N°7571 - ⬆️ Bump HTML2Text library version

2026-04-23 18:48:51 +02:00 · 2024-06-13 14:45:06 +02:00
parent bfbb046b10
commit 47d65931e6
24 changed files with 2529 additions and 381 deletions
--- a/lib/soundasleep/html2text/src/Html2Text.php
+++ b/lib/soundasleep/html2text/src/Html2Text.php
@@ -0,0 +1,505 @@
+<?php
+
+namespace Soundasleep;
+
+class Html2Text {
+
+	public static function defaultOptions() {
+		return array(
+			'ignore_errors' => false,
+			'drop_links'    => false,
+		);
+	}
+
+	/**
+	 * Tries to convert the given HTML into a plain text format - best suited for
+	 * e-mail display, etc.
+	 *
+	 * <p>In particular, it tries to maintain the following features:
+	 * <ul>
+	 *   <li>Links are maintained, with the 'href' copied over
+	 *   <li>Information in the &lt;head&gt; is lost
+	 * </ul>
+	 *
+	 * @param string $html the input HTML
+	 * @param boolean $ignore_error Ignore xml parsing errors
+	 * @return string the HTML converted, as best as possible, to text
+	 * @throws Html2TextException if the HTML could not be loaded as a {@link \DOMDocument}
+	 */
+	public static function convert($html, $options = array()) {
+
+		if ($options === false || $options === true) {
+			// Using old style (< 1.0) of passing in options
+			$options = array('ignore_errors' => $options);
+		}
+
+		$options = array_merge(static::defaultOptions(), $options);
+
+		// check all options are valid
+		foreach ($options as $key => $value) {
+			if (!in_array($key, array_keys(static::defaultOptions()))) {
+				throw new \InvalidArgumentException("Unknown html2text option '$key'");
+			}
+		}
+
+		$is_office_document = static::isOfficeDocument($html);
+
+		if ($is_office_document) {
+			// remove office namespace
+			$html = str_replace(array("<o:p>", "</o:p>"), "", $html);
+		}
+
+		$html = static::fixNewlines($html);
+		if (mb_detect_encoding($html, "UTF-8", true)) {
+			$html = mb_convert_encoding($html, "HTML-ENTITIES", "UTF-8");
+		}
+
+		$doc = static::getDocument($html, $options['ignore_errors']);
+
+		$output = static::iterateOverNode($doc, null, false, $is_office_document, $options);
+
+		// process output for whitespace/newlines
+		$output = static::processWhitespaceNewlines($output);
+
+		return $output;
+	}
+
+	/**
+	 * Unify newlines; in particular, \r\n becomes \n, and
+	 * then \r becomes \n. This means that all newlines (Unix, Windows, Mac)
+	 * all become \ns.
+	 *
+	 * @param string $text text with any number of \r, \r\n and \n combinations
+	 * @return string the fixed text
+	 */
+	static function fixNewlines($text) {
+		// replace \r\n to \n
+		$text = str_replace("\r\n", "\n", $text);
+		// remove \rs
+		$text = str_replace("\r", "\n", $text);
+
+		return $text;
+	}
+
+	static function nbspCodes() {
+		return array(
+			"\xc2\xa0",
+			"\u00a0",
+		);
+	}
+
+	static function zwnjCodes() {
+		return array(
+			"\xe2\x80\x8c",
+			"\u200c",
+		);
+	}
+
+	/**
+	 * Remove leading or trailing spaces and excess empty lines from provided multiline text
+	 *
+	 * @param string $text multiline text any number of leading or trailing spaces or excess lines
+	 * @return string the fixed text
+	 */
+	static function processWhitespaceNewlines($text) {
+
+		// remove excess spaces around tabs
+		$text = preg_replace("/ *\t */im", "\t", $text);
+
+		// remove leading whitespace
+		$text = ltrim($text);
+
+		// remove leading spaces on each line
+		$text = preg_replace("/\n[ \t]*/im", "\n", $text);
+
+		// convert non-breaking spaces to regular spaces to prevent output issues,
+		// do it here so they do NOT get removed with other leading spaces, as they
+		// are sometimes used for indentation
+		$text = static::renderText($text);
+
+		// remove trailing whitespace
+		$text = rtrim($text);
+
+		// remove trailing spaces on each line
+		$text = preg_replace("/[ \t]*\n/im", "\n", $text);
+
+		// unarmor pre blocks
+		$text = static::fixNewLines($text);
+
+		// remove unnecessary empty lines
+		$text = preg_replace("/\n\n\n*/im", "\n\n", $text);
+
+		return $text;
+	}
+
+	/**
+	 * Parse HTML into a DOMDocument
+	 *
+	 * @param string $html the input HTML
+	 * @param boolean $ignore_error Ignore xml parsing errors
+	 * @return \DOMDocument the parsed document tree
+	 */
+	static function getDocument($html, $ignore_error = false) {
+
+		$doc = new \DOMDocument();
+
+		$html = trim($html);
+
+		if (!$html) {
+			// DOMDocument doesn't support empty value and throws an error
+			// Return empty document instead
+			return $doc;
+		}
+
+		if ($html[0] !== '<') {
+			// If HTML does not begin with a tag, we put a body tag around it.
+			// If we do not do this, PHP will insert a paragraph tag around
+			// the first block of text for some reason which can mess up
+			// the newlines. See pre.html test for an example.
+			$html = '<body>' . $html . '</body>';
+		}
+
+		if ($ignore_error) {
+			$doc->strictErrorChecking = false;
+			$doc->recover = true;
+			$doc->xmlStandalone = true;
+			$old_internal_errors = libxml_use_internal_errors(true);
+			$load_result = $doc->loadHTML($html, LIBXML_NOWARNING | LIBXML_NOERROR | LIBXML_NONET | LIBXML_PARSEHUGE);
+			libxml_use_internal_errors($old_internal_errors);
+		}
+		else {
+			$load_result = $doc->loadHTML($html);
+		}
+
+		if (!$load_result) {
+			throw new Html2TextException("Could not load HTML - badly formed?", $html);
+		}
+
+		return $doc;
+	}
+
+	/**
+	 * Can we guess that this HTML is generated by Microsoft Office?
+	 */
+	static function isOfficeDocument($html) {
+		return strpos($html, "urn:schemas-microsoft-com:office") !== false;
+	}
+
+	/**
+	 * Replace any special characters with simple text versions, to prevent output issues:
+	 * - Convert non-breaking spaces to regular spaces; and
+	 * - Convert zero-width non-joiners to '' (nothing).
+	 *
+	 * This is to match our goal of rendering documents as they would be rendered
+	 * by a browser.
+	 */
+	static function renderText($text) {
+		$text = str_replace(static::nbspCodes(), " ", $text);
+		$text = str_replace(static::zwnjCodes(), "", $text);
+		return $text;
+	}
+
+	static function isWhitespace($text) {
+		return strlen(trim(static::renderText($text), "\n\r\t ")) === 0;
+	}
+
+	static function nextChildName($node) {
+		// get the next child
+		$nextNode = $node->nextSibling;
+		while ($nextNode != null) {
+			if ($nextNode instanceof \DOMText) {
+				if (!static::isWhitespace($nextNode->wholeText)) {
+					break;
+				}
+			}
+
+			if ($nextNode instanceof \DOMElement) {
+				break;
+			}
+
+			$nextNode = $nextNode->nextSibling;
+		}
+
+		$nextName = null;
+		if (($nextNode instanceof \DOMElement || $nextNode instanceof \DOMText) && $nextNode != null) {
+			$nextName = strtolower($nextNode->nodeName);
+		}
+
+		return $nextName;
+	}
+
+	static function iterateOverNode($node, $prevName = null, $in_pre = false, $is_office_document = false, $options) {
+		if ($node instanceof \DOMText) {
+		  // Replace whitespace characters with a space (equivilant to \s)
+			if ($in_pre) {
+				$text = "\n" . trim(static::renderText($node->wholeText), "\n\r\t ") . "\n";
+
+				// Remove trailing whitespace only
+				$text = preg_replace("/[ \t]*\n/im", "\n", $text);
+
+				// armor newlines with \r.
+				return str_replace("\n", "\r", $text);
+
+			} else {
+				$text = static::renderText($node->wholeText);
+				$text = preg_replace("/[\\t\\n\\f\\r ]+/im", " ", $text);
+
+				if (!static::isWhitespace($text) && ($prevName == 'p' || $prevName == 'div')) {
+					return "\n" . $text;
+				}
+				return $text;
+			}
+		}
+
+		if ($node instanceof \DOMDocumentType || $node instanceof \DOMProcessingInstruction) {
+			// ignore
+			return "";
+		}
+
+		$name = strtolower($node->nodeName);
+		$nextName = static::nextChildName($node);
+
+		// start whitespace
+		switch ($name) {
+			case "hr":
+				$prefix = '';
+				if ($prevName != null) {
+					$prefix = "\n";
+				}
+				return $prefix . "---------------------------------------------------------------\n";
+
+			case "style":
+			case "head":
+			case "title":
+			case "meta":
+			case "script":
+				// ignore these tags
+				return "";
+
+			case "h1":
+			case "h2":
+			case "h3":
+			case "h4":
+			case "h5":
+			case "h6":
+			case "ol":
+			case "ul":
+			case "pre":
+				// add two newlines
+				$output = "\n\n";
+				break;
+
+			case "td":
+			case "th":
+				// add tab char to separate table fields
+			   $output = "\t";
+			   break;
+
+			case "p":
+				// Microsoft exchange emails often include HTML which, when passed through
+				// html2text, results in lots of double line returns everywhere.
+				//
+				// To fix this, for any p element with a className of `MsoNormal` (the standard
+				// classname in any Microsoft export or outlook for a paragraph that behaves
+				// like a line return) we skip the first line returns and set the name to br.
+				if ($is_office_document && $node->getAttribute('class') == 'MsoNormal') {
+					$output = "";
+					$name = 'br';
+					break;
+				}
+
+				// add two lines
+				$output = "\n\n";
+				break;
+
+			case "tr":
+				// add one line
+				$output = "\n";
+				break;
+
+			case "div":
+				$output = "";
+				if ($prevName !== null) {
+					// add one line
+					$output .= "\n";
+				}
+				break;
+
+			case "li":
+				$output = "- ";
+				break;
+
+			default:
+				// print out contents of unknown tags
+				$output = "";
+				break;
+		}
+
+		// debug
+		//$output .= "[$name,$nextName]";
+
+		if (isset($node->childNodes)) {
+
+			$n = $node->childNodes->item(0);
+			$previousSiblingNames = array();
+			$previousSiblingName = null;
+
+			$parts = array();
+			$trailing_whitespace = 0;
+
+			while ($n != null) {
+
+				$text = static::iterateOverNode($n, $previousSiblingName, $in_pre || $name == 'pre', $is_office_document, $options);
+
+				// Pass current node name to next child, as previousSibling does not appear to get populated
+				if ($n instanceof \DOMDocumentType
+					|| $n instanceof \DOMProcessingInstruction
+					|| ($n instanceof \DOMText && static::isWhitespace($text))) {
+					// Keep current previousSiblingName, these are invisible
+					$trailing_whitespace++;
+				}
+				else {
+					$previousSiblingName = strtolower($n->nodeName);
+					$previousSiblingNames[] = $previousSiblingName;
+					$trailing_whitespace = 0;
+				}
+
+				$node->removeChild($n);
+				$n = $node->childNodes->item(0);
+
+				$parts[] = $text;
+			}
+
+			// Remove trailing whitespace, important for the br check below
+			while ($trailing_whitespace-- > 0) {
+				array_pop($parts);
+			}
+
+			// suppress last br tag inside a node list if follows text
+			$last_name = array_pop($previousSiblingNames);
+			if ($last_name === 'br') {
+				$last_name = array_pop($previousSiblingNames);
+				if ($last_name === '#text') {
+					array_pop($parts);
+				}
+			}
+
+			$output .= implode('', $parts);
+		}
+
+		// end whitespace
+		switch ($name) {
+			case "h1":
+			case "h2":
+			case "h3":
+			case "h4":
+			case "h5":
+			case "h6":
+			case "pre":
+			case "p":
+				// add two lines
+				$output .= "\n\n";
+				break;
+
+			case "br":
+				// add one line
+				$output .= "\n";
+				break;
+
+			case "div":
+				break;
+
+			case "a":
+				// links are returned in [text](link) format
+				$href = $node->getAttribute("href");
+
+				$output = trim($output);
+
+				// remove double [[ ]] s from linking images
+				if (substr($output, 0, 1) == "[" && substr($output, -1) == "]") {
+					$output = substr($output, 1, strlen($output) - 2);
+
+					// for linking images, the title of the <a> overrides the title of the <img>
+					if ($node->getAttribute("title")) {
+						$output = $node->getAttribute("title");
+					}
+				}
+
+				// if there is no link text, but a title attr
+				if (!$output && $node->getAttribute("title")) {
+					$output = $node->getAttribute("title");
+				}
+
+				if ($href == null) {
+					// it doesn't link anywhere
+					if ($node->getAttribute("name") != null) {
+						if ($options['drop_links']) {
+							$output = "$output";
+						} else {
+							$output = "[$output]";
+						}
+					}
+				} else {
+					if ($href == $output || $href == "mailto:$output" || $href == "http://$output" || $href == "https://$output") {
+						// link to the same address: just use link
+						$output = "$output";
+					} else {
+						// replace it
+						if ($output) {
+							if ($options['drop_links']) {
+								$output = "$output";
+							} else {
+								$output = "[$output]($href)";
+							}
+						} else {
+							// empty string
+							$output = "$href";
+						}
+					}
+				}
+
+				// does the next node require additional whitespace?
+				switch ($nextName) {
+					case "h1": case "h2": case "h3": case "h4": case "h5": case "h6":
+						$output .= "\n";
+						break;
+				}
+				break;
+
+			case "img":
+				if ($node->getAttribute("title")) {
+					$output = "[" . $node->getAttribute("title") . "]";
+				} elseif ($node->getAttribute("alt")) {
+					$output = "[" . $node->getAttribute("alt") . "]";
+				} else {
+					$output = "";
+				}
+				break;
+
+			case "li":
+				$output .= "\n";
+				break;
+
+			case "blockquote":
+				// process quoted text for whitespace/newlines
+				$output = static::processWhitespaceNewlines($output);
+
+				// add leading newline
+				$output = "\n" . $output;
+
+				// prepend '> ' at the beginning of all lines
+				$output = preg_replace("/\n/im", "\n> ", $output);
+
+				// replace leading '> >' with '>>'
+				$output = preg_replace("/\n> >/im", "\n>>", $output);
+
+				// add another leading newline and trailing newlines
+				$output = "\n" . $output . "\n\n";
+				break;
+			default:
+				// do nothing
+		}
+
+		return $output;
+	}
+}