✅ fix unit tests

2026-04-23 10:38:45 +02:00 · 2024-06-17 14:52:55 +02:00
parent 7dc1672b24
commit 71e36d264e
14 changed files with 191 additions and 1693 deletions
--- a/lib/soundasleep/html2text/src/Html2Text.php
+++ b/lib/soundasleep/html2text/src/Html2Text.php
@@ -4,11 +4,13 @@ namespace Soundasleep;

 class Html2Text {

-	public static function defaultOptions() {
-		return array(
+	/** @return array<string, bool | string> */
+	public static function defaultOptions(): array {
+		return [
 			'ignore_errors' => false,
 			'drop_links'    => false,
-		);
+			'char_set'      => 'auto'
+		];
 	}

 	/**
@@ -22,15 +24,15 @@ class Html2Text {
 	 * </ul>
 	 *
 	 * @param string $html the input HTML
-	 * @param boolean $ignore_error Ignore xml parsing errors
+	 * @param boolean|array<string, bool | string> $options if boolean, Ignore xml parsing errors, else ['ignore_errors' => false, 'drop_links' => false, 'char_set' => 'auto']
 	 * @return string the HTML converted, as best as possible, to text
 	 * @throws Html2TextException if the HTML could not be loaded as a {@link \DOMDocument}
 	 */
-	public static function convert($html, $options = array()) {
+	public static function convert(string $html, $options = []): string {

 		if ($options === false || $options === true) {
 			// Using old style (< 1.0) of passing in options
-			$options = array('ignore_errors' => $options);
+			$options = ['ignore_errors' => $options];
 		}

 		$options = array_merge(static::defaultOptions(), $options);
@@ -38,28 +40,30 @@ class Html2Text {
 		// check all options are valid
 		foreach ($options as $key => $value) {
 			if (!in_array($key, array_keys(static::defaultOptions()))) {
-				throw new \InvalidArgumentException("Unknown html2text option '$key'");
+				throw new \InvalidArgumentException("Unknown html2text option '$key'. Valid options are " . implode(',', static::defaultOptions()));
 			}
 		}

-		$is_office_document = static::isOfficeDocument($html);
+		$is_office_document = self::isOfficeDocument($html);

 		if ($is_office_document) {
 			// remove office namespace
-			$html = str_replace(array("<o:p>", "</o:p>"), "", $html);
+			$html = str_replace(["<o:p>", "</o:p>"], "", $html);
 		}

-		$html = static::fixNewlines($html);
-		if (mb_detect_encoding($html, "UTF-8", true)) {
+		$html = self::fixNewlines($html);
+
+		// use mb_convert_encoding for legacy versions of php
+		if (PHP_MAJOR_VERSION * 10 + PHP_MINOR_VERSION < 81 && mb_detect_encoding($html, "UTF-8", true)) {
 			$html = mb_convert_encoding($html, "HTML-ENTITIES", "UTF-8");
 		}

-		$doc = static::getDocument($html, $options['ignore_errors']);
+		$doc = self::getDocument($html, $options);

-		$output = static::iterateOverNode($doc, null, false, $is_office_document, $options);
+		$output = self::iterateOverNode($doc, null, false, $is_office_document, $options);

 		// process output for whitespace/newlines
-		$output = static::processWhitespaceNewlines($output);
+		$output = self::processWhitespaceNewlines($output);

 		return $output;
 	}
@@ -72,7 +76,7 @@ class Html2Text {
 	 * @param string $text text with any number of \r, \r\n and \n combinations
 	 * @return string the fixed text
 	 */
-	static function fixNewlines($text) {
+	public static function fixNewlines(string $text): string {
 		// replace \r\n to \n
 		$text = str_replace("\r\n", "\n", $text);
 		// remove \rs
@@ -81,18 +85,20 @@ class Html2Text {
 		return $text;
 	}

-	static function nbspCodes() {
-		return array(
+	/** @return array<string> */
+	public static function nbspCodes(): array {
+		return [
 			"\xc2\xa0",
 			"\u00a0",
-		);
+		];
 	}

-	static function zwnjCodes() {
-		return array(
+	/** @return array<string> */
+	public static function zwnjCodes(): array {
+		return [
 			"\xe2\x80\x8c",
 			"\u200c",
-		);
+		];
 	}

 	/**
@@ -101,7 +107,7 @@ class Html2Text {
 	 * @param string $text multiline text any number of leading or trailing spaces or excess lines
 	 * @return string the fixed text
 	 */
-	static function processWhitespaceNewlines($text) {
+	public static function processWhitespaceNewlines(string $text): string {

 		// remove excess spaces around tabs
 		$text = preg_replace("/ *\t */im", "\t", $text);
@@ -115,7 +121,7 @@ class Html2Text {
 		// convert non-breaking spaces to regular spaces to prevent output issues,
 		// do it here so they do NOT get removed with other leading spaces, as they
 		// are sometimes used for indentation
-		$text = static::renderText($text);
+		$text = self::renderText($text);

 		// remove trailing whitespace
 		$text = rtrim($text);
@@ -124,7 +130,7 @@ class Html2Text {
 		$text = preg_replace("/[ \t]*\n/im", "\n", $text);

 		// unarmor pre blocks
-		$text = static::fixNewLines($text);
+		$text = self::fixNewLines($text);

 		// remove unnecessary empty lines
 		$text = preg_replace("/\n\n\n*/im", "\n\n", $text);
@@ -132,14 +138,25 @@ class Html2Text {
 		return $text;
 	}

+	/**
+	 * Can we guess that this HTML is generated by Microsoft Office?
+	 */
+	public static function isOfficeDocument(string $html): bool {
+		return strpos($html, "urn:schemas-microsoft-com:office") !== false;
+	}
+
+	public static function isWhitespace(string $text): bool {
+		return strlen(trim(self::renderText($text), "\n\r\t ")) === 0;
+	}
+
 	/**
 	 * Parse HTML into a DOMDocument
 	 *
 	 * @param string $html the input HTML
-	 * @param boolean $ignore_error Ignore xml parsing errors
+	 * @param array<string, bool | string> $options
 	 * @return \DOMDocument the parsed document tree
 	 */
-	static function getDocument($html, $ignore_error = false) {
+	private static function getDocument(string $html, array $options): \DOMDocument {

 		$doc = new \DOMDocument();

@@ -159,16 +176,34 @@ class Html2Text {
 			$html = '<body>' . $html . '</body>';
 		}

-		if ($ignore_error) {
+		$header = '';
+		// use char sets for modern versions of php
+		if (PHP_MAJOR_VERSION * 10 + PHP_MINOR_VERSION >= 81) {
+			// use specified char_set, or auto detect if not set
+			$char_set = ! empty($options['char_set']) ? $options['char_set'] : 'auto';
+			if ('auto' === $char_set) {
+				$char_set = mb_detect_encoding($html);
+			} else if (strpos($char_set, ',')) {
+				mb_detect_order($char_set);
+				$char_set = mb_detect_encoding($html);
+			}
+			// turn off error detection for Windows-1252 legacy html
+			if (strpos($char_set, '1252')) {
+				$options['ignore_errors'] = true;
+			}
+			$header = '<?xml version="1.0" encoding="' . $char_set . '">';
+		}
+
+		if (! empty($options['ignore_errors'])) {
 			$doc->strictErrorChecking = false;
 			$doc->recover = true;
 			$doc->xmlStandalone = true;
 			$old_internal_errors = libxml_use_internal_errors(true);
-			$load_result = $doc->loadHTML($html, LIBXML_NOWARNING | LIBXML_NOERROR | LIBXML_NONET | LIBXML_PARSEHUGE);
+			$load_result = $doc->loadHTML($header . $html, LIBXML_NOWARNING | LIBXML_NOERROR | LIBXML_NONET | LIBXML_PARSEHUGE);
 			libxml_use_internal_errors($old_internal_errors);
 		}
 		else {
-			$load_result = $doc->loadHTML($html);
+			$load_result = $doc->loadHTML($header . $html);
 		}

 		if (!$load_result) {
@@ -178,13 +213,6 @@ class Html2Text {
 		return $doc;
 	}

-	/**
-	 * Can we guess that this HTML is generated by Microsoft Office?
-	 */
-	static function isOfficeDocument($html) {
-		return strpos($html, "urn:schemas-microsoft-com:office") !== false;
-	}
-
 	/**
 	 * Replace any special characters with simple text versions, to prevent output issues:
 	 * - Convert non-breaking spaces to regular spaces; and
@@ -193,22 +221,18 @@ class Html2Text {
 	 * This is to match our goal of rendering documents as they would be rendered
 	 * by a browser.
 	 */
-	static function renderText($text) {
-		$text = str_replace(static::nbspCodes(), " ", $text);
-		$text = str_replace(static::zwnjCodes(), "", $text);
+	private static function renderText(string $text): string {
+		$text = str_replace(self::nbspCodes(), " ", $text);
+		$text = str_replace(self::zwnjCodes(), "", $text);
 		return $text;
 	}

-	static function isWhitespace($text) {
-		return strlen(trim(static::renderText($text), "\n\r\t ")) === 0;
-	}
-
-	static function nextChildName($node) {
+	private static function nextChildName(?\DOMNode $node): ?string {
 		// get the next child
 		$nextNode = $node->nextSibling;
 		while ($nextNode != null) {
 			if ($nextNode instanceof \DOMText) {
-				if (!static::isWhitespace($nextNode->wholeText)) {
+				if (!self::isWhitespace($nextNode->wholeText)) {
 					break;
 				}
 			}
@@ -228,11 +252,12 @@ class Html2Text {
 		return $nextName;
 	}

-	static function iterateOverNode($node, $prevName = null, $in_pre = false, $is_office_document = false, $options = []) {
+	/** @param array<string, bool | string> $options */
+	private static function iterateOverNode(\DOMNode $node, ?string $prevName, bool $in_pre, bool $is_office_document, array $options): string {
 		if ($node instanceof \DOMText) {
 		  // Replace whitespace characters with a space (equivilant to \s)
 			if ($in_pre) {
-				$text = "\n" . trim(static::renderText($node->wholeText), "\n\r\t ") . "\n";
+				$text = "\n" . trim(self::renderText($node->wholeText), "\n\r\t ") . "\n";

 				// Remove trailing whitespace only
 				$text = preg_replace("/[ \t]*\n/im", "\n", $text);
@@ -240,15 +265,14 @@ class Html2Text {
 				// armor newlines with \r.
 				return str_replace("\n", "\r", $text);

-			} else {
-				$text = static::renderText($node->wholeText);
-				$text = preg_replace("/[\\t\\n\\f\\r ]+/im", " ", $text);
-
-				if (!static::isWhitespace($text) && ($prevName == 'p' || $prevName == 'div')) {
-					return "\n" . $text;
-				}
-				return $text;
 			}
+			$text = self::renderText($node->wholeText);
+			$text = preg_replace("/[\\t\\n\\f\\r ]+/im", " ", $text);
+
+			if (!self::isWhitespace($text) && ($prevName == 'p' || $prevName == 'div')) {
+				return "\n" . $text;
+			}
+			return $text;
 		}

 		if ($node instanceof \DOMDocumentType || $node instanceof \DOMProcessingInstruction) {
@@ -257,7 +281,7 @@ class Html2Text {
 		}

 		$name = strtolower($node->nodeName);
-		$nextName = static::nextChildName($node);
+		$nextName = self::nextChildName($node);

 		// start whitespace
 		switch ($name) {
@@ -302,6 +326,7 @@ class Html2Text {
 				// To fix this, for any p element with a className of `MsoNormal` (the standard
 				// classname in any Microsoft export or outlook for a paragraph that behaves
 				// like a line return) we skip the first line returns and set the name to br.
+				// @phpstan-ignore-next-line
 				if ($is_office_document && $node->getAttribute('class') == 'MsoNormal') {
 					$output = "";
 					$name = 'br';
@@ -341,20 +366,20 @@ class Html2Text {
 		if (isset($node->childNodes)) {

 			$n = $node->childNodes->item(0);
-			$previousSiblingNames = array();
+			$previousSiblingNames = [];
 			$previousSiblingName = null;

-			$parts = array();
+			$parts = [];
 			$trailing_whitespace = 0;

 			while ($n != null) {

-				$text = static::iterateOverNode($n, $previousSiblingName, $in_pre || $name == 'pre', $is_office_document, $options);
+				$text = self::iterateOverNode($n, $previousSiblingName, $in_pre || $name == 'pre', $is_office_document, $options);

 				// Pass current node name to next child, as previousSibling does not appear to get populated
 				if ($n instanceof \DOMDocumentType
 					|| $n instanceof \DOMProcessingInstruction
-					|| ($n instanceof \DOMText && static::isWhitespace($text))) {
+					|| ($n instanceof \DOMText && self::isWhitespace($text))) {
 					// Keep current previousSiblingName, these are invisible
 					$trailing_whitespace++;
 				}
@@ -411,6 +436,7 @@ class Html2Text {

 			case "a":
 				// links are returned in [text](link) format
+				// @phpstan-ignore-next-line
 				$href = $node->getAttribute("href");

 				$output = trim($output);
@@ -420,18 +446,23 @@ class Html2Text {
 					$output = substr($output, 1, strlen($output) - 2);

 					// for linking images, the title of the <a> overrides the title of the <img>
+					// @phpstan-ignore-next-line
 					if ($node->getAttribute("title")) {
+						// @phpstan-ignore-next-line
 						$output = $node->getAttribute("title");
 					}
 				}

 				// if there is no link text, but a title attr
+				// @phpstan-ignore-next-line
 				if (!$output && $node->getAttribute("title")) {
+					// @phpstan-ignore-next-line
 					$output = $node->getAttribute("title");
 				}

 				if ($href == null) {
 					// it doesn't link anywhere
+					// @phpstan-ignore-next-line
 					if ($node->getAttribute("name") != null) {
 						if ($options['drop_links']) {
 							$output = "$output";
@@ -467,9 +498,13 @@ class Html2Text {
 				break;

 			case "img":
+				// @phpstan-ignore-next-line
 				if ($node->getAttribute("title")) {
+					// @phpstan-ignore-next-line
 					$output = "[" . $node->getAttribute("title") . "]";
+				// @phpstan-ignore-next-line
 				} elseif ($node->getAttribute("alt")) {
+					// @phpstan-ignore-next-line
 					$output = "[" . $node->getAttribute("alt") . "]";
 				} else {
 					$output = "";
@@ -482,7 +517,7 @@ class Html2Text {

 			case "blockquote":
 				// process quoted text for whitespace/newlines
-				$output = static::processWhitespaceNewlines($output);
+				$output = self::processWhitespaceNewlines($output);

 				// add leading newline
 				$output = "\n" . $output;