mirror of
https://github.com/Combodo/iTop.git
synced 2026-04-23 10:38:45 +02:00
✅ fix unit tests
This commit is contained in:
@@ -4,11 +4,13 @@ namespace Soundasleep;
|
||||
|
||||
class Html2Text {
|
||||
|
||||
public static function defaultOptions() {
|
||||
return array(
|
||||
/** @return array<string, bool | string> */
|
||||
public static function defaultOptions(): array {
|
||||
return [
|
||||
'ignore_errors' => false,
|
||||
'drop_links' => false,
|
||||
);
|
||||
'char_set' => 'auto'
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -22,15 +24,15 @@ class Html2Text {
|
||||
* </ul>
|
||||
*
|
||||
* @param string $html the input HTML
|
||||
* @param boolean $ignore_error Ignore xml parsing errors
|
||||
* @param boolean|array<string, bool | string> $options if boolean, Ignore xml parsing errors, else ['ignore_errors' => false, 'drop_links' => false, 'char_set' => 'auto']
|
||||
* @return string the HTML converted, as best as possible, to text
|
||||
* @throws Html2TextException if the HTML could not be loaded as a {@link \DOMDocument}
|
||||
*/
|
||||
public static function convert($html, $options = array()) {
|
||||
public static function convert(string $html, $options = []): string {
|
||||
|
||||
if ($options === false || $options === true) {
|
||||
// Using old style (< 1.0) of passing in options
|
||||
$options = array('ignore_errors' => $options);
|
||||
$options = ['ignore_errors' => $options];
|
||||
}
|
||||
|
||||
$options = array_merge(static::defaultOptions(), $options);
|
||||
@@ -38,28 +40,30 @@ class Html2Text {
|
||||
// check all options are valid
|
||||
foreach ($options as $key => $value) {
|
||||
if (!in_array($key, array_keys(static::defaultOptions()))) {
|
||||
throw new \InvalidArgumentException("Unknown html2text option '$key'");
|
||||
throw new \InvalidArgumentException("Unknown html2text option '$key'. Valid options are " . implode(',', static::defaultOptions()));
|
||||
}
|
||||
}
|
||||
|
||||
$is_office_document = static::isOfficeDocument($html);
|
||||
$is_office_document = self::isOfficeDocument($html);
|
||||
|
||||
if ($is_office_document) {
|
||||
// remove office namespace
|
||||
$html = str_replace(array("<o:p>", "</o:p>"), "", $html);
|
||||
$html = str_replace(["<o:p>", "</o:p>"], "", $html);
|
||||
}
|
||||
|
||||
$html = static::fixNewlines($html);
|
||||
if (mb_detect_encoding($html, "UTF-8", true)) {
|
||||
$html = self::fixNewlines($html);
|
||||
|
||||
// use mb_convert_encoding for legacy versions of php
|
||||
if (PHP_MAJOR_VERSION * 10 + PHP_MINOR_VERSION < 81 && mb_detect_encoding($html, "UTF-8", true)) {
|
||||
$html = mb_convert_encoding($html, "HTML-ENTITIES", "UTF-8");
|
||||
}
|
||||
|
||||
$doc = static::getDocument($html, $options['ignore_errors']);
|
||||
$doc = self::getDocument($html, $options);
|
||||
|
||||
$output = static::iterateOverNode($doc, null, false, $is_office_document, $options);
|
||||
$output = self::iterateOverNode($doc, null, false, $is_office_document, $options);
|
||||
|
||||
// process output for whitespace/newlines
|
||||
$output = static::processWhitespaceNewlines($output);
|
||||
$output = self::processWhitespaceNewlines($output);
|
||||
|
||||
return $output;
|
||||
}
|
||||
@@ -72,7 +76,7 @@ class Html2Text {
|
||||
* @param string $text text with any number of \r, \r\n and \n combinations
|
||||
* @return string the fixed text
|
||||
*/
|
||||
static function fixNewlines($text) {
|
||||
public static function fixNewlines(string $text): string {
|
||||
// replace \r\n to \n
|
||||
$text = str_replace("\r\n", "\n", $text);
|
||||
// remove \rs
|
||||
@@ -81,18 +85,20 @@ class Html2Text {
|
||||
return $text;
|
||||
}
|
||||
|
||||
static function nbspCodes() {
|
||||
return array(
|
||||
/** @return array<string> */
|
||||
public static function nbspCodes(): array {
|
||||
return [
|
||||
"\xc2\xa0",
|
||||
"\u00a0",
|
||||
);
|
||||
];
|
||||
}
|
||||
|
||||
static function zwnjCodes() {
|
||||
return array(
|
||||
/** @return array<string> */
|
||||
public static function zwnjCodes(): array {
|
||||
return [
|
||||
"\xe2\x80\x8c",
|
||||
"\u200c",
|
||||
);
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -101,7 +107,7 @@ class Html2Text {
|
||||
* @param string $text multiline text any number of leading or trailing spaces or excess lines
|
||||
* @return string the fixed text
|
||||
*/
|
||||
static function processWhitespaceNewlines($text) {
|
||||
public static function processWhitespaceNewlines(string $text): string {
|
||||
|
||||
// remove excess spaces around tabs
|
||||
$text = preg_replace("/ *\t */im", "\t", $text);
|
||||
@@ -115,7 +121,7 @@ class Html2Text {
|
||||
// convert non-breaking spaces to regular spaces to prevent output issues,
|
||||
// do it here so they do NOT get removed with other leading spaces, as they
|
||||
// are sometimes used for indentation
|
||||
$text = static::renderText($text);
|
||||
$text = self::renderText($text);
|
||||
|
||||
// remove trailing whitespace
|
||||
$text = rtrim($text);
|
||||
@@ -124,7 +130,7 @@ class Html2Text {
|
||||
$text = preg_replace("/[ \t]*\n/im", "\n", $text);
|
||||
|
||||
// unarmor pre blocks
|
||||
$text = static::fixNewLines($text);
|
||||
$text = self::fixNewLines($text);
|
||||
|
||||
// remove unnecessary empty lines
|
||||
$text = preg_replace("/\n\n\n*/im", "\n\n", $text);
|
||||
@@ -132,14 +138,25 @@ class Html2Text {
|
||||
return $text;
|
||||
}
|
||||
|
||||
/**
|
||||
* Can we guess that this HTML is generated by Microsoft Office?
|
||||
*/
|
||||
public static function isOfficeDocument(string $html): bool {
|
||||
return strpos($html, "urn:schemas-microsoft-com:office") !== false;
|
||||
}
|
||||
|
||||
public static function isWhitespace(string $text): bool {
|
||||
return strlen(trim(self::renderText($text), "\n\r\t ")) === 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse HTML into a DOMDocument
|
||||
*
|
||||
* @param string $html the input HTML
|
||||
* @param boolean $ignore_error Ignore xml parsing errors
|
||||
* @param array<string, bool | string> $options
|
||||
* @return \DOMDocument the parsed document tree
|
||||
*/
|
||||
static function getDocument($html, $ignore_error = false) {
|
||||
private static function getDocument(string $html, array $options): \DOMDocument {
|
||||
|
||||
$doc = new \DOMDocument();
|
||||
|
||||
@@ -159,16 +176,34 @@ class Html2Text {
|
||||
$html = '<body>' . $html . '</body>';
|
||||
}
|
||||
|
||||
if ($ignore_error) {
|
||||
$header = '';
|
||||
// use char sets for modern versions of php
|
||||
if (PHP_MAJOR_VERSION * 10 + PHP_MINOR_VERSION >= 81) {
|
||||
// use specified char_set, or auto detect if not set
|
||||
$char_set = ! empty($options['char_set']) ? $options['char_set'] : 'auto';
|
||||
if ('auto' === $char_set) {
|
||||
$char_set = mb_detect_encoding($html);
|
||||
} else if (strpos($char_set, ',')) {
|
||||
mb_detect_order($char_set);
|
||||
$char_set = mb_detect_encoding($html);
|
||||
}
|
||||
// turn off error detection for Windows-1252 legacy html
|
||||
if (strpos($char_set, '1252')) {
|
||||
$options['ignore_errors'] = true;
|
||||
}
|
||||
$header = '<?xml version="1.0" encoding="' . $char_set . '">';
|
||||
}
|
||||
|
||||
if (! empty($options['ignore_errors'])) {
|
||||
$doc->strictErrorChecking = false;
|
||||
$doc->recover = true;
|
||||
$doc->xmlStandalone = true;
|
||||
$old_internal_errors = libxml_use_internal_errors(true);
|
||||
$load_result = $doc->loadHTML($html, LIBXML_NOWARNING | LIBXML_NOERROR | LIBXML_NONET | LIBXML_PARSEHUGE);
|
||||
$load_result = $doc->loadHTML($header . $html, LIBXML_NOWARNING | LIBXML_NOERROR | LIBXML_NONET | LIBXML_PARSEHUGE);
|
||||
libxml_use_internal_errors($old_internal_errors);
|
||||
}
|
||||
else {
|
||||
$load_result = $doc->loadHTML($html);
|
||||
$load_result = $doc->loadHTML($header . $html);
|
||||
}
|
||||
|
||||
if (!$load_result) {
|
||||
@@ -178,13 +213,6 @@ class Html2Text {
|
||||
return $doc;
|
||||
}
|
||||
|
||||
/**
|
||||
* Can we guess that this HTML is generated by Microsoft Office?
|
||||
*/
|
||||
static function isOfficeDocument($html) {
|
||||
return strpos($html, "urn:schemas-microsoft-com:office") !== false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Replace any special characters with simple text versions, to prevent output issues:
|
||||
* - Convert non-breaking spaces to regular spaces; and
|
||||
@@ -193,22 +221,18 @@ class Html2Text {
|
||||
* This is to match our goal of rendering documents as they would be rendered
|
||||
* by a browser.
|
||||
*/
|
||||
static function renderText($text) {
|
||||
$text = str_replace(static::nbspCodes(), " ", $text);
|
||||
$text = str_replace(static::zwnjCodes(), "", $text);
|
||||
private static function renderText(string $text): string {
|
||||
$text = str_replace(self::nbspCodes(), " ", $text);
|
||||
$text = str_replace(self::zwnjCodes(), "", $text);
|
||||
return $text;
|
||||
}
|
||||
|
||||
static function isWhitespace($text) {
|
||||
return strlen(trim(static::renderText($text), "\n\r\t ")) === 0;
|
||||
}
|
||||
|
||||
static function nextChildName($node) {
|
||||
private static function nextChildName(?\DOMNode $node): ?string {
|
||||
// get the next child
|
||||
$nextNode = $node->nextSibling;
|
||||
while ($nextNode != null) {
|
||||
if ($nextNode instanceof \DOMText) {
|
||||
if (!static::isWhitespace($nextNode->wholeText)) {
|
||||
if (!self::isWhitespace($nextNode->wholeText)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -228,11 +252,12 @@ class Html2Text {
|
||||
return $nextName;
|
||||
}
|
||||
|
||||
static function iterateOverNode($node, $prevName = null, $in_pre = false, $is_office_document = false, $options = []) {
|
||||
/** @param array<string, bool | string> $options */
|
||||
private static function iterateOverNode(\DOMNode $node, ?string $prevName, bool $in_pre, bool $is_office_document, array $options): string {
|
||||
if ($node instanceof \DOMText) {
|
||||
// Replace whitespace characters with a space (equivilant to \s)
|
||||
if ($in_pre) {
|
||||
$text = "\n" . trim(static::renderText($node->wholeText), "\n\r\t ") . "\n";
|
||||
$text = "\n" . trim(self::renderText($node->wholeText), "\n\r\t ") . "\n";
|
||||
|
||||
// Remove trailing whitespace only
|
||||
$text = preg_replace("/[ \t]*\n/im", "\n", $text);
|
||||
@@ -240,15 +265,14 @@ class Html2Text {
|
||||
// armor newlines with \r.
|
||||
return str_replace("\n", "\r", $text);
|
||||
|
||||
} else {
|
||||
$text = static::renderText($node->wholeText);
|
||||
$text = preg_replace("/[\\t\\n\\f\\r ]+/im", " ", $text);
|
||||
|
||||
if (!static::isWhitespace($text) && ($prevName == 'p' || $prevName == 'div')) {
|
||||
return "\n" . $text;
|
||||
}
|
||||
return $text;
|
||||
}
|
||||
$text = self::renderText($node->wholeText);
|
||||
$text = preg_replace("/[\\t\\n\\f\\r ]+/im", " ", $text);
|
||||
|
||||
if (!self::isWhitespace($text) && ($prevName == 'p' || $prevName == 'div')) {
|
||||
return "\n" . $text;
|
||||
}
|
||||
return $text;
|
||||
}
|
||||
|
||||
if ($node instanceof \DOMDocumentType || $node instanceof \DOMProcessingInstruction) {
|
||||
@@ -257,7 +281,7 @@ class Html2Text {
|
||||
}
|
||||
|
||||
$name = strtolower($node->nodeName);
|
||||
$nextName = static::nextChildName($node);
|
||||
$nextName = self::nextChildName($node);
|
||||
|
||||
// start whitespace
|
||||
switch ($name) {
|
||||
@@ -302,6 +326,7 @@ class Html2Text {
|
||||
// To fix this, for any p element with a className of `MsoNormal` (the standard
|
||||
// classname in any Microsoft export or outlook for a paragraph that behaves
|
||||
// like a line return) we skip the first line returns and set the name to br.
|
||||
// @phpstan-ignore-next-line
|
||||
if ($is_office_document && $node->getAttribute('class') == 'MsoNormal') {
|
||||
$output = "";
|
||||
$name = 'br';
|
||||
@@ -341,20 +366,20 @@ class Html2Text {
|
||||
if (isset($node->childNodes)) {
|
||||
|
||||
$n = $node->childNodes->item(0);
|
||||
$previousSiblingNames = array();
|
||||
$previousSiblingNames = [];
|
||||
$previousSiblingName = null;
|
||||
|
||||
$parts = array();
|
||||
$parts = [];
|
||||
$trailing_whitespace = 0;
|
||||
|
||||
while ($n != null) {
|
||||
|
||||
$text = static::iterateOverNode($n, $previousSiblingName, $in_pre || $name == 'pre', $is_office_document, $options);
|
||||
$text = self::iterateOverNode($n, $previousSiblingName, $in_pre || $name == 'pre', $is_office_document, $options);
|
||||
|
||||
// Pass current node name to next child, as previousSibling does not appear to get populated
|
||||
if ($n instanceof \DOMDocumentType
|
||||
|| $n instanceof \DOMProcessingInstruction
|
||||
|| ($n instanceof \DOMText && static::isWhitespace($text))) {
|
||||
|| ($n instanceof \DOMText && self::isWhitespace($text))) {
|
||||
// Keep current previousSiblingName, these are invisible
|
||||
$trailing_whitespace++;
|
||||
}
|
||||
@@ -411,6 +436,7 @@ class Html2Text {
|
||||
|
||||
case "a":
|
||||
// links are returned in [text](link) format
|
||||
// @phpstan-ignore-next-line
|
||||
$href = $node->getAttribute("href");
|
||||
|
||||
$output = trim($output);
|
||||
@@ -420,18 +446,23 @@ class Html2Text {
|
||||
$output = substr($output, 1, strlen($output) - 2);
|
||||
|
||||
// for linking images, the title of the <a> overrides the title of the <img>
|
||||
// @phpstan-ignore-next-line
|
||||
if ($node->getAttribute("title")) {
|
||||
// @phpstan-ignore-next-line
|
||||
$output = $node->getAttribute("title");
|
||||
}
|
||||
}
|
||||
|
||||
// if there is no link text, but a title attr
|
||||
// @phpstan-ignore-next-line
|
||||
if (!$output && $node->getAttribute("title")) {
|
||||
// @phpstan-ignore-next-line
|
||||
$output = $node->getAttribute("title");
|
||||
}
|
||||
|
||||
if ($href == null) {
|
||||
// it doesn't link anywhere
|
||||
// @phpstan-ignore-next-line
|
||||
if ($node->getAttribute("name") != null) {
|
||||
if ($options['drop_links']) {
|
||||
$output = "$output";
|
||||
@@ -467,9 +498,13 @@ class Html2Text {
|
||||
break;
|
||||
|
||||
case "img":
|
||||
// @phpstan-ignore-next-line
|
||||
if ($node->getAttribute("title")) {
|
||||
// @phpstan-ignore-next-line
|
||||
$output = "[" . $node->getAttribute("title") . "]";
|
||||
// @phpstan-ignore-next-line
|
||||
} elseif ($node->getAttribute("alt")) {
|
||||
// @phpstan-ignore-next-line
|
||||
$output = "[" . $node->getAttribute("alt") . "]";
|
||||
} else {
|
||||
$output = "";
|
||||
@@ -482,7 +517,7 @@ class Html2Text {
|
||||
|
||||
case "blockquote":
|
||||
// process quoted text for whitespace/newlines
|
||||
$output = static::processWhitespaceNewlines($output);
|
||||
$output = self::processWhitespaceNewlines($output);
|
||||
|
||||
// add leading newline
|
||||
$output = "\n" . $output;
|
||||
|
||||
Reference in New Issue
Block a user