Support Microsoft encoding of non breaking line in UTF-8

This commit is contained in:
Eric
2019-11-25 15:34:04 +01:00
parent d367d2e864
commit f90381d412
2 changed files with 168 additions and 1 deletions

View File

@@ -272,6 +272,8 @@ class HTMLDOMSanitizer extends HTMLSanitizer
// Unfortunately, DOMDocument::loadHTML does not take the tag namespaces into account (once loaded there is no way to know if the tag did have a namespace)
// therefore we have to do the transformation upfront
$sHTML = preg_replace('@<o:p>(\s|&nbsp;)*</o:p>@', '<br>', $sHTML);
// Replace badly encoded non breaking space
$sHTML = preg_replace('~\xc2\xa0~', '&nbsp;', $sHTML);
@$this->oDoc->loadHTML('<?xml encoding="UTF-8"?>'.$sHTML); // For loading HTML chunks where the character set is not specified
@@ -395,4 +397,4 @@ class HTMLDOMSanitizer extends HTMLSanitizer
}
return true;
}
}
}