N°7571 - ⬆️ Bump HTML2Text library version

This commit is contained in:
Eric Espie
2024-06-13 14:45:06 +02:00
parent bfbb046b10
commit 47d65931e6
24 changed files with 2529 additions and 381 deletions

View File

@@ -746,8 +746,6 @@ return array(
'HTMLDOMSanitizer' => $baseDir . '/core/htmlsanitizer.class.inc.php',
'HTMLNullSanitizer' => $baseDir . '/core/htmlsanitizer.class.inc.php',
'HTMLSanitizer' => $baseDir . '/core/htmlsanitizer.class.inc.php',
'Html2Text\\Html2Text' => $baseDir . '/application/Html2Text.php',
'Html2Text\\Html2TextException' => $baseDir . '/application/Html2TextException.php',
'ITopArchiveTar' => $baseDir . '/core/tar-itop.class.inc.php',
'InlineImage' => $baseDir . '/core/inlineimage.class.inc.php',
'InlineImageGC' => $baseDir . '/core/inlineimage.class.inc.php',
@@ -1152,11 +1150,11 @@ return array(
'ModuleHandlerApiInterface' => $baseDir . '/core/modulehandler.class.inc.php',
'MonthlyRotatingLogFileNameBuilder' => $baseDir . '/core/log.class.inc.php',
'MyHelpers' => $baseDir . '/core/MyHelpers.class.inc.php',
'MySQLException' => $baseDir . '/application/exceptions/mysql/MySQLException.php',
'MySQLHasGoneAwayException' => $baseDir . '/application/exceptions/mysql/MySQLHasGoneAwayException.php',
'MySQLNoTransactionException' => $baseDir . '/application/exceptions/mysql/MySQLNoTransactionException.php',
'MySQLQueryHasNoResultException' => $baseDir . '/application/exceptions/mysql/MySQLQueryHasNoResultException.php',
'MySQLTransactionNotClosedException' => $baseDir . '/application/exceptions/mysql/MySQLTransactionNotClosedException.php',
'MySQLException' => $baseDir . '/core/cmdbsource.class.inc.php',
'MySQLHasGoneAwayException' => $baseDir . '/core/cmdbsource.class.inc.php',
'MySQLNoTransactionException' => $baseDir . '/core/cmdbsource.class.inc.php',
'MySQLQueryHasNoResultException' => $baseDir . '/core/cmdbsource.class.inc.php',
'MySQLTransactionNotClosedException' => $baseDir . '/core/cmdbsource.class.inc.php',
'NestedQueryExpression' => $baseDir . '/core/oql/expression.class.inc.php',
'NestedQueryOqlExpression' => $baseDir . '/core/oql/oqlquery.class.inc.php',
'NewObjectMenuNode' => $baseDir . '/application/menunode.class.inc.php',
@@ -1646,6 +1644,8 @@ return array(
'SimpleCryptSodiumEngine' => $baseDir . '/core/simplecrypt.class.inc.php',
'SimpleGraph' => $baseDir . '/core/simplegraph.class.inc.php',
'SimpleGraphException' => $baseDir . '/core/simplegraph.class.inc.php',
'Soundasleep\\Html2Text' => $vendorDir . '/soundasleep/html2text/src/Html2Text.php',
'Soundasleep\\Html2TextException' => $vendorDir . '/soundasleep/html2text/src/Html2TextException.php',
'SpreadsheetBulkExport' => $baseDir . '/core/spreadsheetbulkexport.class.inc.php',
'StimulusChecker' => $baseDir . '/core/userrights.class.inc.php',
'StimulusInternal' => $baseDir . '/core/stimulus.class.inc.php',

View File

@@ -46,6 +46,7 @@ return array(
'Symfony\\Bundle\\FrameworkBundle\\' => array($vendorDir . '/symfony/framework-bundle'),
'Symfony\\Bundle\\DebugBundle\\' => array($vendorDir . '/symfony/debug-bundle'),
'Symfony\\Bridge\\Twig\\' => array($vendorDir . '/symfony/twig-bridge'),
'Soundasleep\\' => array($vendorDir . '/soundasleep/html2text/src'),
'ScssPhp\\ScssPhp\\' => array($vendorDir . '/scssphp/scssphp/src'),
'Sabberworm\\CSS\\' => array($vendorDir . '/sabberworm/php-css-parser/src'),
'Psr\\Log\\' => array($vendorDir . '/psr/log/src'),

View File

@@ -74,6 +74,7 @@ class ComposerStaticInit7f81b4a2a468a061c306af5e447a9a9f
'Symfony\\Bundle\\FrameworkBundle\\' => 31,
'Symfony\\Bundle\\DebugBundle\\' => 27,
'Symfony\\Bridge\\Twig\\' => 20,
'Soundasleep\\' => 12,
'ScssPhp\\ScssPhp\\' => 16,
'Sabberworm\\CSS\\' => 15,
),
@@ -275,6 +276,10 @@ class ComposerStaticInit7f81b4a2a468a061c306af5e447a9a9f
array (
0 => __DIR__ . '/..' . '/symfony/twig-bridge',
),
'Soundasleep\\' =>
array (
0 => __DIR__ . '/..' . '/soundasleep/html2text/src',
),
'ScssPhp\\ScssPhp\\' =>
array (
0 => __DIR__ . '/..' . '/scssphp/scssphp/src',
@@ -1129,8 +1134,6 @@ class ComposerStaticInit7f81b4a2a468a061c306af5e447a9a9f
'HTMLDOMSanitizer' => __DIR__ . '/../..' . '/core/htmlsanitizer.class.inc.php',
'HTMLNullSanitizer' => __DIR__ . '/../..' . '/core/htmlsanitizer.class.inc.php',
'HTMLSanitizer' => __DIR__ . '/../..' . '/core/htmlsanitizer.class.inc.php',
'Html2Text\\Html2Text' => __DIR__ . '/../..' . '/application/Html2Text.php',
'Html2Text\\Html2TextException' => __DIR__ . '/../..' . '/application/Html2TextException.php',
'ITopArchiveTar' => __DIR__ . '/../..' . '/core/tar-itop.class.inc.php',
'InlineImage' => __DIR__ . '/../..' . '/core/inlineimage.class.inc.php',
'InlineImageGC' => __DIR__ . '/../..' . '/core/inlineimage.class.inc.php',
@@ -1535,11 +1538,11 @@ class ComposerStaticInit7f81b4a2a468a061c306af5e447a9a9f
'ModuleHandlerApiInterface' => __DIR__ . '/../..' . '/core/modulehandler.class.inc.php',
'MonthlyRotatingLogFileNameBuilder' => __DIR__ . '/../..' . '/core/log.class.inc.php',
'MyHelpers' => __DIR__ . '/../..' . '/core/MyHelpers.class.inc.php',
'MySQLException' => __DIR__ . '/../..' . '/application/exceptions/mysql/MySQLException.php',
'MySQLHasGoneAwayException' => __DIR__ . '/../..' . '/application/exceptions/mysql/MySQLHasGoneAwayException.php',
'MySQLNoTransactionException' => __DIR__ . '/../..' . '/application/exceptions/mysql/MySQLNoTransactionException.php',
'MySQLQueryHasNoResultException' => __DIR__ . '/../..' . '/application/exceptions/mysql/MySQLQueryHasNoResultException.php',
'MySQLTransactionNotClosedException' => __DIR__ . '/../..' . '/application/exceptions/mysql/MySQLTransactionNotClosedException.php',
'MySQLException' => __DIR__ . '/../..' . '/core/cmdbsource.class.inc.php',
'MySQLHasGoneAwayException' => __DIR__ . '/../..' . '/core/cmdbsource.class.inc.php',
'MySQLNoTransactionException' => __DIR__ . '/../..' . '/core/cmdbsource.class.inc.php',
'MySQLQueryHasNoResultException' => __DIR__ . '/../..' . '/core/cmdbsource.class.inc.php',
'MySQLTransactionNotClosedException' => __DIR__ . '/../..' . '/core/cmdbsource.class.inc.php',
'NestedQueryExpression' => __DIR__ . '/../..' . '/core/oql/expression.class.inc.php',
'NestedQueryOqlExpression' => __DIR__ . '/../..' . '/core/oql/oqlquery.class.inc.php',
'NewObjectMenuNode' => __DIR__ . '/../..' . '/application/menunode.class.inc.php',
@@ -2029,6 +2032,8 @@ class ComposerStaticInit7f81b4a2a468a061c306af5e447a9a9f
'SimpleCryptSodiumEngine' => __DIR__ . '/../..' . '/core/simplecrypt.class.inc.php',
'SimpleGraph' => __DIR__ . '/../..' . '/core/simplegraph.class.inc.php',
'SimpleGraphException' => __DIR__ . '/../..' . '/core/simplegraph.class.inc.php',
'Soundasleep\\Html2Text' => __DIR__ . '/..' . '/soundasleep/html2text/src/Html2Text.php',
'Soundasleep\\Html2TextException' => __DIR__ . '/..' . '/soundasleep/html2text/src/Html2TextException.php',
'SpreadsheetBulkExport' => __DIR__ . '/../..' . '/core/spreadsheetbulkexport.class.inc.php',
'StimulusChecker' => __DIR__ . '/../..' . '/core/userrights.class.inc.php',
'StimulusInternal' => __DIR__ . '/../..' . '/core/stimulus.class.inc.php',

View File

@@ -2128,6 +2128,64 @@
},
"install-path": "../scssphp/scssphp"
},
{
"name": "soundasleep/html2text",
"version": "1.1.0",
"version_normalized": "1.1.0.0",
"source": {
"type": "git",
"url": "https://github.com/soundasleep/html2text.git",
"reference": "3243a7107878a61685d2eccf99918d6479e039fc"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/soundasleep/html2text/zipball/3243a7107878a61685d2eccf99918d6479e039fc",
"reference": "3243a7107878a61685d2eccf99918d6479e039fc",
"shasum": ""
},
"require": {
"ext-dom": "*",
"ext-libxml": "*",
"php": ">=7.0"
},
"require-dev": {
"phpunit/phpunit": "~7.0",
"soundasleep/component-tests": "~0.2"
},
"time": "2019-02-15T01:44:54+00:00",
"type": "library",
"installation-source": "dist",
"autoload": {
"psr-4": {
"Soundasleep\\": "src"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Jevon Wright",
"homepage": "https://jevon.org",
"role": "Developer"
}
],
"description": "A PHP script to convert HTML into a plain text format",
"homepage": "https://github.com/soundasleep/html2text",
"keywords": [
"email",
"html",
"php",
"text"
],
"support": {
"email": "support@jevon.org",
"issues": "https://github.com/soundasleep/html2text/issues",
"source": "https://github.com/soundasleep/html2text/tree/master"
},
"install-path": "../soundasleep/html2text"
},
{
"name": "symfony/cache",
"version": "v6.4.2",

View File

@@ -3,7 +3,7 @@
'name' => 'combodo/itop',
'pretty_version' => 'dev-develop',
'version' => 'dev-develop',
'reference' => '0036c70fbd34e0bfbdfdbaf26d497d8e1ed7bd04',
'reference' => 'bfbb046b10aa2fc864d5fbc794df31e488542df4',
'type' => 'project',
'install_path' => __DIR__ . '/../../',
'aliases' => array(),
@@ -22,7 +22,7 @@
'combodo/itop' => array(
'pretty_version' => 'dev-develop',
'version' => 'dev-develop',
'reference' => '0036c70fbd34e0bfbdfdbaf26d497d8e1ed7bd04',
'reference' => 'bfbb046b10aa2fc864d5fbc794df31e488542df4',
'type' => 'project',
'install_path' => __DIR__ . '/../../',
'aliases' => array(),
@@ -359,6 +359,15 @@
'aliases' => array(),
'dev_requirement' => false,
),
'soundasleep/html2text' => array(
'pretty_version' => '1.1.0',
'version' => '1.1.0.0',
'reference' => '3243a7107878a61685d2eccf99918d6479e039fc',
'type' => 'library',
'install_path' => __DIR__ . '/../soundasleep/html2text',
'aliases' => array(),
'dev_requirement' => false,
),
'symfony/cache' => array(
'pretty_version' => 'v6.4.2',
'version' => '6.4.2.0',

View File

@@ -0,0 +1,23 @@
# EditorConfig is awesome: http://EditorConfig.org
# top-most EditorConfig file
root = true
# Unix-style newlines with a newline ending every file
[*]
end_of_line = lf
charset = utf-8
insert_final_newline = true
trim_trailing_whitespace = true
indent_style = tab
indent_size = 4
[*.md]
indent_style = space
indent_size = 2
# don't add newlines to test files
[tests/*]
indent_style = tabs
trim_trailing_whitespace = false
insert_final_newline = false

5
lib/soundasleep/html2text/.gitignore vendored Normal file
View File

@@ -0,0 +1,5 @@
tests/*.output
*.sublime-project
*.sublime-workspace
vendor/
**/*.DS_Store

View File

@@ -0,0 +1,10 @@
language: php
php:
- 7.3
group: stable
before_install:
- composer self-update
install:
- composer install
script:
- ./vendor/bin/phpunit

View File

@@ -0,0 +1,37 @@
# Changelog
All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [Unreleased]
## [1.1.0] - 2019-02-15
### Added
- Zero-width non-joiners are now stripped to prevent output issues, similar to non-breaking whitespace
### Fixed
- Fix namespace in composer [#67](https://github.com/soundasleep/html2text/pull/67)
## [1.0.0] - 2019-02-14
### Added
- Added `drop_links` option to render links without the target href [#65](https://github.com/soundasleep/html2text/pull/65)
### Changed
- **Important:** Changed namespace from `\Html2Text\Html2Text` to `\Soundasleep\Html2text` [#45](https://github.com/soundasleep/html2text/issues/45)
- Treat non-breaking spaces consistently: never include them in output text [#64](https://github.com/soundasleep/html2text/pull/64)
- Second argument to `convert()` is now an array, rather than boolean [#65](https://github.com/soundasleep/html2text/pull/65)
- Optimise/improve newline & whitespace handling [#47](https://github.com/soundasleep/html2text/pull/47)
- Upgrade PHP support to PHP 7.3+
- Upgrade PHPUnit to 7.x
- Re-release project under MIT license [#58](https://github.com/soundasleep/html2text/issues/58)
## [0.5.0] - 2017-04-20
### Added
- Add ignore_error optional argument [#63](https://github.com/soundasleep/html2text/pull/63)
- Blockquote support [#50](https://github.com/soundasleep/html2text/pull/50)
[Unreleased]: https://github.com/soundasleep/html2text/compare/1.1.0...HEAD
[1.1.0]: https://github.com/soundasleep/html2text/compare/1.0.0...1.1.0
[1.0.0]: https://github.com/soundasleep/html2text/compare/0.5.0...1.0.0
[0.5.0]: https://github.com/soundasleep/html2text/compare/0.5.0...0.3.4

View File

@@ -0,0 +1,21 @@
MIT License
Copyright (c) 2019 Jevon Wright
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@@ -0,0 +1,101 @@
html2text [![Build Status](https://travis-ci.org/soundasleep/html2text.svg?branch=master)](https://travis-ci.org/soundasleep/html2text) [![Total Downloads](https://poser.pugx.org/soundasleep/html2text/downloads.png)](https://packagist.org/packages/soundasleep/html2text)
=========
html2text is a very simple script that uses DOM methods to convert HTML into a format similar to what would be
rendered by a browser - perfect for places where you need a quick text representation. For example:
```html
<html>
<title>Ignored Title</title>
<body>
<h1>Hello, World!</h1>
<p>This is some e-mail content.
Even though it has whitespace and newlines, the e-mail converter
will handle it correctly.
<p>Even mismatched tags.</p>
<div>A div</div>
<div>Another div</div>
<div>A div<div>within a div</div></div>
<a href="http://foo.com">A link</a>
</body>
</html>
```
Will be converted into:
```text
Hello, World!
This is some e-mail content. Even though it has whitespace and newlines, the e-mail converter will handle it correctly.
Even mismatched tags.
A div
Another div
A div
within a div
[A link](http://foo.com)
```
See the [original blog post](http://journals.jevon.org/users/jevon-phd/entry/19818) or the related [StackOverflow answer](http://stackoverflow.com/a/2564472/39531).
## Installing
You can use [Composer](http://getcomposer.org/) to add the [package](https://packagist.org/packages/soundasleep/html2text) to your project:
```json
{
"require": {
"soundasleep/html2text": "~1.1"
}
}
```
And then use it quite simply:
```php
$text = \Soundasleep\Html2Text::convert($html);
```
You can also include the supplied `html2text.php` and use `$text = convert_html_to_text($html);` instead.
### Options
| Option | Default | Description |
|--------|---------|-------------|
| **ignore_errors** | `false` | Set to `true` to ignore any XML parsing errors. |
| **drop_links** | `false` | Set to `true` to not render links as `[http://foo.com](My Link)`, but rather just `My Link`. |
Pass along options as a second argument to `convert`, for example:
```php
$options = array(
'ignore_errors' => true,
// other options go here
);
$text = \Soundasleep\Html2Text::convert($html, $options);
```
## Tests
Some very basic tests are provided in the `tests/` directory. Run them with `composer install && vendor/bin/phpunit`.
## Troubleshooting
### Class 'DOMDocument' not found
You need to [install the PHP XML extension](https://github.com/soundasleep/html2text/issues/55) for your PHP version. e.g. `apt-get install php7.1-xml`
## License
`html2text` is [licensed under MIT](LICENSE.md), making it suitable for both Eclipse and GPL projects.
## Other versions
Also see [html2text_ruby](https://github.com/soundasleep/html2text_ruby), a Ruby implementation.

View File

@@ -0,0 +1,32 @@
{
"name": "soundasleep/html2text",
"description": "A PHP script to convert HTML into a plain text format",
"type": "library",
"keywords": [ "php", "html", "text", "email" ],
"homepage": "https://github.com/soundasleep/html2text",
"license": "MIT",
"authors": [
{
"name": "Jevon Wright",
"homepage": "https://jevon.org",
"role": "Developer"
}
],
"autoload": {
"psr-4": {
"Soundasleep\\": "src"
}
},
"support": {
"email": "support@jevon.org"
},
"require": {
"php": ">=7.0",
"ext-dom": "*",
"ext-libxml": "*"
},
"require-dev": {
"phpunit/phpunit": "~7.0",
"soundasleep/component-tests": "~0.2"
}
}

1586
lib/soundasleep/html2text/composer.lock generated Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,21 @@
<?php
/**
* This file allows you to convert through the command line.
* Usage:
* php -f convert.php [input file]
*/
if (count($argv) < 2) {
throw new \InvalidArgumentException("Expected: php -f convert.php [input file]");
}
if (!file_exists($argv[1])) {
throw new \InvalidArgumentException("'" . $argv[1] . "' does not exist");
}
$input = file_get_contents($argv[1]);
require_once(__DIR__ . "/src/Html2Text.php");
require_once(__DIR__ . "/src/Html2TextException.php");
echo \Soundasleep\Html2Text::convert($input);

View File

@@ -0,0 +1,16 @@
<?php
/**
* This file is available if you still want to use functions rather than
* autoloading classes.
*/
require_once(__DIR__ . "/src/Html2Text.php");
require_once(__DIR__ . "/src/Html2TextException.php");
function convert_html_to_text($html, $ignore_error = false) {
return Soundasleep\Html2Text::convert($html, $ignore_error);
}
function fix_newlines($text) {
return Soundasleep\Html2Text::fixNewlines($text);
}

View File

@@ -0,0 +1,8 @@
<phpunit stopOnFailure="true" stopOnError="true" beStrictAboutTestsThatDoNotTestAnything="false">
<testsuites>
<testsuite name="Tests">
<!-- loads all *Test.php -->
<directory>tests</directory>
</testsuite>
</testsuites>
</phpunit>

View File

@@ -0,0 +1,505 @@
<?php
namespace Soundasleep;
class Html2Text {
public static function defaultOptions() {
return array(
'ignore_errors' => false,
'drop_links' => false,
);
}
/**
* Tries to convert the given HTML into a plain text format - best suited for
* e-mail display, etc.
*
* <p>In particular, it tries to maintain the following features:
* <ul>
* <li>Links are maintained, with the 'href' copied over
* <li>Information in the &lt;head&gt; is lost
* </ul>
*
* @param string $html the input HTML
* @param boolean $ignore_error Ignore xml parsing errors
* @return string the HTML converted, as best as possible, to text
* @throws Html2TextException if the HTML could not be loaded as a {@link \DOMDocument}
*/
public static function convert($html, $options = array()) {
if ($options === false || $options === true) {
// Using old style (< 1.0) of passing in options
$options = array('ignore_errors' => $options);
}
$options = array_merge(static::defaultOptions(), $options);
// check all options are valid
foreach ($options as $key => $value) {
if (!in_array($key, array_keys(static::defaultOptions()))) {
throw new \InvalidArgumentException("Unknown html2text option '$key'");
}
}
$is_office_document = static::isOfficeDocument($html);
if ($is_office_document) {
// remove office namespace
$html = str_replace(array("<o:p>", "</o:p>"), "", $html);
}
$html = static::fixNewlines($html);
if (mb_detect_encoding($html, "UTF-8", true)) {
$html = mb_convert_encoding($html, "HTML-ENTITIES", "UTF-8");
}
$doc = static::getDocument($html, $options['ignore_errors']);
$output = static::iterateOverNode($doc, null, false, $is_office_document, $options);
// process output for whitespace/newlines
$output = static::processWhitespaceNewlines($output);
return $output;
}
/**
* Unify newlines; in particular, \r\n becomes \n, and
* then \r becomes \n. This means that all newlines (Unix, Windows, Mac)
* all become \ns.
*
* @param string $text text with any number of \r, \r\n and \n combinations
* @return string the fixed text
*/
static function fixNewlines($text) {
// replace \r\n to \n
$text = str_replace("\r\n", "\n", $text);
// remove \rs
$text = str_replace("\r", "\n", $text);
return $text;
}
static function nbspCodes() {
return array(
"\xc2\xa0",
"\u00a0",
);
}
static function zwnjCodes() {
return array(
"\xe2\x80\x8c",
"\u200c",
);
}
/**
* Remove leading or trailing spaces and excess empty lines from provided multiline text
*
* @param string $text multiline text any number of leading or trailing spaces or excess lines
* @return string the fixed text
*/
static function processWhitespaceNewlines($text) {
// remove excess spaces around tabs
$text = preg_replace("/ *\t */im", "\t", $text);
// remove leading whitespace
$text = ltrim($text);
// remove leading spaces on each line
$text = preg_replace("/\n[ \t]*/im", "\n", $text);
// convert non-breaking spaces to regular spaces to prevent output issues,
// do it here so they do NOT get removed with other leading spaces, as they
// are sometimes used for indentation
$text = static::renderText($text);
// remove trailing whitespace
$text = rtrim($text);
// remove trailing spaces on each line
$text = preg_replace("/[ \t]*\n/im", "\n", $text);
// unarmor pre blocks
$text = static::fixNewLines($text);
// remove unnecessary empty lines
$text = preg_replace("/\n\n\n*/im", "\n\n", $text);
return $text;
}
/**
* Parse HTML into a DOMDocument
*
* @param string $html the input HTML
* @param boolean $ignore_error Ignore xml parsing errors
* @return \DOMDocument the parsed document tree
*/
static function getDocument($html, $ignore_error = false) {
$doc = new \DOMDocument();
$html = trim($html);
if (!$html) {
// DOMDocument doesn't support empty value and throws an error
// Return empty document instead
return $doc;
}
if ($html[0] !== '<') {
// If HTML does not begin with a tag, we put a body tag around it.
// If we do not do this, PHP will insert a paragraph tag around
// the first block of text for some reason which can mess up
// the newlines. See pre.html test for an example.
$html = '<body>' . $html . '</body>';
}
if ($ignore_error) {
$doc->strictErrorChecking = false;
$doc->recover = true;
$doc->xmlStandalone = true;
$old_internal_errors = libxml_use_internal_errors(true);
$load_result = $doc->loadHTML($html, LIBXML_NOWARNING | LIBXML_NOERROR | LIBXML_NONET | LIBXML_PARSEHUGE);
libxml_use_internal_errors($old_internal_errors);
}
else {
$load_result = $doc->loadHTML($html);
}
if (!$load_result) {
throw new Html2TextException("Could not load HTML - badly formed?", $html);
}
return $doc;
}
/**
* Can we guess that this HTML is generated by Microsoft Office?
*/
static function isOfficeDocument($html) {
return strpos($html, "urn:schemas-microsoft-com:office") !== false;
}
/**
* Replace any special characters with simple text versions, to prevent output issues:
* - Convert non-breaking spaces to regular spaces; and
* - Convert zero-width non-joiners to '' (nothing).
*
* This is to match our goal of rendering documents as they would be rendered
* by a browser.
*/
static function renderText($text) {
$text = str_replace(static::nbspCodes(), " ", $text);
$text = str_replace(static::zwnjCodes(), "", $text);
return $text;
}
static function isWhitespace($text) {
return strlen(trim(static::renderText($text), "\n\r\t ")) === 0;
}
static function nextChildName($node) {
// get the next child
$nextNode = $node->nextSibling;
while ($nextNode != null) {
if ($nextNode instanceof \DOMText) {
if (!static::isWhitespace($nextNode->wholeText)) {
break;
}
}
if ($nextNode instanceof \DOMElement) {
break;
}
$nextNode = $nextNode->nextSibling;
}
$nextName = null;
if (($nextNode instanceof \DOMElement || $nextNode instanceof \DOMText) && $nextNode != null) {
$nextName = strtolower($nextNode->nodeName);
}
return $nextName;
}
static function iterateOverNode($node, $prevName = null, $in_pre = false, $is_office_document = false, $options) {
if ($node instanceof \DOMText) {
// Replace whitespace characters with a space (equivilant to \s)
if ($in_pre) {
$text = "\n" . trim(static::renderText($node->wholeText), "\n\r\t ") . "\n";
// Remove trailing whitespace only
$text = preg_replace("/[ \t]*\n/im", "\n", $text);
// armor newlines with \r.
return str_replace("\n", "\r", $text);
} else {
$text = static::renderText($node->wholeText);
$text = preg_replace("/[\\t\\n\\f\\r ]+/im", " ", $text);
if (!static::isWhitespace($text) && ($prevName == 'p' || $prevName == 'div')) {
return "\n" . $text;
}
return $text;
}
}
if ($node instanceof \DOMDocumentType || $node instanceof \DOMProcessingInstruction) {
// ignore
return "";
}
$name = strtolower($node->nodeName);
$nextName = static::nextChildName($node);
// start whitespace
switch ($name) {
case "hr":
$prefix = '';
if ($prevName != null) {
$prefix = "\n";
}
return $prefix . "---------------------------------------------------------------\n";
case "style":
case "head":
case "title":
case "meta":
case "script":
// ignore these tags
return "";
case "h1":
case "h2":
case "h3":
case "h4":
case "h5":
case "h6":
case "ol":
case "ul":
case "pre":
// add two newlines
$output = "\n\n";
break;
case "td":
case "th":
// add tab char to separate table fields
$output = "\t";
break;
case "p":
// Microsoft exchange emails often include HTML which, when passed through
// html2text, results in lots of double line returns everywhere.
//
// To fix this, for any p element with a className of `MsoNormal` (the standard
// classname in any Microsoft export or outlook for a paragraph that behaves
// like a line return) we skip the first line returns and set the name to br.
if ($is_office_document && $node->getAttribute('class') == 'MsoNormal') {
$output = "";
$name = 'br';
break;
}
// add two lines
$output = "\n\n";
break;
case "tr":
// add one line
$output = "\n";
break;
case "div":
$output = "";
if ($prevName !== null) {
// add one line
$output .= "\n";
}
break;
case "li":
$output = "- ";
break;
default:
// print out contents of unknown tags
$output = "";
break;
}
// debug
//$output .= "[$name,$nextName]";
if (isset($node->childNodes)) {
$n = $node->childNodes->item(0);
$previousSiblingNames = array();
$previousSiblingName = null;
$parts = array();
$trailing_whitespace = 0;
while ($n != null) {
$text = static::iterateOverNode($n, $previousSiblingName, $in_pre || $name == 'pre', $is_office_document, $options);
// Pass current node name to next child, as previousSibling does not appear to get populated
if ($n instanceof \DOMDocumentType
|| $n instanceof \DOMProcessingInstruction
|| ($n instanceof \DOMText && static::isWhitespace($text))) {
// Keep current previousSiblingName, these are invisible
$trailing_whitespace++;
}
else {
$previousSiblingName = strtolower($n->nodeName);
$previousSiblingNames[] = $previousSiblingName;
$trailing_whitespace = 0;
}
$node->removeChild($n);
$n = $node->childNodes->item(0);
$parts[] = $text;
}
// Remove trailing whitespace, important for the br check below
while ($trailing_whitespace-- > 0) {
array_pop($parts);
}
// suppress last br tag inside a node list if follows text
$last_name = array_pop($previousSiblingNames);
if ($last_name === 'br') {
$last_name = array_pop($previousSiblingNames);
if ($last_name === '#text') {
array_pop($parts);
}
}
$output .= implode('', $parts);
}
// end whitespace
switch ($name) {
case "h1":
case "h2":
case "h3":
case "h4":
case "h5":
case "h6":
case "pre":
case "p":
// add two lines
$output .= "\n\n";
break;
case "br":
// add one line
$output .= "\n";
break;
case "div":
break;
case "a":
// links are returned in [text](link) format
$href = $node->getAttribute("href");
$output = trim($output);
// remove double [[ ]] s from linking images
if (substr($output, 0, 1) == "[" && substr($output, -1) == "]") {
$output = substr($output, 1, strlen($output) - 2);
// for linking images, the title of the <a> overrides the title of the <img>
if ($node->getAttribute("title")) {
$output = $node->getAttribute("title");
}
}
// if there is no link text, but a title attr
if (!$output && $node->getAttribute("title")) {
$output = $node->getAttribute("title");
}
if ($href == null) {
// it doesn't link anywhere
if ($node->getAttribute("name") != null) {
if ($options['drop_links']) {
$output = "$output";
} else {
$output = "[$output]";
}
}
} else {
if ($href == $output || $href == "mailto:$output" || $href == "http://$output" || $href == "https://$output") {
// link to the same address: just use link
$output = "$output";
} else {
// replace it
if ($output) {
if ($options['drop_links']) {
$output = "$output";
} else {
$output = "[$output]($href)";
}
} else {
// empty string
$output = "$href";
}
}
}
// does the next node require additional whitespace?
switch ($nextName) {
case "h1": case "h2": case "h3": case "h4": case "h5": case "h6":
$output .= "\n";
break;
}
break;
case "img":
if ($node->getAttribute("title")) {
$output = "[" . $node->getAttribute("title") . "]";
} elseif ($node->getAttribute("alt")) {
$output = "[" . $node->getAttribute("alt") . "]";
} else {
$output = "";
}
break;
case "li":
$output .= "\n";
break;
case "blockquote":
// process quoted text for whitespace/newlines
$output = static::processWhitespaceNewlines($output);
// add leading newline
$output = "\n" . $output;
// prepend '> ' at the beginning of all lines
$output = preg_replace("/\n/im", "\n> ", $output);
// replace leading '> >' with '>>'
$output = preg_replace("/\n> >/im", "\n>>", $output);
// add another leading newline and trailing newlines
$output = "\n" . $output . "\n\n";
break;
default:
// do nothing
}
return $output;
}
}

View File

@@ -0,0 +1,14 @@
<?php
namespace Soundasleep;
class Html2TextException extends \Exception {
var $more_info;
public function __construct($message = "", $more_info = "") {
parent::__construct($message);
$this->more_info = $more_info;
}
}