3 namespace andreskrey\Readability
;
5 use andreskrey\Readability\Nodes\DOM\DOMDocument
;
6 use andreskrey\Readability\Nodes\DOM\DOMElement
;
7 use andreskrey\Readability\Nodes\DOM\DOMNode
;
8 use andreskrey\Readability\Nodes\DOM\DOMText
;
9 use andreskrey\Readability\Nodes\NodeUtility
;
10 use Psr\Log\LoggerInterface
;
18 * Main DOMDocument where all the magic happens.
25 * Title of the article.
29 protected $title = null;
32 * Final DOMDocument with the fully parsed HTML.
34 * @var DOMDocument|null
36 protected $content = null;
39 * Excerpt of the article.
43 protected $excerpt = null;
46 * Main image of the article.
50 protected $image = null;
53 * Author of the article. Extracted from the byline tags and other social media properties.
57 protected $author = null;
60 * Direction of the text.
64 protected $direction = null;
67 * Configuration object.
71 private $configuration;
76 * @var LoggerInterface
81 * Collection of attempted text extractions.
85 private $attempts = [];
90 private $defaultTagsToScore = [
105 private $alterToDIVExceptions = [
113 * Readability constructor.
115 * @param Configuration $configuration
117 public function __construct(Configuration
$configuration)
119 $this->configuration
= $configuration;
120 $this->logger
= $this->configuration
->getLogger();
124 * Main parse function.
128 * @throws ParseException
132 public function parse($html)
134 $this->logger
->info('*** Starting parse process...');
136 $this->dom
= $this->loadHTML($html);
138 // Checking for minimum HTML to work with.
139 if (!($root = $this->dom
->getElementsByTagName('body')->item(0)) ||
!$root->firstChild
) {
140 $this->logger
->emergency('No body tag present or body tag empty');
142 throw new ParseException('Invalid or incomplete HTML.');
145 $this->getMetadata();
147 $this->getMainImage();
150 $root = $root->firstChild
;
152 $elementsToScore = $this->getNodes($root);
153 $this->logger
->debug(sprintf('Elements to score: \'%s\'', count($elementsToScore)));
155 $result = $this->rateNodes($elementsToScore);
158 * Now that we've gone through the full algorithm, check to see if
159 * we got any meaningful content. If we didn't, we may need to re-run
160 * grabArticle with different flags set. This gives us a higher likelihood of
161 * finding the content, and the sieve approach gives us a higher likelihood of
162 * finding the -right- content.
165 $length = mb_strlen(preg_replace(NodeUtility
::$regexps['onlyWhitespace'], '', $result->textContent
));
167 $this->logger
->info(sprintf('[Parsing] Article parsed. Amount of words: %s. Current threshold is: %s', $length, $this->configuration
->getWordThreshold()));
169 $parseSuccessful = true;
171 if ($result && $length < $this->configuration
->getWordThreshold()) {
172 $this->dom
= $this->loadHTML($html);
173 $root = $this->dom
->getElementsByTagName('body')->item(0);
174 $parseSuccessful = false;
176 if ($this->configuration
->getStripUnlikelyCandidates()) {
177 $this->logger
->debug('[Parsing] Threshold not met, trying again setting StripUnlikelyCandidates as false');
178 $this->configuration
->setStripUnlikelyCandidates(false);
179 $this->attempts
[] = ['articleContent' => $result, 'textLength' => $length];
180 } elseif ($this->configuration
->getWeightClasses()) {
181 $this->logger
->debug('[Parsing] Threshold not met, trying again setting WeightClasses as false');
182 $this->configuration
->setWeightClasses(false);
183 $this->attempts
[] = ['articleContent' => $result, 'textLength' => $length];
184 } elseif ($this->configuration
->getCleanConditionally()) {
185 $this->logger
->debug('[Parsing] Threshold not met, trying again setting CleanConditionally as false');
186 $this->configuration
->setCleanConditionally(false);
187 $this->attempts
[] = ['articleContent' => $result, 'textLength' => $length];
189 $this->logger
->debug('[Parsing] Threshold not met, searching across attempts for some content.');
190 $this->attempts
[] = ['articleContent' => $result, 'textLength' => $length];
192 // No luck after removing flags, just return the longest text we found during the different loops
193 usort($this->attempts
, function ($a, $b) {
194 return $a['textLength'] < $b['textLength'];
197 // But first check if we actually have something
198 if (!$this->attempts
[0]['textLength']) {
199 $this->logger
->emergency('[Parsing] Could not parse text, giving up :(');
201 throw new ParseException('Could not parse text.');
204 $this->logger
->debug('[Parsing] Threshold not met, but found some content in previous attempts.');
206 $result = $this->attempts
[0]['articleContent'];
207 $parseSuccessful = true;
215 if ($parseSuccessful) {
216 $result = $this->postProcessContent($result);
218 // If we haven't found an excerpt in the article's metadata, use the article's
219 // first paragraph as the excerpt. This can be used for displaying a preview of
220 // the article's content.
221 if (!$this->getExcerpt()) {
222 $this->logger
->debug('[Parsing] No excerpt text found on metadata, extracting first p node and using it as excerpt.');
223 $paragraphs = $result->getElementsByTagName('p');
224 if ($paragraphs->length
> 0) {
225 $this->setExcerpt(trim($paragraphs->item(0)->textContent
));
229 $this->setContent($result);
231 $this->logger
->info('*** Parse successful :)');
238 * Creates a DOM Document object and loads the provided HTML on it.
240 * Used for the first load of Readability and subsequent reloads (when disabling flags and rescanning the text)
241 * Previous versions of Readability used this method one time and cloned the DOM to keep a backup. This caused bugs
242 * because cloning the DOM object keeps a relation between the clone and the original one, doing changes in both
243 * objects and ruining the backup.
245 * @param string $html
247 * @return DOMDocument
249 private function loadHTML($html)
251 $this->logger
->debug('[Loading] Loading HTML...');
253 // To avoid throwing a gazillion of errors on malformed HTMLs
254 libxml_use_internal_errors(true);
256 $dom = new DOMDocument('1.0', 'utf-8');
258 if (!$this->configuration
->getSubstituteEntities()) {
259 // Keep the original HTML entities
260 $dom->substituteEntities
= false;
263 if ($this->configuration
->getNormalizeEntities()) {
264 $this->logger
->debug('[Loading] Normalized entities via mb_convert_encoding.');
265 // Replace UTF-8 characters with the HTML Entity equivalent. Useful to fix html with mixed content
266 $html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8');
269 if ($this->configuration
->getSummonCthulhu()) {
270 $this->logger
->debug('[Loading] Removed script tags via regex H̶͈̩̟̬̱͠E̡̨̬͔̳̜͢͠ ̡̧̯͉̩͙̩̹̞̠͎͈̹̥̠͞ͅͅC̶͉̞̘̖̝̗͓̬̯͍͉̤̬͢͢͞Ò̟̘͉͖͎͉̱̭̣̕M̴̯͈̻̱̱̣̗͈̠̙̲̥͘͞E̷̛͙̼̲͍͕̹͍͇̗̻̬̮̭̱̥͢Ş̛̟͔̙̜̤͇̮͍̙̝̀͘');
271 $html = preg_replace('/<script\b[^>]*>([\s\S]*?)<\/script>/', '', $html);
274 // Prepend the XML tag to avoid having issues with special characters. Should be harmless.
275 $dom->loadHTML('<?xml encoding="UTF-8">' . $html);
276 $dom->encoding
= 'UTF-8';
278 $this->removeScripts($dom);
280 $this->prepDocument($dom);
282 $this->logger
->debug('[Loading] Loaded HTML successfully.');
288 * Tries to guess relevant info from metadata of the html. Sets the results in the Readability properties.
290 private function getMetadata()
292 $this->logger
->debug('[Metadata] Retrieving metadata...');
295 // Match "description", or Twitter's "twitter:description" (Cards)
296 // in name attribute.
297 $namePattern = '/^\s*((twitter)\s*:\s*)?(description|title|image)\s*$/i';
299 // Match Facebook's Open Graph title & description properties.
300 $propertyPattern = '/^\s*og\s*:\s*(description|title|image)\s*$/i';
302 foreach ($this->dom
->getElementsByTagName('meta') as $meta) {
303 /* @var DOMNode $meta */
304 $elementName = $meta->getAttribute('name');
305 $elementProperty = $meta->getAttribute('property');
307 if (in_array('author', [$elementName, $elementProperty])) {
308 $this->logger
->info(sprintf('[Metadata] Found author: \'%s\'', $meta->getAttribute('content')));
309 $this->setAuthor($meta->getAttribute('content'));
314 if (preg_match($namePattern, $elementName)) {
315 $name = $elementName;
316 } elseif (preg_match($propertyPattern, $elementProperty)) {
317 $name = $elementProperty;
321 $content = $meta->getAttribute('content');
323 // Convert to lowercase and remove any whitespace
324 // so we can match below.
325 $name = preg_replace('/\s/', '', strtolower($name));
326 $values[$name] = trim($content);
330 if (array_key_exists('description', $values)) {
331 $this->logger
->info(sprintf('[Metadata] Found excerpt in \'description\' tag: \'%s\'', $values['description']));
332 $this->setExcerpt($values['description']);
333 } elseif (array_key_exists('og:description', $values)) {
334 // Use facebook open graph description.
335 $this->logger
->info(sprintf('[Metadata] Found excerpt in \'og:description\' tag: \'%s\'', $values['og:description']));
336 $this->setExcerpt($values['og:description']);
337 } elseif (array_key_exists('twitter:description', $values)) {
338 // Use twitter cards description.
339 $this->logger
->info(sprintf('[Metadata] Found excerpt in \'twitter:description\' tag: \'%s\'', $values['twitter:description']));
340 $this->setExcerpt($values['twitter:description']);
343 $this->setTitle($this->getArticleTitle());
345 if (!$this->getTitle()) {
346 if (array_key_exists('og:title', $values)) {
347 // Use facebook open graph title.
348 $this->logger
->info(sprintf('[Metadata] Found title in \'og:title\' tag: \'%s\'', $values['og:title']));
349 $this->setTitle($values['og:title']);
350 } elseif (array_key_exists('twitter:title', $values)) {
351 // Use twitter cards title.
352 $this->logger
->info(sprintf('[Metadata] Found title in \'twitter:title\' tag: \'%s\'', $values['twitter:title']));
353 $this->setTitle($values['twitter:title']);
357 if (array_key_exists('og:image', $values) ||
array_key_exists('twitter:image', $values)) {
358 if (array_key_exists('og:image', $values)) {
359 $this->logger
->info(sprintf('[Metadata] Found main image in \'og:image\' tag: \'%s\'', $values['og:image']));
360 $this->setImage($values['og:image']);
362 $this->logger
->info(sprintf('[Metadata] Found main image in \'twitter:image\' tag: \'%s\'', $values['twitter:image']));
363 $this->setImage($values['twitter:image']);
369 * Returns all the images of the parsed article.
373 public function getImages()
376 if ($this->getImage()) {
377 $result[] = $this->getImage();
380 if (null == $this->getDOMDocument()) {
384 foreach ($this->getDOMDocument()->getElementsByTagName('img') as $img) {
385 if ($src = $img->getAttribute('src')) {
390 if ($this->configuration
->getFixRelativeURLs()) {
391 foreach ($result as &$imgSrc) {
392 $imgSrc = $this->toAbsoluteURI($imgSrc);
396 $result = array_unique(array_filter($result));
402 * Tries to get the main article image. Will only update the metadata if the getMetadata function couldn't
403 * find a correct image.
405 public function getMainImage()
409 if ($this->getImage() !== null) {
410 $imgUrl = $this->getImage();
414 foreach ($this->dom
->getElementsByTagName('link') as $link) {
415 /** @var \DOMElement $link */
417 * Check for the rel attribute, then check if the rel attribute is either img_src or image_src, and
418 * finally check for the existence of the href attribute, which should hold the image url.
420 if ($link->hasAttribute('rel') && ($link->getAttribute('rel') === 'img_src' ||
$link->getAttribute('rel') === 'image_src') && $link->hasAttribute('href')) {
421 $imgUrl = $link->getAttribute('href');
427 if (!empty($imgUrl) && $this->configuration
->getFixRelativeURLs()) {
428 $this->setImage($this->toAbsoluteURI($imgUrl));
433 * Returns the title of the html. Prioritizes the title from the metadata against the title tag.
435 * @return string|null
437 private function getArticleTitle()
439 $originalTitle = null;
441 if ($this->getTitle()) {
442 $originalTitle = $this->getTitle();
444 $this->logger
->debug('[Metadata] Could not find title in metadata, searching for the title tag...');
445 $titleTag = $this->dom
->getElementsByTagName('title');
446 if ($titleTag->length
> 0) {
447 $this->logger
->info(sprintf('[Metadata] Using title tag as article title: \'%s\'', $titleTag->item(0)->nodeValue
));
448 $originalTitle = $titleTag->item(0)->nodeValue
;
452 if ($originalTitle === null) {
456 $curTitle = $originalTitle;
457 $titleHadHierarchicalSeparators = false;
460 * If there's a separator in the title, first remove the final part
462 * Sanity warning: if you eval this match in PHPStorm's "Evaluate expression" box, it will return false
463 * I can assure you it works properly if you let the code run.
465 if (preg_match('/ [\|\-\\\\\/>»] /i', $curTitle)) {
466 $titleHadHierarchicalSeparators = (bool)preg_match('/ [\\\\\/>»] /', $curTitle);
467 $curTitle = preg_replace('/(.*)[\|\-\\\\\/>»] .*/i', '$1', $originalTitle);
469 $this->logger
->info(sprintf('[Metadata] Found hierarchical separators in title, new title is: \'%s\'', $curTitle));
471 // If the resulting title is too short (3 words or fewer), remove
472 // the first part instead:
473 if (count(preg_split('/\s+/', $curTitle)) < 3) {
474 $curTitle = preg_replace('/[^\|\-\\\\\/>»]*[\|\-\\\\\/>»](.*)/i', '$1', $originalTitle);
475 $this->logger
->info(sprintf('[Metadata] Title too short, using the first part of the title instead: \'%s\'', $curTitle));
477 } elseif (strpos($curTitle, ': ') !== false) {
478 // Check if we have an heading containing this exact string, so we
479 // could assume it's the full title.
481 for ($i = 1; $i <= 2; $i++
) {
482 foreach ($this->dom
->getElementsByTagName('h' . $i) as $hTag) {
483 // Trim texts to avoid having false negatives when the title is surrounded by spaces or tabs
484 if (trim($hTag->nodeValue
) === trim($curTitle)) {
490 // If we don't, let's extract the title out of the original title string.
492 $curTitle = substr($originalTitle, strrpos($originalTitle, ':') +
1);
494 $this->logger
->info(sprintf('[Metadata] Title has a colon in the middle, new title is: \'%s\'', $curTitle));
496 // If the title is now too short, try the first colon instead:
497 if (count(preg_split('/\s+/', $curTitle)) < 3) {
498 $curTitle = substr($originalTitle, strpos($originalTitle, ':') +
1);
499 $this->logger
->info(sprintf('[Metadata] Title too short, using the first part of the title instead: \'%s\'', $curTitle));
500 } elseif (count(preg_split('/\s+/', substr($curTitle, 0, strpos($curTitle, ':')))) > 5) {
501 // But if we have too many words before the colon there's something weird
502 // with the titles and the H tags so let's just use the original title instead
503 $curTitle = $originalTitle;
506 } elseif (mb_strlen($curTitle) > 150 ||
mb_strlen($curTitle) < 15) {
507 $hOnes = $this->dom
->getElementsByTagName('h1');
509 if ($hOnes->length
=== 1) {
510 $curTitle = $hOnes->item(0)->nodeValue
;
511 $this->logger
->info(sprintf('[Metadata] Using title from an H1 node: \'%s\'', $curTitle));
515 $curTitle = trim($curTitle);
518 * If we now have 4 words or fewer as our title, and either no
519 * 'hierarchical' separators (\, /, > or ») were found in the original
520 * title or we decreased the number of words by more than 1 word, use
521 * the original title.
523 $curTitleWordCount = count(preg_split('/\s+/', $curTitle));
524 $originalTitleWordCount = count(preg_split('/\s+/', preg_replace('/[\|\-\\\\\/>»]+/', '', $originalTitle))) - 1;
526 if ($curTitleWordCount <= 4 &&
527 (!$titleHadHierarchicalSeparators ||
$curTitleWordCount !== $originalTitleWordCount)) {
528 $curTitle = $originalTitle;
530 $this->logger
->info(sprintf('Using title from an H1 node: \'%s\'', $curTitle));
537 * Convert URI to an absolute URI.
539 * @param $uri string URI to convert
543 private function toAbsoluteURI($uri)
545 list($pathBase, $scheme, $prePath) = $this->getPathInfo($this->configuration
->getOriginalURL());
547 // If this is already an absolute URI, return it.
548 if (preg_match('/^[a-zA-Z][a-zA-Z0-9\+\-\.]*:/', $uri)) {
552 // Scheme-rooted relative URI.
553 if (substr($uri, 0, 2) === '//') {
554 return $scheme . '://' . substr($uri, 2);
557 // Prepath-rooted relative URI.
558 if (substr($uri, 0, 1) === '/') {
559 return $prePath . $uri;
562 // Dotslash relative URI.
563 if (strpos($uri, './') === 0) {
564 return $pathBase . substr($uri, 2);
567 if (substr($uri, 0, 1) === '#') {
571 // Standard relative URI; add entire path. pathBase already includes a
573 return $pathBase . $uri;
577 * Returns full path info of an URL.
581 * @return array [$pathBase, $scheme, $prePath]
583 public function getPathInfo($url)
585 // Check for base URLs
586 if ($this->dom
->baseURI
!== null) {
587 if (substr($this->dom
->baseURI
, 0, 1) === '/') {
588 // URLs starting with '/' override completely the URL defined in the link
589 $pathBase = parse_url($url, PHP_URL_SCHEME
) . '://' . parse_url($url, PHP_URL_HOST
) . $this->dom
->baseURI
;
591 // Otherwise just prepend the base to the actual path
592 $pathBase = parse_url($url, PHP_URL_SCHEME
) . '://' . parse_url($url, PHP_URL_HOST
) . dirname(parse_url($url, PHP_URL_PATH
)) . '/' . rtrim($this->dom
->baseURI
, '/') . '/';
595 $pathBase = parse_url($url, PHP_URL_SCHEME
) . '://' . parse_url($url, PHP_URL_HOST
) . dirname(parse_url($url, PHP_URL_PATH
)) . '/';
598 $scheme = parse_url($pathBase, PHP_URL_SCHEME
);
599 $prePath = $scheme . '://' . parse_url($pathBase, PHP_URL_HOST
);
601 return [$pathBase, $scheme, $prePath];
605 * Gets nodes from the root element.
607 * @param $node DOMNode|DOMText
611 private function getNodes($node)
613 $this->logger
->info('[Get Nodes] Retrieving nodes...');
615 $stripUnlikelyCandidates = $this->configuration
->getStripUnlikelyCandidates();
617 $elementsToScore = [];
620 * First, node prepping. Trash nodes that look cruddy (like ones with the
621 * class name "comment", etc), and turn divs into P tags where they have been
622 * used inappropriately (as in, where they contain no other block level elements.)
626 $matchString = $node->getAttribute('class') . ' ' . $node->getAttribute('id');
628 // Remove DOMComments nodes as we don't need them and mess up children counting
629 if ($node->nodeType
=== XML_COMMENT_NODE
) {
630 $this->logger
->debug(sprintf('[Get Nodes] Found comment node, removing... Node content was: \'%s\'', substr($node->nodeValue
, 0, 128)));
631 $node = NodeUtility
::removeAndGetNext($node);
635 // Check to see if this node is a byline, and remove it if it is.
636 if ($this->checkByline($node, $matchString)) {
637 $this->logger
->debug(sprintf('[Get Nodes] Found byline, removing... Node content was: \'%s\'', substr($node->nodeValue
, 0, 128)));
638 $node = NodeUtility
::removeAndGetNext($node);
642 // Remove unlikely candidates
643 if ($stripUnlikelyCandidates) {
645 preg_match(NodeUtility
::$regexps['unlikelyCandidates'], $matchString) &&
646 !preg_match(NodeUtility
::$regexps['okMaybeItsACandidate'], $matchString) &&
647 $node->nodeName
!== 'body' &&
648 $node->nodeName
!== 'a'
650 $this->logger
->debug(sprintf('[Get Nodes] Removing unlikely candidate. Node content was: \'%s\'', substr($node->nodeValue
, 0, 128)));
651 $node = NodeUtility
::removeAndGetNext($node);
656 // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe).
657 if (($node->nodeName
=== 'div' ||
$node->nodeName
=== 'section' ||
$node->nodeName
=== 'header' ||
658 $node->nodeName
=== 'h1' ||
$node->nodeName
=== 'h2' ||
$node->nodeName
=== 'h3' ||
659 $node->nodeName
=== 'h4' ||
$node->nodeName
=== 'h5' ||
$node->nodeName
=== 'h6' ||
660 $node->nodeName
=== 'p') &&
661 $node->isElementWithoutContent()) {
662 $this->logger
->debug(sprintf('[Get Nodes] Removing empty \'%s\' node.', $node->nodeName
));
663 $node = NodeUtility
::removeAndGetNext($node);
667 if (in_array(strtolower($node->nodeName
), $this->defaultTagsToScore
)) {
668 $this->logger
->debug(sprintf('[Get Nodes] Adding node to score list, node content is: \'%s\'', substr($node->nodeValue
, 0, 128)));
669 $elementsToScore[] = $node;
672 // Turn all divs that don't have children block level elements into p's
673 if ($node->nodeName
=== 'div') {
675 * Sites like http://mobile.slate.com encloses each paragraph with a DIV
676 * element. DIVs with only a P element inside and no text content can be
677 * safely converted into plain P elements to avoid confusing the scoring
678 * algorithm with DIVs with are, in practice, paragraphs.
680 if ($node->hasSinglePNode()) {
681 $this->logger
->debug(sprintf('[Get Nodes] Found DIV with a single P node, removing DIV. Node content is: \'%s\'', substr($node->nodeValue
, 0, 128)));
682 $pNode = $node->getChildren(true)[0];
683 $node->parentNode
->replaceChild($pNode, $node);
685 $elementsToScore[] = $node;
686 } elseif (!$node->hasSingleChildBlockElement()) {
687 $this->logger
->debug(sprintf('[Get Nodes] Found DIV with a single child block element, converting to a P node. Node content is: \'%s\'', substr($node->nodeValue
, 0, 128)));
688 $node = NodeUtility
::setNodeTag($node, 'p');
689 $elementsToScore[] = $node;
692 foreach ($node->getChildren() as $child) {
693 /** @var $child DOMNode */
694 if ($child->nodeType
=== XML_TEXT_NODE
&& mb_strlen(trim($child->getTextContent())) > 0) {
695 $this->logger
->debug(sprintf('[Get Nodes] Found DIV a text node inside, converting to a P node. Node content is: \'%s\'', substr($node->nodeValue
, 0, 128)));
696 $newNode = $node->createNode($child, 'p');
697 $child->parentNode
->replaceChild($newNode, $child);
703 $node = NodeUtility
::getNextNode($node);
706 return $elementsToScore;
710 * Checks if the node is a byline.
712 * @param DOMNode $node
713 * @param string $matchString
717 private function checkByline($node, $matchString)
719 if (!$this->configuration
->getArticleByLine()) {
724 * Check if the byline is already set
726 if ($this->getAuthor()) {
730 $rel = $node->getAttribute('rel');
732 if ($rel === 'author' ||
preg_match(NodeUtility
::$regexps['byline'], $matchString) && $this->isValidByline($node->getTextContent())) {
733 $this->logger
->info(sprintf('[Metadata] Found article author: \'%s\'', $node->getTextContent()));
734 $this->setAuthor(trim($node->getTextContent()));
743 * Checks the validity of a byLine. Based on string length.
745 * @param string $text
749 private function isValidByline($text)
751 if (gettype($text) == 'string') {
752 $byline = trim($text);
754 return (mb_strlen($byline) > 0) && (mb_strlen($text) < 100);
761 * Removes all the scripts of the html.
763 * @param DOMDocument $dom
765 private function removeScripts(DOMDocument
$dom)
767 $toRemove = ['script', 'noscript'];
769 foreach ($toRemove as $tag) {
770 while ($script = $dom->getElementsByTagName($tag)) {
771 if ($script->item(0)) {
772 $script->item(0)->parentNode
->removeChild($script->item(0));
781 * Prepares the document for parsing.
783 * @param DOMDocument $dom
785 private function prepDocument(DOMDocument
$dom)
787 $this->logger
->info('[PrepDocument] Preparing document for parsing...');
790 * DOMNodeList must be converted to an array before looping over it.
791 * This is done to avoid node shifting when removing nodes.
793 * Reverse traversing cannot be done here because we need to find brs that are right next to other brs.
794 * (If we go the other way around we need to search for previous nodes forcing the creation of new functions
795 * that will be used only here)
797 foreach (iterator_to_array($dom->getElementsByTagName('br')) as $br) {
798 $next = $br->nextSibling
;
801 * Whether 2 or more <br> elements have been found and replaced with a
807 * If we find a <br> chain, remove the <br>s until we hit another element
808 * or non-whitespace. This leaves behind the first <br> in the chain
809 * (which will be replaced with a <p> later).
811 while (($next = NodeUtility
::nextElement($next)) && ($next->nodeName
=== 'br')) {
812 $this->logger
->debug('[PrepDocument] Removing chain of BR nodes...');
815 $brSibling = $next->nextSibling
;
816 $next->parentNode
->removeChild($next);
821 * If we removed a <br> chain, replace the remaining <br> with a <p>. Add
822 * all sibling nodes as children of the <p> until we hit another <br>
827 $p = $dom->createElement('p');
828 $br->parentNode
->replaceChild($p, $br);
830 $next = $p->nextSibling
;
832 // If we've hit another <br><br>, we're done adding children to this <p>.
833 if ($next->nodeName
=== 'br') {
834 $nextElem = NodeUtility
::nextElement($next);
835 if ($nextElem && $nextElem->nodeName
=== 'br') {
840 $this->logger
->debug('[PrepDocument] Replacing BR with a P node...');
842 // Otherwise, make this node a child of the new <p>.
843 $sibling = $next->nextSibling
;
844 $p->appendChild($next);
850 // Replace font tags with span
851 $fonts = $dom->getElementsByTagName('font');
852 $length = $fonts->length
;
853 for ($i = 0; $i < $length; $i++
) {
854 $this->logger
->debug('[PrepDocument] Converting font tag into a span tag.');
855 $font = $fonts->item($length - 1 - $i);
856 NodeUtility
::setNodeTag($font, 'span', true);
861 * Assign scores to each node. Returns full article parsed or false on error.
863 * @param array $nodes
865 * @return DOMDocument|bool
867 private function rateNodes($nodes)
869 $this->logger
->info('[Rating] Rating nodes...');
873 /** @var DOMElement $node */
874 foreach ($nodes as $node) {
875 if (is_null($node->parentNode
)) {
879 // Discard nodes with less than 25 characters, without blank space
880 if (mb_strlen($node->getTextContent(true)) < 25) {
884 $ancestors = $node->getNodeAncestors();
886 // Exclude nodes with no ancestor
887 if (count($ancestors) === 0) {
891 // Start with a point for the paragraph itself as a base.
894 // Add points for any commas within this paragraph.
895 $contentScore +
= count(explode(',', $node->getTextContent(true)));
897 // For every 100 characters in this paragraph, add another point. Up to 3 points.
898 $contentScore +
= min(floor(mb_strlen($node->getTextContent(true)) / 100), 3);
900 $this->logger
->debug(sprintf('[Rating] Node score %s, content: \'%s\'', $contentScore, substr($node->nodeValue
, 0, 128)));
902 /** @var $ancestor DOMElement */
903 foreach ($ancestors as $level => $ancestor) {
904 $this->logger
->debug('[Rating] Found ancestor, initializing and adding it as a candidate...');
905 if (!$ancestor->isInitialized()) {
906 $ancestor->initializeNode($this->configuration
->getWeightClasses());
907 $candidates[] = $ancestor;
911 * Node score divider:
912 * - parent: 1 (no division)
914 * - great grandparent+: ancestor level * 3
919 } elseif ($level === 1) {
922 $scoreDivider = $level * 3;
925 $currentScore = $ancestor->contentScore
;
926 $ancestor->contentScore
= $currentScore +
($contentScore / $scoreDivider);
928 $this->logger
->debug(sprintf('[Rating] Ancestor score %s, value: \'%s\'', $ancestor->contentScore
, substr($ancestor->nodeValue
, 0, 128)));
933 * After we've calculated scores, loop through all of the possible
934 * candidate nodes we found and find the one with the highest score.
938 foreach ($candidates as $candidate) {
941 * Scale the final candidates score based on link density. Good content
942 * should have a relatively small link density (5% or less) and be mostly
943 * unaffected by this operation.
946 $candidate->contentScore
= $candidate->contentScore
* (1 - $candidate->getLinkDensity());
948 for ($i = 0; $i < $this->configuration
->getMaxTopCandidates(); $i++
) {
949 $aTopCandidate = isset($topCandidates[$i]) ?
$topCandidates[$i] : null;
951 if (!$aTopCandidate ||
$candidate->contentScore
> $aTopCandidate->contentScore
) {
952 array_splice($topCandidates, $i, 0, [$candidate]);
953 if (count($topCandidates) > $this->configuration
->getMaxTopCandidates()) {
954 array_pop($topCandidates);
961 $topCandidate = isset($topCandidates[0]) ?
$topCandidates[0] : null;
962 $parentOfTopCandidate = null;
965 * If we still have no top candidate, just use the body as a last resort.
966 * We also have to copy the body node so it is something we can modify.
969 if ($topCandidate === null ||
$topCandidate->nodeName
=== 'body') {
970 $this->logger
->info('[Rating] No top candidate found or top candidate is the body tag. Moving all child nodes to a new DIV node.');
972 // Move all of the page's children into topCandidate
973 $topCandidate = new DOMDocument('1.0', 'utf-8');
974 $topCandidate->encoding
= 'UTF-8';
975 $topCandidate->appendChild($topCandidate->createElement('div', ''));
976 $kids = $this->dom
->getElementsByTagName('body')->item(0)->childNodes
;
978 // Cannot be foreached, don't ask me why.
979 for ($i = 0; $i < $kids->length
; $i++
) {
980 $import = $topCandidate->importNode($kids->item($i), true);
981 $topCandidate->firstChild
->appendChild($import);
984 // Candidate must be created using firstChild to grab the DOMElement instead of the DOMDocument.
985 $topCandidate = $topCandidate->firstChild
;
986 } elseif ($topCandidate) {
987 $this->logger
->info(sprintf('[Rating] Found top candidate, score: %s', $topCandidate->contentScore
));
988 // Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array
989 // and whose scores are quite closed with current `topCandidate` node.
990 $alternativeCandidateAncestors = [];
991 for ($i = 1; $i < count($topCandidates); $i++
) {
992 if ($topCandidates[$i]->contentScore
/ $topCandidate->contentScore
>= 0.75) {
993 array_push($alternativeCandidateAncestors, $topCandidates[$i]->getNodeAncestors(false));
997 $MINIMUM_TOPCANDIDATES = 3;
998 if (count($alternativeCandidateAncestors) >= $MINIMUM_TOPCANDIDATES) {
999 $parentOfTopCandidate = $topCandidate->parentNode
;
1000 while ($parentOfTopCandidate->nodeName
!== 'body') {
1001 $listsContainingThisAncestor = 0;
1002 for ($ancestorIndex = 0; $ancestorIndex < count($alternativeCandidateAncestors) && $listsContainingThisAncestor < $MINIMUM_TOPCANDIDATES; $ancestorIndex++
) {
1003 $listsContainingThisAncestor +
= (int)in_array($parentOfTopCandidate, $alternativeCandidateAncestors[$ancestorIndex]);
1005 if ($listsContainingThisAncestor >= $MINIMUM_TOPCANDIDATES) {
1006 $topCandidate = $parentOfTopCandidate;
1009 $parentOfTopCandidate = $parentOfTopCandidate->parentNode
;
1014 * Because of our bonus system, parents of candidates might have scores
1015 * themselves. They get half of the node. There won't be nodes with higher
1016 * scores than our topCandidate, but if we see the score going *up* in the first
1017 * few steps up the tree, that's a decent sign that there might be more content
1018 * lurking in other places that we want to unify in. The sibling stuff
1019 * below does some of that - but only if we've looked high enough up the DOM
1023 $parentOfTopCandidate = $topCandidate->parentNode
;
1024 $lastScore = $topCandidate->contentScore
;
1026 // The scores shouldn't get too low.
1027 $scoreThreshold = $lastScore / 3;
1029 /* @var DOMElement $parentOfTopCandidate */
1030 // Check if we are actually dealing with a DOMNode and not a DOMDocument node or higher
1031 while ($parentOfTopCandidate->nodeName
!== 'body' && $parentOfTopCandidate->nodeType
=== XML_ELEMENT_NODE
) {
1032 $parentScore = $parentOfTopCandidate->contentScore
;
1033 if ($parentScore < $scoreThreshold) {
1037 if ($parentScore > $lastScore) {
1038 // Alright! We found a better parent to use.
1039 $topCandidate = $parentOfTopCandidate;
1040 $this->logger
->info('[Rating] Found a better top candidate.');
1043 $lastScore = $parentOfTopCandidate->contentScore
;
1044 $parentOfTopCandidate = $parentOfTopCandidate->parentNode
;
1047 // If the top candidate is the only child, use parent instead. This will help sibling
1048 // joining logic when adjacent content is actually located in parent's sibling node.
1049 $parentOfTopCandidate = $topCandidate->parentNode
;
1050 while ($parentOfTopCandidate->nodeName
!== 'body' && count($parentOfTopCandidate->getChildren(true)) === 1) {
1051 $topCandidate = $parentOfTopCandidate;
1052 $parentOfTopCandidate = $topCandidate->parentNode
;
1057 * Now that we have the top candidate, look through its siblings for content
1058 * that might also be related. Things like preambles, content split by ads
1059 * that we removed, etc.
1062 $this->logger
->info('[Rating] Creating final article content document...');
1064 $articleContent = new DOMDocument('1.0', 'utf-8');
1065 $articleContent->createElement('div');
1067 $siblingScoreThreshold = max(10, $topCandidate->contentScore
* 0.2);
1068 // Keep potential top candidate's parent node to try to get text direction of it later.
1069 $parentOfTopCandidate = $topCandidate->parentNode
;
1070 $siblings = $parentOfTopCandidate->getChildren();
1072 $hasContent = false;
1074 $this->logger
->info('[Rating] Adding top candidate siblings...');
1076 /** @var DOMElement $sibling */
1077 foreach ($siblings as $sibling) {
1080 if ($sibling === $topCandidate) {
1081 $this->logger
->debug('[Rating] Sibling is equal to the top candidate, adding to the final article...');
1087 // Give a bonus if sibling nodes and top candidates have the example same classname
1088 if ($sibling->getAttribute('class') === $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') !== '') {
1089 $contentBonus +
= $topCandidate->contentScore
* 0.2;
1091 if ($sibling->contentScore +
$contentBonus >= $siblingScoreThreshold) {
1093 } elseif ($sibling->nodeName
=== 'p') {
1094 $linkDensity = $sibling->getLinkDensity();
1095 $nodeContent = $sibling->getTextContent(true);
1097 if (mb_strlen($nodeContent) > 80 && $linkDensity < 0.25) {
1099 } elseif ($nodeContent && mb_strlen($nodeContent) < 80 && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent)) {
1106 $this->logger
->debug(sprintf('[Rating] Appending sibling to final article, content is: \'%s\'', substr($sibling->nodeValue
, 0, 128)));
1110 if (!in_array(strtolower($sibling->nodeName
), $this->alterToDIVExceptions
)) {
1112 * We have a node that isn't a common block level element, like a form or td tag.
1113 * Turn it into a div so it doesn't get filtered out later by accident.
1116 $sibling = NodeUtility
::setNodeTag($sibling, 'div');
1119 $import = $articleContent->importNode($sibling, true);
1120 $articleContent->appendChild($import);
1123 * No node shifting needs to be check because when calling getChildren, an array is made with the
1124 * children of the parent node, instead of using the DOMElement childNodes function, which, when used
1125 * along with appendChild, would shift the nodes position and the current foreach will behave in
1126 * unpredictable ways.
1131 $articleContent = $this->prepArticle($articleContent);
1134 // Find out text direction from ancestors of final top candidate.
1135 $ancestors = array_merge([$parentOfTopCandidate, $topCandidate], $parentOfTopCandidate->getNodeAncestors());
1136 foreach ($ancestors as $ancestor) {
1137 $articleDir = $ancestor->getAttribute('dir');
1139 $this->setDirection($articleDir);
1140 $this->logger
->debug(sprintf('[Rating] Found article direction: %s', $articleDir));
1145 return $articleContent;
1152 * Cleans up the final article.
1154 * @param DOMDocument $article
1156 * @return DOMDocument
1158 public function prepArticle(DOMDocument
$article)
1160 $this->logger
->info('[PrepArticle] Preparing final article...');
1162 $this->_cleanStyles($article);
1163 $this->_clean($article, 'style');
1165 // Check for data tables before we continue, to avoid removing items in
1166 // those tables, which will often be isolated even though they're
1167 // visually linked to other content-ful elements (text, images, etc.).
1168 $this->_markDataTables($article);
1170 // Clean out junk from the article content
1171 $this->_cleanConditionally($article, 'form');
1172 $this->_cleanConditionally($article, 'fieldset');
1173 $this->_clean($article, 'object');
1174 $this->_clean($article, 'embed');
1175 $this->_clean($article, 'h1');
1176 $this->_clean($article, 'footer');
1177 $this->_clean($article, 'link');
1179 // Clean out elements have "share" in their id/class combinations from final top candidates,
1180 // which means we don't remove the top candidates even they have "share".
1181 foreach ($article->childNodes
as $child) {
1182 $this->_cleanMatchedNodes($child, '/share/i');
1186 * If there is only one h2 and its text content substantially equals article title,
1187 * they are probably using it as a header and not a subheader,
1188 * so remove it since we already extract the title separately.
1190 $h2 = $article->getElementsByTagName('h2');
1191 if ($h2->length
=== 1) {
1192 $lengthSimilarRate = (mb_strlen($h2->item(0)->textContent
) - mb_strlen($this->getTitle())) / max(mb_strlen($this->getTitle()), 1);
1194 if (abs($lengthSimilarRate) < 0.5) {
1195 if ($lengthSimilarRate > 0) {
1196 $titlesMatch = strpos($h2->item(0)->textContent
, $this->getTitle()) !== false;
1198 $titlesMatch = strpos($this->getTitle(), $h2->item(0)->textContent
) !== false;
1201 $this->logger
->info('[PrepArticle] Found title repeated in an H2 node, removing...');
1202 $this->_clean($article, 'h2');
1207 $this->_clean($article, 'iframe');
1208 $this->_clean($article, 'input');
1209 $this->_clean($article, 'textarea');
1210 $this->_clean($article, 'select');
1211 $this->_clean($article, 'button');
1212 $this->_cleanHeaders($article);
1214 // Do these last as the previous stuff may have removed junk
1215 // that will affect these
1216 $this->_cleanConditionally($article, 'table');
1217 $this->_cleanConditionally($article, 'ul');
1218 $this->_cleanConditionally($article, 'div');
1220 $this->_cleanExtraParagraphs($article);
1222 foreach (iterator_to_array($article->getElementsByTagName('br')) as $br) {
1223 $next = $br->nextSibling
;
1224 if ($next && $next->nodeName
=== 'p') {
1225 $this->logger
->debug('[PrepArticle] Removing br node next to a p node.');
1226 $br->parentNode
->removeChild($br);
1234 * Look for 'data' (as opposed to 'layout') tables, for which we use
1236 * https://dxr.mozilla.org/mozilla-central/rev/71224049c0b52ab190564d3ea0eab089a159a4cf/accessible/html/HTMLTableAccessible.cpp#920.
1238 * @param DOMDocument $article
1242 public function _markDataTables(DOMDocument
$article)
1244 $tables = $article->getElementsByTagName('table');
1245 foreach ($tables as $table) {
1246 /** @var DOMElement $table */
1247 $role = $table->getAttribute('role');
1248 if ($role === 'presentation') {
1249 $table->setReadabilityDataTable(false);
1252 $datatable = $table->getAttribute('datatable');
1253 if ($datatable == '0') {
1254 $table->setReadabilityDataTable(false);
1257 $summary = $table->getAttribute('summary');
1259 $table->setReadabilityDataTable(true);
1263 $caption = $table->getElementsByTagName('caption');
1264 if ($caption->length
> 0 && $caption->item(0)->childNodes
->length
> 0) {
1265 $table->setReadabilityDataTable(true);
1269 // If the table has a descendant with any of these tags, consider a data table:
1270 foreach (['col', 'colgroup', 'tfoot', 'thead', 'th'] as $dataTableDescendants) {
1271 if ($table->getElementsByTagName($dataTableDescendants)->length
> 0) {
1272 $table->setReadabilityDataTable(true);
1277 // Nested tables indicate a layout table:
1278 if ($table->getElementsByTagName('table')->length
> 0) {
1279 $table->setReadabilityDataTable(false);
1283 $sizeInfo = $table->getRowAndColumnCount();
1284 if ($sizeInfo['rows'] >= 10 ||
$sizeInfo['columns'] > 4) {
1285 $table->setReadabilityDataTable(true);
1288 // Now just go by size entirely:
1289 $table->setReadabilityDataTable($sizeInfo['rows'] * $sizeInfo['columns'] > 10);
1294 * Remove the style attribute on every e and under.
1296 * @param $node DOMDocument|DOMNode
1298 public function _cleanStyles($node)
1300 if (property_exists($node, 'tagName') && $node->tagName
=== 'svg') {
1304 // Do not bother if there's no method to remove an attribute
1305 if (method_exists($node, 'removeAttribute')) {
1306 $presentational_attributes = ['align', 'background', 'bgcolor', 'border', 'cellpadding', 'cellspacing', 'frame', 'hspace', 'rules', 'style', 'valign', 'vspace'];
1307 // Remove `style` and deprecated presentational attributes
1308 foreach ($presentational_attributes as $presentational_attribute) {
1309 $node->removeAttribute($presentational_attribute);
1312 $deprecated_size_attribute_elems = ['table', 'th', 'td', 'hr', 'pre'];
1313 if (property_exists($node, 'tagName') && in_array($node->tagName
, $deprecated_size_attribute_elems)) {
1314 $node->removeAttribute('width');
1315 $node->removeAttribute('height');
1319 $cur = $node->firstChild
;
1320 while ($cur !== null) {
1321 $this->_cleanStyles($cur);
1322 $cur = $cur->nextSibling
;
1327 * Clean out elements whose id/class combinations match specific string.
1329 * @param $node DOMElement Node to clean
1330 * @param $regex string Match id/class combination.
1334 public function _cleanMatchedNodes($node, $regex)
1336 $endOfSearchMarkerNode = NodeUtility
::getNextNode($node, true);
1337 $next = NodeUtility
::getNextNode($node);
1338 while ($next && $next !== $endOfSearchMarkerNode) {
1339 if (preg_match($regex, sprintf('%s %s', $next->getAttribute('class'), $next->getAttribute('id')))) {
1340 $this->logger
->debug(sprintf('Removing matched node with regex: \'%s\', node class was: \'%s\', id: \'%s\'', $regex, $next->getAttribute('class'), $next->getAttribute('id')));
1341 $next = NodeUtility
::removeAndGetNext($next);
1343 $next = NodeUtility
::getNextNode($next);
1349 * @param DOMDocument $article
1353 public function _cleanExtraParagraphs(DOMDocument
$article)
1355 $paragraphs = $article->getElementsByTagName('p');
1356 $length = $paragraphs->length
;
1358 for ($i = 0; $i < $length; $i++
) {
1359 $paragraph = $paragraphs->item($length - 1 - $i);
1361 $imgCount = $paragraph->getElementsByTagName('img')->length
;
1362 $embedCount = $paragraph->getElementsByTagName('embed')->length
;
1363 $objectCount = $paragraph->getElementsByTagName('object')->length
;
1364 // At this point, nasty iframes have been removed, only remain embedded video ones.
1365 $iframeCount = $paragraph->getElementsByTagName('iframe')->length
;
1366 $totalCount = $imgCount +
$embedCount +
$objectCount +
$iframeCount;
1368 if ($totalCount === 0 && !preg_replace(NodeUtility
::$regexps['onlyWhitespace'], '', $paragraph->textContent
)) {
1369 $this->logger
->debug(sprintf('[PrepArticle] Removing extra paragraph. Text content was: \'%s\'', substr($paragraph->textContent
, 0, 128)));
1370 $paragraph->parentNode
->removeChild($paragraph);
1376 * @param DOMDocument $article
1380 public function _cleanConditionally(DOMDocument
$article, $tag)
1382 if (!$this->configuration
->getCleanConditionally()) {
1386 $isList = in_array($tag, ['ul', 'ol']);
1389 * Gather counts for other typical elements embedded within.
1390 * Traverse backwards so we can remove nodes at the same time
1391 * without effecting the traversal.
1394 $DOMNodeList = $article->getElementsByTagName($tag);
1395 $length = $DOMNodeList->length
;
1396 for ($i = 0; $i < $length; $i++
) {
1397 /** @var $node DOMElement */
1398 $node = $DOMNodeList->item($length - 1 - $i);
1400 // First check if we're in a data table, in which case don't remove us.
1401 if ($node->hasAncestorTag($node, 'table', -1) && $node->isReadabilityDataTable()) {
1406 if ($this->configuration
->getWeightClasses()) {
1407 $weight = $node->getClassWeight();
1411 $this->logger
->debug(sprintf('[PrepArticle] Removing tag \'%s\' with 0 or less weight', $tag));
1413 NodeUtility
::removeNode($node);
1417 if (substr_count($node->getTextContent(), ',') < 10) {
1419 * If there are not very many commas, and the number of
1420 * non-paragraph elements is more than paragraphs or other
1421 * ominous signs, remove the element.
1424 $p = $node->getElementsByTagName('p')->length
;
1425 $img = $node->getElementsByTagName('img')->length
;
1426 $li = $node->getElementsByTagName('li')->length
- 100;
1427 $input = $node->getElementsByTagName('input')->length
;
1430 $embeds = $node->getElementsByTagName('embed');
1432 foreach ($embeds as $embedNode) {
1433 if (preg_match(NodeUtility
::$regexps['videos'], $embedNode->C14N())) {
1438 $linkDensity = $node->getLinkDensity();
1439 $contentLength = mb_strlen($node->getTextContent(true));
1442 ($img > 1 && $p / $img < 0.5 && !$node->hasAncestorTag($node, 'figure')) ||
1443 (!$isList && $li > $p) ||
1444 ($input > floor($p / 3)) ||
1445 (!$isList && $contentLength < 25 && ($img === 0 ||
$img > 2) && !$node->hasAncestorTag($node, 'figure')) ||
1446 (!$isList && $weight < 25 && $linkDensity > 0.2) ||
1447 ($weight >= 25 && $linkDensity > 0.5) ||
1448 (($embedCount === 1 && $contentLength < 75) ||
$embedCount > 1);
1450 if ($haveToRemove) {
1451 $this->logger
->debug(sprintf('[PrepArticle] Removing tag \'%s\'.', $tag));
1453 NodeUtility
::removeNode($node);
1460 * Clean a node of all elements of type "tag".
1461 * (Unless it's a youtube/vimeo video. People love movies.).
1463 * @param $article DOMDocument
1464 * @param $tag string tag to clean
1468 public function _clean(DOMDocument
$article, $tag)
1470 $isEmbed = in_array($tag, ['object', 'embed', 'iframe']);
1472 $DOMNodeList = $article->getElementsByTagName($tag);
1473 $length = $DOMNodeList->length
;
1474 for ($i = 0; $i < $length; $i++
) {
1475 $item = $DOMNodeList->item($length - 1 - $i);
1477 // Allow youtube and vimeo videos through as people usually want to see those.
1479 $attributeValues = [];
1480 foreach ($item->attributes
as $name => $value) {
1481 $attributeValues[] = $value->nodeValue
;
1483 $attributeValues = implode('|', $attributeValues);
1485 // First, check the elements attributes to see if any of them contain youtube or vimeo
1486 if (preg_match(NodeUtility
::$regexps['videos'], $attributeValues)) {
1490 // Then check the elements inside this element for the same.
1491 if (preg_match(NodeUtility
::$regexps['videos'], $item->C14N())) {
1495 $this->logger
->debug(sprintf('[PrepArticle] Removing node \'%s\'.', $item->tagName
));
1497 NodeUtility
::removeNode($item);
1502 * Clean out spurious headers from an Element. Checks things like classnames and link density.
1504 * @param DOMDocument $article
1508 public function _cleanHeaders(DOMDocument
$article)
1510 for ($headerIndex = 1; $headerIndex < 3; $headerIndex++
) {
1511 $headers = $article->getElementsByTagName('h' . $headerIndex);
1512 /** @var $header DOMElement */
1513 foreach ($headers as $header) {
1515 if ($this->configuration
->getWeightClasses()) {
1516 $weight = $header->getClassWeight();
1520 $this->logger
->debug(sprintf('[PrepArticle] Removing H node with 0 or less weight. Content was: \'%s\'', substr($header->nodeValue
, 0, 128)));
1522 NodeUtility
::removeNode($header);
1529 * Removes the class="" attribute from every element in the given
1532 * Readability.js has a special filter to avoid cleaning the classes that the algorithm adds. We don't add classes
1533 * here so no need to filter those.
1535 * @param DOMDocument|DOMNode $node
1539 public function _cleanClasses($node)
1541 if ($node->getAttribute('class') !== '') {
1542 $node->removeAttribute('class');
1545 for ($node = $node->firstChild
; $node !== null; $node = $node->nextSibling
) {
1546 $this->_cleanClasses($node);
1551 * @param DOMDocument $article
1553 * @return DOMDocument
1555 public function postProcessContent(DOMDocument
$article)
1557 $this->logger
->info('[PostProcess] PostProcessing content...');
1559 // Readability cannot open relative uris so we convert them to absolute uris.
1560 if ($this->configuration
->getFixRelativeURLs()) {
1561 foreach (iterator_to_array($article->getElementsByTagName('a')) as $link) {
1562 /** @var DOMElement $link */
1563 $href = $link->getAttribute('href');
1565 // Replace links with javascript: URIs with text content, since
1566 // they won't work after scripts have been removed from the page.
1567 if (strpos($href, 'javascript:') === 0) {
1568 $this->logger
->debug(sprintf('[PostProcess] Removing \'javascript:\' link. Content is: \'%s\'', substr($link->textContent
, 0, 128)));
1570 $text = $article->createTextNode($link->textContent
);
1571 $link->parentNode
->replaceChild($text, $link);
1573 $this->logger
->debug(sprintf('[PostProcess] Converting link to absolute URI: \'%s\'', substr($href, 0, 128)));
1575 $link->setAttribute('href', $this->toAbsoluteURI($href));
1580 foreach ($article->getElementsByTagName('img') as $img) {
1581 /** @var DOMElement $img */
1583 * Extract all possible sources of img url and select the first one on the list.
1586 $img->getAttribute('src'),
1587 $img->getAttribute('data-src'),
1588 $img->getAttribute('data-original'),
1589 $img->getAttribute('data-orig'),
1590 $img->getAttribute('data-url')
1593 $src = array_filter($url);
1596 $this->logger
->debug(sprintf('[PostProcess] Converting image URL to absolute URI: \'%s\'', substr($src, 0, 128)));
1598 $img->setAttribute('src', $this->toAbsoluteURI($src));
1603 $this->_cleanClasses($article);
1609 * @return null|string
1611 public function __toString()
1613 return sprintf('<h1>%s</h1>%s', $this->getTitle(), $this->getContent());
1617 * @return string|null
1619 public function getTitle()
1621 return $this->title
;
1625 * @param string $title
1627 protected function setTitle($title)
1629 $this->title
= $title;
1633 * @return string|null
1635 public function getContent()
1637 return ($this->content
instanceof DOMDocument
) ?
$this->content
->C14N() : null;
1641 * @return DOMDocument|null
1643 public function getDOMDocument()
1645 return $this->content
;
1649 * @param DOMDocument $content
1651 protected function setContent(DOMDocument
$content)
1653 $this->content
= $content;
1657 * @return null|string
1659 public function getExcerpt()
1661 return $this->excerpt
;
1665 * @param null|string $excerpt
1667 public function setExcerpt($excerpt)
1669 $this->excerpt
= $excerpt;
1673 * @return string|null
1675 public function getImage()
1677 return $this->image
;
1681 * @param string $image
1683 protected function setImage($image)
1685 $this->image
= $image;
1689 * @return string|null
1691 public function getAuthor()
1693 return $this->author
;
1697 * @param string $author
1699 protected function setAuthor($author)
1701 $this->author
= $author;
1705 * @return null|string
1707 public function getDirection()
1709 return $this->direction
;
1713 * @param null|string $direction
1715 public function setDirection($direction)
1717 $this->direction
= $direction;