vendor/andreskrey/Readability/Readability.php

   1 <?php
   2
   3 namespace andreskrey\Readability;
   4
   5 use andreskrey\Readability\Nodes\DOM\DOMDocument;
   6 use andreskrey\Readability\Nodes\DOM\DOMElement;
   7 use andreskrey\Readability\Nodes\DOM\DOMNode;
   8 use andreskrey\Readability\Nodes\DOM\DOMText;
   9 use andreskrey\Readability\Nodes\NodeUtility;
  10 use Psr\Log\LoggerInterface;
  11
  12 /**
  13  * Class Readability.
  14  */
  15 class Readability
  16 {
  17     /**
  18      * Main DOMDocument where all the magic happens.
  19      *
  20      * @var DOMDocument
  21      */
  22     protected $dom;
  23
  24     /**
  25      * Title of the article.
  26      *
  27      * @var string|null
  28      */
  29     protected $title = null;
  30
  31     /**
  32      * Final DOMDocument with the fully parsed HTML.
  33      *
  34      * @var DOMDocument|null
  35      */
  36     protected $content = null;
  37
  38     /**
  39      * Excerpt of the article.
  40      *
  41      * @var string|null
  42      */
  43     protected $excerpt = null;
  44
  45     /**
  46      * Main image of the article.
  47      *
  48      * @var string|null
  49      */
  50     protected $image = null;
  51
  52     /**
  53      * Author of the article. Extracted from the byline tags and other social media properties.
  54      *
  55      * @var string|null
  56      */
  57     protected $author = null;
  58
  59     /**
  60      * Direction of the text.
  61      *
  62      * @var string|null
  63      */
  64     protected $direction = null;
  65
  66     /**
  67      * Configuration object.
  68      *
  69      * @var Configuration
  70      */
  71     private $configuration;
  72
  73     /**
  74      * Logger object.
  75      *
  76      * @var LoggerInterface
  77      */
  78     private $logger;
  79
  80     /**
  81      * Collection of attempted text extractions.
  82      *
  83      * @var array
  84      */
  85     private $attempts = [];
  86
  87     /**
  88      * @var array
  89      */
  90     private $defaultTagsToScore = [
  91         'section',
  92         'h2',
  93         'h3',
  94         'h4',
  95         'h5',
  96         'h6',
  97         'p',
  98         'td',
  99         'pre',
 100     ];
 101
 102     /**
 103      * @var array
 104      */
 105     private $alterToDIVExceptions = [
 106         'div',
 107         'article',
 108         'section',
 109         'p',
 110     ];
 111
 112     /**
 113      * Readability constructor.
 114      *
 115      * @param Configuration $configuration
 116      */
 117     public function __construct(Configuration $configuration)
 118     {
 119         $this->configuration = $configuration;
 120         $this->logger = $this->configuration->getLogger();
 121     }
 122
 123     /**
 124      * Main parse function.
 125      *
 126      * @param $html
 127      *
 128      * @throws ParseException
 129      *
 130      * @return array|bool
 131      */
 132     public function parse($html)
 133     {
 134         $this->logger->info('*** Starting parse process...');
 135
 136         $this->dom = $this->loadHTML($html);
 137
 138         // Checking for minimum HTML to work with.
 139         if (!($root = $this->dom->getElementsByTagName('body')->item(0)) || !$root->firstChild) {
 140             $this->logger->emergency('No body tag present or body tag empty');
 141
 142             throw new ParseException('Invalid or incomplete HTML.');
 143         }
 144
 145         $this->getMetadata();
 146
 147         $this->getMainImage();
 148
 149         while (true) {
 150             $root = $root->firstChild;
 151
 152             $elementsToScore = $this->getNodes($root);
 153             $this->logger->debug(sprintf('Elements to score: \'%s\'', count($elementsToScore)));
 154
 155             $result = $this->rateNodes($elementsToScore);
 156
 157             /*
 158              * Now that we've gone through the full algorithm, check to see if
 159              * we got any meaningful content. If we didn't, we may need to re-run
 160              * grabArticle with different flags set. This gives us a higher likelihood of
 161              * finding the content, and the sieve approach gives us a higher likelihood of
 162              * finding the -right- content.
 163              */
 164
 165             $length = mb_strlen(preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $result->textContent));
 166
 167             $this->logger->info(sprintf('[Parsing] Article parsed. Amount of words: %s. Current threshold is: %s', $length, $this->configuration->getWordThreshold()));
 168
 169             $parseSuccessful = true;
 170
 171             if ($result && $length < $this->configuration->getWordThreshold()) {
 172                 $this->dom = $this->loadHTML($html);
 173                 $root = $this->dom->getElementsByTagName('body')->item(0);
 174                 $parseSuccessful = false;
 175
 176                 if ($this->configuration->getStripUnlikelyCandidates()) {
 177                     $this->logger->debug('[Parsing] Threshold not met, trying again setting StripUnlikelyCandidates as false');
 178                     $this->configuration->setStripUnlikelyCandidates(false);
 179                     $this->attempts[] = ['articleContent' => $result, 'textLength' => $length];
 180                 } elseif ($this->configuration->getWeightClasses()) {
 181                     $this->logger->debug('[Parsing] Threshold not met, trying again setting WeightClasses as false');
 182                     $this->configuration->setWeightClasses(false);
 183                     $this->attempts[] = ['articleContent' => $result, 'textLength' => $length];
 184                 } elseif ($this->configuration->getCleanConditionally()) {
 185                     $this->logger->debug('[Parsing] Threshold not met, trying again setting CleanConditionally as false');
 186                     $this->configuration->setCleanConditionally(false);
 187                     $this->attempts[] = ['articleContent' => $result, 'textLength' => $length];
 188                 } else {
 189                     $this->logger->debug('[Parsing] Threshold not met, searching across attempts for some content.');
 190                     $this->attempts[] = ['articleContent' => $result, 'textLength' => $length];
 191
 192                     // No luck after removing flags, just return the longest text we found during the different loops
 193                     usort($this->attempts, function ($a, $b) {
 194                         return $a['textLength'] < $b['textLength'];
 195                     });
 196
 197                     // But first check if we actually have something
 198                     if (!$this->attempts[0]['textLength']) {
 199                         $this->logger->emergency('[Parsing] Could not parse text, giving up :(');
 200
 201                         throw new ParseException('Could not parse text.');
 202                     }
 203
 204                     $this->logger->debug('[Parsing] Threshold not met, but found some content in previous attempts.');
 205
 206                     $result = $this->attempts[0]['articleContent'];
 207                     $parseSuccessful = true;
 208                     break;
 209                 }
 210             } else {
 211                 break;
 212             }
 213         }
 214
 215         if ($parseSuccessful) {
 216             $result = $this->postProcessContent($result);
 217
 218             // If we haven't found an excerpt in the article's metadata, use the article's
 219             // first paragraph as the excerpt. This can be used for displaying a preview of
 220             // the article's content.
 221             if (!$this->getExcerpt()) {
 222                 $this->logger->debug('[Parsing] No excerpt text found on metadata, extracting first p node and using it as excerpt.');
 223                 $paragraphs = $result->getElementsByTagName('p');
 224                 if ($paragraphs->length > 0) {
 225                     $this->setExcerpt(trim($paragraphs->item(0)->textContent));
 226                 }
 227             }
 228
 229             $this->setContent($result);
 230
 231             $this->logger->info('*** Parse successful :)');
 232
 233             return true;
 234         }
 235     }
 236
 237     /**
 238      * Creates a DOM Document object and loads the provided HTML on it.
 239      *
 240      * Used for the first load of Readability and subsequent reloads (when disabling flags and rescanning the text)
 241      * Previous versions of Readability used this method one time and cloned the DOM to keep a backup. This caused bugs
 242      * because cloning the DOM object keeps a relation between the clone and the original one, doing changes in both
 243      * objects and ruining the backup.
 244      *
 245      * @param string $html
 246      *
 247      * @return DOMDocument
 248      */
 249     private function loadHTML($html)
 250     {
 251         $this->logger->debug('[Loading] Loading HTML...');
 252
 253         // To avoid throwing a gazillion of errors on malformed HTMLs
 254         libxml_use_internal_errors(true);
 255
 256         $dom = new DOMDocument('1.0', 'utf-8');
 257
 258         if (!$this->configuration->getSubstituteEntities()) {
 259             // Keep the original HTML entities
 260             $dom->substituteEntities = false;
 261         }
 262
 263         if ($this->configuration->getNormalizeEntities()) {
 264             $this->logger->debug('[Loading] Normalized entities via mb_convert_encoding.');
 265             // Replace UTF-8 characters with the HTML Entity equivalent. Useful to fix html with mixed content
 266             $html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8');
 267         }
 268
 269         if ($this->configuration->getSummonCthulhu()) {
 270             $this->logger->debug('[Loading] Removed script tags via regex H̶͈̩̟̬̱͠E̡̨̬͔̳̜͢͠ ̡̧̯͉̩͙̩̹̞̠͎͈̹̥̠͞ͅͅC̶͉̞̘̖̝̗͓̬̯͍͉̤̬͢͢͞Ò̟̘͉͖͎͉̱̭̣̕M̴̯͈̻̱̱̣̗͈̠̙̲̥͘͞E̷̛͙̼̲͍͕̹͍͇̗̻̬̮̭̱̥͢Ş̛̟͔̙̜̤͇̮͍̙̝̀͘');
 271             $html = preg_replace('/<script\b[^>]*>([\s\S]*?)<\/script>/', '', $html);
 272         }
 273
 274         // Prepend the XML tag to avoid having issues with special characters. Should be harmless.
 275         $dom->loadHTML('<?xml encoding="UTF-8">' . $html);
 276         $dom->encoding = 'UTF-8';
 277
 278         $this->removeScripts($dom);
 279
 280         $this->prepDocument($dom);
 281
 282         $this->logger->debug('[Loading] Loaded HTML successfully.');
 283
 284         return $dom;
 285     }
 286
 287     /**
 288      * Tries to guess relevant info from metadata of the html. Sets the results in the Readability properties.
 289      */
 290     private function getMetadata()
 291     {
 292         $this->logger->debug('[Metadata] Retrieving metadata...');
 293
 294         $values = [];
 295         // Match "description", or Twitter's "twitter:description" (Cards)
 296         // in name attribute.
 297         $namePattern = '/^\s*((twitter)\s*:\s*)?(description|title|image)\s*$/i';
 298
 299         // Match Facebook's Open Graph title & description properties.
 300         $propertyPattern = '/^\s*og\s*:\s*(description|title|image)\s*$/i';
 301
 302         foreach ($this->dom->getElementsByTagName('meta') as $meta) {
 303             /* @var DOMNode $meta */
 304             $elementName = $meta->getAttribute('name');
 305             $elementProperty = $meta->getAttribute('property');
 306
 307             if (in_array('author', [$elementName, $elementProperty])) {
 308                 $this->logger->info(sprintf('[Metadata] Found author: \'%s\'', $meta->getAttribute('content')));
 309                 $this->setAuthor($meta->getAttribute('content'));
 310                 continue;
 311             }
 312
 313             $name = null;
 314             if (preg_match($namePattern, $elementName)) {
 315                 $name = $elementName;
 316             } elseif (preg_match($propertyPattern, $elementProperty)) {
 317                 $name = $elementProperty;
 318             }
 319
 320             if ($name) {
 321                 $content = $meta->getAttribute('content');
 322                 if ($content) {
 323                     // Convert to lowercase and remove any whitespace
 324                     // so we can match below.
 325                     $name = preg_replace('/\s/', '', strtolower($name));
 326                     $values[$name] = trim($content);
 327                 }
 328             }
 329         }
 330         if (array_key_exists('description', $values)) {
 331             $this->logger->info(sprintf('[Metadata] Found excerpt in \'description\' tag: \'%s\'', $values['description']));
 332             $this->setExcerpt($values['description']);
 333         } elseif (array_key_exists('og:description', $values)) {
 334             // Use facebook open graph description.
 335             $this->logger->info(sprintf('[Metadata] Found excerpt in \'og:description\' tag: \'%s\'', $values['og:description']));
 336             $this->setExcerpt($values['og:description']);
 337         } elseif (array_key_exists('twitter:description', $values)) {
 338             // Use twitter cards description.
 339             $this->logger->info(sprintf('[Metadata] Found excerpt in \'twitter:description\' tag: \'%s\'', $values['twitter:description']));
 340             $this->setExcerpt($values['twitter:description']);
 341         }
 342
 343         $this->setTitle($this->getArticleTitle());
 344
 345         if (!$this->getTitle()) {
 346             if (array_key_exists('og:title', $values)) {
 347                 // Use facebook open graph title.
 348                 $this->logger->info(sprintf('[Metadata] Found title in \'og:title\' tag: \'%s\'', $values['og:title']));
 349                 $this->setTitle($values['og:title']);
 350             } elseif (array_key_exists('twitter:title', $values)) {
 351                 // Use twitter cards title.
 352                 $this->logger->info(sprintf('[Metadata] Found title in \'twitter:title\' tag: \'%s\'', $values['twitter:title']));
 353                 $this->setTitle($values['twitter:title']);
 354             }
 355         }
 356
 357         if (array_key_exists('og:image', $values) || array_key_exists('twitter:image', $values)) {
 358             if (array_key_exists('og:image', $values)) {
 359                 $this->logger->info(sprintf('[Metadata] Found main image in \'og:image\' tag: \'%s\'', $values['og:image']));
 360                 $this->setImage($values['og:image']);
 361             } else {
 362                 $this->logger->info(sprintf('[Metadata] Found main image in \'twitter:image\' tag: \'%s\'', $values['twitter:image']));
 363                 $this->setImage($values['twitter:image']);
 364             }
 365         }
 366     }
 367
 368     /**
 369      * Returns all the images of the parsed article.
 370      *
 371      * @return array
 372      */
 373     public function getImages()
 374     {
 375         $result = [];
 376         if ($this->getImage()) {
 377             $result[] = $this->getImage();
 378         }
 379
 380         if (null == $this->getDOMDocument()) {
 381             return $result;
 382         }
 383
 384         foreach ($this->getDOMDocument()->getElementsByTagName('img') as $img) {
 385             if ($src = $img->getAttribute('src')) {
 386                 $result[] = $src;
 387             }
 388         }
 389
 390         if ($this->configuration->getFixRelativeURLs()) {
 391             foreach ($result as &$imgSrc) {
 392                 $imgSrc = $this->toAbsoluteURI($imgSrc);
 393             }
 394         }
 395
 396         $result = array_unique(array_filter($result));
 397
 398         return $result;
 399     }
 400
 401     /**
 402      * Tries to get the main article image. Will only update the metadata if the getMetadata function couldn't
 403      * find a correct image.
 404      */
 405     public function getMainImage()
 406     {
 407         $imgUrl = false;
 408
 409         if ($this->getImage() !== null) {
 410             $imgUrl = $this->getImage();
 411         }
 412
 413         if (!$imgUrl) {
 414             foreach ($this->dom->getElementsByTagName('link') as $link) {
 415                 /** @var \DOMElement $link */
 416                 /*
 417                  * Check for the rel attribute, then check if the rel attribute is either img_src or image_src, and
 418                  * finally check for the existence of the href attribute, which should hold the image url.
 419                  */
 420                 if ($link->hasAttribute('rel') && ($link->getAttribute('rel') === 'img_src' || $link->getAttribute('rel') === 'image_src') && $link->hasAttribute('href')) {
 421                     $imgUrl = $link->getAttribute('href');
 422                     break;
 423                 }
 424             }
 425         }
 426
 427         if (!empty($imgUrl) && $this->configuration->getFixRelativeURLs()) {
 428             $this->setImage($this->toAbsoluteURI($imgUrl));
 429         }
 430     }
 431
 432     /**
 433      * Returns the title of the html. Prioritizes the title from the metadata against the title tag.
 434      *
 435      * @return string|null
 436      */
 437     private function getArticleTitle()
 438     {
 439         $originalTitle = null;
 440
 441         if ($this->getTitle()) {
 442             $originalTitle = $this->getTitle();
 443         } else {
 444             $this->logger->debug('[Metadata] Could not find title in metadata, searching for the title tag...');
 445             $titleTag = $this->dom->getElementsByTagName('title');
 446             if ($titleTag->length > 0) {
 447                 $this->logger->info(sprintf('[Metadata] Using title tag as article title: \'%s\'', $titleTag->item(0)->nodeValue));
 448                 $originalTitle = $titleTag->item(0)->nodeValue;
 449             }
 450         }
 451
 452         if ($originalTitle === null) {
 453             return null;
 454         }
 455
 456         $curTitle = $originalTitle;
 457         $titleHadHierarchicalSeparators = false;
 458
 459         /*
 460          * If there's a separator in the title, first remove the final part
 461          *
 462          * Sanity warning: if you eval this match in PHPStorm's "Evaluate expression" box, it will return false
 463          * I can assure you it works properly if you let the code run.
 464          */
 465         if (preg_match('/ [\|\-\\\\\/>»] /i', $curTitle)) {
 466             $titleHadHierarchicalSeparators = (bool)preg_match('/ [\\\\\/>»] /', $curTitle);
 467             $curTitle = preg_replace('/(.*)[\|\-\\\\\/>»] .*/i', '$1', $originalTitle);
 468
 469             $this->logger->info(sprintf('[Metadata] Found hierarchical separators in title, new title is: \'%s\'', $curTitle));
 470
 471             // If the resulting title is too short (3 words or fewer), remove
 472             // the first part instead:
 473             if (count(preg_split('/\s+/', $curTitle)) < 3) {
 474                 $curTitle = preg_replace('/[^\|\-\\\\\/>»]*[\|\-\\\\\/>»](.*)/i', '$1', $originalTitle);
 475                 $this->logger->info(sprintf('[Metadata] Title too short, using the first part of the title instead: \'%s\'', $curTitle));
 476             }
 477         } elseif (strpos($curTitle, ': ') !== false) {
 478             // Check if we have an heading containing this exact string, so we
 479             // could assume it's the full title.
 480             $match = false;
 481             for ($i = 1; $i <= 2; $i++) {
 482                 foreach ($this->dom->getElementsByTagName('h' . $i) as $hTag) {
 483                     // Trim texts to avoid having false negatives when the title is surrounded by spaces or tabs
 484                     if (trim($hTag->nodeValue) === trim($curTitle)) {
 485                         $match = true;
 486                     }
 487                 }
 488             }
 489
 490             // If we don't, let's extract the title out of the original title string.
 491             if (!$match) {
 492                 $curTitle = substr($originalTitle, strrpos($originalTitle, ':') + 1);
 493
 494                 $this->logger->info(sprintf('[Metadata] Title has a colon in the middle, new title is: \'%s\'', $curTitle));
 495
 496                 // If the title is now too short, try the first colon instead:
 497                 if (count(preg_split('/\s+/', $curTitle)) < 3) {
 498                     $curTitle = substr($originalTitle, strpos($originalTitle, ':') + 1);
 499                     $this->logger->info(sprintf('[Metadata] Title too short, using the first part of the title instead: \'%s\'', $curTitle));
 500                 } elseif (count(preg_split('/\s+/', substr($curTitle, 0, strpos($curTitle, ':')))) > 5) {
 501                     // But if we have too many words before the colon there's something weird
 502                     // with the titles and the H tags so let's just use the original title instead
 503                     $curTitle = $originalTitle;
 504                 }
 505             }
 506         } elseif (mb_strlen($curTitle) > 150 || mb_strlen($curTitle) < 15) {
 507             $hOnes = $this->dom->getElementsByTagName('h1');
 508
 509             if ($hOnes->length === 1) {
 510                 $curTitle = $hOnes->item(0)->nodeValue;
 511                 $this->logger->info(sprintf('[Metadata] Using title from an H1 node: \'%s\'', $curTitle));
 512             }
 513         }
 514
 515         $curTitle = trim($curTitle);
 516
 517         /*
 518          * If we now have 4 words or fewer as our title, and either no
 519          * 'hierarchical' separators (\, /, > or ») were found in the original
 520          * title or we decreased the number of words by more than 1 word, use
 521          * the original title.
 522          */
 523         $curTitleWordCount = count(preg_split('/\s+/', $curTitle));
 524         $originalTitleWordCount = count(preg_split('/\s+/', preg_replace('/[\|\-\\\\\/>»]+/', '', $originalTitle))) - 1;
 525
 526         if ($curTitleWordCount <= 4 &&
 527             (!$titleHadHierarchicalSeparators || $curTitleWordCount !== $originalTitleWordCount)) {
 528             $curTitle = $originalTitle;
 529
 530             $this->logger->info(sprintf('Using title from an H1 node: \'%s\'', $curTitle));
 531         }
 532
 533         return $curTitle;
 534     }
 535
 536     /**
 537      * Convert URI to an absolute URI.
 538      *
 539      * @param $uri string URI to convert
 540      *
 541      * @return string
 542      */
 543     private function toAbsoluteURI($uri)
 544     {
 545         list($pathBase, $scheme, $prePath) = $this->getPathInfo($this->configuration->getOriginalURL());
 546
 547         // If this is already an absolute URI, return it.
 548         if (preg_match('/^[a-zA-Z][a-zA-Z0-9\+\-\.]*:/', $uri)) {
 549             return $uri;
 550         }
 551
 552         // Scheme-rooted relative URI.
 553         if (substr($uri, 0, 2) === '//') {
 554             return $scheme . '://' . substr($uri, 2);
 555         }
 556
 557         // Prepath-rooted relative URI.
 558         if (substr($uri, 0, 1) === '/') {
 559             return $prePath . $uri;
 560         }
 561
 562         // Dotslash relative URI.
 563         if (strpos($uri, './') === 0) {
 564             return $pathBase . substr($uri, 2);
 565         }
 566         // Ignore hash URIs:
 567         if (substr($uri, 0, 1) === '#') {
 568             return $uri;
 569         }
 570
 571         // Standard relative URI; add entire path. pathBase already includes a
 572         // trailing "/".
 573         return $pathBase . $uri;
 574     }
 575
 576     /**
 577      * Returns full path info of an URL.
 578      *
 579      * @param  string $url
 580      *
 581      * @return array [$pathBase, $scheme, $prePath]
 582      */
 583     public function getPathInfo($url)
 584     {
 585         // Check for base URLs
 586         if ($this->dom->baseURI !== null) {
 587             if (substr($this->dom->baseURI, 0, 1) === '/') {
 588                 // URLs starting with '/' override completely the URL defined in the link
 589                 $pathBase = parse_url($url, PHP_URL_SCHEME) . '://' . parse_url($url, PHP_URL_HOST) . $this->dom->baseURI;
 590             } else {
 591                 // Otherwise just prepend the base to the actual path
 592                 $pathBase = parse_url($url, PHP_URL_SCHEME) . '://' . parse_url($url, PHP_URL_HOST) . dirname(parse_url($url, PHP_URL_PATH)) . '/' . rtrim($this->dom->baseURI, '/') . '/';
 593             }
 594         } else {
 595             $pathBase = parse_url($url, PHP_URL_SCHEME) . '://' . parse_url($url, PHP_URL_HOST) . dirname(parse_url($url, PHP_URL_PATH)) . '/';
 596         }
 597
 598         $scheme = parse_url($pathBase, PHP_URL_SCHEME);
 599         $prePath = $scheme . '://' . parse_url($pathBase, PHP_URL_HOST);
 600
 601         return [$pathBase, $scheme, $prePath];
 602     }
 603
 604     /**
 605      * Gets nodes from the root element.
 606      *
 607      * @param $node DOMNode|DOMText
 608      *
 609      * @return array
 610      */
 611     private function getNodes($node)
 612     {
 613         $this->logger->info('[Get Nodes] Retrieving nodes...');
 614
 615         $stripUnlikelyCandidates = $this->configuration->getStripUnlikelyCandidates();
 616
 617         $elementsToScore = [];
 618
 619         /*
 620          * First, node prepping. Trash nodes that look cruddy (like ones with the
 621          * class name "comment", etc), and turn divs into P tags where they have been
 622          * used inappropriately (as in, where they contain no other block level elements.)
 623          */
 624
 625         while ($node) {
 626             $matchString = $node->getAttribute('class') . ' ' . $node->getAttribute('id');
 627
 628             // Remove DOMComments nodes as we don't need them and mess up children counting
 629             if ($node->nodeType === XML_COMMENT_NODE) {
 630                 $this->logger->debug(sprintf('[Get Nodes] Found comment node, removing... Node content was: \'%s\'', substr($node->nodeValue, 0, 128)));
 631                 $node = NodeUtility::removeAndGetNext($node);
 632                 continue;
 633             }
 634
 635             // Check to see if this node is a byline, and remove it if it is.
 636             if ($this->checkByline($node, $matchString)) {
 637                 $this->logger->debug(sprintf('[Get Nodes] Found byline, removing... Node content was: \'%s\'', substr($node->nodeValue, 0, 128)));
 638                 $node = NodeUtility::removeAndGetNext($node);
 639                 continue;
 640             }
 641
 642             // Remove unlikely candidates
 643             if ($stripUnlikelyCandidates) {
 644                 if (
 645                     preg_match(NodeUtility::$regexps['unlikelyCandidates'], $matchString) &&
 646                     !preg_match(NodeUtility::$regexps['okMaybeItsACandidate'], $matchString) &&
 647                     $node->nodeName !== 'body' &&
 648                     $node->nodeName !== 'a'
 649                 ) {
 650                     $this->logger->debug(sprintf('[Get Nodes] Removing unlikely candidate. Node content was: \'%s\'', substr($node->nodeValue, 0, 128)));
 651                     $node = NodeUtility::removeAndGetNext($node);
 652                     continue;
 653                 }
 654             }
 655
 656             // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe).
 657             if (($node->nodeName === 'div' || $node->nodeName === 'section' || $node->nodeName === 'header' ||
 658                     $node->nodeName === 'h1' || $node->nodeName === 'h2' || $node->nodeName === 'h3' ||
 659                     $node->nodeName === 'h4' || $node->nodeName === 'h5' || $node->nodeName === 'h6' ||
 660                     $node->nodeName === 'p') &&
 661                 $node->isElementWithoutContent()) {
 662                 $this->logger->debug(sprintf('[Get Nodes] Removing empty \'%s\' node.', $node->nodeName));
 663                 $node = NodeUtility::removeAndGetNext($node);
 664                 continue;
 665             }
 666
 667             if (in_array(strtolower($node->nodeName), $this->defaultTagsToScore)) {
 668                 $this->logger->debug(sprintf('[Get Nodes] Adding node to score list, node content is: \'%s\'', substr($node->nodeValue, 0, 128)));
 669                 $elementsToScore[] = $node;
 670             }
 671
 672             // Turn all divs that don't have children block level elements into p's
 673             if ($node->nodeName === 'div') {
 674                 /*
 675                  * Sites like http://mobile.slate.com encloses each paragraph with a DIV
 676                  * element. DIVs with only a P element inside and no text content can be
 677                  * safely converted into plain P elements to avoid confusing the scoring
 678                  * algorithm with DIVs with are, in practice, paragraphs.
 679                  */
 680                 if ($node->hasSinglePNode()) {
 681                     $this->logger->debug(sprintf('[Get Nodes] Found DIV with a single P node, removing DIV. Node content is: \'%s\'', substr($node->nodeValue, 0, 128)));
 682                     $pNode = $node->getChildren(true)[0];
 683                     $node->parentNode->replaceChild($pNode, $node);
 684                     $node = $pNode;
 685                     $elementsToScore[] = $node;
 686                 } elseif (!$node->hasSingleChildBlockElement()) {
 687                     $this->logger->debug(sprintf('[Get Nodes] Found DIV with a single child block element, converting to a P node. Node content is: \'%s\'', substr($node->nodeValue, 0, 128)));
 688                     $node = NodeUtility::setNodeTag($node, 'p');
 689                     $elementsToScore[] = $node;
 690                 } else {
 691                     // EXPERIMENTAL
 692                     foreach ($node->getChildren() as $child) {
 693                         /** @var $child DOMNode */
 694                         if ($child->nodeType === XML_TEXT_NODE && mb_strlen(trim($child->getTextContent())) > 0) {
 695                             $this->logger->debug(sprintf('[Get Nodes] Found DIV a text node inside, converting to a P node. Node content is: \'%s\'', substr($node->nodeValue, 0, 128)));
 696                             $newNode = $node->createNode($child, 'p');
 697                             $child->parentNode->replaceChild($newNode, $child);
 698                         }
 699                     }
 700                 }
 701             }
 702
 703             $node = NodeUtility::getNextNode($node);
 704         }
 705
 706         return $elementsToScore;
 707     }
 708
 709     /**
 710      * Checks if the node is a byline.
 711      *
 712      * @param DOMNode $node
 713      * @param string $matchString
 714      *
 715      * @return bool
 716      */
 717     private function checkByline($node, $matchString)
 718     {
 719         if (!$this->configuration->getArticleByLine()) {
 720             return false;
 721         }
 722
 723         /*
 724          * Check if the byline is already set
 725          */
 726         if ($this->getAuthor()) {
 727             return false;
 728         }
 729
 730         $rel = $node->getAttribute('rel');
 731
 732         if ($rel === 'author' || preg_match(NodeUtility::$regexps['byline'], $matchString) && $this->isValidByline($node->getTextContent())) {
 733             $this->logger->info(sprintf('[Metadata] Found article author: \'%s\'', $node->getTextContent()));
 734             $this->setAuthor(trim($node->getTextContent()));
 735
 736             return true;
 737         }
 738
 739         return false;
 740     }
 741
 742     /**
 743      * Checks the validity of a byLine. Based on string length.
 744      *
 745      * @param string $text
 746      *
 747      * @return bool
 748      */
 749     private function isValidByline($text)
 750     {
 751         if (gettype($text) == 'string') {
 752             $byline = trim($text);
 753
 754             return (mb_strlen($byline) > 0) && (mb_strlen($text) < 100);
 755         }
 756
 757         return false;
 758     }
 759
 760     /**
 761      * Removes all the scripts of the html.
 762      *
 763      * @param DOMDocument $dom
 764      */
 765     private function removeScripts(DOMDocument $dom)
 766     {
 767         $toRemove = ['script', 'noscript'];
 768
 769         foreach ($toRemove as $tag) {
 770             while ($script = $dom->getElementsByTagName($tag)) {
 771                 if ($script->item(0)) {
 772                     $script->item(0)->parentNode->removeChild($script->item(0));
 773                 } else {
 774                     break;
 775                 }
 776             }
 777         }
 778     }
 779
 780     /**
 781      * Prepares the document for parsing.
 782      *
 783      * @param DOMDocument $dom
 784      */
 785     private function prepDocument(DOMDocument $dom)
 786     {
 787         $this->logger->info('[PrepDocument] Preparing document for parsing...');
 788
 789         /*
 790          * DOMNodeList must be converted to an array before looping over it.
 791          * This is done to avoid node shifting when removing nodes.
 792          *
 793          * Reverse traversing cannot be done here because we need to find brs that are right next to other brs.
 794          * (If we go the other way around we need to search for previous nodes forcing the creation of new functions
 795          * that will be used only here)
 796          */
 797         foreach (iterator_to_array($dom->getElementsByTagName('br')) as $br) {
 798             $next = $br->nextSibling;
 799
 800             /*
 801              * Whether 2 or more <br> elements have been found and replaced with a
 802              * <p> block.
 803              */
 804             $replaced = false;
 805
 806             /*
 807              * If we find a <br> chain, remove the <br>s until we hit another element
 808              * or non-whitespace. This leaves behind the first <br> in the chain
 809              * (which will be replaced with a <p> later).
 810              */
 811             while (($next = NodeUtility::nextElement($next)) && ($next->nodeName === 'br')) {
 812                 $this->logger->debug('[PrepDocument] Removing chain of BR nodes...');
 813
 814                 $replaced = true;
 815                 $brSibling = $next->nextSibling;
 816                 $next->parentNode->removeChild($next);
 817                 $next = $brSibling;
 818             }
 819
 820             /*
 821              * If we removed a <br> chain, replace the remaining <br> with a <p>. Add
 822              * all sibling nodes as children of the <p> until we hit another <br>
 823              * chain.
 824              */
 825
 826             if ($replaced) {
 827                 $p = $dom->createElement('p');
 828                 $br->parentNode->replaceChild($p, $br);
 829
 830                 $next = $p->nextSibling;
 831                 while ($next) {
 832                     // If we've hit another <br><br>, we're done adding children to this <p>.
 833                     if ($next->nodeName === 'br') {
 834                         $nextElem = NodeUtility::nextElement($next);
 835                         if ($nextElem && $nextElem->nodeName === 'br') {
 836                             break;
 837                         }
 838                     }
 839
 840                     $this->logger->debug('[PrepDocument] Replacing BR with a P node...');
 841
 842                     // Otherwise, make this node a child of the new <p>.
 843                     $sibling = $next->nextSibling;
 844                     $p->appendChild($next);
 845                     $next = $sibling;
 846                 }
 847             }
 848         }
 849
 850         // Replace font tags with span
 851         $fonts = $dom->getElementsByTagName('font');
 852         $length = $fonts->length;
 853         for ($i = 0; $i < $length; $i++) {
 854             $this->logger->debug('[PrepDocument] Converting font tag into a span tag.');
 855             $font = $fonts->item($length - 1 - $i);
 856             NodeUtility::setNodeTag($font, 'span', true);
 857         }
 858     }
 859
 860     /**
 861      * Assign scores to each node. Returns full article parsed or false on error.
 862      *
 863      * @param array $nodes
 864      *
 865      * @return DOMDocument|bool
 866      */
 867     private function rateNodes($nodes)
 868     {
 869         $this->logger->info('[Rating] Rating nodes...');
 870
 871         $candidates = [];
 872
 873         /** @var DOMElement $node */
 874         foreach ($nodes as $node) {
 875             if (is_null($node->parentNode)) {
 876                 continue;
 877             }
 878
 879             // Discard nodes with less than 25 characters, without blank space
 880             if (mb_strlen($node->getTextContent(true)) < 25) {
 881                 continue;
 882             }
 883
 884             $ancestors = $node->getNodeAncestors();
 885
 886             // Exclude nodes with no ancestor
 887             if (count($ancestors) === 0) {
 888                 continue;
 889             }
 890
 891             // Start with a point for the paragraph itself as a base.
 892             $contentScore = 1;
 893
 894             // Add points for any commas within this paragraph.
 895             $contentScore += count(explode(',', $node->getTextContent(true)));
 896
 897             // For every 100 characters in this paragraph, add another point. Up to 3 points.
 898             $contentScore += min(floor(mb_strlen($node->getTextContent(true)) / 100), 3);
 899
 900             $this->logger->debug(sprintf('[Rating] Node score %s, content: \'%s\'', $contentScore, substr($node->nodeValue, 0, 128)));
 901
 902             /** @var $ancestor DOMElement */
 903             foreach ($ancestors as $level => $ancestor) {
 904                 $this->logger->debug('[Rating] Found ancestor, initializing and adding it as a candidate...');
 905                 if (!$ancestor->isInitialized()) {
 906                     $ancestor->initializeNode($this->configuration->getWeightClasses());
 907                     $candidates[] = $ancestor;
 908                 }
 909
 910                 /*
 911                  * Node score divider:
 912                  *  - parent:             1 (no division)
 913                  *  - grandparent:        2
 914                  *  - great grandparent+: ancestor level * 3
 915                  */
 916
 917                 if ($level === 0) {
 918                     $scoreDivider = 1;
 919                 } elseif ($level === 1) {
 920                     $scoreDivider = 2;
 921                 } else {
 922                     $scoreDivider = $level * 3;
 923                 }
 924
 925                 $currentScore = $ancestor->contentScore;
 926                 $ancestor->contentScore = $currentScore + ($contentScore / $scoreDivider);
 927
 928                 $this->logger->debug(sprintf('[Rating] Ancestor score %s, value: \'%s\'', $ancestor->contentScore, substr($ancestor->nodeValue, 0, 128)));
 929             }
 930         }
 931
 932         /*
 933          * After we've calculated scores, loop through all of the possible
 934          * candidate nodes we found and find the one with the highest score.
 935          */
 936
 937         $topCandidates = [];
 938         foreach ($candidates as $candidate) {
 939
 940             /*
 941              * Scale the final candidates score based on link density. Good content
 942              * should have a relatively small link density (5% or less) and be mostly
 943              * unaffected by this operation.
 944              */
 945
 946             $candidate->contentScore = $candidate->contentScore * (1 - $candidate->getLinkDensity());
 947
 948             for ($i = 0; $i < $this->configuration->getMaxTopCandidates(); $i++) {
 949                 $aTopCandidate = isset($topCandidates[$i]) ? $topCandidates[$i] : null;
 950
 951                 if (!$aTopCandidate || $candidate->contentScore > $aTopCandidate->contentScore) {
 952                     array_splice($topCandidates, $i, 0, [$candidate]);
 953                     if (count($topCandidates) > $this->configuration->getMaxTopCandidates()) {
 954                         array_pop($topCandidates);
 955                     }
 956                     break;
 957                 }
 958             }
 959         }
 960
 961         $topCandidate = isset($topCandidates[0]) ? $topCandidates[0] : null;
 962         $parentOfTopCandidate = null;
 963
 964         /*
 965          * If we still have no top candidate, just use the body as a last resort.
 966          * We also have to copy the body node so it is something we can modify.
 967          */
 968
 969         if ($topCandidate === null || $topCandidate->nodeName === 'body') {
 970             $this->logger->info('[Rating] No top candidate found or top candidate is the body tag. Moving all child nodes to a new DIV node.');
 971
 972             // Move all of the page's children into topCandidate
 973             $topCandidate = new DOMDocument('1.0', 'utf-8');
 974             $topCandidate->encoding = 'UTF-8';
 975             $topCandidate->appendChild($topCandidate->createElement('div', ''));
 976             $kids = $this->dom->getElementsByTagName('body')->item(0)->childNodes;
 977
 978             // Cannot be foreached, don't ask me why.
 979             for ($i = 0; $i < $kids->length; $i++) {
 980                 $import = $topCandidate->importNode($kids->item($i), true);
 981                 $topCandidate->firstChild->appendChild($import);
 982             }
 983
 984             // Candidate must be created using firstChild to grab the DOMElement instead of the DOMDocument.
 985             $topCandidate = $topCandidate->firstChild;
 986         } elseif ($topCandidate) {
 987             $this->logger->info(sprintf('[Rating] Found top candidate, score: %s', $topCandidate->contentScore));
 988             // Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array
 989             // and whose scores are quite closed with current `topCandidate` node.
 990             $alternativeCandidateAncestors = [];
 991             for ($i = 1; $i < count($topCandidates); $i++) {
 992                 if ($topCandidates[$i]->contentScore / $topCandidate->contentScore >= 0.75) {
 993                     array_push($alternativeCandidateAncestors, $topCandidates[$i]->getNodeAncestors(false));
 994                 }
 995             }
 996
 997             $MINIMUM_TOPCANDIDATES = 3;
 998             if (count($alternativeCandidateAncestors) >= $MINIMUM_TOPCANDIDATES) {
 999                 $parentOfTopCandidate = $topCandidate->parentNode;
1000                 while ($parentOfTopCandidate->nodeName !== 'body') {
1001                     $listsContainingThisAncestor = 0;
1002                     for ($ancestorIndex = 0; $ancestorIndex < count($alternativeCandidateAncestors) && $listsContainingThisAncestor < $MINIMUM_TOPCANDIDATES; $ancestorIndex++) {
1003                         $listsContainingThisAncestor += (int)in_array($parentOfTopCandidate, $alternativeCandidateAncestors[$ancestorIndex]);
1004                     }
1005                     if ($listsContainingThisAncestor >= $MINIMUM_TOPCANDIDATES) {
1006                         $topCandidate = $parentOfTopCandidate;
1007                         break;
1008                     }
1009                     $parentOfTopCandidate = $parentOfTopCandidate->parentNode;
1010                 }
1011             }
1012
1013             /*
1014              * Because of our bonus system, parents of candidates might have scores
1015              * themselves. They get half of the node. There won't be nodes with higher
1016              * scores than our topCandidate, but if we see the score going *up* in the first
1017              * few steps up the tree, that's a decent sign that there might be more content
1018              * lurking in other places that we want to unify in. The sibling stuff
1019              * below does some of that - but only if we've looked high enough up the DOM
1020              * tree.
1021              */
1022
1023             $parentOfTopCandidate = $topCandidate->parentNode;
1024             $lastScore = $topCandidate->contentScore;
1025
1026             // The scores shouldn't get too low.
1027             $scoreThreshold = $lastScore / 3;
1028
1029             /* @var DOMElement $parentOfTopCandidate */
1030             // Check if we are actually dealing with a DOMNode and not a DOMDocument node or higher
1031             while ($parentOfTopCandidate->nodeName !== 'body' && $parentOfTopCandidate->nodeType === XML_ELEMENT_NODE) {
1032                 $parentScore = $parentOfTopCandidate->contentScore;
1033                 if ($parentScore < $scoreThreshold) {
1034                     break;
1035                 }
1036
1037                 if ($parentScore > $lastScore) {
1038                     // Alright! We found a better parent to use.
1039                     $topCandidate = $parentOfTopCandidate;
1040                     $this->logger->info('[Rating] Found a better top candidate.');
1041                     break;
1042                 }
1043                 $lastScore = $parentOfTopCandidate->contentScore;
1044                 $parentOfTopCandidate = $parentOfTopCandidate->parentNode;
1045             }
1046
1047             // If the top candidate is the only child, use parent instead. This will help sibling
1048             // joining logic when adjacent content is actually located in parent's sibling node.
1049             $parentOfTopCandidate = $topCandidate->parentNode;
1050             while ($parentOfTopCandidate->nodeName !== 'body' && count($parentOfTopCandidate->getChildren(true)) === 1) {
1051                 $topCandidate = $parentOfTopCandidate;
1052                 $parentOfTopCandidate = $topCandidate->parentNode;
1053             }
1054         }
1055
1056         /*
1057          * Now that we have the top candidate, look through its siblings for content
1058          * that might also be related. Things like preambles, content split by ads
1059          * that we removed, etc.
1060          */
1061
1062         $this->logger->info('[Rating] Creating final article content document...');
1063
1064         $articleContent = new DOMDocument('1.0', 'utf-8');
1065         $articleContent->createElement('div');
1066
1067         $siblingScoreThreshold = max(10, $topCandidate->contentScore * 0.2);
1068         // Keep potential top candidate's parent node to try to get text direction of it later.
1069         $parentOfTopCandidate = $topCandidate->parentNode;
1070         $siblings = $parentOfTopCandidate->getChildren();
1071
1072         $hasContent = false;
1073
1074         $this->logger->info('[Rating] Adding top candidate siblings...');
1075
1076         /** @var DOMElement $sibling */
1077         foreach ($siblings as $sibling) {
1078             $append = false;
1079
1080             if ($sibling === $topCandidate) {
1081                 $this->logger->debug('[Rating] Sibling is equal to the top candidate, adding to the final article...');
1082
1083                 $append = true;
1084             } else {
1085                 $contentBonus = 0;
1086
1087                 // Give a bonus if sibling nodes and top candidates have the example same classname
1088                 if ($sibling->getAttribute('class') === $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') !== '') {
1089                     $contentBonus += $topCandidate->contentScore * 0.2;
1090                 }
1091                 if ($sibling->contentScore + $contentBonus >= $siblingScoreThreshold) {
1092                     $append = true;
1093                 } elseif ($sibling->nodeName === 'p') {
1094                     $linkDensity = $sibling->getLinkDensity();
1095                     $nodeContent = $sibling->getTextContent(true);
1096
1097                     if (mb_strlen($nodeContent) > 80 && $linkDensity < 0.25) {
1098                         $append = true;
1099                     } elseif ($nodeContent && mb_strlen($nodeContent) < 80 && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent)) {
1100                         $append = true;
1101                     }
1102                 }
1103             }
1104
1105             if ($append) {
1106                 $this->logger->debug(sprintf('[Rating] Appending sibling to final article, content is: \'%s\'', substr($sibling->nodeValue, 0, 128)));
1107
1108                 $hasContent = true;
1109
1110                 if (!in_array(strtolower($sibling->nodeName), $this->alterToDIVExceptions)) {
1111                     /*
1112                      * We have a node that isn't a common block level element, like a form or td tag.
1113                      * Turn it into a div so it doesn't get filtered out later by accident.
1114                      */
1115
1116                     $sibling = NodeUtility::setNodeTag($sibling, 'div');
1117                 }
1118
1119                 $import = $articleContent->importNode($sibling, true);
1120                 $articleContent->appendChild($import);
1121
1122                 /*
1123                  * No node shifting needs to be check because when calling getChildren, an array is made with the
1124                  * children of the parent node, instead of using the DOMElement childNodes function, which, when used
1125                  * along with appendChild, would shift the nodes position and the current foreach will behave in
1126                  * unpredictable ways.
1127                  */
1128             }
1129         }
1130
1131         $articleContent = $this->prepArticle($articleContent);
1132
1133         if ($hasContent) {
1134             // Find out text direction from ancestors of final top candidate.
1135             $ancestors = array_merge([$parentOfTopCandidate, $topCandidate], $parentOfTopCandidate->getNodeAncestors());
1136             foreach ($ancestors as $ancestor) {
1137                 $articleDir = $ancestor->getAttribute('dir');
1138                 if ($articleDir) {
1139                     $this->setDirection($articleDir);
1140                     $this->logger->debug(sprintf('[Rating] Found article direction: %s', $articleDir));
1141                     break;
1142                 }
1143             }
1144
1145             return $articleContent;
1146         } else {
1147             return false;
1148         }
1149     }
1150
1151     /**
1152      * Cleans up the final article.
1153      *
1154      * @param DOMDocument $article
1155      *
1156      * @return DOMDocument
1157      */
1158     public function prepArticle(DOMDocument $article)
1159     {
1160         $this->logger->info('[PrepArticle] Preparing final article...');
1161
1162         $this->_cleanStyles($article);
1163         $this->_clean($article, 'style');
1164
1165         // Check for data tables before we continue, to avoid removing items in
1166         // those tables, which will often be isolated even though they're
1167         // visually linked to other content-ful elements (text, images, etc.).
1168         $this->_markDataTables($article);
1169
1170         // Clean out junk from the article content
1171         $this->_cleanConditionally($article, 'form');
1172         $this->_cleanConditionally($article, 'fieldset');
1173         $this->_clean($article, 'object');
1174         $this->_clean($article, 'embed');
1175         $this->_clean($article, 'h1');
1176         $this->_clean($article, 'footer');
1177         $this->_clean($article, 'link');
1178
1179         // Clean out elements have "share" in their id/class combinations from final top candidates,
1180         // which means we don't remove the top candidates even they have "share".
1181         foreach ($article->childNodes as $child) {
1182             $this->_cleanMatchedNodes($child, '/share/i');
1183         }
1184
1185         /*
1186          * If there is only one h2 and its text content substantially equals article title,
1187          * they are probably using it as a header and not a subheader,
1188          * so remove it since we already extract the title separately.
1189          */
1190         $h2 = $article->getElementsByTagName('h2');
1191         if ($h2->length === 1) {
1192             $lengthSimilarRate = (mb_strlen($h2->item(0)->textContent) - mb_strlen($this->getTitle())) / max(mb_strlen($this->getTitle()), 1);
1193
1194             if (abs($lengthSimilarRate) < 0.5) {
1195                 if ($lengthSimilarRate > 0) {
1196                     $titlesMatch = strpos($h2->item(0)->textContent, $this->getTitle()) !== false;
1197                 } else {
1198                     $titlesMatch = strpos($this->getTitle(), $h2->item(0)->textContent) !== false;
1199                 }
1200                 if ($titlesMatch) {
1201                     $this->logger->info('[PrepArticle] Found title repeated in an H2 node, removing...');
1202                     $this->_clean($article, 'h2');
1203                 }
1204             }
1205         }
1206
1207         $this->_clean($article, 'iframe');
1208         $this->_clean($article, 'input');
1209         $this->_clean($article, 'textarea');
1210         $this->_clean($article, 'select');
1211         $this->_clean($article, 'button');
1212         $this->_cleanHeaders($article);
1213
1214         // Do these last as the previous stuff may have removed junk
1215         // that will affect these
1216         $this->_cleanConditionally($article, 'table');
1217         $this->_cleanConditionally($article, 'ul');
1218         $this->_cleanConditionally($article, 'div');
1219
1220         $this->_cleanExtraParagraphs($article);
1221
1222         foreach (iterator_to_array($article->getElementsByTagName('br')) as $br) {
1223             $next = $br->nextSibling;
1224             if ($next && $next->nodeName === 'p') {
1225                 $this->logger->debug('[PrepArticle] Removing br node next to a p node.');
1226                 $br->parentNode->removeChild($br);
1227             }
1228         }
1229
1230         return $article;
1231     }
1232
1233     /**
1234      * Look for 'data' (as opposed to 'layout') tables, for which we use
1235      * similar checks as
1236      * https://dxr.mozilla.org/mozilla-central/rev/71224049c0b52ab190564d3ea0eab089a159a4cf/accessible/html/HTMLTableAccessible.cpp#920.
1237      *
1238      * @param DOMDocument $article
1239      *
1240      * @return void
1241      */
1242     public function _markDataTables(DOMDocument $article)
1243     {
1244         $tables = $article->getElementsByTagName('table');
1245         foreach ($tables as $table) {
1246             /** @var DOMElement $table */
1247             $role = $table->getAttribute('role');
1248             if ($role === 'presentation') {
1249                 $table->setReadabilityDataTable(false);
1250                 continue;
1251             }
1252             $datatable = $table->getAttribute('datatable');
1253             if ($datatable == '0') {
1254                 $table->setReadabilityDataTable(false);
1255                 continue;
1256             }
1257             $summary = $table->getAttribute('summary');
1258             if ($summary) {
1259                 $table->setReadabilityDataTable(true);
1260                 continue;
1261             }
1262
1263             $caption = $table->getElementsByTagName('caption');
1264             if ($caption->length > 0 && $caption->item(0)->childNodes->length > 0) {
1265                 $table->setReadabilityDataTable(true);
1266                 continue;
1267             }
1268
1269             // If the table has a descendant with any of these tags, consider a data table:
1270             foreach (['col', 'colgroup', 'tfoot', 'thead', 'th'] as $dataTableDescendants) {
1271                 if ($table->getElementsByTagName($dataTableDescendants)->length > 0) {
1272                     $table->setReadabilityDataTable(true);
1273                     continue 2;
1274                 }
1275             }
1276
1277             // Nested tables indicate a layout table:
1278             if ($table->getElementsByTagName('table')->length > 0) {
1279                 $table->setReadabilityDataTable(false);
1280                 continue;
1281             }
1282
1283             $sizeInfo = $table->getRowAndColumnCount();
1284             if ($sizeInfo['rows'] >= 10 || $sizeInfo['columns'] > 4) {
1285                 $table->setReadabilityDataTable(true);
1286                 continue;
1287             }
1288             // Now just go by size entirely:
1289             $table->setReadabilityDataTable($sizeInfo['rows'] * $sizeInfo['columns'] > 10);
1290         }
1291     }
1292
1293     /**
1294      * Remove the style attribute on every e and under.
1295      *
1296      * @param $node DOMDocument|DOMNode
1297      **/
1298     public function _cleanStyles($node)
1299     {
1300         if (property_exists($node, 'tagName') && $node->tagName === 'svg') {
1301             return;
1302         }
1303
1304         // Do not bother if there's no method to remove an attribute
1305         if (method_exists($node, 'removeAttribute')) {
1306             $presentational_attributes = ['align', 'background', 'bgcolor', 'border', 'cellpadding', 'cellspacing', 'frame', 'hspace', 'rules', 'style', 'valign', 'vspace'];
1307             // Remove `style` and deprecated presentational attributes
1308             foreach ($presentational_attributes as $presentational_attribute) {
1309                 $node->removeAttribute($presentational_attribute);
1310             }
1311
1312             $deprecated_size_attribute_elems = ['table', 'th', 'td', 'hr', 'pre'];
1313             if (property_exists($node, 'tagName') && in_array($node->tagName, $deprecated_size_attribute_elems)) {
1314                 $node->removeAttribute('width');
1315                 $node->removeAttribute('height');
1316             }
1317         }
1318
1319         $cur = $node->firstChild;
1320         while ($cur !== null) {
1321             $this->_cleanStyles($cur);
1322             $cur = $cur->nextSibling;
1323         }
1324     }
1325
1326     /**
1327      * Clean out elements whose id/class combinations match specific string.
1328      *
1329      * @param $node DOMElement Node to clean
1330      * @param $regex string Match id/class combination.
1331      *
1332      * @return void
1333      **/
1334     public function _cleanMatchedNodes($node, $regex)
1335     {
1336         $endOfSearchMarkerNode = NodeUtility::getNextNode($node, true);
1337         $next = NodeUtility::getNextNode($node);
1338         while ($next && $next !== $endOfSearchMarkerNode) {
1339             if (preg_match($regex, sprintf('%s %s', $next->getAttribute('class'), $next->getAttribute('id')))) {
1340                 $this->logger->debug(sprintf('Removing matched node with regex: \'%s\', node class was: \'%s\', id: \'%s\'', $regex, $next->getAttribute('class'), $next->getAttribute('id')));
1341                 $next = NodeUtility::removeAndGetNext($next);
1342             } else {
1343                 $next = NodeUtility::getNextNode($next);
1344             }
1345         }
1346     }
1347
1348     /**
1349      * @param DOMDocument $article
1350      *
1351      * @return void
1352      */
1353     public function _cleanExtraParagraphs(DOMDocument $article)
1354     {
1355         $paragraphs = $article->getElementsByTagName('p');
1356         $length = $paragraphs->length;
1357
1358         for ($i = 0; $i < $length; $i++) {
1359             $paragraph = $paragraphs->item($length - 1 - $i);
1360
1361             $imgCount = $paragraph->getElementsByTagName('img')->length;
1362             $embedCount = $paragraph->getElementsByTagName('embed')->length;
1363             $objectCount = $paragraph->getElementsByTagName('object')->length;
1364             // At this point, nasty iframes have been removed, only remain embedded video ones.
1365             $iframeCount = $paragraph->getElementsByTagName('iframe')->length;
1366             $totalCount = $imgCount + $embedCount + $objectCount + $iframeCount;
1367
1368             if ($totalCount === 0 && !preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $paragraph->textContent)) {
1369                 $this->logger->debug(sprintf('[PrepArticle] Removing extra paragraph. Text content was: \'%s\'', substr($paragraph->textContent, 0, 128)));
1370                 $paragraph->parentNode->removeChild($paragraph);
1371             }
1372         }
1373     }
1374
1375     /**
1376      * @param DOMDocument $article
1377      *
1378      * @return void
1379      */
1380     public function _cleanConditionally(DOMDocument $article, $tag)
1381     {
1382         if (!$this->configuration->getCleanConditionally()) {
1383             return;
1384         }
1385
1386         $isList = in_array($tag, ['ul', 'ol']);
1387
1388         /*
1389          * Gather counts for other typical elements embedded within.
1390          * Traverse backwards so we can remove nodes at the same time
1391          * without effecting the traversal.
1392          */
1393
1394         $DOMNodeList = $article->getElementsByTagName($tag);
1395         $length = $DOMNodeList->length;
1396         for ($i = 0; $i < $length; $i++) {
1397             /** @var $node DOMElement */
1398             $node = $DOMNodeList->item($length - 1 - $i);
1399
1400             // First check if we're in a data table, in which case don't remove us.
1401             if ($node->hasAncestorTag($node, 'table', -1) && $node->isReadabilityDataTable()) {
1402                 continue;
1403             }
1404
1405             $weight = 0;
1406             if ($this->configuration->getWeightClasses()) {
1407                 $weight = $node->getClassWeight();
1408             }
1409
1410             if ($weight < 0) {
1411                 $this->logger->debug(sprintf('[PrepArticle] Removing tag \'%s\' with 0 or less weight', $tag));
1412
1413                 NodeUtility::removeNode($node);
1414                 continue;
1415             }
1416
1417             if (substr_count($node->getTextContent(), ',') < 10) {
1418                 /*
1419                  * If there are not very many commas, and the number of
1420                  * non-paragraph elements is more than paragraphs or other
1421                  * ominous signs, remove the element.
1422                  */
1423
1424                 $p = $node->getElementsByTagName('p')->length;
1425                 $img = $node->getElementsByTagName('img')->length;
1426                 $li = $node->getElementsByTagName('li')->length - 100;
1427                 $input = $node->getElementsByTagName('input')->length;
1428
1429                 $embedCount = 0;
1430                 $embeds = $node->getElementsByTagName('embed');
1431
1432                 foreach ($embeds as $embedNode) {
1433                     if (preg_match(NodeUtility::$regexps['videos'], $embedNode->C14N())) {
1434                         $embedCount++;
1435                     }
1436                 }
1437
1438                 $linkDensity = $node->getLinkDensity();
1439                 $contentLength = mb_strlen($node->getTextContent(true));
1440
1441                 $haveToRemove =
1442                     ($img > 1 && $p / $img < 0.5 && !$node->hasAncestorTag($node, 'figure')) ||
1443                     (!$isList && $li > $p) ||
1444                     ($input > floor($p / 3)) ||
1445                     (!$isList && $contentLength < 25 && ($img === 0 || $img > 2) && !$node->hasAncestorTag($node, 'figure')) ||
1446                     (!$isList && $weight < 25 && $linkDensity > 0.2) ||
1447                     ($weight >= 25 && $linkDensity > 0.5) ||
1448                     (($embedCount === 1 && $contentLength < 75) || $embedCount > 1);
1449
1450                 if ($haveToRemove) {
1451                     $this->logger->debug(sprintf('[PrepArticle] Removing tag \'%s\'.', $tag));
1452
1453                     NodeUtility::removeNode($node);
1454                 }
1455             }
1456         }
1457     }
1458
1459     /**
1460      * Clean a node of all elements of type "tag".
1461      * (Unless it's a youtube/vimeo video. People love movies.).
1462      *
1463      * @param $article DOMDocument
1464      * @param $tag string tag to clean
1465      *
1466      * @return void
1467      **/
1468     public function _clean(DOMDocument $article, $tag)
1469     {
1470         $isEmbed = in_array($tag, ['object', 'embed', 'iframe']);
1471
1472         $DOMNodeList = $article->getElementsByTagName($tag);
1473         $length = $DOMNodeList->length;
1474         for ($i = 0; $i < $length; $i++) {
1475             $item = $DOMNodeList->item($length - 1 - $i);
1476
1477             // Allow youtube and vimeo videos through as people usually want to see those.
1478             if ($isEmbed) {
1479                 $attributeValues = [];
1480                 foreach ($item->attributes as $name => $value) {
1481                     $attributeValues[] = $value->nodeValue;
1482                 }
1483                 $attributeValues = implode('|', $attributeValues);
1484
1485                 // First, check the elements attributes to see if any of them contain youtube or vimeo
1486                 if (preg_match(NodeUtility::$regexps['videos'], $attributeValues)) {
1487                     continue;
1488                 }
1489
1490                 // Then check the elements inside this element for the same.
1491                 if (preg_match(NodeUtility::$regexps['videos'], $item->C14N())) {
1492                     continue;
1493                 }
1494             }
1495             $this->logger->debug(sprintf('[PrepArticle] Removing node \'%s\'.', $item->tagName));
1496
1497             NodeUtility::removeNode($item);
1498         }
1499     }
1500
1501     /**
1502      * Clean out spurious headers from an Element. Checks things like classnames and link density.
1503      *
1504      * @param DOMDocument $article
1505      *
1506      * @return void
1507      **/
1508     public function _cleanHeaders(DOMDocument $article)
1509     {
1510         for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) {
1511             $headers = $article->getElementsByTagName('h' . $headerIndex);
1512             /** @var $header DOMElement */
1513             foreach ($headers as $header) {
1514                 $weight = 0;
1515                 if ($this->configuration->getWeightClasses()) {
1516                     $weight = $header->getClassWeight();
1517                 }
1518
1519                 if ($weight < 0) {
1520                     $this->logger->debug(sprintf('[PrepArticle] Removing H node with 0 or less weight. Content was: \'%s\'', substr($header->nodeValue, 0, 128)));
1521
1522                     NodeUtility::removeNode($header);
1523                 }
1524             }
1525         }
1526     }
1527
1528     /**
1529      * Removes the class="" attribute from every element in the given
1530      * subtree.
1531      *
1532      * Readability.js has a special filter to avoid cleaning the classes that the algorithm adds. We don't add classes
1533      * here so no need to filter those.
1534      *
1535      * @param DOMDocument|DOMNode $node
1536      *
1537      * @return void
1538      **/
1539     public function _cleanClasses($node)
1540     {
1541         if ($node->getAttribute('class') !== '') {
1542             $node->removeAttribute('class');
1543         }
1544
1545         for ($node = $node->firstChild; $node !== null; $node = $node->nextSibling) {
1546             $this->_cleanClasses($node);
1547         }
1548     }
1549
1550     /**
1551      * @param DOMDocument $article
1552      *
1553      * @return DOMDocument
1554      */
1555     public function postProcessContent(DOMDocument $article)
1556     {
1557         $this->logger->info('[PostProcess] PostProcessing content...');
1558
1559         // Readability cannot open relative uris so we convert them to absolute uris.
1560         if ($this->configuration->getFixRelativeURLs()) {
1561             foreach (iterator_to_array($article->getElementsByTagName('a')) as $link) {
1562                 /** @var DOMElement $link */
1563                 $href = $link->getAttribute('href');
1564                 if ($href) {
1565                     // Replace links with javascript: URIs with text content, since
1566                     // they won't work after scripts have been removed from the page.
1567                     if (strpos($href, 'javascript:') === 0) {
1568                         $this->logger->debug(sprintf('[PostProcess] Removing \'javascript:\' link. Content is: \'%s\'', substr($link->textContent, 0, 128)));
1569
1570                         $text = $article->createTextNode($link->textContent);
1571                         $link->parentNode->replaceChild($text, $link);
1572                     } else {
1573                         $this->logger->debug(sprintf('[PostProcess] Converting link to absolute URI: \'%s\'', substr($href, 0, 128)));
1574
1575                         $link->setAttribute('href', $this->toAbsoluteURI($href));
1576                     }
1577                 }
1578             }
1579
1580             foreach ($article->getElementsByTagName('img') as $img) {
1581                 /** @var DOMElement $img */
1582                 /*
1583                  * Extract all possible sources of img url and select the first one on the list.
1584                  */
1585                 $url = [
1586                     $img->getAttribute('src'),
1587                     $img->getAttribute('data-src'),
1588                     $img->getAttribute('data-original'),
1589                     $img->getAttribute('data-orig'),
1590                     $img->getAttribute('data-url')
1591                 ];
1592
1593                 $src = array_filter($url);
1594                 $src = reset($src);
1595                 if ($src) {
1596                     $this->logger->debug(sprintf('[PostProcess] Converting image URL to absolute URI: \'%s\'', substr($src, 0, 128)));
1597
1598                     $img->setAttribute('src', $this->toAbsoluteURI($src));
1599                 }
1600             }
1601         }
1602
1603         $this->_cleanClasses($article);
1604
1605         return $article;
1606     }
1607
1608     /**
1609      * @return null|string
1610      */
1611     public function __toString()
1612     {
1613         return sprintf('<h1>%s</h1>%s', $this->getTitle(), $this->getContent());
1614     }
1615
1616     /**
1617      * @return string|null
1618      */
1619     public function getTitle()
1620     {
1621         return $this->title;
1622     }
1623
1624     /**
1625      * @param string $title
1626      */
1627     protected function setTitle($title)
1628     {
1629         $this->title = $title;
1630     }
1631
1632     /**
1633      * @return string|null
1634      */
1635     public function getContent()
1636     {
1637         return ($this->content instanceof DOMDocument) ? $this->content->C14N() : null;
1638     }
1639
1640     /**
1641      * @return DOMDocument|null
1642      */
1643     public function getDOMDocument()
1644     {
1645         return $this->content;
1646     }
1647
1648     /**
1649      * @param DOMDocument $content
1650      */
1651     protected function setContent(DOMDocument $content)
1652     {
1653         $this->content = $content;
1654     }
1655
1656     /**
1657      * @return null|string
1658      */
1659     public function getExcerpt()
1660     {
1661         return $this->excerpt;
1662     }
1663
1664     /**
1665      * @param null|string $excerpt
1666      */
1667     public function setExcerpt($excerpt)
1668     {
1669         $this->excerpt = $excerpt;
1670     }
1671
1672     /**
1673      * @return string|null
1674      */
1675     public function getImage()
1676     {
1677         return $this->image;
1678     }
1679
1680     /**
1681      * @param string $image
1682      */
1683     protected function setImage($image)
1684     {
1685         $this->image = $image;
1686     }
1687
1688     /**
1689      * @return string|null
1690      */
1691     public function getAuthor()
1692     {
1693         return $this->author;
1694     }
1695
1696     /**
1697      * @param string $author
1698      */
1699     protected function setAuthor($author)
1700     {
1701         $this->author = $author;
1702     }
1703
1704     /**
1705      * @return null|string
1706      */
1707     public function getDirection()
1708     {
1709         return $this->direction;
1710     }
1711
1712     /**
1713      * @param null|string $direction
1714      */
1715     public function setDirection($direction)
1716     {
1717         $this->direction = $direction;
1718     }
1719 }