]> git.wh0rd.org - tt-rss.git/blob - vendor/andreskrey/Readability/Readability.php
update autoloader to consider namespaces for third party libraries: placed and loaded...
[tt-rss.git] / vendor / andreskrey / Readability / Readability.php
1 <?php
2
3 namespace andreskrey\Readability;
4
5 use andreskrey\Readability\Nodes\DOM\DOMDocument;
6 use andreskrey\Readability\Nodes\DOM\DOMElement;
7 use andreskrey\Readability\Nodes\DOM\DOMNode;
8 use andreskrey\Readability\Nodes\DOM\DOMText;
9 use andreskrey\Readability\Nodes\NodeUtility;
10 use Psr\Log\LoggerInterface;
11
12 /**
13 * Class Readability.
14 */
15 class Readability
16 {
17 /**
18 * Main DOMDocument where all the magic happens.
19 *
20 * @var DOMDocument
21 */
22 protected $dom;
23
24 /**
25 * Title of the article.
26 *
27 * @var string|null
28 */
29 protected $title = null;
30
31 /**
32 * Final DOMDocument with the fully parsed HTML.
33 *
34 * @var DOMDocument|null
35 */
36 protected $content = null;
37
38 /**
39 * Excerpt of the article.
40 *
41 * @var string|null
42 */
43 protected $excerpt = null;
44
45 /**
46 * Main image of the article.
47 *
48 * @var string|null
49 */
50 protected $image = null;
51
52 /**
53 * Author of the article. Extracted from the byline tags and other social media properties.
54 *
55 * @var string|null
56 */
57 protected $author = null;
58
59 /**
60 * Direction of the text.
61 *
62 * @var string|null
63 */
64 protected $direction = null;
65
66 /**
67 * Configuration object.
68 *
69 * @var Configuration
70 */
71 private $configuration;
72
73 /**
74 * Logger object.
75 *
76 * @var LoggerInterface
77 */
78 private $logger;
79
80 /**
81 * Collection of attempted text extractions.
82 *
83 * @var array
84 */
85 private $attempts = [];
86
87 /**
88 * @var array
89 */
90 private $defaultTagsToScore = [
91 'section',
92 'h2',
93 'h3',
94 'h4',
95 'h5',
96 'h6',
97 'p',
98 'td',
99 'pre',
100 ];
101
102 /**
103 * @var array
104 */
105 private $alterToDIVExceptions = [
106 'div',
107 'article',
108 'section',
109 'p',
110 ];
111
112 /**
113 * Readability constructor.
114 *
115 * @param Configuration $configuration
116 */
117 public function __construct(Configuration $configuration)
118 {
119 $this->configuration = $configuration;
120 $this->logger = $this->configuration->getLogger();
121 }
122
123 /**
124 * Main parse function.
125 *
126 * @param $html
127 *
128 * @throws ParseException
129 *
130 * @return array|bool
131 */
132 public function parse($html)
133 {
134 $this->logger->info('*** Starting parse process...');
135
136 $this->dom = $this->loadHTML($html);
137
138 // Checking for minimum HTML to work with.
139 if (!($root = $this->dom->getElementsByTagName('body')->item(0)) || !$root->firstChild) {
140 $this->logger->emergency('No body tag present or body tag empty');
141
142 throw new ParseException('Invalid or incomplete HTML.');
143 }
144
145 $this->getMetadata();
146
147 $this->getMainImage();
148
149 while (true) {
150 $root = $root->firstChild;
151
152 $elementsToScore = $this->getNodes($root);
153 $this->logger->debug(sprintf('Elements to score: \'%s\'', count($elementsToScore)));
154
155 $result = $this->rateNodes($elementsToScore);
156
157 /*
158 * Now that we've gone through the full algorithm, check to see if
159 * we got any meaningful content. If we didn't, we may need to re-run
160 * grabArticle with different flags set. This gives us a higher likelihood of
161 * finding the content, and the sieve approach gives us a higher likelihood of
162 * finding the -right- content.
163 */
164
165 $length = mb_strlen(preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $result->textContent));
166
167 $this->logger->info(sprintf('[Parsing] Article parsed. Amount of words: %s. Current threshold is: %s', $length, $this->configuration->getWordThreshold()));
168
169 $parseSuccessful = true;
170
171 if ($result && $length < $this->configuration->getWordThreshold()) {
172 $this->dom = $this->loadHTML($html);
173 $root = $this->dom->getElementsByTagName('body')->item(0);
174 $parseSuccessful = false;
175
176 if ($this->configuration->getStripUnlikelyCandidates()) {
177 $this->logger->debug('[Parsing] Threshold not met, trying again setting StripUnlikelyCandidates as false');
178 $this->configuration->setStripUnlikelyCandidates(false);
179 $this->attempts[] = ['articleContent' => $result, 'textLength' => $length];
180 } elseif ($this->configuration->getWeightClasses()) {
181 $this->logger->debug('[Parsing] Threshold not met, trying again setting WeightClasses as false');
182 $this->configuration->setWeightClasses(false);
183 $this->attempts[] = ['articleContent' => $result, 'textLength' => $length];
184 } elseif ($this->configuration->getCleanConditionally()) {
185 $this->logger->debug('[Parsing] Threshold not met, trying again setting CleanConditionally as false');
186 $this->configuration->setCleanConditionally(false);
187 $this->attempts[] = ['articleContent' => $result, 'textLength' => $length];
188 } else {
189 $this->logger->debug('[Parsing] Threshold not met, searching across attempts for some content.');
190 $this->attempts[] = ['articleContent' => $result, 'textLength' => $length];
191
192 // No luck after removing flags, just return the longest text we found during the different loops
193 usort($this->attempts, function ($a, $b) {
194 return $a['textLength'] < $b['textLength'];
195 });
196
197 // But first check if we actually have something
198 if (!$this->attempts[0]['textLength']) {
199 $this->logger->emergency('[Parsing] Could not parse text, giving up :(');
200
201 throw new ParseException('Could not parse text.');
202 }
203
204 $this->logger->debug('[Parsing] Threshold not met, but found some content in previous attempts.');
205
206 $result = $this->attempts[0]['articleContent'];
207 $parseSuccessful = true;
208 break;
209 }
210 } else {
211 break;
212 }
213 }
214
215 if ($parseSuccessful) {
216 $result = $this->postProcessContent($result);
217
218 // If we haven't found an excerpt in the article's metadata, use the article's
219 // first paragraph as the excerpt. This can be used for displaying a preview of
220 // the article's content.
221 if (!$this->getExcerpt()) {
222 $this->logger->debug('[Parsing] No excerpt text found on metadata, extracting first p node and using it as excerpt.');
223 $paragraphs = $result->getElementsByTagName('p');
224 if ($paragraphs->length > 0) {
225 $this->setExcerpt(trim($paragraphs->item(0)->textContent));
226 }
227 }
228
229 $this->setContent($result);
230
231 $this->logger->info('*** Parse successful :)');
232
233 return true;
234 }
235 }
236
237 /**
238 * Creates a DOM Document object and loads the provided HTML on it.
239 *
240 * Used for the first load of Readability and subsequent reloads (when disabling flags and rescanning the text)
241 * Previous versions of Readability used this method one time and cloned the DOM to keep a backup. This caused bugs
242 * because cloning the DOM object keeps a relation between the clone and the original one, doing changes in both
243 * objects and ruining the backup.
244 *
245 * @param string $html
246 *
247 * @return DOMDocument
248 */
249 private function loadHTML($html)
250 {
251 $this->logger->debug('[Loading] Loading HTML...');
252
253 // To avoid throwing a gazillion of errors on malformed HTMLs
254 libxml_use_internal_errors(true);
255
256 $dom = new DOMDocument('1.0', 'utf-8');
257
258 if (!$this->configuration->getSubstituteEntities()) {
259 // Keep the original HTML entities
260 $dom->substituteEntities = false;
261 }
262
263 if ($this->configuration->getNormalizeEntities()) {
264 $this->logger->debug('[Loading] Normalized entities via mb_convert_encoding.');
265 // Replace UTF-8 characters with the HTML Entity equivalent. Useful to fix html with mixed content
266 $html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8');
267 }
268
269 if ($this->configuration->getSummonCthulhu()) {
270 $this->logger->debug('[Loading] Removed script tags via regex H̶͈̩̟̬̱͠E̡̨̬͔̳̜͢͠ ̡̧̯͉̩͙̩̹̞̠͎͈̹̥̠͞ͅͅC̶͉̞̘̖̝̗͓̬̯͍͉̤̬͢͢͞Ò̟̘͉͖͎͉̱̭̣̕M̴̯͈̻̱̱̣̗͈̠̙̲̥͘͞E̷̛͙̼̲͍͕̹͍͇̗̻̬̮̭̱̥͢Ş̛̟͔̙̜̤͇̮͍̙̝̀͘');
271 $html = preg_replace('/<script\b[^>]*>([\s\S]*?)<\/script>/', '', $html);
272 }
273
274 // Prepend the XML tag to avoid having issues with special characters. Should be harmless.
275 $dom->loadHTML('<?xml encoding="UTF-8">' . $html);
276 $dom->encoding = 'UTF-8';
277
278 $this->removeScripts($dom);
279
280 $this->prepDocument($dom);
281
282 $this->logger->debug('[Loading] Loaded HTML successfully.');
283
284 return $dom;
285 }
286
287 /**
288 * Tries to guess relevant info from metadata of the html. Sets the results in the Readability properties.
289 */
290 private function getMetadata()
291 {
292 $this->logger->debug('[Metadata] Retrieving metadata...');
293
294 $values = [];
295 // Match "description", or Twitter's "twitter:description" (Cards)
296 // in name attribute.
297 $namePattern = '/^\s*((twitter)\s*:\s*)?(description|title|image)\s*$/i';
298
299 // Match Facebook's Open Graph title & description properties.
300 $propertyPattern = '/^\s*og\s*:\s*(description|title|image)\s*$/i';
301
302 foreach ($this->dom->getElementsByTagName('meta') as $meta) {
303 /* @var DOMNode $meta */
304 $elementName = $meta->getAttribute('name');
305 $elementProperty = $meta->getAttribute('property');
306
307 if (in_array('author', [$elementName, $elementProperty])) {
308 $this->logger->info(sprintf('[Metadata] Found author: \'%s\'', $meta->getAttribute('content')));
309 $this->setAuthor($meta->getAttribute('content'));
310 continue;
311 }
312
313 $name = null;
314 if (preg_match($namePattern, $elementName)) {
315 $name = $elementName;
316 } elseif (preg_match($propertyPattern, $elementProperty)) {
317 $name = $elementProperty;
318 }
319
320 if ($name) {
321 $content = $meta->getAttribute('content');
322 if ($content) {
323 // Convert to lowercase and remove any whitespace
324 // so we can match below.
325 $name = preg_replace('/\s/', '', strtolower($name));
326 $values[$name] = trim($content);
327 }
328 }
329 }
330 if (array_key_exists('description', $values)) {
331 $this->logger->info(sprintf('[Metadata] Found excerpt in \'description\' tag: \'%s\'', $values['description']));
332 $this->setExcerpt($values['description']);
333 } elseif (array_key_exists('og:description', $values)) {
334 // Use facebook open graph description.
335 $this->logger->info(sprintf('[Metadata] Found excerpt in \'og:description\' tag: \'%s\'', $values['og:description']));
336 $this->setExcerpt($values['og:description']);
337 } elseif (array_key_exists('twitter:description', $values)) {
338 // Use twitter cards description.
339 $this->logger->info(sprintf('[Metadata] Found excerpt in \'twitter:description\' tag: \'%s\'', $values['twitter:description']));
340 $this->setExcerpt($values['twitter:description']);
341 }
342
343 $this->setTitle($this->getArticleTitle());
344
345 if (!$this->getTitle()) {
346 if (array_key_exists('og:title', $values)) {
347 // Use facebook open graph title.
348 $this->logger->info(sprintf('[Metadata] Found title in \'og:title\' tag: \'%s\'', $values['og:title']));
349 $this->setTitle($values['og:title']);
350 } elseif (array_key_exists('twitter:title', $values)) {
351 // Use twitter cards title.
352 $this->logger->info(sprintf('[Metadata] Found title in \'twitter:title\' tag: \'%s\'', $values['twitter:title']));
353 $this->setTitle($values['twitter:title']);
354 }
355 }
356
357 if (array_key_exists('og:image', $values) || array_key_exists('twitter:image', $values)) {
358 if (array_key_exists('og:image', $values)) {
359 $this->logger->info(sprintf('[Metadata] Found main image in \'og:image\' tag: \'%s\'', $values['og:image']));
360 $this->setImage($values['og:image']);
361 } else {
362 $this->logger->info(sprintf('[Metadata] Found main image in \'twitter:image\' tag: \'%s\'', $values['twitter:image']));
363 $this->setImage($values['twitter:image']);
364 }
365 }
366 }
367
368 /**
369 * Returns all the images of the parsed article.
370 *
371 * @return array
372 */
373 public function getImages()
374 {
375 $result = [];
376 if ($this->getImage()) {
377 $result[] = $this->getImage();
378 }
379
380 if (null == $this->getDOMDocument()) {
381 return $result;
382 }
383
384 foreach ($this->getDOMDocument()->getElementsByTagName('img') as $img) {
385 if ($src = $img->getAttribute('src')) {
386 $result[] = $src;
387 }
388 }
389
390 if ($this->configuration->getFixRelativeURLs()) {
391 foreach ($result as &$imgSrc) {
392 $imgSrc = $this->toAbsoluteURI($imgSrc);
393 }
394 }
395
396 $result = array_unique(array_filter($result));
397
398 return $result;
399 }
400
401 /**
402 * Tries to get the main article image. Will only update the metadata if the getMetadata function couldn't
403 * find a correct image.
404 */
405 public function getMainImage()
406 {
407 $imgUrl = false;
408
409 if ($this->getImage() !== null) {
410 $imgUrl = $this->getImage();
411 }
412
413 if (!$imgUrl) {
414 foreach ($this->dom->getElementsByTagName('link') as $link) {
415 /** @var \DOMElement $link */
416 /*
417 * Check for the rel attribute, then check if the rel attribute is either img_src or image_src, and
418 * finally check for the existence of the href attribute, which should hold the image url.
419 */
420 if ($link->hasAttribute('rel') && ($link->getAttribute('rel') === 'img_src' || $link->getAttribute('rel') === 'image_src') && $link->hasAttribute('href')) {
421 $imgUrl = $link->getAttribute('href');
422 break;
423 }
424 }
425 }
426
427 if (!empty($imgUrl) && $this->configuration->getFixRelativeURLs()) {
428 $this->setImage($this->toAbsoluteURI($imgUrl));
429 }
430 }
431
432 /**
433 * Returns the title of the html. Prioritizes the title from the metadata against the title tag.
434 *
435 * @return string|null
436 */
437 private function getArticleTitle()
438 {
439 $originalTitle = null;
440
441 if ($this->getTitle()) {
442 $originalTitle = $this->getTitle();
443 } else {
444 $this->logger->debug('[Metadata] Could not find title in metadata, searching for the title tag...');
445 $titleTag = $this->dom->getElementsByTagName('title');
446 if ($titleTag->length > 0) {
447 $this->logger->info(sprintf('[Metadata] Using title tag as article title: \'%s\'', $titleTag->item(0)->nodeValue));
448 $originalTitle = $titleTag->item(0)->nodeValue;
449 }
450 }
451
452 if ($originalTitle === null) {
453 return null;
454 }
455
456 $curTitle = $originalTitle;
457 $titleHadHierarchicalSeparators = false;
458
459 /*
460 * If there's a separator in the title, first remove the final part
461 *
462 * Sanity warning: if you eval this match in PHPStorm's "Evaluate expression" box, it will return false
463 * I can assure you it works properly if you let the code run.
464 */
465 if (preg_match('/ [\|\-\\\\\/>»] /i', $curTitle)) {
466 $titleHadHierarchicalSeparators = (bool)preg_match('/ [\\\\\/>»] /', $curTitle);
467 $curTitle = preg_replace('/(.*)[\|\-\\\\\/>»] .*/i', '$1', $originalTitle);
468
469 $this->logger->info(sprintf('[Metadata] Found hierarchical separators in title, new title is: \'%s\'', $curTitle));
470
471 // If the resulting title is too short (3 words or fewer), remove
472 // the first part instead:
473 if (count(preg_split('/\s+/', $curTitle)) < 3) {
474 $curTitle = preg_replace('/[^\|\-\\\\\/>»]*[\|\-\\\\\/>»](.*)/i', '$1', $originalTitle);
475 $this->logger->info(sprintf('[Metadata] Title too short, using the first part of the title instead: \'%s\'', $curTitle));
476 }
477 } elseif (strpos($curTitle, ': ') !== false) {
478 // Check if we have an heading containing this exact string, so we
479 // could assume it's the full title.
480 $match = false;
481 for ($i = 1; $i <= 2; $i++) {
482 foreach ($this->dom->getElementsByTagName('h' . $i) as $hTag) {
483 // Trim texts to avoid having false negatives when the title is surrounded by spaces or tabs
484 if (trim($hTag->nodeValue) === trim($curTitle)) {
485 $match = true;
486 }
487 }
488 }
489
490 // If we don't, let's extract the title out of the original title string.
491 if (!$match) {
492 $curTitle = substr($originalTitle, strrpos($originalTitle, ':') + 1);
493
494 $this->logger->info(sprintf('[Metadata] Title has a colon in the middle, new title is: \'%s\'', $curTitle));
495
496 // If the title is now too short, try the first colon instead:
497 if (count(preg_split('/\s+/', $curTitle)) < 3) {
498 $curTitle = substr($originalTitle, strpos($originalTitle, ':') + 1);
499 $this->logger->info(sprintf('[Metadata] Title too short, using the first part of the title instead: \'%s\'', $curTitle));
500 } elseif (count(preg_split('/\s+/', substr($curTitle, 0, strpos($curTitle, ':')))) > 5) {
501 // But if we have too many words before the colon there's something weird
502 // with the titles and the H tags so let's just use the original title instead
503 $curTitle = $originalTitle;
504 }
505 }
506 } elseif (mb_strlen($curTitle) > 150 || mb_strlen($curTitle) < 15) {
507 $hOnes = $this->dom->getElementsByTagName('h1');
508
509 if ($hOnes->length === 1) {
510 $curTitle = $hOnes->item(0)->nodeValue;
511 $this->logger->info(sprintf('[Metadata] Using title from an H1 node: \'%s\'', $curTitle));
512 }
513 }
514
515 $curTitle = trim($curTitle);
516
517 /*
518 * If we now have 4 words or fewer as our title, and either no
519 * 'hierarchical' separators (\, /, > or ») were found in the original
520 * title or we decreased the number of words by more than 1 word, use
521 * the original title.
522 */
523 $curTitleWordCount = count(preg_split('/\s+/', $curTitle));
524 $originalTitleWordCount = count(preg_split('/\s+/', preg_replace('/[\|\-\\\\\/>»]+/', '', $originalTitle))) - 1;
525
526 if ($curTitleWordCount <= 4 &&
527 (!$titleHadHierarchicalSeparators || $curTitleWordCount !== $originalTitleWordCount)) {
528 $curTitle = $originalTitle;
529
530 $this->logger->info(sprintf('Using title from an H1 node: \'%s\'', $curTitle));
531 }
532
533 return $curTitle;
534 }
535
536 /**
537 * Convert URI to an absolute URI.
538 *
539 * @param $uri string URI to convert
540 *
541 * @return string
542 */
543 private function toAbsoluteURI($uri)
544 {
545 list($pathBase, $scheme, $prePath) = $this->getPathInfo($this->configuration->getOriginalURL());
546
547 // If this is already an absolute URI, return it.
548 if (preg_match('/^[a-zA-Z][a-zA-Z0-9\+\-\.]*:/', $uri)) {
549 return $uri;
550 }
551
552 // Scheme-rooted relative URI.
553 if (substr($uri, 0, 2) === '//') {
554 return $scheme . '://' . substr($uri, 2);
555 }
556
557 // Prepath-rooted relative URI.
558 if (substr($uri, 0, 1) === '/') {
559 return $prePath . $uri;
560 }
561
562 // Dotslash relative URI.
563 if (strpos($uri, './') === 0) {
564 return $pathBase . substr($uri, 2);
565 }
566 // Ignore hash URIs:
567 if (substr($uri, 0, 1) === '#') {
568 return $uri;
569 }
570
571 // Standard relative URI; add entire path. pathBase already includes a
572 // trailing "/".
573 return $pathBase . $uri;
574 }
575
576 /**
577 * Returns full path info of an URL.
578 *
579 * @param string $url
580 *
581 * @return array [$pathBase, $scheme, $prePath]
582 */
583 public function getPathInfo($url)
584 {
585 // Check for base URLs
586 if ($this->dom->baseURI !== null) {
587 if (substr($this->dom->baseURI, 0, 1) === '/') {
588 // URLs starting with '/' override completely the URL defined in the link
589 $pathBase = parse_url($url, PHP_URL_SCHEME) . '://' . parse_url($url, PHP_URL_HOST) . $this->dom->baseURI;
590 } else {
591 // Otherwise just prepend the base to the actual path
592 $pathBase = parse_url($url, PHP_URL_SCHEME) . '://' . parse_url($url, PHP_URL_HOST) . dirname(parse_url($url, PHP_URL_PATH)) . '/' . rtrim($this->dom->baseURI, '/') . '/';
593 }
594 } else {
595 $pathBase = parse_url($url, PHP_URL_SCHEME) . '://' . parse_url($url, PHP_URL_HOST) . dirname(parse_url($url, PHP_URL_PATH)) . '/';
596 }
597
598 $scheme = parse_url($pathBase, PHP_URL_SCHEME);
599 $prePath = $scheme . '://' . parse_url($pathBase, PHP_URL_HOST);
600
601 return [$pathBase, $scheme, $prePath];
602 }
603
604 /**
605 * Gets nodes from the root element.
606 *
607 * @param $node DOMNode|DOMText
608 *
609 * @return array
610 */
611 private function getNodes($node)
612 {
613 $this->logger->info('[Get Nodes] Retrieving nodes...');
614
615 $stripUnlikelyCandidates = $this->configuration->getStripUnlikelyCandidates();
616
617 $elementsToScore = [];
618
619 /*
620 * First, node prepping. Trash nodes that look cruddy (like ones with the
621 * class name "comment", etc), and turn divs into P tags where they have been
622 * used inappropriately (as in, where they contain no other block level elements.)
623 */
624
625 while ($node) {
626 $matchString = $node->getAttribute('class') . ' ' . $node->getAttribute('id');
627
628 // Remove DOMComments nodes as we don't need them and mess up children counting
629 if ($node->nodeType === XML_COMMENT_NODE) {
630 $this->logger->debug(sprintf('[Get Nodes] Found comment node, removing... Node content was: \'%s\'', substr($node->nodeValue, 0, 128)));
631 $node = NodeUtility::removeAndGetNext($node);
632 continue;
633 }
634
635 // Check to see if this node is a byline, and remove it if it is.
636 if ($this->checkByline($node, $matchString)) {
637 $this->logger->debug(sprintf('[Get Nodes] Found byline, removing... Node content was: \'%s\'', substr($node->nodeValue, 0, 128)));
638 $node = NodeUtility::removeAndGetNext($node);
639 continue;
640 }
641
642 // Remove unlikely candidates
643 if ($stripUnlikelyCandidates) {
644 if (
645 preg_match(NodeUtility::$regexps['unlikelyCandidates'], $matchString) &&
646 !preg_match(NodeUtility::$regexps['okMaybeItsACandidate'], $matchString) &&
647 $node->nodeName !== 'body' &&
648 $node->nodeName !== 'a'
649 ) {
650 $this->logger->debug(sprintf('[Get Nodes] Removing unlikely candidate. Node content was: \'%s\'', substr($node->nodeValue, 0, 128)));
651 $node = NodeUtility::removeAndGetNext($node);
652 continue;
653 }
654 }
655
656 // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe).
657 if (($node->nodeName === 'div' || $node->nodeName === 'section' || $node->nodeName === 'header' ||
658 $node->nodeName === 'h1' || $node->nodeName === 'h2' || $node->nodeName === 'h3' ||
659 $node->nodeName === 'h4' || $node->nodeName === 'h5' || $node->nodeName === 'h6' ||
660 $node->nodeName === 'p') &&
661 $node->isElementWithoutContent()) {
662 $this->logger->debug(sprintf('[Get Nodes] Removing empty \'%s\' node.', $node->nodeName));
663 $node = NodeUtility::removeAndGetNext($node);
664 continue;
665 }
666
667 if (in_array(strtolower($node->nodeName), $this->defaultTagsToScore)) {
668 $this->logger->debug(sprintf('[Get Nodes] Adding node to score list, node content is: \'%s\'', substr($node->nodeValue, 0, 128)));
669 $elementsToScore[] = $node;
670 }
671
672 // Turn all divs that don't have children block level elements into p's
673 if ($node->nodeName === 'div') {
674 /*
675 * Sites like http://mobile.slate.com encloses each paragraph with a DIV
676 * element. DIVs with only a P element inside and no text content can be
677 * safely converted into plain P elements to avoid confusing the scoring
678 * algorithm with DIVs with are, in practice, paragraphs.
679 */
680 if ($node->hasSinglePNode()) {
681 $this->logger->debug(sprintf('[Get Nodes] Found DIV with a single P node, removing DIV. Node content is: \'%s\'', substr($node->nodeValue, 0, 128)));
682 $pNode = $node->getChildren(true)[0];
683 $node->parentNode->replaceChild($pNode, $node);
684 $node = $pNode;
685 $elementsToScore[] = $node;
686 } elseif (!$node->hasSingleChildBlockElement()) {
687 $this->logger->debug(sprintf('[Get Nodes] Found DIV with a single child block element, converting to a P node. Node content is: \'%s\'', substr($node->nodeValue, 0, 128)));
688 $node = NodeUtility::setNodeTag($node, 'p');
689 $elementsToScore[] = $node;
690 } else {
691 // EXPERIMENTAL
692 foreach ($node->getChildren() as $child) {
693 /** @var $child DOMNode */
694 if ($child->nodeType === XML_TEXT_NODE && mb_strlen(trim($child->getTextContent())) > 0) {
695 $this->logger->debug(sprintf('[Get Nodes] Found DIV a text node inside, converting to a P node. Node content is: \'%s\'', substr($node->nodeValue, 0, 128)));
696 $newNode = $node->createNode($child, 'p');
697 $child->parentNode->replaceChild($newNode, $child);
698 }
699 }
700 }
701 }
702
703 $node = NodeUtility::getNextNode($node);
704 }
705
706 return $elementsToScore;
707 }
708
709 /**
710 * Checks if the node is a byline.
711 *
712 * @param DOMNode $node
713 * @param string $matchString
714 *
715 * @return bool
716 */
717 private function checkByline($node, $matchString)
718 {
719 if (!$this->configuration->getArticleByLine()) {
720 return false;
721 }
722
723 /*
724 * Check if the byline is already set
725 */
726 if ($this->getAuthor()) {
727 return false;
728 }
729
730 $rel = $node->getAttribute('rel');
731
732 if ($rel === 'author' || preg_match(NodeUtility::$regexps['byline'], $matchString) && $this->isValidByline($node->getTextContent())) {
733 $this->logger->info(sprintf('[Metadata] Found article author: \'%s\'', $node->getTextContent()));
734 $this->setAuthor(trim($node->getTextContent()));
735
736 return true;
737 }
738
739 return false;
740 }
741
742 /**
743 * Checks the validity of a byLine. Based on string length.
744 *
745 * @param string $text
746 *
747 * @return bool
748 */
749 private function isValidByline($text)
750 {
751 if (gettype($text) == 'string') {
752 $byline = trim($text);
753
754 return (mb_strlen($byline) > 0) && (mb_strlen($text) < 100);
755 }
756
757 return false;
758 }
759
760 /**
761 * Removes all the scripts of the html.
762 *
763 * @param DOMDocument $dom
764 */
765 private function removeScripts(DOMDocument $dom)
766 {
767 $toRemove = ['script', 'noscript'];
768
769 foreach ($toRemove as $tag) {
770 while ($script = $dom->getElementsByTagName($tag)) {
771 if ($script->item(0)) {
772 $script->item(0)->parentNode->removeChild($script->item(0));
773 } else {
774 break;
775 }
776 }
777 }
778 }
779
780 /**
781 * Prepares the document for parsing.
782 *
783 * @param DOMDocument $dom
784 */
785 private function prepDocument(DOMDocument $dom)
786 {
787 $this->logger->info('[PrepDocument] Preparing document for parsing...');
788
789 /*
790 * DOMNodeList must be converted to an array before looping over it.
791 * This is done to avoid node shifting when removing nodes.
792 *
793 * Reverse traversing cannot be done here because we need to find brs that are right next to other brs.
794 * (If we go the other way around we need to search for previous nodes forcing the creation of new functions
795 * that will be used only here)
796 */
797 foreach (iterator_to_array($dom->getElementsByTagName('br')) as $br) {
798 $next = $br->nextSibling;
799
800 /*
801 * Whether 2 or more <br> elements have been found and replaced with a
802 * <p> block.
803 */
804 $replaced = false;
805
806 /*
807 * If we find a <br> chain, remove the <br>s until we hit another element
808 * or non-whitespace. This leaves behind the first <br> in the chain
809 * (which will be replaced with a <p> later).
810 */
811 while (($next = NodeUtility::nextElement($next)) && ($next->nodeName === 'br')) {
812 $this->logger->debug('[PrepDocument] Removing chain of BR nodes...');
813
814 $replaced = true;
815 $brSibling = $next->nextSibling;
816 $next->parentNode->removeChild($next);
817 $next = $brSibling;
818 }
819
820 /*
821 * If we removed a <br> chain, replace the remaining <br> with a <p>. Add
822 * all sibling nodes as children of the <p> until we hit another <br>
823 * chain.
824 */
825
826 if ($replaced) {
827 $p = $dom->createElement('p');
828 $br->parentNode->replaceChild($p, $br);
829
830 $next = $p->nextSibling;
831 while ($next) {
832 // If we've hit another <br><br>, we're done adding children to this <p>.
833 if ($next->nodeName === 'br') {
834 $nextElem = NodeUtility::nextElement($next);
835 if ($nextElem && $nextElem->nodeName === 'br') {
836 break;
837 }
838 }
839
840 $this->logger->debug('[PrepDocument] Replacing BR with a P node...');
841
842 // Otherwise, make this node a child of the new <p>.
843 $sibling = $next->nextSibling;
844 $p->appendChild($next);
845 $next = $sibling;
846 }
847 }
848 }
849
850 // Replace font tags with span
851 $fonts = $dom->getElementsByTagName('font');
852 $length = $fonts->length;
853 for ($i = 0; $i < $length; $i++) {
854 $this->logger->debug('[PrepDocument] Converting font tag into a span tag.');
855 $font = $fonts->item($length - 1 - $i);
856 NodeUtility::setNodeTag($font, 'span', true);
857 }
858 }
859
860 /**
861 * Assign scores to each node. Returns full article parsed or false on error.
862 *
863 * @param array $nodes
864 *
865 * @return DOMDocument|bool
866 */
867 private function rateNodes($nodes)
868 {
869 $this->logger->info('[Rating] Rating nodes...');
870
871 $candidates = [];
872
873 /** @var DOMElement $node */
874 foreach ($nodes as $node) {
875 if (is_null($node->parentNode)) {
876 continue;
877 }
878
879 // Discard nodes with less than 25 characters, without blank space
880 if (mb_strlen($node->getTextContent(true)) < 25) {
881 continue;
882 }
883
884 $ancestors = $node->getNodeAncestors();
885
886 // Exclude nodes with no ancestor
887 if (count($ancestors) === 0) {
888 continue;
889 }
890
891 // Start with a point for the paragraph itself as a base.
892 $contentScore = 1;
893
894 // Add points for any commas within this paragraph.
895 $contentScore += count(explode(',', $node->getTextContent(true)));
896
897 // For every 100 characters in this paragraph, add another point. Up to 3 points.
898 $contentScore += min(floor(mb_strlen($node->getTextContent(true)) / 100), 3);
899
900 $this->logger->debug(sprintf('[Rating] Node score %s, content: \'%s\'', $contentScore, substr($node->nodeValue, 0, 128)));
901
902 /** @var $ancestor DOMElement */
903 foreach ($ancestors as $level => $ancestor) {
904 $this->logger->debug('[Rating] Found ancestor, initializing and adding it as a candidate...');
905 if (!$ancestor->isInitialized()) {
906 $ancestor->initializeNode($this->configuration->getWeightClasses());
907 $candidates[] = $ancestor;
908 }
909
910 /*
911 * Node score divider:
912 * - parent: 1 (no division)
913 * - grandparent: 2
914 * - great grandparent+: ancestor level * 3
915 */
916
917 if ($level === 0) {
918 $scoreDivider = 1;
919 } elseif ($level === 1) {
920 $scoreDivider = 2;
921 } else {
922 $scoreDivider = $level * 3;
923 }
924
925 $currentScore = $ancestor->contentScore;
926 $ancestor->contentScore = $currentScore + ($contentScore / $scoreDivider);
927
928 $this->logger->debug(sprintf('[Rating] Ancestor score %s, value: \'%s\'', $ancestor->contentScore, substr($ancestor->nodeValue, 0, 128)));
929 }
930 }
931
932 /*
933 * After we've calculated scores, loop through all of the possible
934 * candidate nodes we found and find the one with the highest score.
935 */
936
937 $topCandidates = [];
938 foreach ($candidates as $candidate) {
939
940 /*
941 * Scale the final candidates score based on link density. Good content
942 * should have a relatively small link density (5% or less) and be mostly
943 * unaffected by this operation.
944 */
945
946 $candidate->contentScore = $candidate->contentScore * (1 - $candidate->getLinkDensity());
947
948 for ($i = 0; $i < $this->configuration->getMaxTopCandidates(); $i++) {
949 $aTopCandidate = isset($topCandidates[$i]) ? $topCandidates[$i] : null;
950
951 if (!$aTopCandidate || $candidate->contentScore > $aTopCandidate->contentScore) {
952 array_splice($topCandidates, $i, 0, [$candidate]);
953 if (count($topCandidates) > $this->configuration->getMaxTopCandidates()) {
954 array_pop($topCandidates);
955 }
956 break;
957 }
958 }
959 }
960
961 $topCandidate = isset($topCandidates[0]) ? $topCandidates[0] : null;
962 $parentOfTopCandidate = null;
963
964 /*
965 * If we still have no top candidate, just use the body as a last resort.
966 * We also have to copy the body node so it is something we can modify.
967 */
968
969 if ($topCandidate === null || $topCandidate->nodeName === 'body') {
970 $this->logger->info('[Rating] No top candidate found or top candidate is the body tag. Moving all child nodes to a new DIV node.');
971
972 // Move all of the page's children into topCandidate
973 $topCandidate = new DOMDocument('1.0', 'utf-8');
974 $topCandidate->encoding = 'UTF-8';
975 $topCandidate->appendChild($topCandidate->createElement('div', ''));
976 $kids = $this->dom->getElementsByTagName('body')->item(0)->childNodes;
977
978 // Cannot be foreached, don't ask me why.
979 for ($i = 0; $i < $kids->length; $i++) {
980 $import = $topCandidate->importNode($kids->item($i), true);
981 $topCandidate->firstChild->appendChild($import);
982 }
983
984 // Candidate must be created using firstChild to grab the DOMElement instead of the DOMDocument.
985 $topCandidate = $topCandidate->firstChild;
986 } elseif ($topCandidate) {
987 $this->logger->info(sprintf('[Rating] Found top candidate, score: %s', $topCandidate->contentScore));
988 // Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array
989 // and whose scores are quite closed with current `topCandidate` node.
990 $alternativeCandidateAncestors = [];
991 for ($i = 1; $i < count($topCandidates); $i++) {
992 if ($topCandidates[$i]->contentScore / $topCandidate->contentScore >= 0.75) {
993 array_push($alternativeCandidateAncestors, $topCandidates[$i]->getNodeAncestors(false));
994 }
995 }
996
997 $MINIMUM_TOPCANDIDATES = 3;
998 if (count($alternativeCandidateAncestors) >= $MINIMUM_TOPCANDIDATES) {
999 $parentOfTopCandidate = $topCandidate->parentNode;
1000 while ($parentOfTopCandidate->nodeName !== 'body') {
1001 $listsContainingThisAncestor = 0;
1002 for ($ancestorIndex = 0; $ancestorIndex < count($alternativeCandidateAncestors) && $listsContainingThisAncestor < $MINIMUM_TOPCANDIDATES; $ancestorIndex++) {
1003 $listsContainingThisAncestor += (int)in_array($parentOfTopCandidate, $alternativeCandidateAncestors[$ancestorIndex]);
1004 }
1005 if ($listsContainingThisAncestor >= $MINIMUM_TOPCANDIDATES) {
1006 $topCandidate = $parentOfTopCandidate;
1007 break;
1008 }
1009 $parentOfTopCandidate = $parentOfTopCandidate->parentNode;
1010 }
1011 }
1012
1013 /*
1014 * Because of our bonus system, parents of candidates might have scores
1015 * themselves. They get half of the node. There won't be nodes with higher
1016 * scores than our topCandidate, but if we see the score going *up* in the first
1017 * few steps up the tree, that's a decent sign that there might be more content
1018 * lurking in other places that we want to unify in. The sibling stuff
1019 * below does some of that - but only if we've looked high enough up the DOM
1020 * tree.
1021 */
1022
1023 $parentOfTopCandidate = $topCandidate->parentNode;
1024 $lastScore = $topCandidate->contentScore;
1025
1026 // The scores shouldn't get too low.
1027 $scoreThreshold = $lastScore / 3;
1028
1029 /* @var DOMElement $parentOfTopCandidate */
1030 // Check if we are actually dealing with a DOMNode and not a DOMDocument node or higher
1031 while ($parentOfTopCandidate->nodeName !== 'body' && $parentOfTopCandidate->nodeType === XML_ELEMENT_NODE) {
1032 $parentScore = $parentOfTopCandidate->contentScore;
1033 if ($parentScore < $scoreThreshold) {
1034 break;
1035 }
1036
1037 if ($parentScore > $lastScore) {
1038 // Alright! We found a better parent to use.
1039 $topCandidate = $parentOfTopCandidate;
1040 $this->logger->info('[Rating] Found a better top candidate.');
1041 break;
1042 }
1043 $lastScore = $parentOfTopCandidate->contentScore;
1044 $parentOfTopCandidate = $parentOfTopCandidate->parentNode;
1045 }
1046
1047 // If the top candidate is the only child, use parent instead. This will help sibling
1048 // joining logic when adjacent content is actually located in parent's sibling node.
1049 $parentOfTopCandidate = $topCandidate->parentNode;
1050 while ($parentOfTopCandidate->nodeName !== 'body' && count($parentOfTopCandidate->getChildren(true)) === 1) {
1051 $topCandidate = $parentOfTopCandidate;
1052 $parentOfTopCandidate = $topCandidate->parentNode;
1053 }
1054 }
1055
1056 /*
1057 * Now that we have the top candidate, look through its siblings for content
1058 * that might also be related. Things like preambles, content split by ads
1059 * that we removed, etc.
1060 */
1061
1062 $this->logger->info('[Rating] Creating final article content document...');
1063
1064 $articleContent = new DOMDocument('1.0', 'utf-8');
1065 $articleContent->createElement('div');
1066
1067 $siblingScoreThreshold = max(10, $topCandidate->contentScore * 0.2);
1068 // Keep potential top candidate's parent node to try to get text direction of it later.
1069 $parentOfTopCandidate = $topCandidate->parentNode;
1070 $siblings = $parentOfTopCandidate->getChildren();
1071
1072 $hasContent = false;
1073
1074 $this->logger->info('[Rating] Adding top candidate siblings...');
1075
1076 /** @var DOMElement $sibling */
1077 foreach ($siblings as $sibling) {
1078 $append = false;
1079
1080 if ($sibling === $topCandidate) {
1081 $this->logger->debug('[Rating] Sibling is equal to the top candidate, adding to the final article...');
1082
1083 $append = true;
1084 } else {
1085 $contentBonus = 0;
1086
1087 // Give a bonus if sibling nodes and top candidates have the example same classname
1088 if ($sibling->getAttribute('class') === $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') !== '') {
1089 $contentBonus += $topCandidate->contentScore * 0.2;
1090 }
1091 if ($sibling->contentScore + $contentBonus >= $siblingScoreThreshold) {
1092 $append = true;
1093 } elseif ($sibling->nodeName === 'p') {
1094 $linkDensity = $sibling->getLinkDensity();
1095 $nodeContent = $sibling->getTextContent(true);
1096
1097 if (mb_strlen($nodeContent) > 80 && $linkDensity < 0.25) {
1098 $append = true;
1099 } elseif ($nodeContent && mb_strlen($nodeContent) < 80 && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent)) {
1100 $append = true;
1101 }
1102 }
1103 }
1104
1105 if ($append) {
1106 $this->logger->debug(sprintf('[Rating] Appending sibling to final article, content is: \'%s\'', substr($sibling->nodeValue, 0, 128)));
1107
1108 $hasContent = true;
1109
1110 if (!in_array(strtolower($sibling->nodeName), $this->alterToDIVExceptions)) {
1111 /*
1112 * We have a node that isn't a common block level element, like a form or td tag.
1113 * Turn it into a div so it doesn't get filtered out later by accident.
1114 */
1115
1116 $sibling = NodeUtility::setNodeTag($sibling, 'div');
1117 }
1118
1119 $import = $articleContent->importNode($sibling, true);
1120 $articleContent->appendChild($import);
1121
1122 /*
1123 * No node shifting needs to be check because when calling getChildren, an array is made with the
1124 * children of the parent node, instead of using the DOMElement childNodes function, which, when used
1125 * along with appendChild, would shift the nodes position and the current foreach will behave in
1126 * unpredictable ways.
1127 */
1128 }
1129 }
1130
1131 $articleContent = $this->prepArticle($articleContent);
1132
1133 if ($hasContent) {
1134 // Find out text direction from ancestors of final top candidate.
1135 $ancestors = array_merge([$parentOfTopCandidate, $topCandidate], $parentOfTopCandidate->getNodeAncestors());
1136 foreach ($ancestors as $ancestor) {
1137 $articleDir = $ancestor->getAttribute('dir');
1138 if ($articleDir) {
1139 $this->setDirection($articleDir);
1140 $this->logger->debug(sprintf('[Rating] Found article direction: %s', $articleDir));
1141 break;
1142 }
1143 }
1144
1145 return $articleContent;
1146 } else {
1147 return false;
1148 }
1149 }
1150
1151 /**
1152 * Cleans up the final article.
1153 *
1154 * @param DOMDocument $article
1155 *
1156 * @return DOMDocument
1157 */
1158 public function prepArticle(DOMDocument $article)
1159 {
1160 $this->logger->info('[PrepArticle] Preparing final article...');
1161
1162 $this->_cleanStyles($article);
1163 $this->_clean($article, 'style');
1164
1165 // Check for data tables before we continue, to avoid removing items in
1166 // those tables, which will often be isolated even though they're
1167 // visually linked to other content-ful elements (text, images, etc.).
1168 $this->_markDataTables($article);
1169
1170 // Clean out junk from the article content
1171 $this->_cleanConditionally($article, 'form');
1172 $this->_cleanConditionally($article, 'fieldset');
1173 $this->_clean($article, 'object');
1174 $this->_clean($article, 'embed');
1175 $this->_clean($article, 'h1');
1176 $this->_clean($article, 'footer');
1177 $this->_clean($article, 'link');
1178
1179 // Clean out elements have "share" in their id/class combinations from final top candidates,
1180 // which means we don't remove the top candidates even they have "share".
1181 foreach ($article->childNodes as $child) {
1182 $this->_cleanMatchedNodes($child, '/share/i');
1183 }
1184
1185 /*
1186 * If there is only one h2 and its text content substantially equals article title,
1187 * they are probably using it as a header and not a subheader,
1188 * so remove it since we already extract the title separately.
1189 */
1190 $h2 = $article->getElementsByTagName('h2');
1191 if ($h2->length === 1) {
1192 $lengthSimilarRate = (mb_strlen($h2->item(0)->textContent) - mb_strlen($this->getTitle())) / max(mb_strlen($this->getTitle()), 1);
1193
1194 if (abs($lengthSimilarRate) < 0.5) {
1195 if ($lengthSimilarRate > 0) {
1196 $titlesMatch = strpos($h2->item(0)->textContent, $this->getTitle()) !== false;
1197 } else {
1198 $titlesMatch = strpos($this->getTitle(), $h2->item(0)->textContent) !== false;
1199 }
1200 if ($titlesMatch) {
1201 $this->logger->info('[PrepArticle] Found title repeated in an H2 node, removing...');
1202 $this->_clean($article, 'h2');
1203 }
1204 }
1205 }
1206
1207 $this->_clean($article, 'iframe');
1208 $this->_clean($article, 'input');
1209 $this->_clean($article, 'textarea');
1210 $this->_clean($article, 'select');
1211 $this->_clean($article, 'button');
1212 $this->_cleanHeaders($article);
1213
1214 // Do these last as the previous stuff may have removed junk
1215 // that will affect these
1216 $this->_cleanConditionally($article, 'table');
1217 $this->_cleanConditionally($article, 'ul');
1218 $this->_cleanConditionally($article, 'div');
1219
1220 $this->_cleanExtraParagraphs($article);
1221
1222 foreach (iterator_to_array($article->getElementsByTagName('br')) as $br) {
1223 $next = $br->nextSibling;
1224 if ($next && $next->nodeName === 'p') {
1225 $this->logger->debug('[PrepArticle] Removing br node next to a p node.');
1226 $br->parentNode->removeChild($br);
1227 }
1228 }
1229
1230 return $article;
1231 }
1232
1233 /**
1234 * Look for 'data' (as opposed to 'layout') tables, for which we use
1235 * similar checks as
1236 * https://dxr.mozilla.org/mozilla-central/rev/71224049c0b52ab190564d3ea0eab089a159a4cf/accessible/html/HTMLTableAccessible.cpp#920.
1237 *
1238 * @param DOMDocument $article
1239 *
1240 * @return void
1241 */
1242 public function _markDataTables(DOMDocument $article)
1243 {
1244 $tables = $article->getElementsByTagName('table');
1245 foreach ($tables as $table) {
1246 /** @var DOMElement $table */
1247 $role = $table->getAttribute('role');
1248 if ($role === 'presentation') {
1249 $table->setReadabilityDataTable(false);
1250 continue;
1251 }
1252 $datatable = $table->getAttribute('datatable');
1253 if ($datatable == '0') {
1254 $table->setReadabilityDataTable(false);
1255 continue;
1256 }
1257 $summary = $table->getAttribute('summary');
1258 if ($summary) {
1259 $table->setReadabilityDataTable(true);
1260 continue;
1261 }
1262
1263 $caption = $table->getElementsByTagName('caption');
1264 if ($caption->length > 0 && $caption->item(0)->childNodes->length > 0) {
1265 $table->setReadabilityDataTable(true);
1266 continue;
1267 }
1268
1269 // If the table has a descendant with any of these tags, consider a data table:
1270 foreach (['col', 'colgroup', 'tfoot', 'thead', 'th'] as $dataTableDescendants) {
1271 if ($table->getElementsByTagName($dataTableDescendants)->length > 0) {
1272 $table->setReadabilityDataTable(true);
1273 continue 2;
1274 }
1275 }
1276
1277 // Nested tables indicate a layout table:
1278 if ($table->getElementsByTagName('table')->length > 0) {
1279 $table->setReadabilityDataTable(false);
1280 continue;
1281 }
1282
1283 $sizeInfo = $table->getRowAndColumnCount();
1284 if ($sizeInfo['rows'] >= 10 || $sizeInfo['columns'] > 4) {
1285 $table->setReadabilityDataTable(true);
1286 continue;
1287 }
1288 // Now just go by size entirely:
1289 $table->setReadabilityDataTable($sizeInfo['rows'] * $sizeInfo['columns'] > 10);
1290 }
1291 }
1292
1293 /**
1294 * Remove the style attribute on every e and under.
1295 *
1296 * @param $node DOMDocument|DOMNode
1297 **/
1298 public function _cleanStyles($node)
1299 {
1300 if (property_exists($node, 'tagName') && $node->tagName === 'svg') {
1301 return;
1302 }
1303
1304 // Do not bother if there's no method to remove an attribute
1305 if (method_exists($node, 'removeAttribute')) {
1306 $presentational_attributes = ['align', 'background', 'bgcolor', 'border', 'cellpadding', 'cellspacing', 'frame', 'hspace', 'rules', 'style', 'valign', 'vspace'];
1307 // Remove `style` and deprecated presentational attributes
1308 foreach ($presentational_attributes as $presentational_attribute) {
1309 $node->removeAttribute($presentational_attribute);
1310 }
1311
1312 $deprecated_size_attribute_elems = ['table', 'th', 'td', 'hr', 'pre'];
1313 if (property_exists($node, 'tagName') && in_array($node->tagName, $deprecated_size_attribute_elems)) {
1314 $node->removeAttribute('width');
1315 $node->removeAttribute('height');
1316 }
1317 }
1318
1319 $cur = $node->firstChild;
1320 while ($cur !== null) {
1321 $this->_cleanStyles($cur);
1322 $cur = $cur->nextSibling;
1323 }
1324 }
1325
1326 /**
1327 * Clean out elements whose id/class combinations match specific string.
1328 *
1329 * @param $node DOMElement Node to clean
1330 * @param $regex string Match id/class combination.
1331 *
1332 * @return void
1333 **/
1334 public function _cleanMatchedNodes($node, $regex)
1335 {
1336 $endOfSearchMarkerNode = NodeUtility::getNextNode($node, true);
1337 $next = NodeUtility::getNextNode($node);
1338 while ($next && $next !== $endOfSearchMarkerNode) {
1339 if (preg_match($regex, sprintf('%s %s', $next->getAttribute('class'), $next->getAttribute('id')))) {
1340 $this->logger->debug(sprintf('Removing matched node with regex: \'%s\', node class was: \'%s\', id: \'%s\'', $regex, $next->getAttribute('class'), $next->getAttribute('id')));
1341 $next = NodeUtility::removeAndGetNext($next);
1342 } else {
1343 $next = NodeUtility::getNextNode($next);
1344 }
1345 }
1346 }
1347
1348 /**
1349 * @param DOMDocument $article
1350 *
1351 * @return void
1352 */
1353 public function _cleanExtraParagraphs(DOMDocument $article)
1354 {
1355 $paragraphs = $article->getElementsByTagName('p');
1356 $length = $paragraphs->length;
1357
1358 for ($i = 0; $i < $length; $i++) {
1359 $paragraph = $paragraphs->item($length - 1 - $i);
1360
1361 $imgCount = $paragraph->getElementsByTagName('img')->length;
1362 $embedCount = $paragraph->getElementsByTagName('embed')->length;
1363 $objectCount = $paragraph->getElementsByTagName('object')->length;
1364 // At this point, nasty iframes have been removed, only remain embedded video ones.
1365 $iframeCount = $paragraph->getElementsByTagName('iframe')->length;
1366 $totalCount = $imgCount + $embedCount + $objectCount + $iframeCount;
1367
1368 if ($totalCount === 0 && !preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $paragraph->textContent)) {
1369 $this->logger->debug(sprintf('[PrepArticle] Removing extra paragraph. Text content was: \'%s\'', substr($paragraph->textContent, 0, 128)));
1370 $paragraph->parentNode->removeChild($paragraph);
1371 }
1372 }
1373 }
1374
1375 /**
1376 * @param DOMDocument $article
1377 *
1378 * @return void
1379 */
1380 public function _cleanConditionally(DOMDocument $article, $tag)
1381 {
1382 if (!$this->configuration->getCleanConditionally()) {
1383 return;
1384 }
1385
1386 $isList = in_array($tag, ['ul', 'ol']);
1387
1388 /*
1389 * Gather counts for other typical elements embedded within.
1390 * Traverse backwards so we can remove nodes at the same time
1391 * without effecting the traversal.
1392 */
1393
1394 $DOMNodeList = $article->getElementsByTagName($tag);
1395 $length = $DOMNodeList->length;
1396 for ($i = 0; $i < $length; $i++) {
1397 /** @var $node DOMElement */
1398 $node = $DOMNodeList->item($length - 1 - $i);
1399
1400 // First check if we're in a data table, in which case don't remove us.
1401 if ($node->hasAncestorTag($node, 'table', -1) && $node->isReadabilityDataTable()) {
1402 continue;
1403 }
1404
1405 $weight = 0;
1406 if ($this->configuration->getWeightClasses()) {
1407 $weight = $node->getClassWeight();
1408 }
1409
1410 if ($weight < 0) {
1411 $this->logger->debug(sprintf('[PrepArticle] Removing tag \'%s\' with 0 or less weight', $tag));
1412
1413 NodeUtility::removeNode($node);
1414 continue;
1415 }
1416
1417 if (substr_count($node->getTextContent(), ',') < 10) {
1418 /*
1419 * If there are not very many commas, and the number of
1420 * non-paragraph elements is more than paragraphs or other
1421 * ominous signs, remove the element.
1422 */
1423
1424 $p = $node->getElementsByTagName('p')->length;
1425 $img = $node->getElementsByTagName('img')->length;
1426 $li = $node->getElementsByTagName('li')->length - 100;
1427 $input = $node->getElementsByTagName('input')->length;
1428
1429 $embedCount = 0;
1430 $embeds = $node->getElementsByTagName('embed');
1431
1432 foreach ($embeds as $embedNode) {
1433 if (preg_match(NodeUtility::$regexps['videos'], $embedNode->C14N())) {
1434 $embedCount++;
1435 }
1436 }
1437
1438 $linkDensity = $node->getLinkDensity();
1439 $contentLength = mb_strlen($node->getTextContent(true));
1440
1441 $haveToRemove =
1442 ($img > 1 && $p / $img < 0.5 && !$node->hasAncestorTag($node, 'figure')) ||
1443 (!$isList && $li > $p) ||
1444 ($input > floor($p / 3)) ||
1445 (!$isList && $contentLength < 25 && ($img === 0 || $img > 2) && !$node->hasAncestorTag($node, 'figure')) ||
1446 (!$isList && $weight < 25 && $linkDensity > 0.2) ||
1447 ($weight >= 25 && $linkDensity > 0.5) ||
1448 (($embedCount === 1 && $contentLength < 75) || $embedCount > 1);
1449
1450 if ($haveToRemove) {
1451 $this->logger->debug(sprintf('[PrepArticle] Removing tag \'%s\'.', $tag));
1452
1453 NodeUtility::removeNode($node);
1454 }
1455 }
1456 }
1457 }
1458
1459 /**
1460 * Clean a node of all elements of type "tag".
1461 * (Unless it's a youtube/vimeo video. People love movies.).
1462 *
1463 * @param $article DOMDocument
1464 * @param $tag string tag to clean
1465 *
1466 * @return void
1467 **/
1468 public function _clean(DOMDocument $article, $tag)
1469 {
1470 $isEmbed = in_array($tag, ['object', 'embed', 'iframe']);
1471
1472 $DOMNodeList = $article->getElementsByTagName($tag);
1473 $length = $DOMNodeList->length;
1474 for ($i = 0; $i < $length; $i++) {
1475 $item = $DOMNodeList->item($length - 1 - $i);
1476
1477 // Allow youtube and vimeo videos through as people usually want to see those.
1478 if ($isEmbed) {
1479 $attributeValues = [];
1480 foreach ($item->attributes as $name => $value) {
1481 $attributeValues[] = $value->nodeValue;
1482 }
1483 $attributeValues = implode('|', $attributeValues);
1484
1485 // First, check the elements attributes to see if any of them contain youtube or vimeo
1486 if (preg_match(NodeUtility::$regexps['videos'], $attributeValues)) {
1487 continue;
1488 }
1489
1490 // Then check the elements inside this element for the same.
1491 if (preg_match(NodeUtility::$regexps['videos'], $item->C14N())) {
1492 continue;
1493 }
1494 }
1495 $this->logger->debug(sprintf('[PrepArticle] Removing node \'%s\'.', $item->tagName));
1496
1497 NodeUtility::removeNode($item);
1498 }
1499 }
1500
1501 /**
1502 * Clean out spurious headers from an Element. Checks things like classnames and link density.
1503 *
1504 * @param DOMDocument $article
1505 *
1506 * @return void
1507 **/
1508 public function _cleanHeaders(DOMDocument $article)
1509 {
1510 for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) {
1511 $headers = $article->getElementsByTagName('h' . $headerIndex);
1512 /** @var $header DOMElement */
1513 foreach ($headers as $header) {
1514 $weight = 0;
1515 if ($this->configuration->getWeightClasses()) {
1516 $weight = $header->getClassWeight();
1517 }
1518
1519 if ($weight < 0) {
1520 $this->logger->debug(sprintf('[PrepArticle] Removing H node with 0 or less weight. Content was: \'%s\'', substr($header->nodeValue, 0, 128)));
1521
1522 NodeUtility::removeNode($header);
1523 }
1524 }
1525 }
1526 }
1527
1528 /**
1529 * Removes the class="" attribute from every element in the given
1530 * subtree.
1531 *
1532 * Readability.js has a special filter to avoid cleaning the classes that the algorithm adds. We don't add classes
1533 * here so no need to filter those.
1534 *
1535 * @param DOMDocument|DOMNode $node
1536 *
1537 * @return void
1538 **/
1539 public function _cleanClasses($node)
1540 {
1541 if ($node->getAttribute('class') !== '') {
1542 $node->removeAttribute('class');
1543 }
1544
1545 for ($node = $node->firstChild; $node !== null; $node = $node->nextSibling) {
1546 $this->_cleanClasses($node);
1547 }
1548 }
1549
1550 /**
1551 * @param DOMDocument $article
1552 *
1553 * @return DOMDocument
1554 */
1555 public function postProcessContent(DOMDocument $article)
1556 {
1557 $this->logger->info('[PostProcess] PostProcessing content...');
1558
1559 // Readability cannot open relative uris so we convert them to absolute uris.
1560 if ($this->configuration->getFixRelativeURLs()) {
1561 foreach (iterator_to_array($article->getElementsByTagName('a')) as $link) {
1562 /** @var DOMElement $link */
1563 $href = $link->getAttribute('href');
1564 if ($href) {
1565 // Replace links with javascript: URIs with text content, since
1566 // they won't work after scripts have been removed from the page.
1567 if (strpos($href, 'javascript:') === 0) {
1568 $this->logger->debug(sprintf('[PostProcess] Removing \'javascript:\' link. Content is: \'%s\'', substr($link->textContent, 0, 128)));
1569
1570 $text = $article->createTextNode($link->textContent);
1571 $link->parentNode->replaceChild($text, $link);
1572 } else {
1573 $this->logger->debug(sprintf('[PostProcess] Converting link to absolute URI: \'%s\'', substr($href, 0, 128)));
1574
1575 $link->setAttribute('href', $this->toAbsoluteURI($href));
1576 }
1577 }
1578 }
1579
1580 foreach ($article->getElementsByTagName('img') as $img) {
1581 /** @var DOMElement $img */
1582 /*
1583 * Extract all possible sources of img url and select the first one on the list.
1584 */
1585 $url = [
1586 $img->getAttribute('src'),
1587 $img->getAttribute('data-src'),
1588 $img->getAttribute('data-original'),
1589 $img->getAttribute('data-orig'),
1590 $img->getAttribute('data-url')
1591 ];
1592
1593 $src = array_filter($url);
1594 $src = reset($src);
1595 if ($src) {
1596 $this->logger->debug(sprintf('[PostProcess] Converting image URL to absolute URI: \'%s\'', substr($src, 0, 128)));
1597
1598 $img->setAttribute('src', $this->toAbsoluteURI($src));
1599 }
1600 }
1601 }
1602
1603 $this->_cleanClasses($article);
1604
1605 return $article;
1606 }
1607
1608 /**
1609 * @return null|string
1610 */
1611 public function __toString()
1612 {
1613 return sprintf('<h1>%s</h1>%s', $this->getTitle(), $this->getContent());
1614 }
1615
1616 /**
1617 * @return string|null
1618 */
1619 public function getTitle()
1620 {
1621 return $this->title;
1622 }
1623
1624 /**
1625 * @param string $title
1626 */
1627 protected function setTitle($title)
1628 {
1629 $this->title = $title;
1630 }
1631
1632 /**
1633 * @return string|null
1634 */
1635 public function getContent()
1636 {
1637 return ($this->content instanceof DOMDocument) ? $this->content->C14N() : null;
1638 }
1639
1640 /**
1641 * @return DOMDocument|null
1642 */
1643 public function getDOMDocument()
1644 {
1645 return $this->content;
1646 }
1647
1648 /**
1649 * @param DOMDocument $content
1650 */
1651 protected function setContent(DOMDocument $content)
1652 {
1653 $this->content = $content;
1654 }
1655
1656 /**
1657 * @return null|string
1658 */
1659 public function getExcerpt()
1660 {
1661 return $this->excerpt;
1662 }
1663
1664 /**
1665 * @param null|string $excerpt
1666 */
1667 public function setExcerpt($excerpt)
1668 {
1669 $this->excerpt = $excerpt;
1670 }
1671
1672 /**
1673 * @return string|null
1674 */
1675 public function getImage()
1676 {
1677 return $this->image;
1678 }
1679
1680 /**
1681 * @param string $image
1682 */
1683 protected function setImage($image)
1684 {
1685 $this->image = $image;
1686 }
1687
1688 /**
1689 * @return string|null
1690 */
1691 public function getAuthor()
1692 {
1693 return $this->author;
1694 }
1695
1696 /**
1697 * @param string $author
1698 */
1699 protected function setAuthor($author)
1700 {
1701 $this->author = $author;
1702 }
1703
1704 /**
1705 * @return null|string
1706 */
1707 public function getDirection()
1708 {
1709 return $this->direction;
1710 }
1711
1712 /**
1713 * @param null|string $direction
1714 */
1715 public function setDirection($direction)
1716 {
1717 $this->direction = $direction;
1718 }
1719 }