]> git.wh0rd.org Git - tt-rss.git/blob - plugins/af_readability/classes/Readability.php
Merge branch 'master' of git.fakecake.org:tt-rss
[tt-rss.git] / plugins / af_readability / classes / Readability.php
1 <?php
2 /** 
3 * Arc90's Readability ported to PHP for FiveFilters.org
4 * Based on readability.js version 1.7.1 (without multi-page support)
5 * ------------------------------------------------------
6 * Original URL: http://lab.arc90.com/experiments/readability/js/readability.js
7 * Arc90's project URL: http://lab.arc90.com/experiments/readability/
8 * JS Source: http://code.google.com/p/arc90labs-readability
9 * Ported by: Keyvan Minoukadeh, http://www.keyvan.net
10 * More information: http://fivefilters.org/content-only/
11 * License: Apache License, Version 2.0
12 * Requires: PHP5
13 * Date: 2010-10-29
14
15 * Differences between the PHP port and the original
16 * ------------------------------------------------------
17 * Arc90's Readability is designed to run in the browser. It works on the DOM 
18 * tree (the parsed HTML) after the page's CSS styles have been applied and 
19 * Javascript code executed. This PHP port does not run inside a browser. 
20 * We use PHP's ability to parse HTML to build our DOM tree, but we cannot 
21 * rely on CSS or Javascript support. As such, the results will not always 
22 * match Arc90's Readability. (For example, if a web page contains CSS style 
23 * rules or Javascript code which hide certain HTML elements from display, 
24 * Arc90's Readability will dismiss those from consideration but our PHP port, 
25 * unable to understand CSS or Javascript, will not know any better.)
26
27 * Another significant difference is that the aim of Arc90's Readability is 
28 * to re-present the main content block of a given web page so users can 
29 * read it more easily in their browsers. Correct identification, clean up, 
30 * and separation of the content block is only a part of this process. 
31 * This PHP port is only concerned with this part, it does not include code 
32 * that relates to presentation in the browser - Arc90 already do 
33 * that extremely well, and for PDF output there's FiveFilters.org's 
34 * PDF Newspaper: http://fivefilters.org/pdf-newspaper/.
35
36 * Finally, this class contains methods that might be useful for developers 
37 * working on HTML document fragments. So without deviating too much from 
38 * the original code (which I don't want to do because it makes debugging 
39 * and updating more difficult), I've tried to make it a little more 
40 * developer friendly. You should be able to use the methods here on 
41 * existing DOMElement objects without passing an entire HTML document to 
42 * be parsed.
43 */
44
45 // This class allows us to do JavaScript like assignements to innerHTML
46 require_once(dirname(__FILE__).'/JSLikeHTMLElement.php');
47
48 // Alternative usage (for testing only!)
49 // uncomment the lins below and call Readability.php in your browser 
50 // passing it the URL of the page you'd like content from, e.g.:
51 // Readability.php?url=http://medialens.org/alerts/09/090615_the_guardian_climate.php
52
53 /*
54 if (!isset($_GET['url']) || $_GET['url'] == '') {
55         die('Please pass a URL to the script. E.g. Readability.php?url=bla.com/story.html');
56 }
57 $url = $_GET['url'];
58 if (!preg_match('!^https?://!i', $url)) $url = 'http://'.$url;
59 $html = file_get_contents($url);
60 $r = new Readability($html, $url);
61 $r->init();
62 echo $r->articleContent->innerHTML;
63 */
64
65 class Readability
66 {
67         public $version = '1.7.1-without-multi-page';
68         public $convertLinksToFootnotes = false;
69         public $revertForcedParagraphElements = true;
70         public $articleTitle;
71         public $articleContent;
72         public $dom;
73         public $url = null; // optional - URL where HTML was retrieved
74         public $debug = false;
75         protected $body = null; // 
76         protected $bodyCache = null; // Cache the body HTML in case we need to re-use it later
77         protected $flags = 7; // 1 | 2 | 4;   // Start with all flags set.
78         protected $success = false; // indicates whether we were able to extract or not
79         
80         /**
81         * All of the regular expressions in use within readability.
82         * Defined up here so we don't instantiate them repeatedly in loops.
83         **/
84         public $regexps = array(
85                 'unlikelyCandidates' => '/combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter/i',
86                 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i',
87                 'positive' => '/article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i',
88                 'negative' => '/combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i',
89                 'divToPElements' => '/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i',
90                 'replaceBrs' => '/(<br[^>]*>[ \n\r\t]*){2,}/i',
91                 'replaceFonts' => '/<(\/?)font[^>]*>/i',
92                 // 'trimRe' => '/^\s+|\s+$/g', // PHP has trim()
93                 'normalize' => '/\s{2,}/',
94                 'killBreaks' => '/(<br\s*\/?>(\s|&nbsp;?)*){1,}/',
95                 'video' => '/http:\/\/(www\.)?(youtube|vimeo)\.com/i',
96                 'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i'
97         );      
98         
99         /* constants */
100         const FLAG_STRIP_UNLIKELYS = 1;
101         const FLAG_WEIGHT_CLASSES = 2;
102         const FLAG_CLEAN_CONDITIONALLY = 4;
103         
104         /**
105         * Create instance of Readability
106         * @param string UTF-8 encoded string
107         * @param string (optional) URL associated with HTML (used for footnotes)
108         */      
109         function __construct($html, $url=null)
110         {
111                 /* Turn all double br's into p's */
112                 $html = preg_replace($this->regexps['replaceBrs'], '</p><p>', $html);
113                 $html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html);
114                 $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
115                 $this->dom = new DOMDocument();
116                 $this->dom->preserveWhiteSpace = false;
117                 $this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement');
118                 if (trim($html) == '') $html = '<html></html>';
119                 @$this->dom->loadHTML($html);
120                 $this->url = $url;
121         }
122
123         /**
124         * Get article title element
125         * @return DOMElement
126         */
127         public function getTitle() {
128                 return $this->articleTitle;
129         }
130         
131         /**
132         * Get article content element
133         * @return DOMElement
134         */
135         public function getContent() {
136                 return $this->articleContent;
137         }       
138         
139         /**
140         * Runs readability.
141         * 
142         * Workflow:
143         *  1. Prep the document by removing script tags, css, etc.
144         *  2. Build readability's DOM tree.
145         *  3. Grab the article content from the current dom tree.
146         *  4. Replace the current DOM tree with the new one.
147         *  5. Read peacefully.
148         *
149         * @return boolean true if we found content, false otherwise
150         **/
151         public function init()
152         {
153                 $this->removeScripts($this->dom);
154                 //die($this->getInnerHTML($this->dom->documentElement));
155                 
156                 // Assume successful outcome
157                 $this->success = true;
158
159                 $bodyElems = $this->dom->getElementsByTagName('body');
160                 if ($bodyElems->length > 0) {
161                         if ($this->bodyCache == null) {
162                                 $this->bodyCache = $bodyElems->item(0)->innerHTML;
163                         }
164                         if ($this->body == null) {
165                                 $this->body = $bodyElems->item(0);
166                         }
167                 }
168
169                 $this->prepDocument();
170                 
171                 //die($this->dom->documentElement->parentNode->nodeType);
172                 //$this->setInnerHTML($this->dom->documentElement, $this->getInnerHTML($this->dom->documentElement));
173                 //die($this->getInnerHTML($this->dom->documentElement));
174
175                 /* Build readability's DOM tree */
176                 $overlay        = $this->dom->createElement('div');
177                 $innerDiv       = $this->dom->createElement('div');
178                 $articleTitle   = $this->getArticleTitle();
179                 $articleContent = $this->grabArticle();
180
181                 if (!$articleContent) {
182                         $this->success = false;
183                         $articleContent = $this->dom->createElement('div');
184                         $articleContent->setAttribute('id', 'readability-content');
185                         $articleContent->innerHTML = '<p>Sorry, Readability was unable to parse this page for content.</p>';            
186                 }
187                 
188                 $overlay->setAttribute('id', 'readOverlay');
189                 $innerDiv->setAttribute('id', 'readInner');
190
191                 /* Glue the structure of our document together. */
192                 $innerDiv->appendChild($articleTitle);
193                 $innerDiv->appendChild($articleContent);
194                 $overlay->appendChild($innerDiv);
195                 
196                 /* Clear the old HTML, insert the new content. */
197                 $this->body->innerHTML = '';
198                 $this->body->appendChild($overlay);
199                 //document.body.insertBefore(overlay, document.body.firstChild);
200                 $this->body->removeAttribute('style');
201
202                 $this->postProcessContent($articleContent);
203                 
204                 // Set title and content instance variables
205                 $this->articleTitle = $articleTitle;
206                 $this->articleContent = $articleContent;
207                 
208                 return $this->success;
209         }
210         
211         /**
212         * Debug
213         */
214         protected function dbg($msg) {
215                 if ($this->debug) echo '* ',$msg, '<br />', "\n";
216         }
217         
218         /**
219         * Run any post-process modifications to article content as necessary.
220         *
221         * @param DOMElement
222         * @return void
223         */
224         public function postProcessContent($articleContent) {
225                 if ($this->convertLinksToFootnotes && !preg_match('/wikipedia\.org/', @$this->url)) { 
226                         $this->addFootnotes($articleContent);
227                 }
228         }
229         
230         /**
231         * Get the article title as an H1.
232         *
233         * @return DOMElement
234         */
235         protected function getArticleTitle() {
236                 $curTitle = '';
237                 $origTitle = '';
238
239                 try {
240                         $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0));
241                 } catch(Exception $e) {}
242                 
243                 if (preg_match('/ [\|\-] /', $curTitle))
244                 {
245                         $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle);
246                         
247                         if (count(explode(' ', $curTitle)) < 3) {
248                                 $curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle);
249                         }
250                 }
251                 else if (strpos($curTitle, ': ') !== false)
252                 {
253                         $curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle);
254
255                         if (count(explode(' ', $curTitle)) < 3) {
256                                 $curTitle = preg_replace('/[^:]*[:](.*)/i','$1', $origTitle);
257                         }
258                 }
259                 else if(strlen($curTitle) > 150 || strlen($curTitle) < 15)
260                 {
261                         $hOnes = $this->dom->getElementsByTagName('h1');
262                         if($hOnes->length == 1)
263                         {
264                                 $curTitle = $this->getInnerText($hOnes->item(0));
265                         }
266                 }
267
268                 $curTitle = trim($curTitle);
269
270                 if (count(explode(' ', $curTitle)) <= 4) {
271                         $curTitle = $origTitle;
272                 }
273                 
274                 $articleTitle = $this->dom->createElement('h1');
275                 $articleTitle->innerHTML = $curTitle;
276                 
277                 return $articleTitle;
278         }
279         
280         /**
281         * Prepare the HTML document for readability to scrape it.
282         * This includes things like stripping javascript, CSS, and handling terrible markup.
283         * 
284         * @return void
285         **/
286         protected function prepDocument() {
287                 /**
288                 * In some cases a body element can't be found (if the HTML is totally hosed for example)
289                 * so we create a new body node and append it to the document.
290                 */
291                 if ($this->body == null)
292                 {
293                         $this->body = $this->dom->createElement('body');
294                         $this->dom->documentElement->appendChild($this->body);
295                 }
296                 
297                 $this->body->setAttribute('id', 'readabilityBody');
298
299                 /* Remove all style tags in head */
300                 $styleTags = $this->dom->getElementsByTagName('style');
301                 for ($i = $styleTags->length-1; $i >= 0; $i--)
302                 {
303                         $styleTags->item($i)->parentNode->removeChild($styleTags->item($i));
304                 }
305
306                 /* Turn all double br's into p's */
307                 /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */
308                 //document.body.innerHTML = document.body.innerHTML.replace(readability.regexps.replaceBrs, '</p><p>').replace(readability.regexps.replaceFonts, '<$1span>');
309                 // We do this in the constructor for PHP as that's when we have raw HTML - before parsing it into a DOM tree.
310                 // Manipulating innerHTML as it's done in JS is not possible in PHP.
311         }
312
313         /**
314         * For easier reading, convert this document to have footnotes at the bottom rather than inline links.
315         * @see http://www.roughtype.com/archives/2010/05/experiments_in.php
316         *
317         * @return void
318         **/
319         public function addFootnotes($articleContent) {
320                 $footnotesWrapper = $this->dom->createElement('div');
321                 $footnotesWrapper->setAttribute('id', 'readability-footnotes');
322                 $footnotesWrapper->innerHTML = '<h3>References</h3>';
323                 
324                 $articleFootnotes = $this->dom->createElement('ol');
325                 $articleFootnotes->setAttribute('id', 'readability-footnotes-list');
326                 $footnotesWrapper->appendChild($articleFootnotes);
327                 
328                 $articleLinks = $articleContent->getElementsByTagName('a');
329                 
330                 $linkCount = 0;
331                 for ($i = 0; $i < $articleLinks->length; $i++)
332                 {
333                         $articleLink  = $articleLinks->item($i);
334                         $footnoteLink = $articleLink->cloneNode(true);
335                         $refLink      = $this->dom->createElement('a');
336                         $footnote     = $this->dom->createElement('li');
337                         $linkDomain   = @parse_url($footnoteLink->getAttribute('href'), PHP_URL_HOST);
338                         if (!$linkDomain && isset($this->url)) $linkDomain = @parse_url($this->url, PHP_URL_HOST);
339                         //linkDomain   = footnoteLink.host ? footnoteLink.host : document.location.host,
340                         $linkText     = $this->getInnerText($articleLink);
341                         
342                         if ((strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote') !== false) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) {
343                                 continue;
344                         }
345                         
346                         $linkCount++;
347
348                         /** Add a superscript reference after the article link */
349                         $refLink->setAttribute('href', '#readabilityFootnoteLink-' . $linkCount);
350                         $refLink->innerHTML = '<small><sup>[' . $linkCount . ']</sup></small>';
351                         $refLink->setAttribute('class', 'readability-DoNotFootnote');
352                         $refLink->setAttribute('style', 'color: inherit;');
353                         
354                         //TODO: does this work or should we use DOMNode.isSameNode()?
355                         if ($articleLink->parentNode->lastChild == $articleLink) {
356                                 $articleLink->parentNode->appendChild($refLink);
357                         } else {
358                                 $articleLink->parentNode->insertBefore($refLink, $articleLink->nextSibling);
359                         }
360
361                         $articleLink->setAttribute('style', 'color: inherit; text-decoration: none;');
362                         $articleLink->setAttribute('name', 'readabilityLink-' . $linkCount);
363
364                         $footnote->innerHTML = '<small><sup><a href="#readabilityLink-' . $linkCount . '" title="Jump to Link in Article">^</a></sup></small> ';
365
366                         $footnoteLink->innerHTML = ($footnoteLink->getAttribute('title') != '' ? $footnoteLink->getAttribute('title') : $linkText);
367                         $footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount);
368                         
369                         $footnote->appendChild($footnoteLink);
370                         if ($linkDomain) $footnote->innerHTML = $footnote->innerHTML . '<small> (' . $linkDomain . ')</small>';
371                         
372                         $articleFootnotes->appendChild($footnote);
373                 }
374
375                 if ($linkCount > 0) {
376                         $articleContent->appendChild($footnotesWrapper);           
377                 }
378         }
379
380         /**
381         * Reverts P elements with class 'readability-styled'
382         * to text nodes - which is what they were before.
383         *
384         * @param DOMElement
385         * @return void
386         */
387         function revertReadabilityStyledElements($articleContent) {
388                 $xpath = new DOMXPath($articleContent->ownerDocument);
389                 $elems = $xpath->query('.//p[@class="readability-styled"]', $articleContent);
390                 //$elems = $articleContent->getElementsByTagName('p');
391                 for ($i = $elems->length-1; $i >= 0; $i--) {
392                         $e = $elems->item($i);
393                         $e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e);
394                         //if ($e->hasAttribute('class') && $e->getAttribute('class') == 'readability-styled') {
395                         //      $e->parentNode->replaceChild($this->dom->createTextNode($e->textContent), $e);
396                         //}
397                 }
398         }
399         
400         /**
401         * Prepare the article node for display. Clean out any inline styles,
402         * iframes, forms, strip extraneous <p> tags, etc.
403         *
404         * @param DOMElement
405         * @return void
406         */
407         function prepArticle($articleContent) {
408                 $this->cleanStyles($articleContent);
409                 $this->killBreaks($articleContent);
410                 if ($this->revertForcedParagraphElements) {
411                         $this->revertReadabilityStyledElements($articleContent);
412                 }
413
414                 /* Clean out junk from the article content */
415                 $this->cleanConditionally($articleContent, 'form');
416                 $this->clean($articleContent, 'object');
417                 $this->clean($articleContent, 'h1');
418
419                 /**
420                 * If there is only one h2, they are probably using it
421                 * as a header and not a subheader, so remove it since we already have a header.
422                 ***/
423                 if ($articleContent->getElementsByTagName('h2')->length == 1) {
424                         $this->clean($articleContent, 'h2'); 
425                 }
426                 $this->clean($articleContent, 'iframe');
427
428                 $this->cleanHeaders($articleContent);
429
430                 /* Do these last as the previous stuff may have removed junk that will affect these */
431                 $this->cleanConditionally($articleContent, 'table');
432                 $this->cleanConditionally($articleContent, 'ul');
433                 $this->cleanConditionally($articleContent, 'div');
434
435                 /* Remove extra paragraphs */
436                 $articleParagraphs = $articleContent->getElementsByTagName('p');
437                 for ($i = $articleParagraphs->length-1; $i >= 0; $i--)
438                 {
439                         $imgCount    = $articleParagraphs->item($i)->getElementsByTagName('img')->length;
440                         $embedCount  = $articleParagraphs->item($i)->getElementsByTagName('embed')->length;
441                         $objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length;
442                         
443                         if ($imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $this->getInnerText($articleParagraphs->item($i), false) == '')
444                         {
445                                 $articleParagraphs->item($i)->parentNode->removeChild($articleParagraphs->item($i));
446                         }
447                 }
448
449                 try {
450                         $articleContent->innerHTML = preg_replace('/<br[^>]*>\s*<p/i', '<p', $articleContent->innerHTML);
451                         //articleContent.innerHTML = articleContent.innerHTML.replace(/<br[^>]*>\s*<p/gi, '<p');      
452                 }
453                 catch (Exception $e) {
454                         $this->dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " . $e);
455                 }
456         }
457         
458         /**
459         * Initialize a node with the readability object. Also checks the
460         * className/id for special names to add to its score.
461         *
462         * @param Element
463         * @return void
464         **/
465         protected function initializeNode($node) {
466                 $readability = $this->dom->createAttribute('readability');
467                 $readability->value = 0; // this is our contentScore
468                 $node->setAttributeNode($readability);                   
469
470                 switch (strtoupper($node->tagName)) { // unsure if strtoupper is needed, but using it just in case
471                         case 'DIV':
472                                 $readability->value += 5;
473                                 break;
474
475                         case 'PRE':
476                         case 'TD':
477                         case 'BLOCKQUOTE':
478                                 $readability->value += 3;
479                                 break;
480                                 
481                         case 'ADDRESS':
482                         case 'OL':
483                         case 'UL':
484                         case 'DL':
485                         case 'DD':
486                         case 'DT':
487                         case 'LI':
488                         case 'FORM':
489                                 $readability->value -= 3;
490                                 break;
491
492                         case 'H1':
493                         case 'H2':
494                         case 'H3':
495                         case 'H4':
496                         case 'H5':
497                         case 'H6':
498                         case 'TH':
499                                 $readability->value -= 5;
500                                 break;
501                 }
502                 $readability->value += $this->getClassWeight($node);
503         }
504         
505         /***
506         * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
507         *               most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
508         *
509         * @return DOMElement
510         **/
511         protected function grabArticle($page=null) {
512                 $stripUnlikelyCandidates = $this->flagIsActive(self::FLAG_STRIP_UNLIKELYS);
513                 if (!$page) $page = $this->dom;
514                 $allElements = $page->getElementsByTagName('*');
515                 /**
516                 * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs
517                 * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.)
518                 *
519                 * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
520                 * TODO: Shouldn't this be a reverse traversal?
521                 **/
522                 $node = null;
523                 $nodesToScore = array();
524                 for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); $nodeIndex++) {
525                 //for ($nodeIndex=$targetList->length-1; $nodeIndex >= 0; $nodeIndex--) {
526                         //$node = $targetList->item($nodeIndex);
527                         $tagName = strtoupper($node->tagName);
528                         /* Remove unlikely candidates */
529                         if ($stripUnlikelyCandidates) {
530                                 $unlikelyMatchString = $node->getAttribute('class') . $node->getAttribute('id');
531                                 if (
532                                         preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) &&
533                                         !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) &&
534                                         $tagName != 'BODY'
535                                 )
536                                 {
537                                         $this->dbg('Removing unlikely candidate - ' . $unlikelyMatchString);
538                                         //$nodesToRemove[] = $node;
539                                         $node->parentNode->removeChild($node);
540                                         $nodeIndex--;
541                                         continue;
542                                 }               
543                         }
544
545                         if ($tagName == 'P' || $tagName == 'TD' || $tagName == 'PRE') {
546                                 $nodesToScore[] = $node;
547                         }
548
549                         /* Turn all divs that don't have children block level elements into p's */
550                         if ($tagName == 'DIV') {
551                                 if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) {
552                                         //$this->dbg('Altering div to p');
553                                         $newNode = $this->dom->createElement('p');
554                                         try {
555                                                 $newNode->innerHTML = $node->innerHTML;
556                                                 //$nodesToReplace[] = array('new'=>$newNode, 'old'=>$node);
557                                                 $node->parentNode->replaceChild($newNode, $node);
558                                                 $nodeIndex--;
559                                                 $nodesToScore[] = $node; // or $newNode?
560                                         }
561                                         catch(Exception $e) {
562                                                 $this->dbg('Could not alter div to p, reverting back to div.: ' . $e);
563                                         }
564                                 }
565                                 else
566                                 {
567                                         /* EXPERIMENTAL */
568                                         // TODO: change these p elements back to text nodes after processing
569                                         for ($i = 0, $il = $node->childNodes->length; $i < $il; $i++) {
570                                                 $childNode = $node->childNodes->item($i);
571                                                 if ($childNode->nodeType == 3) { // XML_TEXT_NODE
572                                                         //$this->dbg('replacing text node with a p tag with the same content.');
573                                                         $p = $this->dom->createElement('p');
574                                                         $p->innerHTML = $childNode->nodeValue;
575                                                         $p->setAttribute('style', 'display: inline;');
576                                                         $p->setAttribute('class', 'readability-styled');
577                                                         $childNode->parentNode->replaceChild($p, $childNode);
578                                                 }
579                                         }
580                                 }
581                         }
582                 }
583                 
584                 /**
585                 * Loop through all paragraphs, and assign a score to them based on how content-y they look.
586                 * Then add their score to their parent node.
587                 *
588                 * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
589                 **/
590                 $candidates = array();
591                 for ($pt=0; $pt < count($nodesToScore); $pt++) {
592                         $parentNode      = $nodesToScore[$pt]->parentNode;
593                         // $grandParentNode = $parentNode ? $parentNode->parentNode : null;
594                         $grandParentNode = !$parentNode ? null : (($parentNode->parentNode instanceof DOMElement) ? $parentNode->parentNode : null);
595                         $innerText       = $this->getInnerText($nodesToScore[$pt]);
596
597                         if (!$parentNode || !isset($parentNode->tagName)) {
598                                 continue;
599                         }
600
601                         /* If this paragraph is less than 25 characters, don't even count it. */
602                         if(strlen($innerText) < 25) {
603                                 continue;
604                         }
605
606                         /* Initialize readability data for the parent. */
607                         if (!$parentNode->hasAttribute('readability')) 
608                         {
609                                 $this->initializeNode($parentNode);
610                                 $candidates[] = $parentNode;
611                         }
612
613                         /* Initialize readability data for the grandparent. */
614                         if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName))
615                         {
616                                 $this->initializeNode($grandParentNode);
617                                 $candidates[] = $grandParentNode;
618                         }
619
620                         $contentScore = 0;
621
622                         /* Add a point for the paragraph itself as a base. */
623                         $contentScore++;
624
625                         /* Add points for any commas within this paragraph */
626                         $contentScore += count(explode(',', $innerText));
627                         
628                         /* For every 100 characters in this paragraph, add another point. Up to 3 points. */
629                         $contentScore += min(floor(strlen($innerText) / 100), 3);
630                         
631                         /* Add the score to the parent. The grandparent gets half. */
632                         $parentNode->getAttributeNode('readability')->value += $contentScore;
633
634                         if ($grandParentNode) {
635                                 $grandParentNode->getAttributeNode('readability')->value += $contentScore/2;             
636                         }
637                 }
638
639                 /**
640                 * After we've calculated scores, loop through all of the possible candidate nodes we found
641                 * and find the one with the highest score.
642                 **/
643                 $topCandidate = null;
644                 for ($c=0, $cl=count($candidates); $c < $cl; $c++)
645                 {
646                         /**
647                         * Scale the final candidates score based on link density. Good content should have a
648                         * relatively small link density (5% or less) and be mostly unaffected by this operation.
649                         **/
650                         $readability = $candidates[$c]->getAttributeNode('readability');
651                         $readability->value = $readability->value * (1-$this->getLinkDensity($candidates[$c]));
652
653                         $this->dbg('Candidate: ' . $candidates[$c]->tagName . ' (' . $candidates[$c]->getAttribute('class') . ':' . $candidates[$c]->getAttribute('id') . ') with score ' . $readability->value);
654
655                         if (!$topCandidate || $readability->value > (int)$topCandidate->getAttribute('readability')) {
656                                 $topCandidate = $candidates[$c];
657                         }
658                 }
659
660                 /**
661                 * If we still have no top candidate, just use the body as a last resort.
662                 * We also have to copy the body node so it is something we can modify.
663                 **/
664                 if ($topCandidate === null || strtoupper($topCandidate->tagName) == 'BODY')
665                 {
666                         $topCandidate = $this->dom->createElement('div');
667                         $topCandidate->innerHTML = ($page instanceof DOMDocument) ? $page->saveXML($page->documentElement) : $page->innerHTML;
668                         $page->innerHTML = '';
669                         $page->appendChild($topCandidate);
670                         $this->initializeNode($topCandidate);
671                 }
672
673                 /**
674                 * Now that we have the top candidate, look through its siblings for content that might also be related.
675                 * Things like preambles, content split by ads that we removed, etc.
676                 **/
677                 $articleContent        = $this->dom->createElement('div');
678                 $articleContent->setAttribute('id', 'readability-content');
679                 $siblingScoreThreshold = max(10, ((int)$topCandidate->getAttribute('readability')) * 0.2);
680                 $siblingNodes          = $topCandidate->parentNode->childNodes;
681
682                 for ($s=0, $sl=$siblingNodes->length; $s < $sl; $s++)
683                 {
684                         $siblingNode = $siblingNodes->item($s);
685                         $append      = false;
686
687                         $this->dbg('Looking at sibling node: ' . $siblingNode->nodeName . (($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : ''));
688
689                         //dbg('Sibling has score ' . ($siblingNode->readability ? siblingNode.readability.contentScore : 'Unknown'));
690
691                         if ($siblingNode === $topCandidate)
692                         // or if ($siblingNode->isSameNode($topCandidate))
693                         {
694                                 $append = true;
695                         }
696
697                         $contentBonus = 0;
698                         /* Give a bonus if sibling nodes and top candidates have the example same classname */
699                         if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') == $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') != '') {
700                                 $contentBonus += ((int)$topCandidate->getAttribute('readability')) * 0.2;
701                         }
702
703                         if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') && (((int)$siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold)
704                         {
705                                 $append = true;
706                         }
707                         
708                         if (strtoupper($siblingNode->nodeName) == 'P') {
709                                 $linkDensity = $this->getLinkDensity($siblingNode);
710                                 $nodeContent = $this->getInnerText($siblingNode);
711                                 $nodeLength  = strlen($nodeContent);
712                                 
713                                 if ($nodeLength > 80 && $linkDensity < 0.25)
714                                 {
715                                         $append = true;
716                                 }
717                                 else if ($nodeLength < 80 && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent))
718                                 {
719                                         $append = true;
720                                 }
721                         }
722
723                         if ($append)
724                         {
725                                 $this->dbg('Appending node: ' . $siblingNode->nodeName);
726
727                                 $nodeToAppend = null;
728                                 $sibNodeName = strtoupper($siblingNode->nodeName);
729                                 if ($sibNodeName != 'DIV' && $sibNodeName != 'P') {
730                                         /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */
731                                         
732                                         $this->dbg('Altering siblingNode of ' . $sibNodeName . ' to div.');
733                                         $nodeToAppend = $this->dom->createElement('div');
734                                         try {
735                                                 $nodeToAppend->setAttribute('id', $siblingNode->getAttribute('id'));
736                                                 $nodeToAppend->innerHTML = $siblingNode->innerHTML;
737                                         }
738                                         catch(Exception $e)
739                                         {
740                                                 $this->dbg('Could not alter siblingNode to div, reverting back to original.');
741                                                 $nodeToAppend = $siblingNode;
742                                                 $s--;
743                                                 $sl--;
744                                         }
745                                 } else {
746                                         $nodeToAppend = $siblingNode;
747                                         $s--;
748                                         $sl--;
749                                 }
750                                 
751                                 /* To ensure a node does not interfere with readability styles, remove its classnames */
752                                 $nodeToAppend->removeAttribute('class');
753
754                                 /* Append sibling and subtract from our list because it removes the node when you append to another node */
755                                 $articleContent->appendChild($nodeToAppend);
756                         }
757                 }
758
759                 /**
760                 * So we have all of the content that we need. Now we clean it up for presentation.
761                 **/
762                 $this->prepArticle($articleContent);
763
764                 /**
765                 * Now that we've gone through the full algorithm, check to see if we got any meaningful content.
766                 * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
767                 * likelihood of finding the content, and the sieve approach gives us a higher likelihood of
768                 * finding the -right- content.
769                 **/
770                 if (strlen($this->getInnerText($articleContent, false)) < 250)
771                 {
772                         $this->body->innerHTML = $this->bodyCache;
773                         
774                         if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) {
775                                 $this->removeFlag(self::FLAG_STRIP_UNLIKELYS);
776                                 return $this->grabArticle($this->body);
777                         }
778                         else if ($this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) {
779                                 $this->removeFlag(self::FLAG_WEIGHT_CLASSES);
780                                 return $this->grabArticle($this->body);              
781                         }
782                         else if ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
783                                 $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY);
784                                 return $this->grabArticle($this->body);
785                         }
786                         else {
787                                 return false;
788                         }
789                 }
790                 return $articleContent;
791         }
792         
793         /**
794         * Remove script tags from document
795         *
796         * @param DOMElement
797         * @return void
798         */
799         public function removeScripts($doc) {
800                 $scripts = $doc->getElementsByTagName('script');
801                 for($i = $scripts->length-1; $i >= 0; $i--)
802                 {
803                         $scripts->item($i)->parentNode->removeChild($scripts->item($i));
804                 }
805         }
806         
807         /**
808         * Get the inner text of a node.
809         * This also strips out any excess whitespace to be found.
810         *
811         * @param DOMElement $
812         * @param boolean $normalizeSpaces (default: true)
813         * @return string
814         **/
815         public function getInnerText($e, $normalizeSpaces=true) {
816                 $textContent = '';
817
818                 if (!isset($e->textContent) || $e->textContent == '') {
819                         return '';
820                 }
821
822                 $textContent = trim($e->textContent);
823
824                 if ($normalizeSpaces) {
825                         return preg_replace($this->regexps['normalize'], ' ', $textContent);
826                 } else {
827                         return $textContent;
828                 }
829         }
830
831         /**
832         * Get the number of times a string $s appears in the node $e.
833         *
834         * @param DOMElement $e
835         * @param string - what to count. Default is ","
836         * @return number (integer)
837         **/
838         public function getCharCount($e, $s=',') {
839                 return substr_count($this->getInnerText($e), $s);
840         }
841
842         /**
843         * Remove the style attribute on every $e and under.
844         *
845         * @param DOMElement $e
846         * @return void
847         */
848         public function cleanStyles($e) {
849                 $elems = $e->getElementsByTagName('*');
850                 foreach ($elems as $elem) {
851                         $elem->removeAttribute('style');
852                 }
853         }
854         
855         /**
856         * Get the density of links as a percentage of the content
857         * This is the amount of text that is inside a link divided by the total text in the node.
858         * 
859         * @param DOMElement $e
860         * @return number (float)
861         */
862         public function getLinkDensity($e) {
863                 $links      = $e->getElementsByTagName('a');
864                 $textLength = strlen($this->getInnerText($e));
865                 $linkLength = 0;
866                 for ($i=0, $il=$links->length; $i < $il; $i++)
867                 {
868                         $linkLength += strlen($this->getInnerText($links->item($i)));
869                 }
870                 if ($textLength > 0) {
871                         return $linkLength / $textLength;
872                 } else {
873                         return 0;
874                 }
875         }
876         
877         /**
878         * Get an elements class/id weight. Uses regular expressions to tell if this 
879         * element looks good or bad.
880         *
881         * @param DOMElement $e
882         * @return number (Integer)
883         */
884         public function getClassWeight($e) {
885                 if(!$this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) {
886                         return 0;
887                 }
888
889                 $weight = 0;
890
891                 /* Look for a special classname */
892                 if ($e->hasAttribute('class') && $e->getAttribute('class') != '')
893                 {
894                         if (preg_match($this->regexps['negative'], $e->getAttribute('class'))) {
895                                 $weight -= 25;
896                         }
897                         if (preg_match($this->regexps['positive'], $e->getAttribute('class'))) {
898                                 $weight += 25;
899                         }
900                 }
901
902                 /* Look for a special ID */
903                 if ($e->hasAttribute('id') && $e->getAttribute('id') != '')
904                 {
905                         if (preg_match($this->regexps['negative'], $e->getAttribute('id'))) {
906                                 $weight -= 25;
907                         }
908                         if (preg_match($this->regexps['positive'], $e->getAttribute('id'))) {
909                                 $weight += 25;
910                         }
911                 }
912                 return $weight;
913         }
914
915         /**
916         * Remove extraneous break tags from a node.
917         *
918         * @param DOMElement $node
919         * @return void
920         */
921         public function killBreaks($node) {
922                 $html = $node->innerHTML;
923                 $html = preg_replace($this->regexps['killBreaks'], '<br />', $html);
924                 $node->innerHTML = $html;
925         }
926
927         /**
928         * Clean a node of all elements of type "tag".
929         * (Unless it's a youtube/vimeo video. People love movies.)
930         *
931         * @param DOMElement $e
932         * @param string $tag
933         * @return void
934         */
935         public function clean($e, $tag) {
936                 $targetList = $e->getElementsByTagName($tag);
937                 $isEmbed = ($tag == 'object' || $tag == 'embed');
938                 
939                 for ($y=$targetList->length-1; $y >= 0; $y--) {
940                         /* Allow youtube and vimeo videos through as people usually want to see those. */
941                         if ($isEmbed) {
942                                 $attributeValues = '';
943                                 for ($i=0, $il=$targetList->item($y)->attributes->length; $i < $il; $i++) {
944                                         $attributeValues .= $targetList->item($y)->attributes->item($i)->value . '|'; // DOMAttr? (TODO: test)
945                                 }
946                                 
947                                 /* First, check the elements attributes to see if any of them contain youtube or vimeo */
948                                 if (preg_match($this->regexps['video'], $attributeValues)) {
949                                         continue;
950                                 }
951
952                                 /* Then check the elements inside this element for the same. */
953                                 if (preg_match($this->regexps['video'], $targetList->item($y)->innerHTML)) {
954                                         continue;
955                                 }
956                         }
957                         $targetList->item($y)->parentNode->removeChild($targetList->item($y));
958                 }
959         }
960         
961         /**
962         * Clean an element of all tags of type "tag" if they look fishy.
963         * "Fishy" is an algorithm based on content length, classnames, 
964         * link density, number of images & embeds, etc.
965         *
966         * @param DOMElement $e
967         * @param string $tag
968         * @return void
969         */
970         public function cleanConditionally($e, $tag) {
971                 if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
972                         return;
973                 }
974
975                 $tagsList = $e->getElementsByTagName($tag);
976                 $curTagsLength = $tagsList->length;
977
978                 /**
979                 * Gather counts for other typical elements embedded within.
980                 * Traverse backwards so we can remove nodes at the same time without effecting the traversal.
981                 *
982                 * TODO: Consider taking into account original contentScore here.
983                 */
984                 for ($i=$curTagsLength-1; $i >= 0; $i--) {
985                         $weight = $this->getClassWeight($tagsList->item($i));
986                         $contentScore = ($tagsList->item($i)->hasAttribute('readability')) ? (int)$tagsList->item($i)->getAttribute('readability') : 0;
987                         
988                         $this->dbg('Cleaning Conditionally ' . $tagsList->item($i)->tagName . ' (' . $tagsList->item($i)->getAttribute('class') . ':' . $tagsList->item($i)->getAttribute('id') . ')' . (($tagsList->item($i)->hasAttribute('readability')) ? (' with score ' . $tagsList->item($i)->getAttribute('readability')) : ''));
989
990                         if ($weight + $contentScore < 0) {
991                                 $tagsList->item($i)->parentNode->removeChild($tagsList->item($i));
992                         }
993                         else if ( $this->getCharCount($tagsList->item($i), ',') < 10) {
994                                 /**
995                                 * If there are not very many commas, and the number of
996                                 * non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
997                                 **/
998                                 $p      = $tagsList->item($i)->getElementsByTagName('p')->length;
999                                 $img    = $tagsList->item($i)->getElementsByTagName('img')->length;
1000                                 $li     = $tagsList->item($i)->getElementsByTagName('li')->length-100;
1001                                 $input  = $tagsList->item($i)->getElementsByTagName('input')->length;
1002
1003                                 $embedCount = 0;
1004                                 $embeds = $tagsList->item($i)->getElementsByTagName('embed');
1005                                 for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) {
1006                                         if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) {
1007                                         $embedCount++; 
1008                                         }
1009                                 }
1010
1011                                 $linkDensity   = $this->getLinkDensity($tagsList->item($i));
1012                                 $contentLength = strlen($this->getInnerText($tagsList->item($i)));
1013                                 $toRemove      = false;
1014
1015                                 if ( $img > $p ) {
1016                                         $toRemove = true;
1017                                 } else if ($li > $p && $tag != 'ul' && $tag != 'ol') {
1018                                         $toRemove = true;
1019                                 } else if ( $input > floor($p/3) ) {
1020                                         $toRemove = true; 
1021                                 } else if ($contentLength < 25 && ($img === 0 || $img > 2) ) {
1022                                         $toRemove = true;
1023                                 } else if($weight < 25 && $linkDensity > 0.2) {
1024                                         $toRemove = true;
1025                                 } else if($weight >= 25 && $linkDensity > 0.5) {
1026                                         $toRemove = true;
1027                                 } else if(($embedCount == 1 && $contentLength < 75) || $embedCount > 1) {
1028                                         $toRemove = true;
1029                                 }
1030
1031                                 if ($toRemove) {
1032                                         $tagsList->item($i)->parentNode->removeChild($tagsList->item($i));
1033                                 }
1034                         }
1035                 }
1036         }
1037
1038         /**
1039         * Clean out spurious headers from an Element. Checks things like classnames and link density.
1040         *
1041         * @param DOMElement $e
1042         * @return void
1043         */
1044         public function cleanHeaders($e) {
1045                 for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) {
1046                         $headers = $e->getElementsByTagName('h' . $headerIndex);
1047                         for ($i=$headers->length-1; $i >=0; $i--) {
1048                                 if ($this->getClassWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) {
1049                                         $headers->item($i)->parentNode->removeChild($headers->item($i));
1050                                 }
1051                         }
1052                 }
1053         }
1054
1055         public function flagIsActive($flag) {
1056                 return ($this->flags & $flag) > 0;
1057         }
1058         
1059         public function addFlag($flag) {
1060                 $this->flags = $this->flags | $flag;
1061         }
1062         
1063         public function removeFlag($flag) {
1064                 $this->flags = $this->flags & ~$flag;
1065         }
1066 }
1067 ?>