]>
Commit | Line | Data |
---|---|---|
b90c4468 AD |
1 | <?php |
2 | /** | |
3 | * Arc90's Readability ported to PHP for FiveFilters.org | |
4 | * Based on readability.js version 1.7.1 (without multi-page support) | |
5 | * ------------------------------------------------------ | |
6 | * Original URL: http://lab.arc90.com/experiments/readability/js/readability.js | |
7 | * Arc90's project URL: http://lab.arc90.com/experiments/readability/ | |
8 | * JS Source: http://code.google.com/p/arc90labs-readability | |
9 | * Ported by: Keyvan Minoukadeh, http://www.keyvan.net | |
10 | * More information: http://fivefilters.org/content-only/ | |
11 | * License: Apache License, Version 2.0 | |
12 | * Requires: PHP5 | |
13 | * Date: 2010-10-29 | |
14 | * | |
15 | * Differences between the PHP port and the original | |
16 | * ------------------------------------------------------ | |
17 | * Arc90's Readability is designed to run in the browser. It works on the DOM | |
18 | * tree (the parsed HTML) after the page's CSS styles have been applied and | |
19 | * Javascript code executed. This PHP port does not run inside a browser. | |
20 | * We use PHP's ability to parse HTML to build our DOM tree, but we cannot | |
21 | * rely on CSS or Javascript support. As such, the results will not always | |
22 | * match Arc90's Readability. (For example, if a web page contains CSS style | |
23 | * rules or Javascript code which hide certain HTML elements from display, | |
24 | * Arc90's Readability will dismiss those from consideration but our PHP port, | |
25 | * unable to understand CSS or Javascript, will not know any better.) | |
26 | * | |
27 | * Another significant difference is that the aim of Arc90's Readability is | |
28 | * to re-present the main content block of a given web page so users can | |
29 | * read it more easily in their browsers. Correct identification, clean up, | |
30 | * and separation of the content block is only a part of this process. | |
31 | * This PHP port is only concerned with this part, it does not include code | |
32 | * that relates to presentation in the browser - Arc90 already do | |
33 | * that extremely well, and for PDF output there's FiveFilters.org's | |
34 | * PDF Newspaper: http://fivefilters.org/pdf-newspaper/. | |
35 | * | |
36 | * Finally, this class contains methods that might be useful for developers | |
37 | * working on HTML document fragments. So without deviating too much from | |
38 | * the original code (which I don't want to do because it makes debugging | |
39 | * and updating more difficult), I've tried to make it a little more | |
40 | * developer friendly. You should be able to use the methods here on | |
41 | * existing DOMElement objects without passing an entire HTML document to | |
42 | * be parsed. | |
43 | */ | |
44 | ||
45 | // This class allows us to do JavaScript like assignements to innerHTML | |
46 | require_once(dirname(__FILE__).'/JSLikeHTMLElement.php'); | |
47 | ||
48 | // Alternative usage (for testing only!) | |
49 | // uncomment the lins below and call Readability.php in your browser | |
50 | // passing it the URL of the page you'd like content from, e.g.: | |
51 | // Readability.php?url=http://medialens.org/alerts/09/090615_the_guardian_climate.php | |
52 | ||
53 | /* | |
54 | if (!isset($_GET['url']) || $_GET['url'] == '') { | |
55 | die('Please pass a URL to the script. E.g. Readability.php?url=bla.com/story.html'); | |
56 | } | |
57 | $url = $_GET['url']; | |
58 | if (!preg_match('!^https?://!i', $url)) $url = 'http://'.$url; | |
59 | $html = file_get_contents($url); | |
60 | $r = new Readability($html, $url); | |
61 | $r->init(); | |
62 | echo $r->articleContent->innerHTML; | |
63 | */ | |
64 | ||
65 | class Readability | |
66 | { | |
67 | public $version = '1.7.1-without-multi-page'; | |
68 | public $convertLinksToFootnotes = false; | |
69 | public $revertForcedParagraphElements = true; | |
70 | public $articleTitle; | |
71 | public $articleContent; | |
72 | public $dom; | |
73 | public $url = null; // optional - URL where HTML was retrieved | |
74 | public $debug = false; | |
75 | protected $body = null; // | |
76 | protected $bodyCache = null; // Cache the body HTML in case we need to re-use it later | |
77 | protected $flags = 7; // 1 | 2 | 4; // Start with all flags set. | |
78 | protected $success = false; // indicates whether we were able to extract or not | |
79 | ||
80 | /** | |
81 | * All of the regular expressions in use within readability. | |
82 | * Defined up here so we don't instantiate them repeatedly in loops. | |
83 | **/ | |
84 | public $regexps = array( | |
85 | 'unlikelyCandidates' => '/combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter/i', | |
86 | 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i', | |
87 | 'positive' => '/article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i', | |
88 | 'negative' => '/combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i', | |
89 | 'divToPElements' => '/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i', | |
90 | 'replaceBrs' => '/(<br[^>]*>[ \n\r\t]*){2,}/i', | |
91 | 'replaceFonts' => '/<(\/?)font[^>]*>/i', | |
92 | // 'trimRe' => '/^\s+|\s+$/g', // PHP has trim() | |
93 | 'normalize' => '/\s{2,}/', | |
94 | 'killBreaks' => '/(<br\s*\/?>(\s| ?)*){1,}/', | |
95 | 'video' => '/http:\/\/(www\.)?(youtube|vimeo)\.com/i', | |
96 | 'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i' | |
97 | ); | |
98 | ||
99 | /* constants */ | |
100 | const FLAG_STRIP_UNLIKELYS = 1; | |
101 | const FLAG_WEIGHT_CLASSES = 2; | |
102 | const FLAG_CLEAN_CONDITIONALLY = 4; | |
103 | ||
104 | /** | |
105 | * Create instance of Readability | |
106 | * @param string UTF-8 encoded string | |
107 | * @param string (optional) URL associated with HTML (used for footnotes) | |
108 | */ | |
109 | function __construct($html, $url=null) | |
110 | { | |
111 | /* Turn all double br's into p's */ | |
112 | $html = preg_replace($this->regexps['replaceBrs'], '</p><p>', $html); | |
113 | $html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html); | |
114 | $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); | |
115 | $this->dom = new DOMDocument(); | |
116 | $this->dom->preserveWhiteSpace = false; | |
117 | $this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement'); | |
118 | if (trim($html) == '') $html = '<html></html>'; | |
119 | @$this->dom->loadHTML($html); | |
120 | $this->url = $url; | |
121 | } | |
122 | ||
123 | /** | |
124 | * Get article title element | |
125 | * @return DOMElement | |
126 | */ | |
127 | public function getTitle() { | |
128 | return $this->articleTitle; | |
129 | } | |
130 | ||
131 | /** | |
132 | * Get article content element | |
133 | * @return DOMElement | |
134 | */ | |
135 | public function getContent() { | |
136 | return $this->articleContent; | |
137 | } | |
138 | ||
139 | /** | |
140 | * Runs readability. | |
141 | * | |
142 | * Workflow: | |
143 | * 1. Prep the document by removing script tags, css, etc. | |
144 | * 2. Build readability's DOM tree. | |
145 | * 3. Grab the article content from the current dom tree. | |
146 | * 4. Replace the current DOM tree with the new one. | |
147 | * 5. Read peacefully. | |
148 | * | |
149 | * @return boolean true if we found content, false otherwise | |
150 | **/ | |
151 | public function init() | |
152 | { | |
153 | $this->removeScripts($this->dom); | |
154 | //die($this->getInnerHTML($this->dom->documentElement)); | |
155 | ||
156 | // Assume successful outcome | |
157 | $this->success = true; | |
158 | ||
159 | $bodyElems = $this->dom->getElementsByTagName('body'); | |
160 | if ($bodyElems->length > 0) { | |
161 | if ($this->bodyCache == null) { | |
162 | $this->bodyCache = $bodyElems->item(0)->innerHTML; | |
163 | } | |
164 | if ($this->body == null) { | |
165 | $this->body = $bodyElems->item(0); | |
166 | } | |
167 | } | |
168 | ||
169 | $this->prepDocument(); | |
170 | ||
171 | //die($this->dom->documentElement->parentNode->nodeType); | |
172 | //$this->setInnerHTML($this->dom->documentElement, $this->getInnerHTML($this->dom->documentElement)); | |
173 | //die($this->getInnerHTML($this->dom->documentElement)); | |
174 | ||
175 | /* Build readability's DOM tree */ | |
176 | $overlay = $this->dom->createElement('div'); | |
177 | $innerDiv = $this->dom->createElement('div'); | |
178 | $articleTitle = $this->getArticleTitle(); | |
179 | $articleContent = $this->grabArticle(); | |
180 | ||
181 | if (!$articleContent) { | |
182 | $this->success = false; | |
183 | $articleContent = $this->dom->createElement('div'); | |
184 | $articleContent->setAttribute('id', 'readability-content'); | |
185 | $articleContent->innerHTML = '<p>Sorry, Readability was unable to parse this page for content.</p>'; | |
186 | } | |
187 | ||
188 | $overlay->setAttribute('id', 'readOverlay'); | |
189 | $innerDiv->setAttribute('id', 'readInner'); | |
190 | ||
191 | /* Glue the structure of our document together. */ | |
192 | $innerDiv->appendChild($articleTitle); | |
193 | $innerDiv->appendChild($articleContent); | |
194 | $overlay->appendChild($innerDiv); | |
195 | ||
196 | /* Clear the old HTML, insert the new content. */ | |
197 | $this->body->innerHTML = ''; | |
198 | $this->body->appendChild($overlay); | |
199 | //document.body.insertBefore(overlay, document.body.firstChild); | |
200 | $this->body->removeAttribute('style'); | |
201 | ||
202 | $this->postProcessContent($articleContent); | |
203 | ||
204 | // Set title and content instance variables | |
205 | $this->articleTitle = $articleTitle; | |
206 | $this->articleContent = $articleContent; | |
207 | ||
208 | return $this->success; | |
209 | } | |
210 | ||
211 | /** | |
212 | * Debug | |
213 | */ | |
214 | protected function dbg($msg) { | |
215 | if ($this->debug) echo '* ',$msg, '<br />', "\n"; | |
216 | } | |
217 | ||
218 | /** | |
219 | * Run any post-process modifications to article content as necessary. | |
220 | * | |
221 | * @param DOMElement | |
222 | * @return void | |
223 | */ | |
224 | public function postProcessContent($articleContent) { | |
225 | if ($this->convertLinksToFootnotes && !preg_match('/wikipedia\.org/', @$this->url)) { | |
226 | $this->addFootnotes($articleContent); | |
227 | } | |
228 | } | |
229 | ||
230 | /** | |
231 | * Get the article title as an H1. | |
232 | * | |
233 | * @return DOMElement | |
234 | */ | |
235 | protected function getArticleTitle() { | |
236 | $curTitle = ''; | |
237 | $origTitle = ''; | |
238 | ||
239 | try { | |
240 | $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0)); | |
241 | } catch(Exception $e) {} | |
242 | ||
243 | if (preg_match('/ [\|\-] /', $curTitle)) | |
244 | { | |
245 | $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle); | |
246 | ||
247 | if (count(explode(' ', $curTitle)) < 3) { | |
248 | $curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle); | |
249 | } | |
250 | } | |
251 | else if (strpos($curTitle, ': ') !== false) | |
252 | { | |
253 | $curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle); | |
254 | ||
255 | if (count(explode(' ', $curTitle)) < 3) { | |
256 | $curTitle = preg_replace('/[^:]*[:](.*)/i','$1', $origTitle); | |
257 | } | |
258 | } | |
259 | else if(strlen($curTitle) > 150 || strlen($curTitle) < 15) | |
260 | { | |
261 | $hOnes = $this->dom->getElementsByTagName('h1'); | |
262 | if($hOnes->length == 1) | |
263 | { | |
264 | $curTitle = $this->getInnerText($hOnes->item(0)); | |
265 | } | |
266 | } | |
267 | ||
268 | $curTitle = trim($curTitle); | |
269 | ||
270 | if (count(explode(' ', $curTitle)) <= 4) { | |
271 | $curTitle = $origTitle; | |
272 | } | |
273 | ||
274 | $articleTitle = $this->dom->createElement('h1'); | |
275 | $articleTitle->innerHTML = $curTitle; | |
276 | ||
277 | return $articleTitle; | |
278 | } | |
279 | ||
280 | /** | |
281 | * Prepare the HTML document for readability to scrape it. | |
282 | * This includes things like stripping javascript, CSS, and handling terrible markup. | |
283 | * | |
284 | * @return void | |
285 | **/ | |
286 | protected function prepDocument() { | |
287 | /** | |
288 | * In some cases a body element can't be found (if the HTML is totally hosed for example) | |
289 | * so we create a new body node and append it to the document. | |
290 | */ | |
291 | if ($this->body == null) | |
292 | { | |
293 | $this->body = $this->dom->createElement('body'); | |
294 | $this->dom->documentElement->appendChild($this->body); | |
295 | } | |
296 | ||
297 | $this->body->setAttribute('id', 'readabilityBody'); | |
298 | ||
299 | /* Remove all style tags in head */ | |
300 | $styleTags = $this->dom->getElementsByTagName('style'); | |
301 | for ($i = $styleTags->length-1; $i >= 0; $i--) | |
302 | { | |
303 | $styleTags->item($i)->parentNode->removeChild($styleTags->item($i)); | |
304 | } | |
305 | ||
306 | /* Turn all double br's into p's */ | |
307 | /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */ | |
308 | //document.body.innerHTML = document.body.innerHTML.replace(readability.regexps.replaceBrs, '</p><p>').replace(readability.regexps.replaceFonts, '<$1span>'); | |
309 | // We do this in the constructor for PHP as that's when we have raw HTML - before parsing it into a DOM tree. | |
310 | // Manipulating innerHTML as it's done in JS is not possible in PHP. | |
311 | } | |
312 | ||
313 | /** | |
314 | * For easier reading, convert this document to have footnotes at the bottom rather than inline links. | |
315 | * @see http://www.roughtype.com/archives/2010/05/experiments_in.php | |
316 | * | |
317 | * @return void | |
318 | **/ | |
319 | public function addFootnotes($articleContent) { | |
320 | $footnotesWrapper = $this->dom->createElement('div'); | |
321 | $footnotesWrapper->setAttribute('id', 'readability-footnotes'); | |
322 | $footnotesWrapper->innerHTML = '<h3>References</h3>'; | |
323 | ||
324 | $articleFootnotes = $this->dom->createElement('ol'); | |
325 | $articleFootnotes->setAttribute('id', 'readability-footnotes-list'); | |
326 | $footnotesWrapper->appendChild($articleFootnotes); | |
327 | ||
328 | $articleLinks = $articleContent->getElementsByTagName('a'); | |
329 | ||
330 | $linkCount = 0; | |
331 | for ($i = 0; $i < $articleLinks->length; $i++) | |
332 | { | |
333 | $articleLink = $articleLinks->item($i); | |
334 | $footnoteLink = $articleLink->cloneNode(true); | |
335 | $refLink = $this->dom->createElement('a'); | |
336 | $footnote = $this->dom->createElement('li'); | |
337 | $linkDomain = @parse_url($footnoteLink->getAttribute('href'), PHP_URL_HOST); | |
338 | if (!$linkDomain && isset($this->url)) $linkDomain = @parse_url($this->url, PHP_URL_HOST); | |
339 | //linkDomain = footnoteLink.host ? footnoteLink.host : document.location.host, | |
340 | $linkText = $this->getInnerText($articleLink); | |
341 | ||
342 | if ((strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote') !== false) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) { | |
343 | continue; | |
344 | } | |
345 | ||
346 | $linkCount++; | |
347 | ||
348 | /** Add a superscript reference after the article link */ | |
349 | $refLink->setAttribute('href', '#readabilityFootnoteLink-' . $linkCount); | |
350 | $refLink->innerHTML = '<small><sup>[' . $linkCount . ']</sup></small>'; | |
351 | $refLink->setAttribute('class', 'readability-DoNotFootnote'); | |
352 | $refLink->setAttribute('style', 'color: inherit;'); | |
353 | ||
354 | //TODO: does this work or should we use DOMNode.isSameNode()? | |
355 | if ($articleLink->parentNode->lastChild == $articleLink) { | |
356 | $articleLink->parentNode->appendChild($refLink); | |
357 | } else { | |
358 | $articleLink->parentNode->insertBefore($refLink, $articleLink->nextSibling); | |
359 | } | |
360 | ||
361 | $articleLink->setAttribute('style', 'color: inherit; text-decoration: none;'); | |
362 | $articleLink->setAttribute('name', 'readabilityLink-' . $linkCount); | |
363 | ||
364 | $footnote->innerHTML = '<small><sup><a href="#readabilityLink-' . $linkCount . '" title="Jump to Link in Article">^</a></sup></small> '; | |
365 | ||
366 | $footnoteLink->innerHTML = ($footnoteLink->getAttribute('title') != '' ? $footnoteLink->getAttribute('title') : $linkText); | |
367 | $footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount); | |
368 | ||
369 | $footnote->appendChild($footnoteLink); | |
370 | if ($linkDomain) $footnote->innerHTML = $footnote->innerHTML . '<small> (' . $linkDomain . ')</small>'; | |
371 | ||
372 | $articleFootnotes->appendChild($footnote); | |
373 | } | |
374 | ||
375 | if ($linkCount > 0) { | |
376 | $articleContent->appendChild($footnotesWrapper); | |
377 | } | |
378 | } | |
379 | ||
380 | /** | |
381 | * Reverts P elements with class 'readability-styled' | |
382 | * to text nodes - which is what they were before. | |
383 | * | |
384 | * @param DOMElement | |
385 | * @return void | |
386 | */ | |
387 | function revertReadabilityStyledElements($articleContent) { | |
388 | $xpath = new DOMXPath($articleContent->ownerDocument); | |
389 | $elems = $xpath->query('.//p[@class="readability-styled"]', $articleContent); | |
390 | //$elems = $articleContent->getElementsByTagName('p'); | |
391 | for ($i = $elems->length-1; $i >= 0; $i--) { | |
392 | $e = $elems->item($i); | |
393 | $e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e); | |
394 | //if ($e->hasAttribute('class') && $e->getAttribute('class') == 'readability-styled') { | |
395 | // $e->parentNode->replaceChild($this->dom->createTextNode($e->textContent), $e); | |
396 | //} | |
397 | } | |
398 | } | |
399 | ||
400 | /** | |
401 | * Prepare the article node for display. Clean out any inline styles, | |
402 | * iframes, forms, strip extraneous <p> tags, etc. | |
403 | * | |
404 | * @param DOMElement | |
405 | * @return void | |
406 | */ | |
407 | function prepArticle($articleContent) { | |
408 | $this->cleanStyles($articleContent); | |
409 | $this->killBreaks($articleContent); | |
410 | if ($this->revertForcedParagraphElements) { | |
411 | $this->revertReadabilityStyledElements($articleContent); | |
412 | } | |
413 | ||
414 | /* Clean out junk from the article content */ | |
415 | $this->cleanConditionally($articleContent, 'form'); | |
416 | $this->clean($articleContent, 'object'); | |
417 | $this->clean($articleContent, 'h1'); | |
418 | ||
419 | /** | |
420 | * If there is only one h2, they are probably using it | |
421 | * as a header and not a subheader, so remove it since we already have a header. | |
422 | ***/ | |
423 | if ($articleContent->getElementsByTagName('h2')->length == 1) { | |
424 | $this->clean($articleContent, 'h2'); | |
425 | } | |
426 | $this->clean($articleContent, 'iframe'); | |
427 | ||
428 | $this->cleanHeaders($articleContent); | |
429 | ||
430 | /* Do these last as the previous stuff may have removed junk that will affect these */ | |
431 | $this->cleanConditionally($articleContent, 'table'); | |
432 | $this->cleanConditionally($articleContent, 'ul'); | |
433 | $this->cleanConditionally($articleContent, 'div'); | |
434 | ||
435 | /* Remove extra paragraphs */ | |
436 | $articleParagraphs = $articleContent->getElementsByTagName('p'); | |
437 | for ($i = $articleParagraphs->length-1; $i >= 0; $i--) | |
438 | { | |
439 | $imgCount = $articleParagraphs->item($i)->getElementsByTagName('img')->length; | |
440 | $embedCount = $articleParagraphs->item($i)->getElementsByTagName('embed')->length; | |
441 | $objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length; | |
442 | ||
443 | if ($imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $this->getInnerText($articleParagraphs->item($i), false) == '') | |
444 | { | |
445 | $articleParagraphs->item($i)->parentNode->removeChild($articleParagraphs->item($i)); | |
446 | } | |
447 | } | |
448 | ||
449 | try { | |
450 | $articleContent->innerHTML = preg_replace('/<br[^>]*>\s*<p/i', '<p', $articleContent->innerHTML); | |
451 | //articleContent.innerHTML = articleContent.innerHTML.replace(/<br[^>]*>\s*<p/gi, '<p'); | |
452 | } | |
453 | catch (Exception $e) { | |
454 | $this->dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " . $e); | |
455 | } | |
456 | } | |
457 | ||
458 | /** | |
459 | * Initialize a node with the readability object. Also checks the | |
460 | * className/id for special names to add to its score. | |
461 | * | |
462 | * @param Element | |
463 | * @return void | |
464 | **/ | |
465 | protected function initializeNode($node) { | |
466 | $readability = $this->dom->createAttribute('readability'); | |
467 | $readability->value = 0; // this is our contentScore | |
468 | $node->setAttributeNode($readability); | |
469 | ||
470 | switch (strtoupper($node->tagName)) { // unsure if strtoupper is needed, but using it just in case | |
471 | case 'DIV': | |
472 | $readability->value += 5; | |
473 | break; | |
474 | ||
475 | case 'PRE': | |
476 | case 'TD': | |
477 | case 'BLOCKQUOTE': | |
478 | $readability->value += 3; | |
479 | break; | |
480 | ||
481 | case 'ADDRESS': | |
482 | case 'OL': | |
483 | case 'UL': | |
484 | case 'DL': | |
485 | case 'DD': | |
486 | case 'DT': | |
487 | case 'LI': | |
488 | case 'FORM': | |
489 | $readability->value -= 3; | |
490 | break; | |
491 | ||
492 | case 'H1': | |
493 | case 'H2': | |
494 | case 'H3': | |
495 | case 'H4': | |
496 | case 'H5': | |
497 | case 'H6': | |
498 | case 'TH': | |
499 | $readability->value -= 5; | |
500 | break; | |
501 | } | |
502 | $readability->value += $this->getClassWeight($node); | |
503 | } | |
504 | ||
505 | /*** | |
506 | * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is | |
507 | * most likely to be the stuff a user wants to read. Then return it wrapped up in a div. | |
508 | * | |
509 | * @return DOMElement | |
510 | **/ | |
511 | protected function grabArticle($page=null) { | |
512 | $stripUnlikelyCandidates = $this->flagIsActive(self::FLAG_STRIP_UNLIKELYS); | |
513 | if (!$page) $page = $this->dom; | |
514 | $allElements = $page->getElementsByTagName('*'); | |
515 | /** | |
516 | * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs | |
517 | * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.) | |
518 | * | |
519 | * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 | |
520 | * TODO: Shouldn't this be a reverse traversal? | |
521 | **/ | |
522 | $node = null; | |
523 | $nodesToScore = array(); | |
524 | for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); $nodeIndex++) { | |
525 | //for ($nodeIndex=$targetList->length-1; $nodeIndex >= 0; $nodeIndex--) { | |
526 | //$node = $targetList->item($nodeIndex); | |
527 | $tagName = strtoupper($node->tagName); | |
528 | /* Remove unlikely candidates */ | |
529 | if ($stripUnlikelyCandidates) { | |
530 | $unlikelyMatchString = $node->getAttribute('class') . $node->getAttribute('id'); | |
531 | if ( | |
532 | preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) && | |
533 | !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) && | |
534 | $tagName != 'BODY' | |
535 | ) | |
536 | { | |
537 | $this->dbg('Removing unlikely candidate - ' . $unlikelyMatchString); | |
538 | //$nodesToRemove[] = $node; | |
539 | $node->parentNode->removeChild($node); | |
540 | $nodeIndex--; | |
541 | continue; | |
542 | } | |
543 | } | |
544 | ||
545 | if ($tagName == 'P' || $tagName == 'TD' || $tagName == 'PRE') { | |
546 | $nodesToScore[] = $node; | |
547 | } | |
548 | ||
549 | /* Turn all divs that don't have children block level elements into p's */ | |
550 | if ($tagName == 'DIV') { | |
551 | if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) { | |
552 | //$this->dbg('Altering div to p'); | |
553 | $newNode = $this->dom->createElement('p'); | |
554 | try { | |
555 | $newNode->innerHTML = $node->innerHTML; | |
556 | //$nodesToReplace[] = array('new'=>$newNode, 'old'=>$node); | |
557 | $node->parentNode->replaceChild($newNode, $node); | |
558 | $nodeIndex--; | |
559 | $nodesToScore[] = $node; // or $newNode? | |
560 | } | |
561 | catch(Exception $e) { | |
562 | $this->dbg('Could not alter div to p, reverting back to div.: ' . $e); | |
563 | } | |
564 | } | |
565 | else | |
566 | { | |
567 | /* EXPERIMENTAL */ | |
568 | // TODO: change these p elements back to text nodes after processing | |
569 | for ($i = 0, $il = $node->childNodes->length; $i < $il; $i++) { | |
570 | $childNode = $node->childNodes->item($i); | |
571 | if ($childNode->nodeType == 3) { // XML_TEXT_NODE | |
572 | //$this->dbg('replacing text node with a p tag with the same content.'); | |
573 | $p = $this->dom->createElement('p'); | |
574 | $p->innerHTML = $childNode->nodeValue; | |
575 | $p->setAttribute('style', 'display: inline;'); | |
576 | $p->setAttribute('class', 'readability-styled'); | |
577 | $childNode->parentNode->replaceChild($p, $childNode); | |
578 | } | |
579 | } | |
580 | } | |
581 | } | |
582 | } | |
583 | ||
584 | /** | |
585 | * Loop through all paragraphs, and assign a score to them based on how content-y they look. | |
586 | * Then add their score to their parent node. | |
587 | * | |
588 | * A score is determined by things like number of commas, class names, etc. Maybe eventually link density. | |
589 | **/ | |
590 | $candidates = array(); | |
591 | for ($pt=0; $pt < count($nodesToScore); $pt++) { | |
592 | $parentNode = $nodesToScore[$pt]->parentNode; | |
593 | // $grandParentNode = $parentNode ? $parentNode->parentNode : null; | |
594 | $grandParentNode = !$parentNode ? null : (($parentNode->parentNode instanceof DOMElement) ? $parentNode->parentNode : null); | |
595 | $innerText = $this->getInnerText($nodesToScore[$pt]); | |
596 | ||
597 | if (!$parentNode || !isset($parentNode->tagName)) { | |
598 | continue; | |
599 | } | |
600 | ||
601 | /* If this paragraph is less than 25 characters, don't even count it. */ | |
602 | if(strlen($innerText) < 25) { | |
603 | continue; | |
604 | } | |
605 | ||
606 | /* Initialize readability data for the parent. */ | |
607 | if (!$parentNode->hasAttribute('readability')) | |
608 | { | |
609 | $this->initializeNode($parentNode); | |
610 | $candidates[] = $parentNode; | |
611 | } | |
612 | ||
613 | /* Initialize readability data for the grandparent. */ | |
614 | if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName)) | |
615 | { | |
616 | $this->initializeNode($grandParentNode); | |
617 | $candidates[] = $grandParentNode; | |
618 | } | |
619 | ||
620 | $contentScore = 0; | |
621 | ||
622 | /* Add a point for the paragraph itself as a base. */ | |
623 | $contentScore++; | |
624 | ||
625 | /* Add points for any commas within this paragraph */ | |
626 | $contentScore += count(explode(',', $innerText)); | |
627 | ||
628 | /* For every 100 characters in this paragraph, add another point. Up to 3 points. */ | |
629 | $contentScore += min(floor(strlen($innerText) / 100), 3); | |
630 | ||
631 | /* Add the score to the parent. The grandparent gets half. */ | |
632 | $parentNode->getAttributeNode('readability')->value += $contentScore; | |
633 | ||
634 | if ($grandParentNode) { | |
635 | $grandParentNode->getAttributeNode('readability')->value += $contentScore/2; | |
636 | } | |
637 | } | |
638 | ||
639 | /** | |
640 | * After we've calculated scores, loop through all of the possible candidate nodes we found | |
641 | * and find the one with the highest score. | |
642 | **/ | |
643 | $topCandidate = null; | |
644 | for ($c=0, $cl=count($candidates); $c < $cl; $c++) | |
645 | { | |
646 | /** | |
647 | * Scale the final candidates score based on link density. Good content should have a | |
648 | * relatively small link density (5% or less) and be mostly unaffected by this operation. | |
649 | **/ | |
650 | $readability = $candidates[$c]->getAttributeNode('readability'); | |
651 | $readability->value = $readability->value * (1-$this->getLinkDensity($candidates[$c])); | |
652 | ||
653 | $this->dbg('Candidate: ' . $candidates[$c]->tagName . ' (' . $candidates[$c]->getAttribute('class') . ':' . $candidates[$c]->getAttribute('id') . ') with score ' . $readability->value); | |
654 | ||
655 | if (!$topCandidate || $readability->value > (int)$topCandidate->getAttribute('readability')) { | |
656 | $topCandidate = $candidates[$c]; | |
657 | } | |
658 | } | |
659 | ||
660 | /** | |
661 | * If we still have no top candidate, just use the body as a last resort. | |
662 | * We also have to copy the body node so it is something we can modify. | |
663 | **/ | |
664 | if ($topCandidate === null || strtoupper($topCandidate->tagName) == 'BODY') | |
665 | { | |
666 | $topCandidate = $this->dom->createElement('div'); | |
667 | $topCandidate->innerHTML = ($page instanceof DOMDocument) ? $page->saveXML($page->documentElement) : $page->innerHTML; | |
668 | $page->innerHTML = ''; | |
669 | $page->appendChild($topCandidate); | |
670 | $this->initializeNode($topCandidate); | |
671 | } | |
672 | ||
673 | /** | |
674 | * Now that we have the top candidate, look through its siblings for content that might also be related. | |
675 | * Things like preambles, content split by ads that we removed, etc. | |
676 | **/ | |
677 | $articleContent = $this->dom->createElement('div'); | |
678 | $articleContent->setAttribute('id', 'readability-content'); | |
679 | $siblingScoreThreshold = max(10, ((int)$topCandidate->getAttribute('readability')) * 0.2); | |
680 | $siblingNodes = $topCandidate->parentNode->childNodes; | |
681 | ||
682 | for ($s=0, $sl=$siblingNodes->length; $s < $sl; $s++) | |
683 | { | |
684 | $siblingNode = $siblingNodes->item($s); | |
685 | $append = false; | |
686 | ||
687 | $this->dbg('Looking at sibling node: ' . $siblingNode->nodeName . (($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : '')); | |
688 | ||
689 | //dbg('Sibling has score ' . ($siblingNode->readability ? siblingNode.readability.contentScore : 'Unknown')); | |
690 | ||
691 | if ($siblingNode === $topCandidate) | |
692 | // or if ($siblingNode->isSameNode($topCandidate)) | |
693 | { | |
694 | $append = true; | |
695 | } | |
696 | ||
697 | $contentBonus = 0; | |
698 | /* Give a bonus if sibling nodes and top candidates have the example same classname */ | |
699 | if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') == $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') != '') { | |
700 | $contentBonus += ((int)$topCandidate->getAttribute('readability')) * 0.2; | |
701 | } | |
702 | ||
703 | if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') && (((int)$siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold) | |
704 | { | |
705 | $append = true; | |
706 | } | |
707 | ||
708 | if (strtoupper($siblingNode->nodeName) == 'P') { | |
709 | $linkDensity = $this->getLinkDensity($siblingNode); | |
710 | $nodeContent = $this->getInnerText($siblingNode); | |
711 | $nodeLength = strlen($nodeContent); | |
712 | ||
713 | if ($nodeLength > 80 && $linkDensity < 0.25) | |
714 | { | |
715 | $append = true; | |
716 | } | |
717 | else if ($nodeLength < 80 && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent)) | |
718 | { | |
719 | $append = true; | |
720 | } | |
721 | } | |
722 | ||
723 | if ($append) | |
724 | { | |
725 | $this->dbg('Appending node: ' . $siblingNode->nodeName); | |
726 | ||
727 | $nodeToAppend = null; | |
728 | $sibNodeName = strtoupper($siblingNode->nodeName); | |
729 | if ($sibNodeName != 'DIV' && $sibNodeName != 'P') { | |
730 | /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */ | |
731 | ||
732 | $this->dbg('Altering siblingNode of ' . $sibNodeName . ' to div.'); | |
733 | $nodeToAppend = $this->dom->createElement('div'); | |
734 | try { | |
735 | $nodeToAppend->setAttribute('id', $siblingNode->getAttribute('id')); | |
736 | $nodeToAppend->innerHTML = $siblingNode->innerHTML; | |
737 | } | |
738 | catch(Exception $e) | |
739 | { | |
740 | $this->dbg('Could not alter siblingNode to div, reverting back to original.'); | |
741 | $nodeToAppend = $siblingNode; | |
742 | $s--; | |
743 | $sl--; | |
744 | } | |
745 | } else { | |
746 | $nodeToAppend = $siblingNode; | |
747 | $s--; | |
748 | $sl--; | |
749 | } | |
750 | ||
751 | /* To ensure a node does not interfere with readability styles, remove its classnames */ | |
752 | $nodeToAppend->removeAttribute('class'); | |
753 | ||
754 | /* Append sibling and subtract from our list because it removes the node when you append to another node */ | |
755 | $articleContent->appendChild($nodeToAppend); | |
756 | } | |
757 | } | |
758 | ||
759 | /** | |
760 | * So we have all of the content that we need. Now we clean it up for presentation. | |
761 | **/ | |
762 | $this->prepArticle($articleContent); | |
763 | ||
764 | /** | |
765 | * Now that we've gone through the full algorithm, check to see if we got any meaningful content. | |
766 | * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher | |
767 | * likelihood of finding the content, and the sieve approach gives us a higher likelihood of | |
768 | * finding the -right- content. | |
769 | **/ | |
770 | if (strlen($this->getInnerText($articleContent, false)) < 250) | |
771 | { | |
772 | $this->body->innerHTML = $this->bodyCache; | |
773 | ||
774 | if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) { | |
775 | $this->removeFlag(self::FLAG_STRIP_UNLIKELYS); | |
776 | return $this->grabArticle($this->body); | |
777 | } | |
778 | else if ($this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) { | |
779 | $this->removeFlag(self::FLAG_WEIGHT_CLASSES); | |
780 | return $this->grabArticle($this->body); | |
781 | } | |
782 | else if ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { | |
783 | $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY); | |
784 | return $this->grabArticle($this->body); | |
785 | } | |
786 | else { | |
787 | return false; | |
788 | } | |
789 | } | |
790 | return $articleContent; | |
791 | } | |
792 | ||
793 | /** | |
794 | * Remove script tags from document | |
795 | * | |
796 | * @param DOMElement | |
797 | * @return void | |
798 | */ | |
799 | public function removeScripts($doc) { | |
800 | $scripts = $doc->getElementsByTagName('script'); | |
801 | for($i = $scripts->length-1; $i >= 0; $i--) | |
802 | { | |
803 | $scripts->item($i)->parentNode->removeChild($scripts->item($i)); | |
804 | } | |
805 | } | |
806 | ||
807 | /** | |
808 | * Get the inner text of a node. | |
809 | * This also strips out any excess whitespace to be found. | |
810 | * | |
811 | * @param DOMElement $ | |
812 | * @param boolean $normalizeSpaces (default: true) | |
813 | * @return string | |
814 | **/ | |
815 | public function getInnerText($e, $normalizeSpaces=true) { | |
816 | $textContent = ''; | |
817 | ||
818 | if (!isset($e->textContent) || $e->textContent == '') { | |
819 | return ''; | |
820 | } | |
821 | ||
822 | $textContent = trim($e->textContent); | |
823 | ||
824 | if ($normalizeSpaces) { | |
825 | return preg_replace($this->regexps['normalize'], ' ', $textContent); | |
826 | } else { | |
827 | return $textContent; | |
828 | } | |
829 | } | |
830 | ||
831 | /** | |
832 | * Get the number of times a string $s appears in the node $e. | |
833 | * | |
834 | * @param DOMElement $e | |
835 | * @param string - what to count. Default is "," | |
836 | * @return number (integer) | |
837 | **/ | |
838 | public function getCharCount($e, $s=',') { | |
839 | return substr_count($this->getInnerText($e), $s); | |
840 | } | |
841 | ||
842 | /** | |
843 | * Remove the style attribute on every $e and under. | |
844 | * | |
845 | * @param DOMElement $e | |
846 | * @return void | |
847 | */ | |
848 | public function cleanStyles($e) { | |
849 | $elems = $e->getElementsByTagName('*'); | |
850 | foreach ($elems as $elem) { | |
851 | $elem->removeAttribute('style'); | |
852 | } | |
853 | } | |
854 | ||
855 | /** | |
856 | * Get the density of links as a percentage of the content | |
857 | * This is the amount of text that is inside a link divided by the total text in the node. | |
858 | * | |
859 | * @param DOMElement $e | |
860 | * @return number (float) | |
861 | */ | |
862 | public function getLinkDensity($e) { | |
863 | $links = $e->getElementsByTagName('a'); | |
864 | $textLength = strlen($this->getInnerText($e)); | |
865 | $linkLength = 0; | |
866 | for ($i=0, $il=$links->length; $i < $il; $i++) | |
867 | { | |
868 | $linkLength += strlen($this->getInnerText($links->item($i))); | |
869 | } | |
870 | if ($textLength > 0) { | |
871 | return $linkLength / $textLength; | |
872 | } else { | |
873 | return 0; | |
874 | } | |
875 | } | |
876 | ||
877 | /** | |
878 | * Get an elements class/id weight. Uses regular expressions to tell if this | |
879 | * element looks good or bad. | |
880 | * | |
881 | * @param DOMElement $e | |
882 | * @return number (Integer) | |
883 | */ | |
884 | public function getClassWeight($e) { | |
885 | if(!$this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) { | |
886 | return 0; | |
887 | } | |
888 | ||
889 | $weight = 0; | |
890 | ||
891 | /* Look for a special classname */ | |
892 | if ($e->hasAttribute('class') && $e->getAttribute('class') != '') | |
893 | { | |
894 | if (preg_match($this->regexps['negative'], $e->getAttribute('class'))) { | |
895 | $weight -= 25; | |
896 | } | |
897 | if (preg_match($this->regexps['positive'], $e->getAttribute('class'))) { | |
898 | $weight += 25; | |
899 | } | |
900 | } | |
901 | ||
902 | /* Look for a special ID */ | |
903 | if ($e->hasAttribute('id') && $e->getAttribute('id') != '') | |
904 | { | |
905 | if (preg_match($this->regexps['negative'], $e->getAttribute('id'))) { | |
906 | $weight -= 25; | |
907 | } | |
908 | if (preg_match($this->regexps['positive'], $e->getAttribute('id'))) { | |
909 | $weight += 25; | |
910 | } | |
911 | } | |
912 | return $weight; | |
913 | } | |
914 | ||
915 | /** | |
916 | * Remove extraneous break tags from a node. | |
917 | * | |
918 | * @param DOMElement $node | |
919 | * @return void | |
920 | */ | |
921 | public function killBreaks($node) { | |
922 | $html = $node->innerHTML; | |
923 | $html = preg_replace($this->regexps['killBreaks'], '<br />', $html); | |
924 | $node->innerHTML = $html; | |
925 | } | |
926 | ||
927 | /** | |
928 | * Clean a node of all elements of type "tag". | |
929 | * (Unless it's a youtube/vimeo video. People love movies.) | |
930 | * | |
931 | * @param DOMElement $e | |
932 | * @param string $tag | |
933 | * @return void | |
934 | */ | |
935 | public function clean($e, $tag) { | |
936 | $targetList = $e->getElementsByTagName($tag); | |
937 | $isEmbed = ($tag == 'object' || $tag == 'embed'); | |
938 | ||
939 | for ($y=$targetList->length-1; $y >= 0; $y--) { | |
940 | /* Allow youtube and vimeo videos through as people usually want to see those. */ | |
941 | if ($isEmbed) { | |
942 | $attributeValues = ''; | |
943 | for ($i=0, $il=$targetList->item($y)->attributes->length; $i < $il; $i++) { | |
944 | $attributeValues .= $targetList->item($y)->attributes->item($i)->value . '|'; // DOMAttr? (TODO: test) | |
945 | } | |
946 | ||
947 | /* First, check the elements attributes to see if any of them contain youtube or vimeo */ | |
948 | if (preg_match($this->regexps['video'], $attributeValues)) { | |
949 | continue; | |
950 | } | |
951 | ||
952 | /* Then check the elements inside this element for the same. */ | |
953 | if (preg_match($this->regexps['video'], $targetList->item($y)->innerHTML)) { | |
954 | continue; | |
955 | } | |
956 | } | |
957 | $targetList->item($y)->parentNode->removeChild($targetList->item($y)); | |
958 | } | |
959 | } | |
960 | ||
961 | /** | |
962 | * Clean an element of all tags of type "tag" if they look fishy. | |
963 | * "Fishy" is an algorithm based on content length, classnames, | |
964 | * link density, number of images & embeds, etc. | |
965 | * | |
966 | * @param DOMElement $e | |
967 | * @param string $tag | |
968 | * @return void | |
969 | */ | |
970 | public function cleanConditionally($e, $tag) { | |
971 | if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { | |
972 | return; | |
973 | } | |
974 | ||
975 | $tagsList = $e->getElementsByTagName($tag); | |
976 | $curTagsLength = $tagsList->length; | |
977 | ||
978 | /** | |
979 | * Gather counts for other typical elements embedded within. | |
980 | * Traverse backwards so we can remove nodes at the same time without effecting the traversal. | |
981 | * | |
982 | * TODO: Consider taking into account original contentScore here. | |
983 | */ | |
984 | for ($i=$curTagsLength-1; $i >= 0; $i--) { | |
985 | $weight = $this->getClassWeight($tagsList->item($i)); | |
986 | $contentScore = ($tagsList->item($i)->hasAttribute('readability')) ? (int)$tagsList->item($i)->getAttribute('readability') : 0; | |
987 | ||
988 | $this->dbg('Cleaning Conditionally ' . $tagsList->item($i)->tagName . ' (' . $tagsList->item($i)->getAttribute('class') . ':' . $tagsList->item($i)->getAttribute('id') . ')' . (($tagsList->item($i)->hasAttribute('readability')) ? (' with score ' . $tagsList->item($i)->getAttribute('readability')) : '')); | |
989 | ||
990 | if ($weight + $contentScore < 0) { | |
991 | $tagsList->item($i)->parentNode->removeChild($tagsList->item($i)); | |
992 | } | |
993 | else if ( $this->getCharCount($tagsList->item($i), ',') < 10) { | |
994 | /** | |
995 | * If there are not very many commas, and the number of | |
996 | * non-paragraph elements is more than paragraphs or other ominous signs, remove the element. | |
997 | **/ | |
998 | $p = $tagsList->item($i)->getElementsByTagName('p')->length; | |
999 | $img = $tagsList->item($i)->getElementsByTagName('img')->length; | |
1000 | $li = $tagsList->item($i)->getElementsByTagName('li')->length-100; | |
1001 | $input = $tagsList->item($i)->getElementsByTagName('input')->length; | |
1002 | ||
1003 | $embedCount = 0; | |
1004 | $embeds = $tagsList->item($i)->getElementsByTagName('embed'); | |
1005 | for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) { | |
1006 | if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) { | |
1007 | $embedCount++; | |
1008 | } | |
1009 | } | |
1010 | ||
1011 | $linkDensity = $this->getLinkDensity($tagsList->item($i)); | |
1012 | $contentLength = strlen($this->getInnerText($tagsList->item($i))); | |
1013 | $toRemove = false; | |
1014 | ||
1015 | if ( $img > $p ) { | |
1016 | $toRemove = true; | |
1017 | } else if ($li > $p && $tag != 'ul' && $tag != 'ol') { | |
1018 | $toRemove = true; | |
1019 | } else if ( $input > floor($p/3) ) { | |
1020 | $toRemove = true; | |
1021 | } else if ($contentLength < 25 && ($img === 0 || $img > 2) ) { | |
1022 | $toRemove = true; | |
1023 | } else if($weight < 25 && $linkDensity > 0.2) { | |
1024 | $toRemove = true; | |
1025 | } else if($weight >= 25 && $linkDensity > 0.5) { | |
1026 | $toRemove = true; | |
1027 | } else if(($embedCount == 1 && $contentLength < 75) || $embedCount > 1) { | |
1028 | $toRemove = true; | |
1029 | } | |
1030 | ||
1031 | if ($toRemove) { | |
1032 | $tagsList->item($i)->parentNode->removeChild($tagsList->item($i)); | |
1033 | } | |
1034 | } | |
1035 | } | |
1036 | } | |
1037 | ||
1038 | /** | |
1039 | * Clean out spurious headers from an Element. Checks things like classnames and link density. | |
1040 | * | |
1041 | * @param DOMElement $e | |
1042 | * @return void | |
1043 | */ | |
1044 | public function cleanHeaders($e) { | |
1045 | for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) { | |
1046 | $headers = $e->getElementsByTagName('h' . $headerIndex); | |
1047 | for ($i=$headers->length-1; $i >=0; $i--) { | |
1048 | if ($this->getClassWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) { | |
1049 | $headers->item($i)->parentNode->removeChild($headers->item($i)); | |
1050 | } | |
1051 | } | |
1052 | } | |
1053 | } | |
1054 | ||
1055 | public function flagIsActive($flag) { | |
1056 | return ($this->flags & $flag) > 0; | |
1057 | } | |
1058 | ||
1059 | public function addFlag($flag) { | |
1060 | $this->flags = $this->flags | $flag; | |
1061 | } | |
1062 | ||
1063 | public function removeFlag($flag) { | |
1064 | $this->flags = $this->flags & ~$flag; | |
1065 | } | |
1066 | } | |
1067 | ?> |