]> git.wh0rd.org Git - tt-rss.git/commitdiff
af_readability: add a workaround for meta charset html pages
authorAndrew Dolgov <noreply@fakecake.org>
Tue, 7 Jul 2015 07:15:08 +0000 (10:15 +0300)
committerAndrew Dolgov <noreply@fakecake.org>
Tue, 7 Jul 2015 07:15:08 +0000 (10:15 +0300)
plugins/af_readability/init.php

index 6cdd28fafe0464ee9b63c4915ae61a10dc7b1c7d..15b88d32c7a51c0cfbf696e277098358b6822c00 100644 (file)
@@ -101,6 +101,19 @@ class Af_Readability extends Plugin {
                $tmp = fetch_file_contents($article["link"]);
 
                if ($tmp) {
+                       $tmpdoc = new DOMDocument("1.0", "UTF-8");
+                       $tmpdoc->loadHTML($tmp);
+
+                       if ($tmpdoc->encoding != 'UTF-8') {
+                               $tmpxpath = new DOMXPath($tmpdoc);
+
+                               foreach ($tmpxpath->query("//meta") as $elem) {
+                                       $elem->parentNode->removeChild($elem);
+                               }
+
+                               $tmp = $tmpdoc->saveHTML();
+                       }
+
                        $r = new Readability($tmp, $article["link"]);
 
                        if ($r->init()) {