From 6475fc7e06121ff948264b990280b2f488a86aa8 Mon Sep 17 00:00:00 2001 From: Andrew Dolgov Date: Wed, 8 Jul 2015 10:30:35 +0300 Subject: [PATCH] af_redditimgur: check if document is text/html before trying to readability parse it --- plugins/af_redditimgur/init.php | 67 +++++++++++++++++++++------------ 1 file changed, 43 insertions(+), 24 deletions(-) diff --git a/plugins/af_redditimgur/init.php b/plugins/af_redditimgur/init.php index 265999e2..ae500c52 100644 --- a/plugins/af_redditimgur/init.php +++ b/plugins/af_redditimgur/init.php @@ -240,50 +240,69 @@ class Af_RedditImgur extends Plugin { $found = $this->inline_stuff($article, $doc, $xpath); - if (!$found && $this->host->get($this, "enable_readability") && mb_strlen(strip_tags($article["content"])) <= 150) { + if (function_exists("curl_init") && !$found && $this->host->get($this, "enable_readability") && + mb_strlen(strip_tags($article["content"])) <= 150) { + if (!class_exists("Readability")) require_once(__DIR__ . "/classes/Readability.php"); $content_link = $xpath->query("(//a[contains(., '[link]')])")->item(0); if ($content_link && strpos($content_link->getAttribute("href"), "reddit.com") === FALSE) { - $tmp = fetch_file_contents($content_link->getAttribute("href")); + /* link may lead to a huge video file or whatever, we need to check content type before trying to + parse it which p much requires curl */ - if ($tmp) { - $r = new Readability($tmp, $content_link->getAttribute("href")); + $ch = curl_init($content_link->getAttribute("href")); + curl_setopt($ch, CURLOPT_TIMEOUT, 5); + curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); + curl_setopt($ch, CURLOPT_HEADER, true); + curl_setopt($ch, CURLOPT_NOBODY, true); + curl_setopt($ch, CURLOPT_FOLLOWLOCATION, + !ini_get("safe_mode") && !ini_get("open_basedir")); + curl_setopt($ch, CURLOPT_USERAGENT, SELF_USER_AGENT); - if ($r->init()) { - //$article["content"] = $r->articleContent->innerHTML . "
" . $article["content"]; + @$result = curl_exec($ch); + $content_type = curl_getinfo($ch, CURLINFO_CONTENT_TYPE); - $tmpxpath = new DOMXPath($r->dom); + if ($content_type && strpos($content_type, "text/html") !== FALSE) { - $entries = $tmpxpath->query('(//a[@href]|//img[@src])'); + $tmp = fetch_file_contents($content_link->getAttribute("href")); - foreach ($entries as $entry) { - if ($entry->hasAttribute("href")) { - $entry->setAttribute("href", - rewrite_relative_url($content_link->getAttribute("href"), $entry->getAttribute("href"))); + if ($tmp) { + $r = new Readability($tmp, $content_link->getAttribute("href")); - } + if ($r->init()) { + //$article["content"] = $r->articleContent->innerHTML . "
" . $article["content"]; - if ($entry->hasAttribute("src")) { - $entry->setAttribute("src", - rewrite_relative_url($content_link->getAttribute("href"), $entry->getAttribute("src"))); + $tmpxpath = new DOMXPath($r->dom); - } + $entries = $tmpxpath->query('(//a[@href]|//img[@src])'); - } + foreach ($entries as $entry) { + if ($entry->hasAttribute("href")) { + $entry->setAttribute("href", + rewrite_relative_url($content_link->getAttribute("href"), $entry->getAttribute("href"))); + + } - $article["content"] = $r->articleContent->innerHTML . "
" . $article["content"]; + if ($entry->hasAttribute("src")) { + $entry->setAttribute("src", + rewrite_relative_url($content_link->getAttribute("href"), $entry->getAttribute("src"))); - $doc = new DOMDocument(); - @$doc->loadHTML($article["content"]); - $xpath = new DOMXPath($doc); + } - $found = $this->inline_stuff($article, $doc, $xpath); + } + + $article["content"] = $r->articleContent->innerHTML . "
" . $article["content"]; + + $doc = new DOMDocument(); + @$doc->loadHTML($article["content"]); + $xpath = new DOMXPath($doc); + + $found = $this->inline_stuff($article, $doc, $xpath); + } } } - } } -- 2.39.5