]> git.wh0rd.org - tt-rss.git/commitdiff
Feedparser: Change handling of libxml error 9 (cycle all errors)
authorwltb <wltb@localhost.com>
Sun, 8 Sep 2013 22:44:55 +0000 (00:44 +0200)
committerwltb <wltb@localhost.com>
Tue, 24 Sep 2013 14:38:28 +0000 (16:38 +0200)
classes/feedparser.php

index 4a2c6c2da8933d2bdad8d5bdfb21b185119d7381..22052bdb6c8352b5981f5d3ba08918cebf1144fc 100644 (file)
@@ -23,18 +23,12 @@ class FeedParser {
                $error = libxml_get_last_error();
 
                // libxml compiled without iconv?
-               if ($error && ($error->code == 32 || $error->code == 9)) {
-                       if (preg_match('/^(<\?xml[\t\n\r ].*?encoding=["\'])(.+?)(["\'].*?\?>)/s', $data, $matches) === 1) {
-                               $enc = $matches[2];
-
-                               $data = mb_convert_encoding($data, 'UTF-8', $enc);
+               if ($error && $error->code == 32) {
+                       if (preg_match('/^(<\?xml[\t\n\r ].*?encoding[\t\n\r ]*=[\t\n\r ]*["\'])(.+?)(["\'].*?\?>)/s', $data, $matches) === 1) {
+                               $data = mb_convert_encoding($data, 'UTF-8', $matches[2]);
 
                                $data = preg_replace('/^<\?xml[\t\n\r ].*?\?>/s', $matches[1] . "UTF-8" . $matches[3] , $data);
 
-
-                               // apparently not all UTF-8 characters are valid for XML
-                               $data = preg_replace('/[^\x{0009}\x{000a}\x{000d}\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}]+/u', ' ', $data);
-
                                if ($data) {
                                        libxml_clear_errors();
 
@@ -43,20 +37,29 @@ class FeedParser {
 
                                        $error = libxml_get_last_error();
                                }
-                  }
+                       }
                }
 
                // some terrible invalid unicode entity?
-               if ($error && $error->code == 9) {
-                       $data = mb_convert_encoding($data, 'UTF-8', 'UTF-8');
+               if ($error) {
+                       foreach(libxml_get_errors() as $err) {
+                               if ($err->code == 9) {
+                                       // remove dangling bytes
+                                       $data = mb_convert_encoding($data, 'UTF-8', 'UTF-8');
+                                       
+                                       // apparently not all UTF-8 characters are valid for XML
+                                       $data = preg_replace('/[^\x{0009}\x{000a}\x{000d}\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}]+/u', ' ', $data);
 
-                       if ($data) {
-                               libxml_clear_errors();
+                                       if ($data) {
+                                               libxml_clear_errors();
 
-                               $this->doc = new DOMDocument();
-                               $this->doc->loadXML($data);
+                                               $this->doc = new DOMDocument();
+                                               $this->doc->loadXML($data);
 
-                               $error = libxml_get_last_error();
+                                               $error = libxml_get_last_error();
+                                       }
+                                       break;
+                               }
                        }
                }