]> git.wh0rd.org Git - tt-rss.git/blob - classes/feedparser.php
Feedparser: Store libXML fatal error messages in an array, repair error reporting
[tt-rss.git] / classes / feedparser.php
1 <?php
2 class FeedParser {
3         private $doc;
4         private $error;
5         private $libxml_errors = array();
6         private $items;
7         private $link;
8         private $title;
9         private $type;
10         private $xpath;
11
12         const FEED_RDF = 0;
13         const FEED_RSS = 1;
14         const FEED_ATOM = 2;
15
16         function __construct($data) {
17                 libxml_use_internal_errors(true);
18                 libxml_clear_errors();
19                 $this->doc = new DOMDocument();
20                 $this->doc->loadXML($data);
21
22                 mb_substitute_character("none");
23
24                 $error = libxml_get_last_error();
25
26                 // libxml compiled without iconv?
27                 if ($error && $error->code == 32) {
28                         if (preg_match('/^(<\?xml[\t\n\r ].*?encoding[\t\n\r ]*=[\t\n\r ]*["\'])(.+?)(["\'].*?\?>)/s', $data, $matches) === 1) {
29                                 $data = mb_convert_encoding($data, 'UTF-8', $matches[2]);
30
31                                 $data = preg_replace('/^<\?xml[\t\n\r ].*?\?>/s', $matches[1] . "UTF-8" . $matches[3] , $data);
32
33                                 if ($data) {
34                                         libxml_clear_errors();
35
36                                         $this->doc = new DOMDocument();
37                                         $this->doc->loadXML($data);
38
39                                         $error = libxml_get_last_error();
40                                 }
41                         }
42                 }
43
44                 // some terrible invalid unicode entity?
45                 if ($error) {
46                         foreach (libxml_get_errors() as $err) {
47                                 if ($err->code == 9) {
48                                         // remove dangling bytes
49                                         $data = mb_convert_encoding($data, 'UTF-8', 'UTF-8');
50
51                                         // apparently not all UTF-8 characters are valid for XML
52                                         $data = preg_replace('/[^\x{0009}\x{000a}\x{000d}\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}]+/u', ' ', $data);
53
54                                         if ($data) {
55                                                 libxml_clear_errors();
56
57                                                 $this->doc = new DOMDocument();
58                                                 $this->doc->loadXML($data);
59
60                                                 $error = libxml_get_last_error();
61                                         }
62                                         break;
63                                 }
64                         }
65                 }
66
67                 if ($error) {
68                         foreach (libxml_get_errors() as $error) {
69                                 if ($error->level == LIBXML_ERR_FATAL) {
70                                         if(!isset($this->error)) //currently only the first error is reported
71                                                 $this->error = $this->format_error($error);
72                                         $this->libxml_errors [] = $this->format_error($error);
73                                 }
74                         }
75                 }
76                 libxml_clear_errors();
77
78                 $this->items = array();
79         }
80
81         function init() {
82                 $root = $this->doc->firstChild;
83                 $xpath = new DOMXPath($this->doc);
84                 $xpath->registerNamespace('atom', 'http://www.w3.org/2005/Atom');
85                 $xpath->registerNamespace('atom03', 'http://purl.org/atom/ns#');
86                 $xpath->registerNamespace('media', 'http://search.yahoo.com/mrss/');
87                 $xpath->registerNamespace('rdf', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#');
88                 $xpath->registerNamespace('slash', 'http://purl.org/rss/1.0/modules/slash/');
89                 $xpath->registerNamespace('dc', 'http://purl.org/dc/elements/1.1/');
90                 $xpath->registerNamespace('content', 'http://purl.org/rss/1.0/modules/content/');
91
92                 $this->xpath = $xpath;
93
94                 $root = $xpath->query("(//atom03:feed|//atom:feed|//channel|//rdf:rdf|//rdf:RDF)");
95
96                 if ($root) {
97                         $root = $root->item(0);
98
99                         if ($root) {
100                                 switch (mb_strtolower($root->tagName)) {
101                                 case "rdf:rdf":
102                                         $this->type = $this::FEED_RDF;
103                                         break;
104                                 case "channel":
105                                         $this->type = $this::FEED_RSS;
106                                         break;
107                                 case "feed":
108                                         $this->type = $this::FEED_ATOM;
109                                         break;
110                                 default:
111                                         if( !isset($this->error) ){
112                                                 $this->error = "Unknown/unsupported feed type";
113                                         }
114                                         return;
115                                 }
116                         }
117
118                         switch ($this->type) {
119                         case $this::FEED_ATOM:
120
121                                 $title = $xpath->query("//atom:feed/atom:title")->item(0);
122
123                                 if (!$title)
124                                         $title = $xpath->query("//atom03:feed/atom03:title")->item(0);
125
126
127                                 if ($title) {
128                                         $this->title = $title->nodeValue;
129                                 }
130
131                                 $link = $xpath->query("//atom:feed/atom:link[not(@rel)]")->item(0);
132
133                                 if (!$link)
134                                         $link = $xpath->query("//atom03:feed/atom03:link[not(@rel)]")->item(0);
135
136
137                                 if ($link && $link->hasAttributes()) {
138                                         $this->link = $link->getAttribute("href");
139                                 }
140
141                                 $articles = $xpath->query("//atom:entry");
142
143                                 if (!$articles || $articles->length == 0)
144                                         $articles = $xpath->query("//atom03:entry");
145
146                                 foreach ($articles as $article) {
147                                         array_push($this->items, new FeedItem_Atom($article, $this->doc, $this->xpath));
148                                 }
149
150                                 break;
151                         case $this::FEED_RSS:
152                                 $title = $xpath->query("//channel/title")->item(0);
153
154                                 if ($title) {
155                                         $this->title = $title->nodeValue;
156                                 }
157
158                                 $link = $xpath->query("//channel/link")->item(0);
159
160                                 if ($link) {
161                                         if ($link->getAttribute("href"))
162                                                 $this->link = $link->getAttribute("href");
163                                         else if ($link->nodeValue)
164                                                 $this->link = $link->nodeValue;
165                                 }
166
167                                 $articles = $xpath->query("//channel/item");
168
169                                 foreach ($articles as $article) {
170                                         array_push($this->items, new FeedItem_RSS($article, $this->doc, $this->xpath));
171                                 }
172
173                                 break;
174                         case $this::FEED_RDF:
175                                 $xpath->registerNamespace('rssfake', 'http://purl.org/rss/1.0/');
176
177                                 $title = $xpath->query("//rssfake:channel/rssfake:title")->item(0);
178
179                                 if ($title) {
180                                         $this->title = $title->nodeValue;
181                                 }
182
183                                 $link = $xpath->query("//rssfake:channel/rssfake:link")->item(0);
184
185                                 if ($link) {
186                                         $this->link = $link->nodeValue;
187                                 }
188
189                                 $articles = $xpath->query("//rssfake:item");
190
191                                 foreach ($articles as $article) {
192                                         array_push($this->items, new FeedItem_RSS($article, $this->doc, $this->xpath));
193                                 }
194
195                                 break;
196
197                         }
198                 } else {
199                         if( !isset($this->error) ){
200                                 $this->error = "Unknown/unsupported feed type";
201                         }
202                         return;
203                 }
204         }
205
206         function format_error($error) {
207                 if ($error) {
208                         return sprintf("LibXML error %s at line %d (column %d): %s",
209                                 $error->code, $error->line, $error->column,
210                                 $error->message);
211                 } else {
212                         return "";
213                 }
214         }
215
216         function error() {
217                 return $this->error;
218         }
219
220         function errors() {
221                 return $this->libxml_errors;
222         }
223
224         function get_link() {
225                 return $this->link;
226         }
227
228         function get_title() {
229                 return $this->title;
230         }
231
232         function get_items() {
233                 return $this->items;
234         }
235
236         function get_links($rel) {
237                 $rv = array();
238
239                 switch ($this->type) {
240                 case $this::FEED_ATOM:
241                         $links = $this->xpath->query("//atom:feed/atom:link");
242
243                         foreach ($links as $link) {
244                                 if (!$rel || $link->hasAttribute('rel') && $link->getAttribute('rel') == $rel) {
245                                         array_push($rv, $link->getAttribute('href'));
246                                 }
247                         }
248                         break;
249                 case $this::FEED_RSS:
250                         $links = $this->xpath->query("//atom:link");
251
252                         foreach ($links as $link) {
253                                 if (!$rel || $link->hasAttribute('rel') && $link->getAttribute('rel') == $rel) {
254                                         array_push($rv, $link->getAttribute('href'));
255                                 }
256                         }
257                         break;
258                 }
259
260                 return $rv;
261         }
262 } ?>