]> git.wh0rd.org - tt-rss.git/blob - lib/magpierss/rss_fetch.inc
695d3b69e4031cbf7147c80f4210be9b42dcb037
[tt-rss.git] / lib / magpierss / rss_fetch.inc
1 <?php
2 /*
3 * Project: MagpieRSS: a simple RSS integration tool
4 * File: rss_fetch.inc, a simple functional interface
5 to fetching and parsing RSS files, via the
6 function fetch_rss()
7 * Author: Kellan Elliott-McCrea <kellan@protest.net>
8 * License: GPL
9 *
10 * The lastest version of MagpieRSS can be obtained from:
11 * http://magpierss.sourceforge.net
12 *
13 * For questions, help, comments, discussion, etc., please join the
14 * Magpie mailing list:
15 * magpierss-general@lists.sourceforge.net
16 *
17 */
18
19 // Setup MAGPIE_DIR for use on hosts that don't include
20 // the current path in include_path.
21 // with thanks to rajiv and smarty
22 if (!defined('DIR_SEP')) {
23 define('DIR_SEP', DIRECTORY_SEPARATOR);
24 }
25
26 if (!defined('MAGPIE_DIR')) {
27 define('MAGPIE_DIR', dirname(__FILE__) . DIR_SEP);
28 }
29
30 require_once( MAGPIE_DIR . 'rss_parse.inc' );
31 require_once( MAGPIE_DIR . 'rss_cache.inc' );
32
33 // for including 3rd party libraries
34 define('MAGPIE_EXTLIB', MAGPIE_DIR . 'extlib' . DIR_SEP);
35 require_once( MAGPIE_EXTLIB . 'Snoopy.class.inc');
36
37
38 /*
39 * CONSTANTS - redefine these in your script to change the
40 * behaviour of fetch_rss() currently, most options effect the cache
41 *
42 * MAGPIE_CACHE_ON - Should Magpie cache parsed RSS objects?
43 * For me a built in cache was essential to creating a "PHP-like"
44 * feel to Magpie, see rss_cache.inc for rationale
45 *
46 *
47 * MAGPIE_CACHE_DIR - Where should Magpie cache parsed RSS objects?
48 * This should be a location that the webserver can write to. If this
49 * directory does not already exist Mapie will try to be smart and create
50 * it. This will often fail for permissions reasons.
51 *
52 *
53 * MAGPIE_CACHE_AGE - How long to store cached RSS objects? In seconds.
54 *
55 *
56 * MAGPIE_CACHE_FRESH_ONLY - If remote fetch fails, throw error
57 * instead of returning stale object?
58 *
59 * MAGPIE_DEBUG - Display debugging notices?
60 *
61 */
62
63
64 /*=======================================================================*\
65 Function: fetch_rss:
66 Purpose: return RSS object for the give url
67 maintain the cache
68 Input: url of RSS file
69 Output: parsed RSS object (see rss_parse.inc)
70
71 NOTES ON CACHEING:
72 If caching is on (MAGPIE_CACHE_ON) fetch_rss will first check the cache.
73
74 NOTES ON RETRIEVING REMOTE FILES:
75 If conditional gets are on (MAGPIE_CONDITIONAL_GET_ON) fetch_rss will
76 return a cached object, and touch the cache object upon recieving a
77 304.
78
79 NOTES ON FAILED REQUESTS:
80 If there is an HTTP error while fetching an RSS object, the cached
81 version will be return, if it exists (and if MAGPIE_CACHE_FRESH_ONLY is off)
82 \*=======================================================================*/
83
84 define('MAGPIE_VERSION', '0.72');
85
86 $MAGPIE_ERROR = "";
87
88 function fetch_rss ($url) {
89 // initialize constants
90 init();
91
92 if ( !isset($url) ) {
93 error("fetch_rss called without a url");
94 return false;
95 }
96
97 // if cache is disabled
98 if ( !MAGPIE_CACHE_ON ) {
99 // fetch file, and parse it
100 $resp = _fetch_remote_file( $url );
101 if ( is_success( $resp->status ) ) {
102 return _response_to_rss( $resp );
103 }
104 else {
105 error("Failed to fetch $url and cache is off");
106 return false;
107 }
108 }
109 // else cache is ON
110 else {
111 // Flow
112 // 1. check cache
113 // 2. if there is a hit, make sure its fresh
114 // 3. if cached obj fails freshness check, fetch remote
115 // 4. if remote fails, return stale object, or error
116
117 $cache = new RSSCache( MAGPIE_CACHE_DIR, MAGPIE_CACHE_AGE );
118
119 if (MAGPIE_DEBUG and $cache->ERROR) {
120 debug($cache->ERROR, E_USER_WARNING);
121 }
122
123
124 $cache_status = 0; // response of check_cache
125 $request_headers = array(); // HTTP headers to send with fetch
126 $rss = 0; // parsed RSS object
127 $errormsg = 0; // errors, if any
128
129 // store parsed XML by desired output encoding
130 // as character munging happens at parse time
131 $cache_key = $url . MAGPIE_OUTPUT_ENCODING;
132
133 if (!$cache->ERROR) {
134 // return cache HIT, MISS, or STALE
135 $cache_status = $cache->check_cache( $cache_key);
136 }
137
138 // if object cached, and cache is fresh, return cached obj
139 if ( $cache_status == 'HIT' ) {
140 $rss = $cache->get( $cache_key );
141 if ( isset($rss) and $rss ) {
142 // should be cache age
143 $rss->from_cache = 1;
144 if ( MAGPIE_DEBUG > 1) {
145 debug("MagpieRSS: Cache HIT", E_USER_NOTICE);
146 }
147 return $rss;
148 }
149 }
150
151 // else attempt a conditional get
152
153 // setup headers
154 if ( $cache_status == 'STALE' ) {
155 $rss = $cache->get( $cache_key );
156 if ( $rss and $rss->etag and $rss->last_modified ) {
157 $request_headers['If-None-Match'] = $rss->etag;
158 $request_headers['If-Last-Modified'] = $rss->last_modified;
159 }
160 }
161
162 $resp = _fetch_remote_file( $url, $request_headers );
163
164 if (isset($resp) and $resp) {
165 if ($resp->status == '304' ) {
166 // we have the most current copy
167 if ( MAGPIE_DEBUG > 1) {
168 debug("Got 304 for $url");
169 }
170 // reset cache on 304 (at minutillo insistent prodding)
171 $cache->set($cache_key, $rss);
172 return $rss;
173 }
174 elseif ( is_success( $resp->status ) ) {
175 $rss = _response_to_rss( $resp );
176 if ( $rss ) {
177 if (MAGPIE_DEBUG > 1) {
178 debug("Fetch successful");
179 }
180 // add object to cache
181 $cache->set( $cache_key, $rss );
182 return $rss;
183 }
184 }
185 else {
186 $errormsg = "Failed to fetch $url ";
187 if ( $resp->status == '-100' ) {
188 $errormsg .= "(Request timed out after " . MAGPIE_FETCH_TIME_OUT . " seconds)";
189 }
190 elseif ( $resp->error ) {
191 # compensate for Snoopy's annoying habbit to tacking
192 # on '\n'
193 $http_error = substr($resp->error, 0, -2);
194 $errormsg .= "(HTTP Error: $http_error)";
195 }
196 else {
197 $errormsg .= "(HTTP Response: " . $resp->response_code .')';
198 }
199 }
200 }
201 else {
202 $errormsg = "Unable to retrieve RSS file for unknown reasons.";
203 }
204
205 // else fetch failed
206
207 // attempt to return cached object
208 if ($rss) {
209 if ( MAGPIE_DEBUG ) {
210 debug("Returning STALE object for $url");
211 }
212 return $rss;
213 }
214
215 // else we totally failed
216 error( $errormsg );
217
218 return false;
219
220 } // end if ( !MAGPIE_CACHE_ON ) {
221 } // end fetch_rss()
222
223 /*=======================================================================*\
224 Function: error
225 Purpose: set MAGPIE_ERROR, and trigger error
226 \*=======================================================================*/
227
228 function error ($errormsg, $lvl=E_USER_WARNING) {
229 global $MAGPIE_ERROR;
230
231 // append PHP's error message if track_errors enabled
232 if ( isset($php_errormsg) ) {
233 $errormsg .= " ($php_errormsg)";
234 }
235 if ( $errormsg ) {
236 $errormsg = "MagpieRSS: $errormsg";
237 $MAGPIE_ERROR = $errormsg;
238 trigger_error( $errormsg, $lvl);
239 }
240 }
241
242 function debug ($debugmsg, $lvl=E_USER_NOTICE) {
243 trigger_error("MagpieRSS [debug] $debugmsg", $lvl);
244 }
245
246 /*=======================================================================*\
247 Function: magpie_error
248 Purpose: accessor for the magpie error variable
249 \*=======================================================================*/
250 function magpie_error ($errormsg="") {
251 global $MAGPIE_ERROR;
252
253 if ( isset($errormsg) and $errormsg ) {
254 $MAGPIE_ERROR = $errormsg;
255 }
256
257 return $MAGPIE_ERROR;
258 }
259
260 /*=======================================================================*\
261 Function: _fetch_remote_file
262 Purpose: retrieve an arbitrary remote file
263 Input: url of the remote file
264 headers to send along with the request (optional)
265 Output: an HTTP response object (see Snoopy.class.inc)
266 \*=======================================================================*/
267 function _fetch_remote_file ($url, $headers = "" ) {
268 // Snoopy is an HTTP client in PHP
269 $client = new Snoopy();
270 $client->agent = MAGPIE_USER_AGENT;
271 $client->read_timeout = MAGPIE_FETCH_TIME_OUT;
272 $client->use_gzip = MAGPIE_USE_GZIP;
273 if (is_array($headers) ) {
274 $client->rawheaders = $headers;
275 }
276
277 @$client->fetch($url);
278 return $client;
279
280 }
281
282 /*=======================================================================*\
283 Function: _response_to_rss
284 Purpose: parse an HTTP response object into an RSS object
285 Input: an HTTP response object (see Snoopy)
286 Output: parsed RSS object (see rss_parse)
287 \*=======================================================================*/
288 function _response_to_rss ($resp) {
289 $rss = new MagpieRSS( $resp->results, MAGPIE_OUTPUT_ENCODING, MAGPIE_INPUT_ENCODING, MAGPIE_DETECT_ENCODING );
290
291 // if RSS parsed successfully
292 if ( $rss and !$rss->ERROR) {
293
294 // find Etag, and Last-Modified
295 foreach($resp->headers as $h) {
296 // 2003-03-02 - Nicola Asuni (www.tecnick.com) - fixed bug "Undefined offset: 1"
297 if (strpos($h, ": ")) {
298 list($field, $val) = explode(": ", $h, 2);
299 }
300 else {
301 $field = $h;
302 $val = "";
303 }
304
305 if ( $field == 'ETag' ) {
306 $rss->etag = $val;
307 }
308
309 if ( $field == 'Last-Modified' ) {
310 $rss->last_modified = $val;
311 }
312 }
313
314 return $rss;
315 } // else construct error message
316 else {
317 $errormsg = "Failed to parse RSS file.";
318
319 if ($rss) {
320 $errormsg .= " (" . $rss->ERROR . ")";
321 }
322 error($errormsg);
323
324 return false;
325 } // end if ($rss and !$rss->error)
326 }
327
328 /*=======================================================================*\
329 Function: init
330 Purpose: setup constants with default values
331 check for user overrides
332 \*=======================================================================*/
333 function init () {
334 if ( defined('MAGPIE_INITALIZED') ) {
335 return;
336 }
337 else {
338 define('MAGPIE_INITALIZED', true);
339 }
340
341 if ( !defined('MAGPIE_CACHE_ON') ) {
342 define('MAGPIE_CACHE_ON', true);
343 }
344
345 if ( !defined('MAGPIE_CACHE_DIR') ) {
346 define('MAGPIE_CACHE_DIR', './cache');
347 }
348
349 if ( !defined('MAGPIE_CACHE_AGE') ) {
350 define('MAGPIE_CACHE_AGE', 60*60); // one hour
351 }
352
353 if ( !defined('MAGPIE_CACHE_FRESH_ONLY') ) {
354 define('MAGPIE_CACHE_FRESH_ONLY', false);
355 }
356
357 if ( !defined('MAGPIE_OUTPUT_ENCODING') ) {
358 define('MAGPIE_OUTPUT_ENCODING', 'ISO-8859-1');
359 }
360
361 if ( !defined('MAGPIE_INPUT_ENCODING') ) {
362 define('MAGPIE_INPUT_ENCODING', null);
363 }
364
365 if ( !defined('MAGPIE_DETECT_ENCODING') ) {
366 define('MAGPIE_DETECT_ENCODING', true);
367 }
368
369 if ( !defined('MAGPIE_DEBUG') ) {
370 define('MAGPIE_DEBUG', 0);
371 }
372
373 if ( !defined('MAGPIE_USER_AGENT') ) {
374 $ua = 'MagpieRSS/'. MAGPIE_VERSION . ' (+http://magpierss.sf.net';
375
376 if ( MAGPIE_CACHE_ON ) {
377 $ua = $ua . ')';
378 }
379 else {
380 $ua = $ua . '; No cache)';
381 }
382
383 if ( defined('MAGPIE_USER_AGENT_EXT') ) {
384 $ua = $ua . MAGPIE_USER_AGENT_EXT;
385 }
386
387 define('MAGPIE_USER_AGENT', $ua);
388 }
389
390 if ( !defined('MAGPIE_FETCH_TIME_OUT') ) {
391 define('MAGPIE_FETCH_TIME_OUT', 5); // 5 second timeout
392 }
393
394 // use gzip encoding to fetch rss files if supported?
395 if ( !defined('MAGPIE_USE_GZIP') ) {
396 define('MAGPIE_USE_GZIP', true);
397 }
398 }
399
400 // NOTE: the following code should really be in Snoopy, or at least
401 // somewhere other then rss_fetch!
402
403 /*=======================================================================*\
404 HTTP STATUS CODE PREDICATES
405 These functions attempt to classify an HTTP status code
406 based on RFC 2616 and RFC 2518.
407
408 All of them take an HTTP status code as input, and return true or false
409
410 All this code is adapted from LWP's HTTP::Status.
411 \*=======================================================================*/
412
413
414 /*=======================================================================*\
415 Function: is_info
416 Purpose: return true if Informational status code
417 \*=======================================================================*/
418 function is_info ($sc) {
419 return $sc >= 100 && $sc < 200;
420 }
421
422 /*=======================================================================*\
423 Function: is_success
424 Purpose: return true if Successful status code
425 \*=======================================================================*/
426 function is_success ($sc) {
427 return $sc >= 200 && $sc < 300;
428 }
429
430 /*=======================================================================*\
431 Function: is_redirect
432 Purpose: return true if Redirection status code
433 \*=======================================================================*/
434 function is_redirect ($sc) {
435 return $sc >= 300 && $sc < 400;
436 }
437
438 /*=======================================================================*\
439 Function: is_error
440 Purpose: return true if Error status code
441 \*=======================================================================*/
442 function is_error ($sc) {
443 return $sc >= 400 && $sc < 600;
444 }
445
446 /*=======================================================================*\
447 Function: is_client_error
448 Purpose: return true if Error status code, and its a client error
449 \*=======================================================================*/
450 function is_client_error ($sc) {
451 return $sc >= 400 && $sc < 500;
452 }
453
454 /*=======================================================================*\
455 Function: is_client_error
456 Purpose: return true if Error status code, and its a server error
457 \*=======================================================================*/
458 function is_server_error ($sc) {
459 return $sc >= 500 && $sc < 600;
460 }
461
462 ?>