]>
Commit | Line | Data |
---|---|---|
1 | <?php | |
2 | /* | |
3 | * Project: MagpieRSS: a simple RSS integration tool | |
4 | * File: rss_fetch.inc, a simple functional interface | |
5 | to fetching and parsing RSS files, via the | |
6 | function fetch_rss() | |
7 | * Author: Kellan Elliott-McCrea <kellan@protest.net> | |
8 | * License: GPL | |
9 | * | |
10 | * The lastest version of MagpieRSS can be obtained from: | |
11 | * http://magpierss.sourceforge.net | |
12 | * | |
13 | * For questions, help, comments, discussion, etc., please join the | |
14 | * Magpie mailing list: | |
15 | * magpierss-general@lists.sourceforge.net | |
16 | * | |
17 | */ | |
18 | ||
19 | // Setup MAGPIE_DIR for use on hosts that don't include | |
20 | // the current path in include_path. | |
21 | // with thanks to rajiv and smarty | |
22 | if (!defined('DIR_SEP')) { | |
23 | define('DIR_SEP', DIRECTORY_SEPARATOR); | |
24 | } | |
25 | ||
26 | if (!defined('MAGPIE_DIR')) { | |
27 | define('MAGPIE_DIR', dirname(__FILE__) . DIR_SEP); | |
28 | } | |
29 | ||
30 | require_once( MAGPIE_DIR . 'rss_parse.inc' ); | |
31 | require_once( MAGPIE_DIR . 'rss_cache.inc' ); | |
32 | ||
33 | // for including 3rd party libraries | |
34 | define('MAGPIE_EXTLIB', MAGPIE_DIR . 'extlib' . DIR_SEP); | |
35 | require_once( MAGPIE_EXTLIB . 'Snoopy.class.inc'); | |
36 | ||
37 | ||
38 | /* | |
39 | * CONSTANTS - redefine these in your script to change the | |
40 | * behaviour of fetch_rss() currently, most options effect the cache | |
41 | * | |
42 | * MAGPIE_CACHE_ON - Should Magpie cache parsed RSS objects? | |
43 | * For me a built in cache was essential to creating a "PHP-like" | |
44 | * feel to Magpie, see rss_cache.inc for rationale | |
45 | * | |
46 | * | |
47 | * MAGPIE_CACHE_DIR - Where should Magpie cache parsed RSS objects? | |
48 | * This should be a location that the webserver can write to. If this | |
49 | * directory does not already exist Mapie will try to be smart and create | |
50 | * it. This will often fail for permissions reasons. | |
51 | * | |
52 | * | |
53 | * MAGPIE_CACHE_AGE - How long to store cached RSS objects? In seconds. | |
54 | * | |
55 | * | |
56 | * MAGPIE_CACHE_FRESH_ONLY - If remote fetch fails, throw error | |
57 | * instead of returning stale object? | |
58 | * | |
59 | * MAGPIE_DEBUG - Display debugging notices? | |
60 | * | |
61 | */ | |
62 | ||
63 | ||
64 | /*=======================================================================*\ | |
65 | Function: fetch_rss: | |
66 | Purpose: return RSS object for the give url | |
67 | maintain the cache | |
68 | Input: url of RSS file | |
69 | Output: parsed RSS object (see rss_parse.inc) | |
70 | ||
71 | NOTES ON CACHEING: | |
72 | If caching is on (MAGPIE_CACHE_ON) fetch_rss will first check the cache. | |
73 | ||
74 | NOTES ON RETRIEVING REMOTE FILES: | |
75 | If conditional gets are on (MAGPIE_CONDITIONAL_GET_ON) fetch_rss will | |
76 | return a cached object, and touch the cache object upon recieving a | |
77 | 304. | |
78 | ||
79 | NOTES ON FAILED REQUESTS: | |
80 | If there is an HTTP error while fetching an RSS object, the cached | |
81 | version will be return, if it exists (and if MAGPIE_CACHE_FRESH_ONLY is off) | |
82 | \*=======================================================================*/ | |
83 | ||
84 | define('MAGPIE_VERSION', '0.72'); | |
85 | ||
86 | $MAGPIE_ERROR = ""; | |
87 | ||
88 | function fetch_rss ($url) { | |
89 | // initialize constants | |
90 | init(); | |
91 | ||
92 | if ( !isset($url) ) { | |
93 | error("fetch_rss called without a url"); | |
94 | return false; | |
95 | } | |
96 | ||
97 | // if cache is disabled | |
98 | if ( !MAGPIE_CACHE_ON ) { | |
99 | // fetch file, and parse it | |
100 | $resp = _fetch_remote_file( $url ); | |
101 | if ( is_success( $resp->status ) ) { | |
102 | return _response_to_rss( $resp ); | |
103 | } | |
104 | else { | |
105 | error("Failed to fetch $url and cache is off"); | |
106 | return false; | |
107 | } | |
108 | } | |
109 | // else cache is ON | |
110 | else { | |
111 | // Flow | |
112 | // 1. check cache | |
113 | // 2. if there is a hit, make sure its fresh | |
114 | // 3. if cached obj fails freshness check, fetch remote | |
115 | // 4. if remote fails, return stale object, or error | |
116 | ||
117 | $cache = new RSSCache( MAGPIE_CACHE_DIR, MAGPIE_CACHE_AGE ); | |
118 | ||
119 | if (MAGPIE_DEBUG and $cache->ERROR) { | |
120 | debug($cache->ERROR, E_USER_WARNING); | |
121 | } | |
122 | ||
123 | ||
124 | $cache_status = 0; // response of check_cache | |
125 | $request_headers = array(); // HTTP headers to send with fetch | |
126 | $rss = 0; // parsed RSS object | |
127 | $errormsg = 0; // errors, if any | |
128 | ||
129 | // store parsed XML by desired output encoding | |
130 | // as character munging happens at parse time | |
131 | $cache_key = $url . MAGPIE_OUTPUT_ENCODING; | |
132 | ||
133 | if (!$cache->ERROR) { | |
134 | // return cache HIT, MISS, or STALE | |
135 | $cache_status = $cache->check_cache( $cache_key); | |
136 | } | |
137 | ||
138 | // if object cached, and cache is fresh, return cached obj | |
139 | if ( $cache_status == 'HIT' ) { | |
140 | $rss = $cache->get( $cache_key ); | |
141 | if ( isset($rss) and $rss ) { | |
142 | // should be cache age | |
143 | $rss->from_cache = 1; | |
144 | if ( MAGPIE_DEBUG > 1) { | |
145 | debug("MagpieRSS: Cache HIT", E_USER_NOTICE); | |
146 | } | |
147 | return $rss; | |
148 | } | |
149 | } | |
150 | ||
151 | // else attempt a conditional get | |
152 | ||
153 | // setup headers | |
154 | if ( $cache_status == 'STALE' ) { | |
155 | $rss = $cache->get( $cache_key ); | |
156 | if ( $rss and $rss->etag and $rss->last_modified ) { | |
157 | $request_headers['If-None-Match'] = $rss->etag; | |
158 | $request_headers['If-Last-Modified'] = $rss->last_modified; | |
159 | } | |
160 | } | |
161 | ||
162 | $resp = _fetch_remote_file( $url, $request_headers ); | |
163 | ||
164 | if (isset($resp) and $resp) { | |
165 | if ($resp->status == '304' ) { | |
166 | // we have the most current copy | |
167 | if ( MAGPIE_DEBUG > 1) { | |
168 | debug("Got 304 for $url"); | |
169 | } | |
170 | // reset cache on 304 (at minutillo insistent prodding) | |
171 | $cache->set($cache_key, $rss); | |
172 | return $rss; | |
173 | } | |
174 | elseif ( is_success( $resp->status ) ) { | |
175 | $rss = _response_to_rss( $resp ); | |
176 | if ( $rss ) { | |
177 | if (MAGPIE_DEBUG > 1) { | |
178 | debug("Fetch successful"); | |
179 | } | |
180 | // add object to cache | |
181 | $cache->set( $cache_key, $rss ); | |
182 | return $rss; | |
183 | } | |
184 | } | |
185 | else { | |
186 | $errormsg = "Failed to fetch $url "; | |
187 | if ( $resp->status == '-100' ) { | |
188 | $errormsg .= "(Request timed out after " . MAGPIE_FETCH_TIME_OUT . " seconds)"; | |
189 | } | |
190 | elseif ( $resp->error ) { | |
191 | # compensate for Snoopy's annoying habbit to tacking | |
192 | # on '\n' | |
193 | $http_error = substr($resp->error, 0, -2); | |
194 | $errormsg .= "(HTTP Error: $http_error)"; | |
195 | } | |
196 | else { | |
197 | $errormsg .= "(HTTP Response: " . $resp->response_code .')'; | |
198 | } | |
199 | } | |
200 | } | |
201 | else { | |
202 | $errormsg = "Unable to retrieve RSS file for unknown reasons."; | |
203 | } | |
204 | ||
205 | // else fetch failed | |
206 | ||
207 | // attempt to return cached object | |
208 | if ($rss) { | |
209 | if ( MAGPIE_DEBUG ) { | |
210 | debug("Returning STALE object for $url"); | |
211 | } | |
212 | return $rss; | |
213 | } | |
214 | ||
215 | // else we totally failed | |
216 | error( $errormsg ); | |
217 | ||
218 | return false; | |
219 | ||
220 | } // end if ( !MAGPIE_CACHE_ON ) { | |
221 | } // end fetch_rss() | |
222 | ||
223 | /*=======================================================================*\ | |
224 | Function: error | |
225 | Purpose: set MAGPIE_ERROR, and trigger error | |
226 | \*=======================================================================*/ | |
227 | ||
228 | function error ($errormsg, $lvl=E_USER_WARNING) { | |
229 | global $MAGPIE_ERROR; | |
230 | ||
231 | // append PHP's error message if track_errors enabled | |
232 | if ( isset($php_errormsg) ) { | |
233 | $errormsg .= " ($php_errormsg)"; | |
234 | } | |
235 | if ( $errormsg ) { | |
236 | $errormsg = "MagpieRSS: $errormsg"; | |
237 | $MAGPIE_ERROR = $errormsg; | |
238 | trigger_error( $errormsg, $lvl); | |
239 | } | |
240 | } | |
241 | ||
242 | function debug ($debugmsg, $lvl=E_USER_NOTICE) { | |
243 | trigger_error("MagpieRSS [debug] $debugmsg", $lvl); | |
244 | } | |
245 | ||
246 | /*=======================================================================*\ | |
247 | Function: magpie_error | |
248 | Purpose: accessor for the magpie error variable | |
249 | \*=======================================================================*/ | |
250 | function magpie_error ($errormsg="") { | |
251 | global $MAGPIE_ERROR; | |
252 | ||
253 | if ( isset($errormsg) and $errormsg ) { | |
254 | $MAGPIE_ERROR = $errormsg; | |
255 | } | |
256 | ||
257 | return $MAGPIE_ERROR; | |
258 | } | |
259 | ||
260 | /*=======================================================================*\ | |
261 | Function: _fetch_remote_file | |
262 | Purpose: retrieve an arbitrary remote file | |
263 | Input: url of the remote file | |
264 | headers to send along with the request (optional) | |
265 | Output: an HTTP response object (see Snoopy.class.inc) | |
266 | \*=======================================================================*/ | |
267 | function _fetch_remote_file ($url, $headers = "" ) { | |
268 | // Snoopy is an HTTP client in PHP | |
269 | $client = new Snoopy(); | |
270 | $client->agent = MAGPIE_USER_AGENT; | |
271 | $client->read_timeout = MAGPIE_FETCH_TIME_OUT; | |
272 | $client->use_gzip = MAGPIE_USE_GZIP; | |
273 | if (is_array($headers) ) { | |
274 | $client->rawheaders = $headers; | |
275 | } | |
276 | ||
277 | @$client->fetch($url); | |
278 | return $client; | |
279 | ||
280 | } | |
281 | ||
282 | /*=======================================================================*\ | |
283 | Function: _response_to_rss | |
284 | Purpose: parse an HTTP response object into an RSS object | |
285 | Input: an HTTP response object (see Snoopy) | |
286 | Output: parsed RSS object (see rss_parse) | |
287 | \*=======================================================================*/ | |
288 | function _response_to_rss ($resp) { | |
289 | $rss = new MagpieRSS( $resp->results, MAGPIE_OUTPUT_ENCODING, MAGPIE_INPUT_ENCODING, MAGPIE_DETECT_ENCODING ); | |
290 | ||
291 | // if RSS parsed successfully | |
292 | if ( $rss and !$rss->ERROR) { | |
293 | ||
294 | // find Etag, and Last-Modified | |
295 | foreach($resp->headers as $h) { | |
296 | // 2003-03-02 - Nicola Asuni (www.tecnick.com) - fixed bug "Undefined offset: 1" | |
297 | if (strpos($h, ": ")) { | |
298 | list($field, $val) = explode(": ", $h, 2); | |
299 | } | |
300 | else { | |
301 | $field = $h; | |
302 | $val = ""; | |
303 | } | |
304 | ||
305 | if ( $field == 'ETag' ) { | |
306 | $rss->etag = $val; | |
307 | } | |
308 | ||
309 | if ( $field == 'Last-Modified' ) { | |
310 | $rss->last_modified = $val; | |
311 | } | |
312 | } | |
313 | ||
314 | return $rss; | |
315 | } // else construct error message | |
316 | else { | |
317 | $errormsg = "Failed to parse RSS file."; | |
318 | ||
319 | if ($rss) { | |
320 | $errormsg .= " (" . $rss->ERROR . ")"; | |
321 | } | |
322 | error($errormsg); | |
323 | ||
324 | return false; | |
325 | } // end if ($rss and !$rss->error) | |
326 | } | |
327 | ||
328 | /*=======================================================================*\ | |
329 | Function: init | |
330 | Purpose: setup constants with default values | |
331 | check for user overrides | |
332 | \*=======================================================================*/ | |
333 | function init () { | |
334 | if ( defined('MAGPIE_INITALIZED') ) { | |
335 | return; | |
336 | } | |
337 | else { | |
338 | define('MAGPIE_INITALIZED', true); | |
339 | } | |
340 | ||
341 | if ( !defined('MAGPIE_CACHE_ON') ) { | |
342 | define('MAGPIE_CACHE_ON', true); | |
343 | } | |
344 | ||
345 | if ( !defined('MAGPIE_CACHE_DIR') ) { | |
346 | define('MAGPIE_CACHE_DIR', './cache'); | |
347 | } | |
348 | ||
349 | if ( !defined('MAGPIE_CACHE_AGE') ) { | |
350 | define('MAGPIE_CACHE_AGE', 60*60); // one hour | |
351 | } | |
352 | ||
353 | if ( !defined('MAGPIE_CACHE_FRESH_ONLY') ) { | |
354 | define('MAGPIE_CACHE_FRESH_ONLY', false); | |
355 | } | |
356 | ||
357 | if ( !defined('MAGPIE_OUTPUT_ENCODING') ) { | |
358 | define('MAGPIE_OUTPUT_ENCODING', 'ISO-8859-1'); | |
359 | } | |
360 | ||
361 | if ( !defined('MAGPIE_INPUT_ENCODING') ) { | |
362 | define('MAGPIE_INPUT_ENCODING', null); | |
363 | } | |
364 | ||
365 | if ( !defined('MAGPIE_DETECT_ENCODING') ) { | |
366 | define('MAGPIE_DETECT_ENCODING', true); | |
367 | } | |
368 | ||
369 | if ( !defined('MAGPIE_DEBUG') ) { | |
370 | define('MAGPIE_DEBUG', 0); | |
371 | } | |
372 | ||
373 | if ( !defined('MAGPIE_USER_AGENT') ) { | |
374 | $ua = 'MagpieRSS/'. MAGPIE_VERSION . ' (+http://magpierss.sf.net'; | |
375 | ||
376 | if ( MAGPIE_CACHE_ON ) { | |
377 | $ua = $ua . ')'; | |
378 | } | |
379 | else { | |
380 | $ua = $ua . '; No cache)'; | |
381 | } | |
382 | ||
383 | define('MAGPIE_USER_AGENT', $ua); | |
384 | } | |
385 | ||
386 | if ( !defined('MAGPIE_FETCH_TIME_OUT') ) { | |
387 | define('MAGPIE_FETCH_TIME_OUT', 5); // 5 second timeout | |
388 | } | |
389 | ||
390 | // use gzip encoding to fetch rss files if supported? | |
391 | if ( !defined('MAGPIE_USE_GZIP') ) { | |
392 | define('MAGPIE_USE_GZIP', true); | |
393 | } | |
394 | } | |
395 | ||
396 | // NOTE: the following code should really be in Snoopy, or at least | |
397 | // somewhere other then rss_fetch! | |
398 | ||
399 | /*=======================================================================*\ | |
400 | HTTP STATUS CODE PREDICATES | |
401 | These functions attempt to classify an HTTP status code | |
402 | based on RFC 2616 and RFC 2518. | |
403 | ||
404 | All of them take an HTTP status code as input, and return true or false | |
405 | ||
406 | All this code is adapted from LWP's HTTP::Status. | |
407 | \*=======================================================================*/ | |
408 | ||
409 | ||
410 | /*=======================================================================*\ | |
411 | Function: is_info | |
412 | Purpose: return true if Informational status code | |
413 | \*=======================================================================*/ | |
414 | function is_info ($sc) { | |
415 | return $sc >= 100 && $sc < 200; | |
416 | } | |
417 | ||
418 | /*=======================================================================*\ | |
419 | Function: is_success | |
420 | Purpose: return true if Successful status code | |
421 | \*=======================================================================*/ | |
422 | function is_success ($sc) { | |
423 | return $sc >= 200 && $sc < 300; | |
424 | } | |
425 | ||
426 | /*=======================================================================*\ | |
427 | Function: is_redirect | |
428 | Purpose: return true if Redirection status code | |
429 | \*=======================================================================*/ | |
430 | function is_redirect ($sc) { | |
431 | return $sc >= 300 && $sc < 400; | |
432 | } | |
433 | ||
434 | /*=======================================================================*\ | |
435 | Function: is_error | |
436 | Purpose: return true if Error status code | |
437 | \*=======================================================================*/ | |
438 | function is_error ($sc) { | |
439 | return $sc >= 400 && $sc < 600; | |
440 | } | |
441 | ||
442 | /*=======================================================================*\ | |
443 | Function: is_client_error | |
444 | Purpose: return true if Error status code, and its a client error | |
445 | \*=======================================================================*/ | |
446 | function is_client_error ($sc) { | |
447 | return $sc >= 400 && $sc < 500; | |
448 | } | |
449 | ||
450 | /*=======================================================================*\ | |
451 | Function: is_client_error | |
452 | Purpose: return true if Error status code, and its a server error | |
453 | \*=======================================================================*/ | |
454 | function is_server_error ($sc) { | |
455 | return $sc >= 500 && $sc < 600; | |
456 | } | |
457 | ||
458 | ?> |