]> git.wh0rd.org - tt-rss.git/blobdiff - include/rssfuncs.php
implement ttrss_feeds.cache_content
[tt-rss.git] / include / rssfuncs.php
index 925983657ff394125c759abbc8bb56a5e4668c56..b26495f67623671833c48804328a4b0f21337d4e 100644 (file)
@@ -1,7 +1,7 @@
 <?php
        define('DAEMON_UPDATE_LOGIN_LIMIT', 30);
        define('DAEMON_FEED_LIMIT', 100);
-       define('DAEMON_SLEEP_INTERVAL', 120);
+       define('DAEMON_SLEEP_INTERVAL', 60);
 
        function update_feedbrowser_cache($link) {
 
 
        } // function update_daemon_common
 
+       // ignore_daemon is not used
        function update_rss_feed($link, $feed, $ignore_daemon = false, $no_cache = false,
                $override_url = false) {
 
 
                $debug_enabled = defined('DAEMON_EXTENDED_DEBUG') || $_REQUEST['xdebug'];
 
-               if (!$_REQUEST["daemon"] && !$ignore_daemon) {
-                       return false;
-               }
-
                if ($debug_enabled) {
                        _debug("update_rss_feed: start");
                }
 
-               if (!$ignore_daemon) {
-
-                       if (DB_TYPE == "pgsql") {
-                                       $updstart_thresh_qpart = "(ttrss_feeds.last_update_started IS NULL OR ttrss_feeds.last_update_started < NOW() - INTERVAL '120 seconds')";
-                               } else {
-                                       $updstart_thresh_qpart = "(ttrss_feeds.last_update_started IS NULL OR ttrss_feeds.last_update_started < DATE_SUB(NOW(), INTERVAL 120 SECOND))";
-                               }
-
-                       $result = db_query($link, "SELECT id,update_interval,auth_login,
-                               auth_pass,cache_images,update_method,last_updated
-                               FROM ttrss_feeds WHERE id = '$feed' AND $updstart_thresh_qpart");
-
-               } else {
-
-                       $result = db_query($link, "SELECT id,update_interval,auth_login,
-                               feed_url,auth_pass,cache_images,update_method,last_updated,
-                               mark_unread_on_update, owner_uid, update_on_checksum_change,
-                               pubsub_state
-                               FROM ttrss_feeds WHERE id = '$feed'");
-
-               }
+               $result = db_query($link, "SELECT id,update_interval,auth_login,
+                       feed_url,auth_pass,cache_images,update_method,last_updated,cache_content,
+                       mark_unread_on_update, owner_uid, update_on_checksum_change,
+                       pubsub_state
+                       FROM ttrss_feeds WHERE id = '$feed'");
 
                if (db_num_rows($result) == 0) {
                        if ($debug_enabled) {
                }
 
                $cache_images = sql_bool_to_bool(db_fetch_result($result, 0, "cache_images"));
+               $cache_content = sql_bool_to_bool(db_fetch_result($result, 0, "cache_content"));
                $fetch_url = db_fetch_result($result, 0, "feed_url");
 
                $feed = db_escape_string($feed);
 
                        define('MAGPIE_CACHE_AGE', $cache_age);
                        define('MAGPIE_CACHE_ON', !$no_cache);
-                       define('MAGPIE_FETCH_TIME_OUT', 60);
+                       define('MAGPIE_FETCH_TIME_OUT', $no_cache ? 15 : 60);
                        define('MAGPIE_CACHE_DIR', CACHE_DIR . "/magpie");
 
                        $rss = @fetch_rss($fetch_url);
 
                        $rss = new SimplePie();
                        $rss->set_useragent(SELF_USER_AGENT);
-#                      $rss->set_timeout(10);
+                       $rss->set_timeout($no_cache ? 15 : 60);
                        $rss->set_feed_url($fetch_url);
                        $rss->set_output_encoding('UTF-8');
                        //$rss->force_feed(true);
                        }
 
                        if ($debug_enabled) {
-                               _debug("update_rss_feed: loading filters...");
+                               _debug("update_rss_feed: loading filters & labels...");
                        }
 
                        $filters = load_filters($link, $feed, $owner_uid);
+                       $labels = get_all_labels($link, $owner_uid);
 
-//                     if ($debug_enabled) {
-//                             print_r($filters);
-//                     }
+                       if ($debug_enabled) {
+                               //print_r($filters);
+                               _debug("update_rss_feed: " . count($filters) . " filters loaded.");
+                       }
 
                        if ($use_simplepie) {
                                $iterator = $rss->get_items();
                                }
 
                                $entry_content_unescaped = $entry_content;
+                               $entry_cached_content = "";
 
                                if ($use_simplepie) {
                                        $entry_comments = strip_tags($item->data["comments"]);
                                        //print_r($entry_tags);
                                }
 
-                               # sanitize content
-
-                               $entry_content = sanitize_article_content($entry_content);
-                               $entry_title = sanitize_article_content($entry_title);
-
                                if ($debug_enabled) {
                                        _debug("update_rss_feed: done collecting data [TITLE:$entry_title]");
                                }
                                                _debug("update_rss_feed: base guid not found");
                                        }
 
+                                       if ($cache_content) {
+                                               if ($debug_enabled) {
+                                                       _debug("update_rss_feed: caching content...");
+                                               }
+
+                                               $entry_cached_content = cache_content($link, $entry_link, $auth_login, $auth_pass);
+
+                                               if ($cache_images && is_writable(CACHE_DIR . '/images'))
+                                                       $entry_cached_content = cache_images($entry_cached_content, $site_url, $debug_enabled);
+
+                                               $entry_cached_content = db_escape_string($entry_cached_content, false);
+
+                                       }
+
                                        // base post entry does not exist, create it
 
                                        $result = db_query($link,
                                                        updated,
                                                        content,
                                                        content_hash,
+                                                       cached_content,
                                                        no_orig_date,
                                                        date_updated,
                                                        date_entered,
                                                        '$entry_link',
                                                        '$entry_timestamp_fmt',
                                                        '$entry_content',
+                                                       '$entry_cached_content',
                                                        '$content_hash',
                                                        $no_orig_date,
                                                        NOW(),
                                                        '$entry_comments',
                                                        '$num_comments',
                                                        '$entry_author')");
+
+                                       $article_labels = array();
+
                                } else {
                                        // we keep encountering the entry in feeds, so we need to
                                        // update date_updated column so that we don't get horrible
 
                                        db_query($link, "UPDATE ttrss_entries SET date_updated = NOW()
                                                WHERE id = '$base_entry_id'");
+
+                                       $article_labels = get_article_labels($link, $base_entry_id, $owner_uid);
                                }
 
                                // now it should exist, if not - bad luck then
                                        if ($content_hash != $orig_content_hash) {
                                                $post_needs_update = true;
                                                $update_insignificant = false;
+
+                                               if ($cache_content) {
+                                                       if ($debug_enabled) {
+                                                               _debug("update_rss_feed: caching content because original checksum changed...");
+                                                       }
+
+                                                       $entry_cached_content = cache_content($link, $entry_link, $auth_login, $auth_pass);
+
+                                                       if ($cache_images && is_writable(CACHE_DIR . '/images'))
+                                                               $entry_cached_content = cache_images($entry_cached_content, $site_url, $debug_enabled);
+
+                                                       $entry_cached_content = db_escape_string($entry_cached_content, false);
+                                               }
                                        }
 
                                        if (db_escape_string($orig_title) != $entry_title) {
                                                db_query($link, "UPDATE ttrss_entries
                                                        SET title = '$entry_title', content = '$entry_content',
                                                                content_hash = '$content_hash',
+                                                               cached_content = '$entry_cached_content',
                                                                updated = '$entry_timestamp_fmt',
                                                                num_comments = '$num_comments'
                                                        WHERE id = '$ref_id'");
                                        _debug("update_rss_feed: assigning labels...");
                                }
 
-                               assign_article_to_labels($link, $entry_ref_id, $article_filters,
-                                       $owner_uid);
+                               assign_article_to_label_filters($link, $entry_ref_id, $article_filters,
+                                       $owner_uid, $article_labels);
 
                                if ($debug_enabled) {
                                        _debug("update_rss_feed: looking for enclosures...");
                                        db_query($link, "COMMIT");
                                }
 
+                               if (get_pref($link, "AUTO_ASSIGN_LABELS", $owner_uid, false)) {
+                                       if ($debug_enabled) {
+                                               _debug("update_rss_feed: auto-assigning labels...");
+                                       }
+
+                                       foreach ($labels as $label) {
+                                               $caption = $label["caption"];
+
+                                               if (preg_match("/\b$caption\b/i", "$tags_str " . strip_tags($entry_content) . " $entry_title")) {
+                                                       if (!labels_contains_caption($article_labels, $caption)) {
+                                                               label_add_article($link, $entry_ref_id, $caption, $owner_uid);
+                                                       }
+                                               }
+                                       }
+                               }
+
                                if ($debug_enabled) {
                                        _debug("update_rss_feed: article processed");
                                }
 
                return $params;
        }
+
+       function get_article_filters($filters, $title, $content, $link, $timestamp, $author, $tags) {
+               $matches = array();
+
+               foreach ($filters as $filter) {
+                       $match_any_rule = $filter["match_any_rule"];
+                       $filter_match = false;
+
+                       foreach ($filter["rules"] as $rule) {
+                               $match = false;
+                               $reg_exp = $rule["reg_exp"];
+
+                               if (!$reg_exp)
+                                       continue;
+
+                               switch ($rule["type"]) {
+                               case "title":
+                                       $match = @preg_match("/$reg_exp/i", $title);
+                                       break;
+                               case "content":
+                                       // we don't need to deal with multiline regexps
+                                       $content = preg_replace("/[\r\n\t]/", "", $content);
+
+                                       $match = @preg_match("/$reg_exp/i", $content);
+                                       break;
+                               case "both":
+                                       // we don't need to deal with multiline regexps
+                                       $content = preg_replace("/[\r\n\t]/", "", $content);
+
+                                       $match = (@preg_match("/$reg_exp/i", $title) || @preg_match("/$reg_exp/i", $content));
+                                       break;
+                               case "link":
+                                       $match = @preg_match("/$reg_exp/i", $link);
+                                       break;
+                               case "author":
+                                       $match = @preg_match("/$reg_exp/i", $author);
+                                       break;
+                               case "tag":
+                                       $tag_string = join(",", $tags);
+                                       $match = @preg_match("/$reg_exp/i", $tag_string);
+                                       break;
+                               }
+
+                               if ($match_any_rule) {
+                                       if ($match) {
+                                               $filter_match = true;
+                                               break;
+                                       }
+                               } else {
+                                       $filter_match = $match;
+                                       if (!$match) {
+                                               break;
+                                       }
+                               }
+                       }
+
+                       if ($filter_match) {
+                               foreach ($filter["actions"] AS $action) {
+                                       array_push($matches, $action);
+                               }
+                       }
+               }
+
+               return $matches;
+       }
+
+       function find_article_filter($filters, $filter_name) {
+               foreach ($filters as $f) {
+                       if ($f["type"] == $filter_name) {
+                               return $f;
+                       };
+               }
+               return false;
+       }
+
+       function find_article_filters($filters, $filter_name) {
+               $results = array();
+
+               foreach ($filters as $f) {
+                       if ($f["type"] == $filter_name) {
+                               array_push($results, $f);
+                       };
+               }
+               return $results;
+       }
+
+       function calculate_article_score($filters) {
+               $score = 0;
+
+               foreach ($filters as $f) {
+                       if ($f["type"] == "score") {
+                               $score += $f["param"];
+                       };
+               }
+               return $score;
+       }
+
+       function labels_contains_caption($labels, $caption) {
+               foreach ($labels as $label) {
+                       if ($label[1] == $caption) {
+                               return true;
+                       }
+               }
+
+               return false;
+       }
+
+       function assign_article_to_label_filters($link, $id, $filters, $owner_uid, $article_labels) {
+               foreach ($filters as $f) {
+                       if ($f["type"] == "label") {
+                               if (!labels_contains_caption($article_labels, $f["param"])) {
+                                       label_add_article($link, $id, $f["param"], $owner_uid);
+                               }
+                       }
+               }
+       }
+
+       function cache_content($link, $url, $login, $pass) {
+
+               $content = fetch_file_contents($url, $login, $pass);
+
+               if ($content) {
+                       $doc = new DOMDocument();
+                       @$doc->loadHTML($content);
+                       $xpath = new DOMXPath($doc);
+
+                       $node = $doc->getElementsByTagName('body')->item(0);
+
+                       if ($node) {
+                               $content = $doc->saveXML($node, LIBXML_NOEMPTYTAG);
+
+                               return $content;
+                       }
+               }
+
+               return "";
+       }
 ?>