X-Git-Url: https://git.wh0rd.org/?a=blobdiff_plain;f=include%2Frssfuncs.php;h=3e0decb4193670b583cd731906959ae629f04a38;hb=e8291805dd99e0f6047edbb1ea162c1f3af3d0ee;hp=51b7d7c33117ee33b09f5fd1db977e5a1a3b9744;hpb=61c1812f29dc76e93107b65dea80fde260269e8d;p=tt-rss.git diff --git a/include/rssfuncs.php b/include/rssfuncs.php index 51b7d7c3..3e0decb4 100644 --- a/include/rssfuncs.php +++ b/include/rssfuncs.php @@ -1,7 +1,7 @@ 0 AND ttrss_feeds.last_updated < NOW() - CAST((ttrss_feeds.update_interval || ' minutes') AS INTERVAL) - ) OR ttrss_feeds.last_updated IS NULL)"; + ) OR ttrss_feeds.last_updated IS NULL + OR last_updated = '1970-01-01 00:00:00')"; } else { $update_limit_qpart = "AND (( ttrss_feeds.update_interval = 0 @@ -89,7 +90,8 @@ ) OR ( ttrss_feeds.update_interval > 0 AND ttrss_feeds.last_updated < DATE_SUB(NOW(), INTERVAL ttrss_feeds.update_interval MINUTE) - ) OR ttrss_feeds.last_updated IS NULL)"; + ) OR ttrss_feeds.last_updated IS NULL + OR last_updated = '1970-01-01 00:00:00')"; } // Test if feed is currently being updated by another process. @@ -151,90 +153,17 @@ } // Send feed digests by email if needed. - send_headlines_digests($link, 100, $debug); + send_headlines_digests($link, $debug); } // function update_daemon_common - function fetch_twitter_rss($link, $url, $owner_uid) { - - require_once 'lib/tmhoauth/tmhOAuth.php'; - require_once "lib/magpierss/rss_fetch.inc"; - require_once 'lib/magpierss/rss_utils.inc'; - - $result = db_query($link, "SELECT twitter_oauth FROM ttrss_users - WHERE id = $owner_uid"); - - $access_token = json_decode(db_fetch_result($result, 0, 'twitter_oauth'), true); - $url_escaped = db_escape_string($url); - - if ($access_token) { - - $tmhOAuth = new tmhOAuth(array( - 'consumer_key' => CONSUMER_KEY, - 'consumer_secret' => CONSUMER_SECRET, - 'user_token' => $access_token['oauth_token'], - 'user_secret' => $access_token['oauth_token_secret'], - )); - - $code = $tmhOAuth->request('GET', $url, - convertUrlQuery(parse_url($url, PHP_URL_QUERY))); - - if ($code == 200) { - - $content = $tmhOAuth->response['response']; - - define('MAGPIE_CACHE_ON', false); - - $rss = new MagpieRSS($content, MAGPIE_OUTPUT_ENCODING, - MAGPIE_INPUT_ENCODING, MAGPIE_DETECT_ENCODING ); - - return $rss; - - } else { - - db_query($link, "UPDATE ttrss_feeds - SET last_error = 'OAuth authorization failed ($code).' - WHERE feed_url = '$url_escaped' AND owner_uid = $owner_uid"); - } - - } else { - - db_query($link, "UPDATE ttrss_feeds - SET last_error = 'OAuth information not found.' - WHERE feed_url = '$url_escaped' AND owner_uid = $owner_uid"); - - return false; - } - } - - function update_rss_feed($link, $feed, $ignore_daemon = false, $no_cache = false) { - - global $memcache; - - /* Update all feeds with the same URL to utilize memcache */ - - if ($memcache) { - $result = db_query($link, "SELECT f1.id - FROM ttrss_feeds AS f1, ttrss_feeds AS f2 - WHERE f2.feed_url = f1.feed_url AND f2.id = '$feed'"); - - while ($line = db_fetch_assoc($result)) { - update_rss_feed_real($link, $line["id"], $ignore_daemon, $no_cache); - } - } else { - update_rss_feed_real($link, $feed, $ignore_daemon, $no_cache); - } - } - - function update_rss_feed_real($link, $feed, $ignore_daemon = false, $no_cache = false, + function update_rss_feed($link, $feed, $ignore_daemon = false, $no_cache = false, $override_url = false) { require_once "lib/simplepie/simplepie.inc"; require_once "lib/magpierss/rss_fetch.inc"; require_once 'lib/magpierss/rss_utils.inc'; - global $memcache; - $debug_enabled = defined('DAEMON_EXTENDED_DEBUG') || $_REQUEST['xdebug']; if (!$_REQUEST["daemon"] && !$ignore_daemon) { @@ -254,7 +183,7 @@ } $result = db_query($link, "SELECT id,update_interval,auth_login, - auth_pass,cache_images,update_method + auth_pass,cache_images,update_method,last_updated FROM ttrss_feeds WHERE id = '$feed' AND $updstart_thresh_qpart"); } else { @@ -310,12 +239,9 @@ $auth_pass = urlencode($auth_pass); } - $update_interval = db_fetch_result($result, 0, "update_interval"); $cache_images = sql_bool_to_bool(db_fetch_result($result, 0, "cache_images")); $fetch_url = db_fetch_result($result, 0, "feed_url"); - if ($update_interval < 0) { return false; } - $feed = db_escape_string($feed); if ($auth_login && $auth_pass ){ @@ -335,58 +261,45 @@ _debug("update_rss_feed: fetching [$fetch_url]..."); } - $obj_id = md5("FDATA:$use_simplepie:$fetch_url"); - - if ($memcache && $obj = $memcache->get($obj_id)) { + // Ignore cache if new feed or manual update. + $cache_age = (is_null($last_updated) || $last_updated == '1970-01-01 00:00:00') ? + -1 : get_feed_update_interval($link, $feed) * 60; - if ($debug_enabled) { - _debug("update_rss_feed: data found in memcache."); - } + if ($update_method == 1) { - $rss = $obj; + define('MAGPIE_CACHE_AGE', $cache_age); + define('MAGPIE_CACHE_ON', !$no_cache); + define('MAGPIE_FETCH_TIME_OUT', $no_cache ? 15 : 60); + define('MAGPIE_CACHE_DIR', CACHE_DIR . "/magpie"); + $rss = @fetch_rss($fetch_url); } else { + $simplepie_cache_dir = CACHE_DIR . "/simplepie"; - if ($update_method == 3) { - $rss = fetch_twitter_rss($link, $fetch_url, $owner_uid); - } else if ($update_method == 1) { - - define('MAGPIE_CACHE_AGE', get_feed_update_interval($link, $feed) * 60); - define('MAGPIE_CACHE_ON', !$no_cache); - define('MAGPIE_FETCH_TIME_OUT', 60); - define('MAGPIE_CACHE_DIR', CACHE_DIR . "/magpie"); - - $rss = @fetch_rss($fetch_url); - } else { - $simplepie_cache_dir = CACHE_DIR . "/simplepie"; - - if (!is_dir($simplepie_cache_dir)) { - mkdir($simplepie_cache_dir); - } - - $rss = new SimplePie(); - $rss->set_useragent(SELF_USER_AGENT); - # $rss->set_timeout(10); - $rss->set_feed_url($fetch_url); - $rss->set_output_encoding('UTF-8'); - //$rss->force_feed(true); + if (!is_dir($simplepie_cache_dir)) { + mkdir($simplepie_cache_dir); + } - if ($debug_enabled) { - _debug("feed update interval (sec): " . - get_feed_update_interval($link, $feed)*60); - } + $rss = new SimplePie(); + $rss->set_useragent(SELF_USER_AGENT); + $rss->set_timeout($no_cache ? 15 : 60); + $rss->set_feed_url($fetch_url); + $rss->set_output_encoding('UTF-8'); + //$rss->force_feed(true); - $rss->enable_cache(!$no_cache); + if ($debug_enabled) { + _debug("feed update interval (sec): " . + get_feed_update_interval($link, $feed)*60); + } - if (!$no_cache) { - $rss->set_cache_location($simplepie_cache_dir); - $rss->set_cache_duration(get_feed_update_interval($link, $feed) * 60); - } + $rss->enable_cache(!$no_cache); - $rss->init(); + if (!$no_cache) { + $rss->set_cache_location($simplepie_cache_dir); + $rss->set_cache_duration($cache_age); } - if ($memcache && $rss) $memcache->add($obj_id, $rss, 0, 300); + $rss->init(); } // print_r($rss); @@ -411,12 +324,22 @@ // db_query($link, "BEGIN"); - $result = db_query($link, "SELECT title,icon_url,site_url,owner_uid + if (DB_TYPE == "pgsql") { + $favicon_interval_qpart = "favicon_last_checked < NOW() - INTERVAL '12 hour'"; + } else { + $favicon_interval_qpart = "favicon_last_checked < DATE_SUB(NOW(), INTERVAL 12 HOUR)"; + } + + $result = db_query($link, "SELECT title,icon_url,site_url,owner_uid, + (favicon_last_checked IS NULL OR $favicon_interval_qpart) AS + favicon_needs_check FROM ttrss_feeds WHERE id = '$feed'"); $registered_title = db_fetch_result($result, 0, "title"); $orig_icon_url = db_fetch_result($result, 0, "icon_url"); $orig_site_url = db_fetch_result($result, 0, "site_url"); + $favicon_needs_check = sql_bool_to_bool(db_fetch_result($result, 0, + "favicon_needs_check")); $owner_uid = db_fetch_result($result, 0, "owner_uid"); @@ -438,7 +361,12 @@ _debug("update_rss_feed: checking favicon..."); } - check_feed_favicon($site_url, $feed, $link); + if ($favicon_needs_check) { + check_feed_favicon($site_url, $feed, $link); + + db_query($link, "UPDATE ttrss_feeds SET favicon_last_checked = NOW() + WHERE id = '$feed'"); + } if (!$registered_title || $registered_title == "[Unknown]") { @@ -477,14 +405,16 @@ } if ($debug_enabled) { - _debug("update_rss_feed: loading filters..."); + _debug("update_rss_feed: loading filters & labels..."); } $filters = load_filters($link, $feed, $owner_uid); + $labels = get_all_labels($link, $owner_uid); -// if ($debug_enabled) { -// print_r($filters); -// } + if ($debug_enabled) { + //print_r($filters); + _debug("update_rss_feed: " . count($filters) . " filters loaded."); + } if ($use_simplepie) { $iterator = $rss->get_items(); @@ -575,7 +505,6 @@ } foreach ($iterator as $item) { - if ($_REQUEST['xdebug'] == 2) { print_r($item); } @@ -662,7 +591,7 @@ $entry_content = $item["content:escaped"]; if (!$entry_content) $entry_content = $item["content:encoded"]; - if (!$entry_content) $entry_content = $item["content"]["encoded"]; + if (!$entry_content && is_array($entry_content)) $entry_content = $item["content"]["encoded"]; if (!$entry_content) $entry_content = $item["content"]; if (is_array($entry_content)) $entry_content = $entry_content[0]; @@ -837,15 +766,10 @@ $entry_tags[$i] = mb_strtolower($entry_tags[$i], 'utf-8'); if ($debug_enabled) { - _debug("update_rss_feed: unfiltered tags found:"); - print_r($entry_tags); + //_debug("update_rss_feed: unfiltered tags found:"); + //print_r($entry_tags); } - # sanitize content - - $entry_content = sanitize_article_content($entry_content); - $entry_title = sanitize_article_content($entry_title); - if ($debug_enabled) { _debug("update_rss_feed: done collecting data [TITLE:$entry_title]"); } @@ -887,6 +811,9 @@ '$entry_comments', '$num_comments', '$entry_author')"); + + $article_labels = array(); + } else { // we keep encountering the entry in feeds, so we need to // update date_updated column so that we don't get horrible @@ -897,6 +824,8 @@ db_query($link, "UPDATE ttrss_entries SET date_updated = NOW() WHERE id = '$base_entry_id'"); + + $article_labels = get_article_labels($link, $base_entry_id, $owner_uid); } // now it should exist, if not - bad luck then @@ -941,7 +870,7 @@ /* Collect article tags here so we could filter by them: */ $article_filters = get_article_filters($filters, $entry_title, - $entry_content, $entry_link, $entry_timestamp, $entry_author, + strip_tags($entry_content), $entry_link, $entry_timestamp, $entry_author, $entry_tags); if ($debug_enabled) { @@ -997,6 +926,27 @@ $published = 'false'; } + // N-grams + + if (DB_TYPE == "pgsql" and defined('_NGRAM_TITLE_DUPLICATE_THRESHOLD')) { + + $result = db_query($link, "SELECT COUNT(*) AS similar FROM + ttrss_entries,ttrss_user_entries + WHERE ref_id = id AND updated >= NOW() - INTERVAL '7 day' + AND similarity(title, '$entry_title') >= "._NGRAM_TITLE_DUPLICATE_THRESHOLD." + AND owner_uid = $owner_uid"); + + $ngram_similar = db_fetch_result($result, 0, "similar"); + + if ($debug_enabled) { + _debug("update_rss_feed: N-gram similar results: $ngram_similar"); + } + + if ($ngram_similar > 0) { + $unread = 'false'; + } + } + $result = db_query($link, "INSERT INTO ttrss_user_entries (ref_id, owner_uid, feed_id, unread, last_read, marked, @@ -1089,8 +1039,8 @@ _debug("update_rss_feed: assigning labels..."); } - assign_article_to_labels($link, $entry_ref_id, $article_filters, - $owner_uid); + assign_article_to_label_filters($link, $entry_ref_id, $article_filters, + $owner_uid, $article_labels); if ($debug_enabled) { _debug("update_rss_feed: looking for enclosures..."); @@ -1188,9 +1138,9 @@ // check for manual tags (we have to do it here since they're loaded from filters) foreach ($article_filters as $f) { - if ($f[0] == "tag") { + if ($f["type"] == "tag") { - $manual_tags = trim_array(explode(",", $f[1])); + $manual_tags = trim_array(explode(",", $f["param"])); foreach ($manual_tags as $tag) { if (tag_is_valid($tag)) { @@ -1263,6 +1213,22 @@ db_query($link, "COMMIT"); } + if (get_pref($link, "AUTO_ASSIGN_LABELS", $owner_uid, false)) { + if ($debug_enabled) { + _debug("update_rss_feed: auto-assigning labels..."); + } + + foreach ($labels as $label) { + $caption = $label["caption"]; + + if (preg_match("/\b$caption\b/i", "$tags_str " . strip_tags($entry_content) . " $entry_title")) { + if (!labels_contains_caption($article_labels, $caption)) { + label_add_article($link, $entry_ref_id, $caption, $owner_uid); + } + } + } + } + if ($debug_enabled) { _debug("update_rss_feed: article processed"); } @@ -1355,11 +1321,11 @@ $node = $doc->getElementsByTagName('body')->item(0); - return $doc->saveXML($node); + return $doc->saveXML($node, LIBXML_NOEMPTYTAG); } function expire_cached_files($debug) { - foreach (array("magpie", "simplepie", "images") as $dir) { + foreach (array("magpie", "simplepie", "images", "export") as $dir) { $cache_dir = CACHE_DIR . "/$dir"; if ($debug) _debug("Expiring $cache_dir"); @@ -1369,14 +1335,15 @@ if (is_writable($cache_dir)) { $files = glob("$cache_dir/*"); - foreach ($files as $file) { - if (time() - filemtime($file) > 86400*7) { - unlink($file); + if ($files) + foreach ($files as $file) { + if (time() - filemtime($file) > 86400*7) { + unlink($file); - ++$num_deleted; + ++$num_deleted; + } } } - } if ($debug) _debug("Removed $num_deleted files."); } @@ -1401,4 +1368,114 @@ return $params; } + + function get_article_filters($filters, $title, $content, $link, $timestamp, $author, $tags) { + $matches = array(); + + foreach ($filters as $filter) { + $match_any_rule = $filter["match_any_rule"]; + $filter_match = false; + + foreach ($filter["rules"] as $rule) { + $match = false; + $reg_exp = $rule["reg_exp"]; + + if (!$reg_exp) + continue; + + switch ($rule["type"]) { + case "title": + $match = @preg_match("/$reg_exp/i", $title); + break; + case "content": + $match = @preg_match("/$reg_exp/i", $content); + break; + case "both": + $match = (@preg_match("/$reg_exp/i", $title) || @preg_match("/$reg_exp/i", $title)); + break; + case "link": + $match = @preg_match("/$reg_exp/i", $link); + break; + case "author": + $match = @preg_match("/$reg_exp/i", $author); + break; + case "tag": + $tag_string = join(",", $tags); + $match = @preg_match("/$reg_exp/i", $tag_string); + break; + } + + if ($match_any_rule) { + if ($match) { + $filter_match = true; + break; + } + } else { + $filter_match = $match; + if (!$match) { + break; + } + } + } + + if ($filter_match) { + foreach ($filter["actions"] AS $action) { + array_push($matches, $action); + } + } + } + + return $matches; + } + + function find_article_filter($filters, $filter_name) { + foreach ($filters as $f) { + if ($f["type"] == $filter_name) { + return $f; + }; + } + return false; + } + + function find_article_filters($filters, $filter_name) { + $results = array(); + + foreach ($filters as $f) { + if ($f["type"] == $filter_name) { + array_push($results, $f); + }; + } + return $results; + } + + function calculate_article_score($filters) { + $score = 0; + + foreach ($filters as $f) { + if ($f["type"] == "score") { + $score += $f["param"]; + }; + } + return $score; + } + + function labels_contains_caption($labels, $caption) { + foreach ($labels as $label) { + if ($label[1] == $caption) { + return true; + } + } + + return false; + } + + function assign_article_to_label_filters($link, $id, $filters, $owner_uid, $article_labels) { + foreach ($filters as $f) { + if ($f["type"] == "label") { + if (!labels_contains_caption($article_labels, $f["param"])) { + label_add_article($link, $id, $f["param"], $owner_uid); + } + } + } + } ?>