<?php
define('DAEMON_UPDATE_LOGIN_LIMIT', 30);
define('DAEMON_FEED_LIMIT', 100);
- define('DAEMON_SLEEP_INTERVAL', 120);
+ define('DAEMON_SLEEP_INTERVAL', 60);
function update_feedbrowser_cache($link) {
} // function update_daemon_common
- function fetch_twitter_rss($link, $url, $owner_uid) {
-
- require_once 'lib/tmhoauth/tmhOAuth.php';
- require_once "lib/magpierss/rss_fetch.inc";
- require_once 'lib/magpierss/rss_utils.inc';
-
- $result = db_query($link, "SELECT twitter_oauth FROM ttrss_users
- WHERE id = $owner_uid");
-
- $access_token = json_decode(db_fetch_result($result, 0, 'twitter_oauth'), true);
- $url_escaped = db_escape_string($url);
-
- if ($access_token) {
-
- $tmhOAuth = new tmhOAuth(array(
- 'consumer_key' => CONSUMER_KEY,
- 'consumer_secret' => CONSUMER_SECRET,
- 'user_token' => $access_token['oauth_token'],
- 'user_secret' => $access_token['oauth_token_secret'],
- ));
-
- $code = $tmhOAuth->request('GET', $url,
- convertUrlQuery(parse_url($url, PHP_URL_QUERY)));
-
- if ($code == 200) {
-
- $content = $tmhOAuth->response['response'];
-
- define('MAGPIE_CACHE_ON', false);
-
- $rss = new MagpieRSS($content, MAGPIE_OUTPUT_ENCODING,
- MAGPIE_INPUT_ENCODING, MAGPIE_DETECT_ENCODING );
-
- return $rss;
-
- } else {
-
- db_query($link, "UPDATE ttrss_feeds
- SET last_error = 'OAuth authorization failed ($code).'
- WHERE feed_url = '$url_escaped' AND owner_uid = $owner_uid");
- }
-
- } else {
-
- db_query($link, "UPDATE ttrss_feeds
- SET last_error = 'OAuth information not found.'
- WHERE feed_url = '$url_escaped' AND owner_uid = $owner_uid");
-
- return false;
- }
- }
-
- function update_rss_feed($link, $feed, $ignore_daemon = false, $no_cache = false) {
-
- global $memcache;
-
- /* Update all feeds with the same URL to utilize memcache */
-
- if ($memcache) {
- $result = db_query($link, "SELECT f1.id
- FROM ttrss_feeds AS f1, ttrss_feeds AS f2
- WHERE f2.feed_url = f1.feed_url AND f2.id = '$feed'");
-
- while ($line = db_fetch_assoc($result)) {
- update_rss_feed_real($link, $line["id"], $ignore_daemon, $no_cache);
- }
- } else {
- update_rss_feed_real($link, $feed, $ignore_daemon, $no_cache);
- }
- }
-
- function update_rss_feed_real($link, $feed, $ignore_daemon = false, $no_cache = false,
+ // ignore_daemon is not used
+ function update_rss_feed($link, $feed, $ignore_daemon = false, $no_cache = false,
$override_url = false) {
require_once "lib/simplepie/simplepie.inc";
require_once "lib/magpierss/rss_fetch.inc";
require_once 'lib/magpierss/rss_utils.inc';
- global $memcache;
-
$debug_enabled = defined('DAEMON_EXTENDED_DEBUG') || $_REQUEST['xdebug'];
- if (!$_REQUEST["daemon"] && !$ignore_daemon) {
- return false;
- }
-
if ($debug_enabled) {
_debug("update_rss_feed: start");
}
- if (!$ignore_daemon) {
-
- if (DB_TYPE == "pgsql") {
- $updstart_thresh_qpart = "(ttrss_feeds.last_update_started IS NULL OR ttrss_feeds.last_update_started < NOW() - INTERVAL '120 seconds')";
- } else {
- $updstart_thresh_qpart = "(ttrss_feeds.last_update_started IS NULL OR ttrss_feeds.last_update_started < DATE_SUB(NOW(), INTERVAL 120 SECOND))";
- }
-
- $result = db_query($link, "SELECT id,update_interval,auth_login,
- auth_pass,cache_images,update_method,last_updated
- FROM ttrss_feeds WHERE id = '$feed' AND $updstart_thresh_qpart");
-
- } else {
-
- $result = db_query($link, "SELECT id,update_interval,auth_login,
- feed_url,auth_pass,cache_images,update_method,last_updated,
- mark_unread_on_update, owner_uid, update_on_checksum_change,
- pubsub_state
- FROM ttrss_feeds WHERE id = '$feed'");
-
- }
+ $result = db_query($link, "SELECT id,update_interval,auth_login,
+ feed_url,auth_pass,cache_images,update_method,last_updated,cache_content,
+ mark_unread_on_update, owner_uid, update_on_checksum_change,
+ pubsub_state
+ FROM ttrss_feeds WHERE id = '$feed'");
if (db_num_rows($result) == 0) {
if ($debug_enabled) {
}
$cache_images = sql_bool_to_bool(db_fetch_result($result, 0, "cache_images"));
+ $cache_content = sql_bool_to_bool(db_fetch_result($result, 0, "cache_content"));
$fetch_url = db_fetch_result($result, 0, "feed_url");
$feed = db_escape_string($feed);
_debug("update_rss_feed: fetching [$fetch_url]...");
}
- $obj_id = md5("FDATA:$use_simplepie:$fetch_url");
+ // Ignore cache if new feed or manual update.
+ $cache_age = (is_null($last_updated) || $last_updated == '1970-01-01 00:00:00') ?
+ -1 : get_feed_update_interval($link, $feed) * 60;
- if ($memcache && $obj = $memcache->get($obj_id)) {
-
- if ($debug_enabled) {
- _debug("update_rss_feed: data found in memcache.");
- }
+ if ($update_method == 1) {
- $rss = $obj;
+ define('MAGPIE_CACHE_AGE', $cache_age);
+ define('MAGPIE_CACHE_ON', !$no_cache);
+ define('MAGPIE_FETCH_TIME_OUT', $no_cache ? 15 : 60);
+ define('MAGPIE_CACHE_DIR', CACHE_DIR . "/magpie");
+ $rss = @fetch_rss($fetch_url);
} else {
+ $simplepie_cache_dir = CACHE_DIR . "/simplepie";
- // Ignore cache if new feed or manual update.
- $cache_age = (is_null($last_updated) || $last_updated == '1970-01-01 00:00:00') ?
- -1 : get_feed_update_interval($link, $feed) * 60;
-
- if ($update_method == 3) {
- $rss = fetch_twitter_rss($link, $fetch_url, $owner_uid);
- } else if ($update_method == 1) {
-
- define('MAGPIE_CACHE_AGE', $cache_age);
- define('MAGPIE_CACHE_ON', !$no_cache);
- define('MAGPIE_FETCH_TIME_OUT', 60);
- define('MAGPIE_CACHE_DIR', CACHE_DIR . "/magpie");
-
- $rss = @fetch_rss($fetch_url);
- } else {
- $simplepie_cache_dir = CACHE_DIR . "/simplepie";
-
- if (!is_dir($simplepie_cache_dir)) {
- mkdir($simplepie_cache_dir);
- }
+ if (!is_dir($simplepie_cache_dir)) {
+ mkdir($simplepie_cache_dir);
+ }
- $rss = new SimplePie();
- $rss->set_useragent(SELF_USER_AGENT);
- # $rss->set_timeout(10);
- $rss->set_feed_url($fetch_url);
- $rss->set_output_encoding('UTF-8');
- //$rss->force_feed(true);
+ $rss = new SimplePie();
+ $rss->set_useragent(SELF_USER_AGENT);
+ $rss->set_timeout($no_cache ? 15 : 60);
+ $rss->set_feed_url($fetch_url);
+ $rss->set_output_encoding('UTF-8');
+ //$rss->force_feed(true);
- if ($debug_enabled) {
- _debug("feed update interval (sec): " .
- get_feed_update_interval($link, $feed)*60);
- }
-
- $rss->enable_cache(!$no_cache);
+ if ($debug_enabled) {
+ _debug("feed update interval (sec): " .
+ get_feed_update_interval($link, $feed)*60);
+ }
- if (!$no_cache) {
- $rss->set_cache_location($simplepie_cache_dir);
- $rss->set_cache_duration($cache_age);
- }
+ $rss->enable_cache(!$no_cache);
- $rss->init();
+ if (!$no_cache) {
+ $rss->set_cache_location($simplepie_cache_dir);
+ $rss->set_cache_duration($cache_age);
}
- if ($memcache && $rss) $memcache->add($obj_id, $rss, 0, 300);
+ $rss->init();
}
// print_r($rss);
// db_query($link, "BEGIN");
- $result = db_query($link, "SELECT title,icon_url,site_url,owner_uid
+ if (DB_TYPE == "pgsql") {
+ $favicon_interval_qpart = "favicon_last_checked < NOW() - INTERVAL '12 hour'";
+ } else {
+ $favicon_interval_qpart = "favicon_last_checked < DATE_SUB(NOW(), INTERVAL 12 HOUR)";
+ }
+
+ $result = db_query($link, "SELECT title,icon_url,site_url,owner_uid,
+ (favicon_last_checked IS NULL OR $favicon_interval_qpart) AS
+ favicon_needs_check
FROM ttrss_feeds WHERE id = '$feed'");
$registered_title = db_fetch_result($result, 0, "title");
$orig_icon_url = db_fetch_result($result, 0, "icon_url");
$orig_site_url = db_fetch_result($result, 0, "site_url");
+ $favicon_needs_check = sql_bool_to_bool(db_fetch_result($result, 0,
+ "favicon_needs_check"));
$owner_uid = db_fetch_result($result, 0, "owner_uid");
_debug("update_rss_feed: checking favicon...");
}
- check_feed_favicon($site_url, $feed, $link);
+ if ($favicon_needs_check) {
+ check_feed_favicon($site_url, $feed, $link);
+
+ db_query($link, "UPDATE ttrss_feeds SET favicon_last_checked = NOW()
+ WHERE id = '$feed'");
+ }
if (!$registered_title || $registered_title == "[Unknown]") {
}
if ($debug_enabled) {
- _debug("update_rss_feed: loading filters...");
+ _debug("update_rss_feed: loading filters & labels...");
}
$filters = load_filters($link, $feed, $owner_uid);
+ $labels = get_all_labels($link, $owner_uid);
+
+ if ($debug_enabled) {
+ //print_r($filters);
+ _debug("update_rss_feed: " . count($filters) . " filters loaded.");
+ }
+
+ $filter_plugins = array();
+
+ if (defined('_ARTICLE_FILTER_PLUGINS')) {
+ foreach (explode(",", _ARTICLE_FILTER_PLUGINS) as $p) {
+ $pclass = "filter_" . trim($p);
-// if ($debug_enabled) {
-// print_r($filters);
-// }
+ if (class_exists($pclass)) {
+ $plugin = new $pclass($link);
+ array_push($filter_plugins, $plugin);
+ }
+ }
+ }
+
+ if ($debug_enabled) {
+ _debug("update_rss_feed: " . count($filter_plugins) . " filter plugins loaded.");
+ }
if ($use_simplepie) {
$iterator = $rss->get_items();
}
foreach ($iterator as $item) {
-
if ($_REQUEST['xdebug'] == 2) {
print_r($item);
}
}
$entry_content_unescaped = $entry_content;
+ $entry_cached_content = "";
if ($use_simplepie) {
$entry_comments = strip_tags($item->data["comments"]);
$entry_content = db_escape_string($entry_content, false);
- $content_hash = "SHA1:" . sha1(strip_tags($entry_content));
-
$entry_title = db_escape_string($entry_title);
$entry_link = db_escape_string($entry_link);
$entry_comments = mb_substr(db_escape_string($entry_comments), 0, 250);
$entry_tags[$i] = mb_strtolower($entry_tags[$i], 'utf-8');
if ($debug_enabled) {
- _debug("update_rss_feed: unfiltered tags found:");
- print_r($entry_tags);
+ //_debug("update_rss_feed: unfiltered tags found:");
+ //print_r($entry_tags);
}
- # sanitize content
-
- $entry_content = sanitize_article_content($entry_content);
- $entry_title = sanitize_article_content($entry_title);
-
if ($debug_enabled) {
_debug("update_rss_feed: done collecting data [TITLE:$entry_title]");
}
+ // TODO: less memory-hungry implementation
+ if (count($filter_plugins) > 0) {
+ if ($debug_enabled) {
+ _debug("update_rss_feed: applying plugin filters...");
+ }
+
+ $article = array("owner_uid" => $owner_uid,
+ "title" => $entry_title,
+ "content" => $entry_content,
+ "link" => $entry_link,
+ "tags" => $entry_tags,
+ "author" => $entry_author);
+
+ foreach ($filter_plugins as $plugin) {
+ $article = $plugin->filter_article($article);
+ }
+
+ $entry_title = $article["title"];
+ $entry_content = $article["content"];
+ $entry_tags = $article["tags"];
+ $entry_author = $article["author"];
+ }
+
+ $content_hash = "SHA1:" . sha1(strip_tags($entry_content));
+
db_query($link, "BEGIN");
if (db_num_rows($result) == 0) {
_debug("update_rss_feed: base guid not found");
}
+ if ($cache_content) {
+ if ($debug_enabled) {
+ _debug("update_rss_feed: caching content...");
+ }
+
+ $entry_cached_content = cache_content($link, $entry_link, $auth_login, $auth_pass);
+
+ if ($cache_images && is_writable(CACHE_DIR . '/images'))
+ $entry_cached_content = cache_images($entry_cached_content, $site_url, $debug_enabled);
+
+ $entry_cached_content = db_escape_string($entry_cached_content, false);
+
+ }
+
// base post entry does not exist, create it
$result = db_query($link,
updated,
content,
content_hash,
+ cached_content,
no_orig_date,
date_updated,
date_entered,
'$entry_timestamp_fmt',
'$entry_content',
'$content_hash',
+ '$entry_cached_content',
$no_orig_date,
NOW(),
NOW(),
'$entry_comments',
'$num_comments',
'$entry_author')");
+
+ $article_labels = array();
+
} else {
// we keep encountering the entry in feeds, so we need to
// update date_updated column so that we don't get horrible
db_query($link, "UPDATE ttrss_entries SET date_updated = NOW()
WHERE id = '$base_entry_id'");
+
+ $article_labels = get_article_labels($link, $base_entry_id, $owner_uid);
}
// now it should exist, if not - bad luck then
$published = 'false';
}
+ // N-grams
+
+ if (DB_TYPE == "pgsql" and defined('_NGRAM_TITLE_DUPLICATE_THRESHOLD')) {
+
+ $result = db_query($link, "SELECT COUNT(*) AS similar FROM
+ ttrss_entries,ttrss_user_entries
+ WHERE ref_id = id AND updated >= NOW() - INTERVAL '7 day'
+ AND similarity(title, '$entry_title') >= "._NGRAM_TITLE_DUPLICATE_THRESHOLD."
+ AND owner_uid = $owner_uid");
+
+ $ngram_similar = db_fetch_result($result, 0, "similar");
+
+ if ($debug_enabled) {
+ _debug("update_rss_feed: N-gram similar results: $ngram_similar");
+ }
+
+ if ($ngram_similar > 0) {
+ $unread = 'false';
+ }
+ }
+
$result = db_query($link,
"INSERT INTO ttrss_user_entries
(ref_id, owner_uid, feed_id, unread, last_read, marked,
if ($content_hash != $orig_content_hash) {
$post_needs_update = true;
$update_insignificant = false;
+
+ if ($cache_content) {
+ if ($debug_enabled) {
+ _debug("update_rss_feed: caching content because original checksum changed...");
+ }
+
+ $entry_cached_content = cache_content($link, $entry_link, $auth_login, $auth_pass);
+
+ if ($cache_images && is_writable(CACHE_DIR . '/images'))
+ $entry_cached_content = cache_images($entry_cached_content, $site_url, $debug_enabled);
+
+ $entry_cached_content = db_escape_string($entry_cached_content, false);
+ }
}
if (db_escape_string($orig_title) != $entry_title) {
db_query($link, "UPDATE ttrss_entries
SET title = '$entry_title', content = '$entry_content',
content_hash = '$content_hash',
+ cached_content = '$entry_cached_content',
updated = '$entry_timestamp_fmt',
num_comments = '$num_comments'
WHERE id = '$ref_id'");
_debug("update_rss_feed: assigning labels...");
}
- assign_article_to_labels($link, $entry_ref_id, $article_filters,
- $owner_uid);
+ assign_article_to_label_filters($link, $entry_ref_id, $article_filters,
+ $owner_uid, $article_labels);
if ($debug_enabled) {
_debug("update_rss_feed: looking for enclosures...");
// check for manual tags (we have to do it here since they're loaded from filters)
foreach ($article_filters as $f) {
- if ($f[0] == "tag") {
+ if ($f["type"] == "tag") {
- $manual_tags = trim_array(explode(",", $f[1]));
+ $manual_tags = trim_array(explode(",", $f["param"]));
foreach ($manual_tags as $tag) {
if (tag_is_valid($tag)) {
db_query($link, "COMMIT");
}
+ if (get_pref($link, "AUTO_ASSIGN_LABELS", $owner_uid, false)) {
+ if ($debug_enabled) {
+ _debug("update_rss_feed: auto-assigning labels...");
+ }
+
+ foreach ($labels as $label) {
+ $caption = $label["caption"];
+
+ if (preg_match("/\b$caption\b/i", "$tags_str " . strip_tags($entry_content) . " $entry_title")) {
+ if (!labels_contains_caption($article_labels, $caption)) {
+ label_add_article($link, $entry_ref_id, $caption, $owner_uid);
+ }
+ }
+ }
+ }
+
if ($debug_enabled) {
_debug("update_rss_feed: article processed");
}
}
function expire_cached_files($debug) {
- foreach (array("magpie", "simplepie", "images") as $dir) {
+ foreach (array("magpie", "simplepie", "images", "export") as $dir) {
$cache_dir = CACHE_DIR . "/$dir";
if ($debug) _debug("Expiring $cache_dir");
return $params;
}
+
+ function get_article_filters($filters, $title, $content, $link, $timestamp, $author, $tags) {
+ $matches = array();
+
+ foreach ($filters as $filter) {
+ $match_any_rule = $filter["match_any_rule"];
+ $filter_match = false;
+
+ foreach ($filter["rules"] as $rule) {
+ $match = false;
+ $reg_exp = $rule["reg_exp"];
+
+ if (!$reg_exp)
+ continue;
+
+ switch ($rule["type"]) {
+ case "title":
+ $match = @preg_match("/$reg_exp/i", $title);
+ break;
+ case "content":
+ // we don't need to deal with multiline regexps
+ $content = preg_replace("/[\r\n\t]/", "", $content);
+
+ $match = @preg_match("/$reg_exp/i", $content);
+ break;
+ case "both":
+ // we don't need to deal with multiline regexps
+ $content = preg_replace("/[\r\n\t]/", "", $content);
+
+ $match = (@preg_match("/$reg_exp/i", $title) || @preg_match("/$reg_exp/i", $content));
+ break;
+ case "link":
+ $match = @preg_match("/$reg_exp/i", $link);
+ break;
+ case "author":
+ $match = @preg_match("/$reg_exp/i", $author);
+ break;
+ case "tag":
+ $tag_string = join(",", $tags);
+ $match = @preg_match("/$reg_exp/i", $tag_string);
+ break;
+ }
+
+ if ($match_any_rule) {
+ if ($match) {
+ $filter_match = true;
+ break;
+ }
+ } else {
+ $filter_match = $match;
+ if (!$match) {
+ break;
+ }
+ }
+ }
+
+ if ($filter_match) {
+ foreach ($filter["actions"] AS $action) {
+ array_push($matches, $action);
+ }
+ }
+ }
+
+ return $matches;
+ }
+
+ function find_article_filter($filters, $filter_name) {
+ foreach ($filters as $f) {
+ if ($f["type"] == $filter_name) {
+ return $f;
+ };
+ }
+ return false;
+ }
+
+ function find_article_filters($filters, $filter_name) {
+ $results = array();
+
+ foreach ($filters as $f) {
+ if ($f["type"] == $filter_name) {
+ array_push($results, $f);
+ };
+ }
+ return $results;
+ }
+
+ function calculate_article_score($filters) {
+ $score = 0;
+
+ foreach ($filters as $f) {
+ if ($f["type"] == "score") {
+ $score += $f["param"];
+ };
+ }
+ return $score;
+ }
+
+ function labels_contains_caption($labels, $caption) {
+ foreach ($labels as $label) {
+ if ($label[1] == $caption) {
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ function assign_article_to_label_filters($link, $id, $filters, $owner_uid, $article_labels) {
+ foreach ($filters as $f) {
+ if ($f["type"] == "label") {
+ if (!labels_contains_caption($article_labels, $f["param"])) {
+ label_add_article($link, $id, $f["param"], $owner_uid);
+ }
+ }
+ }
+ }
+
+ function cache_content($link, $url, $login, $pass) {
+
+ $content = fetch_file_contents($url, $login, $pass);
+
+ if ($content) {
+ $doc = new DOMDocument();
+ @$doc->loadHTML($content);
+ $xpath = new DOMXPath($doc);
+
+ $node = $doc->getElementsByTagName('body')->item(0);
+
+ if ($node) {
+ $content = $doc->saveXML($node, LIBXML_NOEMPTYTAG);
+
+ return $content;
+ }
+ }
+
+ return "";
+ }
?>