extensive refactoring

This commit is contained in:
Matthew Exon 2019-10-02 07:19:59 +02:00 committed by Matthew Exon
parent dbfc24d51f
commit 2d8e13d53d
2 changed files with 164 additions and 173 deletions

View file

@ -177,14 +177,14 @@ function retriever_clean_up_completed_resources($max_items, $a) {
} }
$resource = DBA::selectFirst('retriever_resource', [], ['id' => intval($rr['resource'])]); $resource = DBA::selectFirst('retriever_resource', [], ['id' => intval($rr['resource'])]);
retriever_apply_completed_resource_to_item($retriever_rule, $item, $resource, $a); retriever_apply_completed_resource_to_item($retriever_rule, $item, $resource, $a);
Logger::info('@@@ retriever_clean_up_completed_resources tried to update id ' . $retriver_item['id'] . ' to finished, better check that it really worked!'); Logger::info('@@@ retriever_clean_up_completed_resources tried to update id ' . $retriever_item['id'] . ' to finished, better check that it really worked!');
DBA::update('retriever_item', ['finished' => 1], ['id' => intval($retriever_item['id'])], ['finished']); DBA::update('retriever_item', ['finished' => 1], ['id' => intval($retriever_item['id'])], ['finished']);
retriever_check_item_completed($item); retriever_check_item_completed($item);
} }
} }
function retriever_tidy() { function retriever_tidy() {
// TODO: figure out how to do this with DBA module // TODO: figure out how to do this with DBA module @@@ it is possible
q("DELETE FROM retriever_resource WHERE completed IS NOT NULL AND completed < DATE_SUB(now(), INTERVAL 1 WEEK)"); q("DELETE FROM retriever_resource WHERE completed IS NOT NULL AND completed < DATE_SUB(now(), INTERVAL 1 WEEK)");
q("DELETE FROM retriever_resource WHERE completed IS NULL AND created < DATE_SUB(now(), INTERVAL 3 MONTH)"); q("DELETE FROM retriever_resource WHERE completed IS NULL AND created < DATE_SUB(now(), INTERVAL 3 MONTH)");
@ -212,8 +212,6 @@ function retrieve_dataurl_resource($resource) {
} }
function retrieve_resource($resource) { function retrieve_resource($resource) {
Logger::info('@@@ retrieve_resource: url ' . $resource['url'] . ' uid ' . $resource['item-uid'] . ' cid ' . $resource['contact-id']);
if (substr($resource['url'], 0, 5) == "data:") { if (substr($resource['url'], 0, 5) == "data:") {
return retrieve_dataurl_resource($resource); return retrieve_dataurl_resource($resource);
} }
@ -221,24 +219,22 @@ function retrieve_resource($resource) {
$a = get_app(); $a = get_app();
$retriever_rule = get_retriever_rule($resource['contact-id'], $resource['item-uid']); $retriever_rule = get_retriever_rule($resource['contact-id'], $resource['item-uid']);
$rule_data = $retriever_rule['data'];
try { try {
Logger::debug('retrieve_resource: ' . ($resource['num-tries'] + 1) . ' attempt at resource ' . $resource['id'] . ' ' . $resource['url']); Logger::debug('retrieve_resource: ' . ($resource['num-tries'] + 1) . ' attempt at resource ' . $resource['id'] . ' ' . $resource['url']);
$redirects = 0; $redirects = 0;
$cookiejar = ''; $cookiejar = '';
Logger::debug('@@@ retrieve_resource storecookies ' . $retriever_rule['storecookies']); if (array_key_exists('storecookies', $rule_data) && $rule_data['storecookies']) {
if (array_key_exists('storecookies', $retriever_rule) && $retriever_rule['storecookies']) {
$cookiejar = tempnam(get_temppath(), 'cookiejar-retriever-'); $cookiejar = tempnam(get_temppath(), 'cookiejar-retriever-');
Logger::debug('@@@ retrieve_resource cookie file ' . $cookiejar . ' content ' . $retriever_rule['cookiedata']); file_put_contents($cookiejar, $rule_data['cookiedata']);
file_put_contents($cookiejar, $retriever_rule['cookiedata']);
} }
$fetch_result = Network::fetchUrlFull($resource['url'], $resource['binary'], $redirects, '', $cookiejar); $fetch_result = Network::fetchUrlFull($resource['url'], $resource['binary'], $redirects, '', $cookiejar);
if (array_key_exists('storecookies', $retriever_rule) && $retriever_rule['storecookies']) { if (array_key_exists('storecookies', $rule_data) && $rule_data['storecookies']) {
$retriever_rule['cookiedata'] = file_get_contents($cookiejar); $retriever_rule['data']['cookiedata'] = file_get_contents($cookiejar);
Logger::debug('@@@ retriever_resource update cookie ' . json_encode($retriever_rule['data'] . ' id ' . $retriever_rule['id'])); DBA::update('retriever_rule', ['data' => json_encode($retriever_rule['data'])], ['id' => intval($retriever_rule["id"])]);
q("UPDATE `retriever_rule` SET `data`='%s' WHERE `id` = %d", //@@@ check the update worked
DBA::escape(json_encode($retriever_rule['data'])), intval($retriever_rule["id"])); unlink($cookiejar);
/* unlink($cookiejar); */ //@@@
} }
$resource['data'] = $fetch_result->getBody(); $resource['data'] = $fetch_result->getBody();
$resource['http-code'] = $fetch_result->getReturnCode(); $resource['http-code'] = $fetch_result->getReturnCode();
@ -248,36 +244,33 @@ function retrieve_resource($resource) {
} catch (Exception $e) { } catch (Exception $e) {
Logger::info('retrieve_resource: unable to retrieve ' . $resource['url'] . ' - ' . $e->getMessage()); Logger::info('retrieve_resource: unable to retrieve ' . $resource['url'] . ' - ' . $e->getMessage());
} }
// TODO: figure out how to do this with DBA module
q("UPDATE `retriever_resource` SET `last-try` = now(), `num-tries` = `num-tries` + 1, `http-code` = %d, `redirect-url` = '%s' WHERE id = %d", q("UPDATE `retriever_resource` SET `last-try` = now(), `num-tries` = `num-tries` + 1, `http-code` = %d, `redirect-url` = '%s' WHERE id = %d",
intval($resource['http-code']), intval($resource['http-code']),
DBA::escape($resource['redirect-url']), DBA::escape($resource['redirect-url']),
intval($resource['id'])); intval($resource['id']));
if ($resource['data']) { if ($resource['data']) {
// TODO: figure out how to do this with DBA module
q("UPDATE `retriever_resource` SET `completed` = now(), `data` = '%s', `type` = '%s' WHERE id = %d", q("UPDATE `retriever_resource` SET `completed` = now(), `data` = '%s', `type` = '%s' WHERE id = %d",
DBA::escape($resource['data']), DBA::escape($resource['data']),
DBA::escape($resource['type']), DBA::escape($resource['type']),
intval($resource['id'])); intval($resource['id']));
retriever_resource_completed($resource, $a); retriever_resource_completed($resource, $a);
} }
Logger::info('@@@ retrieve_resource finished: ' . $resource['url']);
} }
function get_retriever_rule($contact_id, $uid, $create = false) { function get_retriever_rule($contact_id, $uid, $create = false) {
Logger::info('@@@ get_retriever_rule ' . "SELECT * FROM `retriever_rule` WHERE `contact-id` = " . intval($contact_id) . " AND `uid` = " . intval($uid)); $retriever_rule = DBA::selectFirst('retriever_rule', [], ['contact-id' => intval($contact_id), 'uid' => intval($uid)]);
$r = q("SELECT * FROM `retriever_rule` WHERE `contact-id` = %d AND `uid` = %d", //@@@ check that this worked
intval($contact_id), intval($uid)); if ($retriever_rule) {
Logger::info('@@@ get_retriever_rule count is ' . count($r)); $retriever_rule['data'] = json_decode($retriever_rule['data'], true);
if (count($r)) {
$r[0]['data'] = json_decode($r[0]['data'], true);
Logger::info('@@@ get_retriever_rule returning an actual thing'); Logger::info('@@@ get_retriever_rule returning an actual thing');
return $r[0]; return $retriever_rule;
} }
if ($create) { if ($create) {
q("INSERT INTO `retriever_rule` (`uid`, `contact-id`) VALUES (%d, %d)", DBA::insert('retriever_rule', ['uid' => intval($uid), 'contact-id' => intval($contact_id)]);
intval($uid), intval($contact_id)); //@@@ check that this worked
$r = q("SELECT * FROM `retriever_rule` WHERE `contact-id` = %d AND `uid` = %d", return DBA::selectFirst('retriever_rule', [], ['contact-id' => intval($contact_id), 'uid' => intval($uid)]);
intval($contact_id), intval($uid));
return $r[0];
} }
} }
@ -285,38 +278,13 @@ function retriever_get_retriever_item($id) {
return DBA::selectFirst('retriever_item', [], ['id' => intval($id)]); return DBA::selectFirst('retriever_item', [], ['id' => intval($id)]);
} }
function retriever_class_of_item($item) { //@@@
if (!$item) {
return 'false';
}
if (array_key_exists('finished', $item)) {
Logger::info('@@@ oh no this is a bad thing');
return 'retriever_item';
}
if (array_key_exists('moderated', $item)) {
return 'friendica_item';
}
return 'unknown';
}
function mat_test($item) { //@@@
return 'mat_test';
}
function retriever_get_item($retriever_item) { function retriever_get_item($retriever_item) {
// @@@ add contact id as a search term $item = Item::selectFirst([], ['uri' => $retriever_item['item-uri'], 'uid' => intval($retriever_item['item-uid']), 'contact-id' => intval($retriever_item['contact-id'])]);
Logger::info('@@@ retriever_get_item uri ' . $retriever_item['item-uri'] . ' uid ' . $retriever_item['item-uid'] . ' cid ' . $retriever_item['contact-id']);
try {//@@@ not necessary
$item = Item::selectFirst([], ['uri' => $retriever_item['item-uri'], 'uid' => intval($retriever_item['item-uid'])]);
if (!DBA::isResult($item)) { if (!DBA::isResult($item)) {
Logger::warning('retriever_get_item: no item found for uri ' . $retriever_item['item-uri']); Logger::warning('retriever_get_item: no item found for uri ' . $retriever_item['item-uri']);
return; return;
} }
Logger::info('@@@ retriever_get_item: yay item found for uri ' . $retriever_item['item-uri'] . ' guid ' . $item['guid'] . ' plink ' . $item['plink']);
return $item; return $item;
} catch (Exception $e) {
Logger::info('retriever_get_item: exception ' . $e->getMessage());
}
} }
function retriever_item_completed($retriever_item_id, $resource, $a) { function retriever_item_completed($retriever_item_id, $resource, $a) {
@ -328,7 +296,6 @@ function retriever_item_completed($retriever_item_id, $resource, $a) {
return; return;
} }
$item = retriever_get_item($retriever_item); $item = retriever_get_item($retriever_item);
Logger::info('@@@ 2 item class is ' . retriever_class_of_item($item) . ' ' . mat_test($item));
if (!$item) { if (!$item) {
Logger::warning('retriever_item_completed: no item ' . $retriever_item['item-uri']); Logger::warning('retriever_item_completed: no item ' . $retriever_item['item-uri']);
return; return;
@ -338,25 +305,23 @@ function retriever_item_completed($retriever_item_id, $resource, $a) {
retriever_apply_completed_resource_to_item($retriever_rule, $item, $resource, $a); retriever_apply_completed_resource_to_item($retriever_rule, $item, $resource, $a);
q("UPDATE `retriever_item` SET `finished` = 1 WHERE id = %d", DBA::update('retriever_item', ['finished' => 1], ['id' => intval($retriever_item['id'])], ['finished']);
intval($retriever_item['id']));
retriever_check_item_completed($item); retriever_check_item_completed($item);
} }
function retriever_resource_completed($resource, $a) { function retriever_resource_completed($resource, $a) {
Logger::debug('retriever_resource_completed: id ' . $resource['id'] . ' url ' . $resource['url']); Logger::debug('retriever_resource_completed: id ' . $resource['id'] . ' url ' . $resource['url']);
$r = q("SELECT `id` FROM `retriever_item` WHERE `resource` = %d", $resource['id']);
foreach (DBA::select('retriever_item', ['id'], ['resource' => intval($resource['id'])]) as $retriever_item) { foreach (DBA::select('retriever_item', ['id'], ['resource' => intval($resource['id'])]) as $retriever_item) {
retriever_item_completed($retriever_item['id'], $resource, $a); retriever_item_completed($retriever_item['id'], $resource, $a);
} }
} }
function apply_retrospective($a, $retriever, $num) { function apply_retrospective($a, $retriever, $num) {
$r = q("SELECT * FROM `item` WHERE `contact-id` = %d ORDER BY `received` DESC LIMIT %d", Logger::debug('@@@ apply_retrospective');
intval($retriever['contact-id']), intval($num)); foreach (Item::select([], ['contact-id' => intval($retriever['contact-id'])], ['order' => ['received' => true], 'limit' => $num]) as $item) {
foreach ($r as $item) { Logger::debug('@@@ apply_retrospective got item id ' . $item['id'] . ' uri ' . $item['uri']);
q('UPDATE `item` SET `visible` = 0 WHERE `id` = %d', $item['id']); Item::update(['visible' => 0], ['id' => intval($item['id'])]);
q('UPDATE `thread` SET `visible` = 0 WHERE `iid` = %d', $item['id']); //@@@ check that this works
foreach (DBA::select('retriever_item', [], ['item-uri' => $item['uri'], 'item-uid' => $item['uid'], 'contact-id' => $item['contact-id']]) as $retriever_item) { foreach (DBA::select('retriever_item', [], ['item-uri' => $item['uri'], 'item-uid' => $item['uid'], 'contact-id' => $item['contact-id']]) as $retriever_item) {
DBA::delete('retriever_resource', ['id' => $retriever_item['resource']]); DBA::delete('retriever_resource', ['id' => $retriever_item['resource']]);
DBA::delete('retriever_item', ['id' => $retriever_item['id']]); DBA::delete('retriever_item', ['id' => $retriever_item['id']]);
@ -368,13 +333,11 @@ function apply_retrospective($a, $retriever, $num) {
// TODO: Currently this waits until the next cron before actually downloading. Should do it immediately. // TODO: Currently this waits until the next cron before actually downloading. Should do it immediately.
// TODO: This queries then inserts. It should use some kind of lock to avoid requesting the same resource twice. // TODO: This queries then inserts. It should use some kind of lock to avoid requesting the same resource twice.
function retriever_on_item_insert($a, $retriever, &$item) { function retriever_on_item_insert($a, $retriever, &$item) {
Logger::info('@@@ retriever_on_item_insert start plink ' . $item['plink'] . ' id ' . $item['id']);
if (!$retriever || !$retriever['id']) { if (!$retriever || !$retriever['id']) {
Logger::info('retriever_on_item_insert: No retriever supplied'); Logger::info('retriever_on_item_insert: No retriever supplied');
return; return;
} }
if (!array_key_exists('enable', $retriever['data']) || !$retriever['data']['enable'] == "on") { if (!array_key_exists('enable', $retriever['data']) || !$retriever['data']['enable'] == "on") {
Logger::info('@@@ retriever_on_item_insert: Disabled');
return; return;
} }
if (array_key_exists('plink', $item) && strlen($item['plink'])) { if (array_key_exists('plink', $item) && strlen($item['plink'])) {
@ -389,13 +352,12 @@ function retriever_on_item_insert($a, $retriever, &$item) {
$url = $content['plink']; $url = $content['plink'];
} }
if (array_key_exists('pattern', $retriever['data']) && $retriever['data']['pattern']) { if ($retriever['data']['modurl']) {
$orig_url = $url; $orig_url = $url;
$url = preg_replace('/' . $retriever['data']['pattern'] . '/', $retriever['data']['replace'], $orig_url); $url = preg_replace('/' . $retriever['data']['pattern'] . '/', $retriever['data']['replace'], $orig_url);
Logger::debug('retriever_on_item_insert: Changed ' . $orig_url . ' to ' . $url); Logger::debug('retriever_on_item_insert: Changed ' . $orig_url . ' to ' . $url);
} }
Logger::debug('@@@ retriever_on_item_insert: about to add_retriever_resource uid ' . $item['uid'] . ' cid ' . $item['contact-id'] . ' url ' . $url);
$resource = add_retriever_resource($a, $url, $item['uid'], $item['contact-id']); $resource = add_retriever_resource($a, $url, $item['uid'], $item['contact-id']);
$retriever_item_id = add_retriever_item($item, $resource); $retriever_item_id = add_retriever_item($item, $resource);
} }
@ -412,16 +374,15 @@ function add_retriever_resource($a, $url, $uid, $cid, $binary = false) {
fclose($fp); fclose($fp);
$url = 'md5://' . hash('md5', $url); $url = 'md5://' . hash('md5', $url);
//@@@ fix this if (DBA::selectFirst('retriever_resource', [], ['url' => $url, 'item-uid' => intval($uid), 'contact-id' => intval($cid)])) {
$r = q("SELECT * FROM `retriever_resource` WHERE `url` = '%s' AND `item-uid` = %d AND `contact-id` = %d", DBA::escape($url), intval($uid), intval($cid)); //@@@ test that this really happens - it should sometimes
$resource = $r[0];
if (count($r)) {
Logger::debug('add_retriever_resource: Resource ' . $url . ' already requested'); Logger::debug('add_retriever_resource: Resource ' . $url . ' already requested');
return $resource; return $resource;
} }
Logger::debug('retrieve_resource: got data URL type ' . $resource['type']); Logger::debug('retrieve_resource: got data URL type ' . $resource['type']);
//@@@ fix this // TODO: figure out how to do this with DBA module
// @@@ DBA::update('workerqueue', ['executed' => DateTimeFormat::utcNow()], ['pid' => $mypid, 'done' => false]);
q("INSERT INTO `retriever_resource` (`item-uid`, `contact-id`, `type`, `binary`, `url`, `completed`, `data`) " . q("INSERT INTO `retriever_resource` (`item-uid`, `contact-id`, `type`, `binary`, `url`, `completed`, `data`) " .
"VALUES (%d, %d, '%s', %d, '%s', now(), '%s')", "VALUES (%d, %d, '%s', %d, '%s', now(), '%s')",
intval($uid), intval($uid),
@ -430,6 +391,7 @@ function add_retriever_resource($a, $url, $uid, $cid, $binary = false) {
intval($binary ? 1 : 0), intval($binary ? 1 : 0),
DBA::escape($url), DBA::escape($url),
DBA::escape($data)); DBA::escape($data));
//@@@ fix this
$r = q("SELECT * FROM `retriever_resource` WHERE `url` = '%s'", DBA::escape($url)); $r = q("SELECT * FROM `retriever_resource` WHERE `url` = '%s'", DBA::escape($url));
$resource = $r[0]; $resource = $r[0];
if (count($r)) { if (count($r)) {
@ -449,16 +411,18 @@ function add_retriever_resource($a, $url, $uid, $cid, $binary = false) {
return $r[0]; return $r[0];
} }
//@@@ fix this
q("INSERT INTO `retriever_resource` (`item-uid`, `contact-id`, `binary`, `url`) " . q("INSERT INTO `retriever_resource` (`item-uid`, `contact-id`, `binary`, `url`) " .
"VALUES (%d, %d, %d, '%s')", intval($uid), intval($cid), intval($binary ? 1 : 0), DBA::escape($url)); "VALUES (%d, %d, %d, '%s')", intval($uid), intval($cid), intval($binary ? 1 : 0), DBA::escape($url));
//@@@ fix this
$r = q("SELECT * FROM `retriever_resource` WHERE `url` = '%s'", DBA::escape($url)); $r = q("SELECT * FROM `retriever_resource` WHERE `url` = '%s'", DBA::escape($url));
return $r[0]; return $r[0];
} }
function add_retriever_item(&$item, $resource) { function add_retriever_item(&$item, $resource) {
Logger::debug('@@@ 5 item class is ' . retriever_class_of_item($item) . ' ' . mat_test($item));
Logger::debug('add_retriever_item: ' . $resource['url'] . ' for ' . $item['uri'] . ' ' . $item['uid'] . ' ' . $item['contact-id']); Logger::debug('add_retriever_item: ' . $resource['url'] . ' for ' . $item['uri'] . ' ' . $item['uid'] . ' ' . $item['contact-id']);
//@@@ can use selectFirst
$r = q("SELECT COUNT(*) FROM `retriever_item` WHERE " . $r = q("SELECT COUNT(*) FROM `retriever_item` WHERE " .
"`item-uri` = '%s' AND `item-uid` = %d AND `contact-id` = %d AND `resource` = %d", "`item-uri` = '%s' AND `item-uid` = %d AND `contact-id` = %d AND `resource` = %d",
DBA::escape($item['uri']), intval($item['uid']), intval($item['contact-id']), intval($resource['id'])); DBA::escape($item['uri']), intval($item['uid']), intval($item['contact-id']), intval($resource['id']));
@ -466,9 +430,11 @@ function add_retriever_item(&$item, $resource) {
Logger::info("add_retriever_item: retriever item already present for " . $item['uri'] . ' ' . $item['uid'] . ' ' . $item['contact-id']); Logger::info("add_retriever_item: retriever item already present for " . $item['uri'] . ' ' . $item['uid'] . ' ' . $item['contact-id']);
return; return;
} }
//@@@ fix this
q("INSERT INTO `retriever_item` (`item-uri`, `item-uid`, `contact-id`, `resource`) " . q("INSERT INTO `retriever_item` (`item-uri`, `item-uid`, `contact-id`, `resource`) " .
"VALUES ('%s', %d, %d, %d)", "VALUES ('%s', %d, %d, %d)",
DBA::escape($item['uri']), intval($item['uid']), intval($item['contact-id']), intval($resource["id"])); DBA::escape($item['uri']), intval($item['uid']), intval($item['contact-id']), intval($resource["id"]));
//@@@ fix this
$r = q("SELECT id FROM `retriever_item` WHERE " . $r = q("SELECT id FROM `retriever_item` WHERE " .
"`item-uri` = '%s' AND `item-uid` = %d AND `contact-id` = %d AND `resource` = %d ORDER BY id DESC", "`item-uri` = '%s' AND `item-uid` = %d AND `contact-id` = %d AND `resource` = %d ORDER BY id DESC",
DBA::escape($item['uri']), intval($item['uid']), intval($item['contact-id']), intval($resource['id'])); DBA::escape($item['uri']), intval($item['uid']), intval($item['contact-id']), intval($resource['id']));
@ -505,8 +471,10 @@ function retriever_apply_xslt_text($xslt_text, $doc) {
return $result; return $result;
} }
//@@@ I think this is supposed to update the $item, but it doesn't
function retriever_apply_dom_filter($retriever, &$item, $resource) { function retriever_apply_dom_filter($retriever, &$item, $resource) {
Logger::debug('retriever_apply_dom_filter: applying XSLT to ' . $item['id'] . ' ' . $item['uri'] . ' contact ' . $item['contact-id']); //@@@ check if id and uri-id are there //@@@ uri-id definitely is not
Logger::debug('retriever_apply_dom_filter: applying XSLT to ' . $item['id'] . ' ' . $item['uri'] . ' contact ' . $item['contact-id'] . ' uri-id ' . $item['uri-id']);
if (!array_key_exists('include', $retriever['data']) && !array_key_exists('customxslt', $retriever['data'])) { if (!array_key_exists('include', $retriever['data']) && !array_key_exists('customxslt', $retriever['data'])) {
Logger::info('retriever_apply_dom_filter: no include and no customxslt'); Logger::info('retriever_apply_dom_filter: no include and no customxslt');
@ -517,41 +485,15 @@ function retriever_apply_dom_filter($retriever, &$item, $resource) {
return; return;
} }
//@@@ break this bit into separate function $doc = retriever_load_into_dom($resource);
$encoding = retriever_get_encoding($resource);
$content = mb_convert_encoding($resource['data'], 'HTML-ENTITIES', $encoding);
$doc = new DOMDocument('1.0', 'UTF-8');
if (strpos($resource['type'], 'html') !== false) {
@$doc->loadHTML($content);
}
else {
$doc->loadXML($content);
}
$params = array('$spec' => $retriever['data']); $doc = retriever_extract($doc, $retriever);
$extract_template = Renderer::getMarkupTemplate('extract.tpl', 'addon/retriever/');
$extract_xslt = Renderer::replaceMacros($extract_template, $params);
if ($retriever['data']['include']) {
Logger::debug('retriever_apply_dom_filter: applying include/exclude template \"' . $extract_xslt . '\"');
$doc = retriever_apply_xslt_text($extract_xslt, $doc);
}
if (array_key_exists('customxslt', $retriever['data']) && $retriever['data']['customxslt']) {
Logger::debug('retriever_apply_dom_filter: applying custom XSLT \"' . $retriever['data']['customxslt'] . '\"');
$doc = retriever_apply_xslt_text($retriever['data']['customxslt'], $doc);
}
if (!$doc) { if (!$doc) {
Logger::info('retriever_apply_dom_filter: failed to apply extract XSLT template'); Logger::info('retriever_apply_dom_filter: failed to apply extract XSLT template');
return; return;
} }
//@@@ break this bit into separate function $doc = retriever_globalise_urls($doc, $resource);
$components = parse_url($resource['redirect-url']);
$rooturl = $components['scheme'] . "://" . $components['host'];
$dirurl = $rooturl . dirname($components['path']) . "/";
$params = array('$dirurl' => $dirurl, '$rooturl' => $rooturl);
$fix_urls_template = Renderer::getMarkupTemplate('fix-urls.tpl', 'addon/retriever/');
$fix_urls_xslt = Renderer::replaceMacros($fix_urls_template, $params);
$doc = retriever_apply_xslt_text($fix_urls_xslt, $doc);
if (!$doc) { if (!$doc) {
Logger::info('retriever_apply_dom_filter: failed to apply fix urls XSLT template'); Logger::info('retriever_apply_dom_filter: failed to apply fix urls XSLT template');
return; return;
@ -571,10 +513,56 @@ function retriever_apply_dom_filter($retriever, &$item, $resource) {
Item::update(['body' => $body], ['uri-id' => $uri_id]); Item::update(['body' => $body], ['uri-id' => $uri_id]);
} }
function retriever_load_into_dom($resource) {
Logger::info('@@@ retriever_load_into_dom start');
$encoding = retriever_get_encoding($resource);
$content = mb_convert_encoding($resource['data'], 'HTML-ENTITIES', $encoding);
$doc = new DOMDocument('1.0', 'UTF-8');
if (strpos($resource['type'], 'html') !== false) {
@$doc->loadHTML($content);
}
else {
$doc->loadXML($content);
}
Logger::info('@@@ retriever_load_into_dom end');
return $doc;
}
function retriever_extract($doc, $retriever) {
Logger::info('@@@ retriever_extract start');
$params = array('$spec' => $retriever['data']);
$extract_template = Renderer::getMarkupTemplate('extract.tpl', 'addon/retriever/');
$extract_xslt = Renderer::replaceMacros($extract_template, $params);
if ($retriever['data']['include']) {
Logger::debug('retriever_apply_dom_filter: applying include/exclude template \"' . $extract_xslt . '\"');
$doc = retriever_apply_xslt_text($extract_xslt, $doc);
}
if (array_key_exists('customxslt', $retriever['data']) && $retriever['data']['customxslt']) {
Logger::debug('retriever_extract: applying custom XSLT \"' . $retriever['data']['customxslt'] . '\"');
$doc = retriever_apply_xslt_text($retriever['data']['customxslt'], $doc);
}
Logger::info('@@@ retriever_extract end');
return $doc;
}
function retriever_globalise_urls($doc, $resource) {
Logger::info('@@@ retriever_globalise_urls start');
$components = parse_url($resource['redirect-url']);
$rooturl = $components['scheme'] . "://" . $components['host'];
$dirurl = $rooturl . dirname($components['path']) . "/";
$params = array('$dirurl' => $dirurl, '$rooturl' => $rooturl);
$fix_urls_template = Renderer::getMarkupTemplate('fix-urls.tpl', 'addon/retriever/');
$fix_urls_xslt = Renderer::replaceMacros($fix_urls_template, $params);
$doc = retriever_apply_xslt_text($fix_urls_xslt, $doc);
Logger::info('@@@ retriever_globalise_urls end');
return $doc;
}
function retrieve_images(&$item, $a) { function retrieve_images(&$item, $a) {
// Note that $item doesn't necessarily contain all the fields you would expect, in particular 'id' // Note that $item doesn't necessarily contain all the fields you would expect, in particular 'id'
//@@@ doe sit contain uri-id? //@@@ it definitely does not
Logger::debug('@@@ retrieve_images start item '. $item['id'] . ' uri ' . $item['uri'] . ' uri id ' . $item['uri-id'] . ' plink ' . $item['plink'] . ' guid ' . $item['guid']); Logger::debug('@@@ retrieve_images start item id '. (array_key_exists('id', $item) ? $item['id'] : 'undef') . ' uri ' . $item['uri'] . ' uri id ' . $item['uri-id'] . ' plink ' . $item['plink'] . ' guid ' . $item['guid']);
$uri_id = ItemURI::getIdByURI($item['uri']); //@@@ why can't I get this from the item itself? $uri_id = ItemURI::getIdByURI($item['uri']); //@@@ why can't I get this from the item itself?
$content = DBA::selectFirst('item-content', ['body'], ['uri-id' => $uri_id]); $content = DBA::selectFirst('item-content', ['body'], ['uri-id' => $uri_id]);
@ -584,7 +572,6 @@ function retrieve_images(&$item, $a) {
return; return;
} }
Logger::info('@@@ retrieve_images looking in body "' . $body . '"');
// I suspect that the first two are not used any more? // I suspect that the first two are not used any more?
preg_match_all("/\[img\=([0-9]*)x([0-9]*)\](.*?)\[\/img\]/ism", $item["body"], $matches1); preg_match_all("/\[img\=([0-9]*)x([0-9]*)\](.*?)\[\/img\]/ism", $item["body"], $matches1);
preg_match_all("/\[img\](.*?)\[\/img\]/ism", $item["body"], $matches2); preg_match_all("/\[img\](.*?)\[\/img\]/ism", $item["body"], $matches2);
@ -592,9 +579,7 @@ function retrieve_images(&$item, $a) {
$matches = array_merge($matches1[3], $matches2[1], $matches3[1]); $matches = array_merge($matches1[3], $matches2[1], $matches3[1]);
Logger::debug('retrieve_images: found ' . count($matches) . ' images for item ' . $item['uri'] . ' ' . $item['uid'] . ' ' . $item['contact-id']); Logger::debug('retrieve_images: found ' . count($matches) . ' images for item ' . $item['uri'] . ' ' . $item['uid'] . ' ' . $item['contact-id']);
foreach ($matches as $url) { foreach ($matches as $url) {
Logger::debug('@@@ retrieve_images: url ' . $url);
if (strpos($url, get_app()->getBaseUrl()) === FALSE) { if (strpos($url, get_app()->getBaseUrl()) === FALSE) {
Logger::debug('@@@ retrieve_images: about to add_retriever_resource uid ' . $item['uid'] . ' cid ' . $item['contact-id']);
$resource = add_retriever_resource($a, $url, $item['uid'], $item['contact-id'], true); $resource = add_retriever_resource($a, $url, $item['uid'], $item['contact-id'], true);
if (!$resource['completed']) { if (!$resource['completed']) {
add_retriever_item($item, $resource); add_retriever_item($item, $resource);
@ -604,12 +589,11 @@ function retrieve_images(&$item, $a) {
} }
} }
} }
Logger::info('@@@ retrieve_images end');
} }
function retriever_check_item_completed(&$item) function retriever_check_item_completed(&$item)
{ {
Logger::debug('@@@ 9 item class is ' . retriever_class_of_item($item) . ' ' . mat_test($item)); // TODO: figure out how to do this with DBA module
$r = q('SELECT count(*) FROM retriever_item WHERE `item-uri` = "%s" ' . $r = q('SELECT count(*) FROM retriever_item WHERE `item-uri` = "%s" ' .
'AND `item-uid` = %d AND `contact-id` = %d AND `finished` = 0', 'AND `item-uid` = %d AND `contact-id` = %d AND `finished` = 0',
DBA::escape($item['uri']), intval($item['uid']), DBA::escape($item['uri']), intval($item['uid']),
@ -620,12 +604,7 @@ function retriever_check_item_completed(&$item)
$item['visible'] = $waiting ? 0 : 1; $item['visible'] = $waiting ? 0 : 1;
if (array_key_exists('id', $item) && ($item['id'] > 0) && ($old_visible != $item['visible'])) { if (array_key_exists('id', $item) && ($item['id'] > 0) && ($old_visible != $item['visible'])) {
Logger::debug('retriever_check_item_completed: changing visible flag to ' . $item['visible']); Logger::debug('retriever_check_item_completed: changing visible flag to ' . $item['visible']);
q("UPDATE `item` SET `visible` = %d WHERE `id` = %d", Item::update(['visible' => 0], ['id' => intval($item['id'])]);
intval($item['visible']),
intval($item['id']));
q("UPDATE `thread` SET `visible` = %d WHERE `iid` = %d",
intval($item['visible']),
intval($item['id']));
} }
} }
@ -647,11 +626,8 @@ function retriever_apply_completed_resource_to_item($retriever, &$item, $resourc
} }
} }
//@@@ todo: change all Logger::info t etc
//@@@ todo: what is this reference for? document if needed delete if not //@@@ todo: what is this reference for? document if needed delete if not
function retriever_transform_images($a, &$item, $resource) { function retriever_transform_images($a, &$item, $resource) {
Logger::debug('@@@ 11 item class is ' . retriever_class_of_item($item) . ' ' . mat_test($item));
Logger::info('@@@ retriever_transform_images');
if (!$resource['data']) { if (!$resource['data']) {
Logger::info('retriever_transform_images: no data available for ' . $resource['id'] . ' ' . $resource['url']); Logger::info('retriever_transform_images: no data available for ' . $resource['id'] . ' ' . $resource['url']);
return; return;
@ -659,7 +635,6 @@ function retriever_transform_images($a, &$item, $resource) {
$uri_id = ItemURI::getIdByURI($item['uri']); //@@@ why can't I get this from the item itself? $uri_id = ItemURI::getIdByURI($item['uri']); //@@@ why can't I get this from the item itself?
try { //@@@ probably can get rid of this try/catch
$data = $resource['data']; $data = $resource['data'];
$type = $resource['type']; $type = $resource['type'];
$uid = $item['uid']; $uid = $item['uid'];
@ -668,23 +643,17 @@ function retriever_transform_images($a, &$item, $resource) {
$path = parse_url($resource['url'], PHP_URL_PATH); $path = parse_url($resource['url'], PHP_URL_PATH);
$parts = pathinfo($path); $parts = pathinfo($path);
$filename = $parts['filename'] . (array_key_exists('extension', $parts) ? '.' . $parts['extension'] : ''); $filename = $parts['filename'] . (array_key_exists('extension', $parts) ? '.' . $parts['extension'] : '');
Logger::info('@@@ retriever_transform_images url ' . $resource['url'] . ' path ' . $path . ' filename ' . $parts['filename']);
$album = 'Wall Photos'; $album = 'Wall Photos';
$scale = 0; $scale = 0;
$desc = ''; // TODO: store alt text with resource when it's requested so we can fill this in $desc = ''; // TODO: store alt text with resource when it's requested so we can fill this in
Logger::debug('retriever_transform_images storing ' . strlen($data) . ' bytes type ' . $type . ': uid ' . $uid . ' cid ' . $cid . ' rid ' . $rid . ' filename ' . $filename . ' album ' . $album . ' scale ' . $scale . ' desc ' . $desc); Logger::debug('retriever_transform_images storing ' . strlen($data) . ' bytes type ' . $type . ': uid ' . $uid . ' cid ' . $cid . ' rid ' . $rid . ' filename ' . $filename . ' album ' . $album . ' scale ' . $scale . ' desc ' . $desc);
Logger::info('@@@ retriever_transform_images before new Image');
$image = new Image($data, $type); $image = new Image($data, $type);
Logger::info('@@@ retriever_transform_images after new Image');
if (!$image->isValid()) { if (!$image->isValid()) {
Logger::warning('retriever_transform_images: invalid image found at URL ' . $resource['url'] . ' for item ' . $item['id']); Logger::warning('retriever_transform_images: invalid image found at URL ' . $resource['url'] . ' for item ' . $item['id']);
return; return;
} }
Logger::info('@@@ retriever_transform_images before Photo::store');
$photo = Photo::store($image, $uid, $cid, $rid, $filename, $album, 0, 0, "", "", "", "", $desc); $photo = Photo::store($image, $uid, $cid, $rid, $filename, $album, 0, 0, "", "", "", "", $desc);
Logger::info('@@@ retriever_transform_images after Photo::store');
$new_url = System::baseUrl() . '/photo/' . $rid . '-0.' . $image->getExt(); $new_url = System::baseUrl() . '/photo/' . $rid . '-0.' . $image->getExt();
Logger::info('@@@ retriever_transform_images new url ' . $new_url . ' rid ' . $rid . ' ext ' . $image->getExt());
if (!strlen($new_url)) { if (!strlen($new_url)) {
Logger::warning('retriever_transform_images: no replacement URL for image ' . $resource['url']); Logger::warning('retriever_transform_images: no replacement URL for image ' . $resource['url']);
return; return;
@ -692,18 +661,11 @@ function retriever_transform_images($a, &$item, $resource) {
$content = DBA::selectFirst('item-content', ['body'], ['uri-id' => $uri_id]); $content = DBA::selectFirst('item-content', ['body'], ['uri-id' => $uri_id]);
$body = $content['body']; $body = $content['body'];
Logger::info('@@@ retriever_transform_images: found body for uri id ' . $uri_id . ': ' . $body);
Logger::debug('retriever_transform_images: replacing ' . $resource['url'] . ' with ' . $new_url . ' in item ' . $item['uri']); Logger::debug('retriever_transform_images: replacing ' . $resource['url'] . ' with ' . $new_url . ' in item ' . $item['uri']);
Logger::debug('@@@ retriever_transform_images: replacing ' . $resource['url'] . ' with ' . $new_url . ' in body ' . $body);
$body = str_replace($resource["url"], $new_url, $body); $body = str_replace($resource["url"], $new_url, $body);
Logger::info('@@@ retriever_transform_images: result \"' . $body . '\"');
Item::update(['body' => $body], ['uri-id' => $uri_id]); Item::update(['body' => $body], ['uri-id' => $uri_id]);
} catch (Exception $e) {
Logger::info('retriever_transform_images caught exception ' . $e->getMessage());
return;
}
} }
function retriever_content($a) { function retriever_content($a) {
@ -712,6 +674,7 @@ function retriever_content($a) {
return; return;
} }
if ($a->argv[1] === 'help') { if ($a->argv[1] === 'help') {
//@@@ fix me
$feeds = q("SELECT `id`, `name`, `thumb` FROM contact WHERE `uid` = %d AND `network` = 'feed'", $feeds = q("SELECT `id`, `name`, `thumb` FROM contact WHERE `uid` = %d AND `network` = 'feed'",
local_user()); local_user());
foreach ($feeds as $k=>$v) { foreach ($feeds as $k=>$v) {
@ -729,7 +692,7 @@ function retriever_content($a) {
if (!empty($_POST["id"])) { if (!empty($_POST["id"])) {
$retriever_rule = get_retriever_rule($a->argv[1], local_user(), true); $retriever_rule = get_retriever_rule($a->argv[1], local_user(), true);
$retriever_rule['data'] = array(); $retriever_rule['data'] = array();
foreach (array('pattern', 'replace', 'enable', 'images', 'customxslt', 'storecookies', 'cookiedata') as $setting) { foreach (array('modurl', 'pattern', 'replace', 'enable', 'images', 'customxslt', 'storecookies', 'cookiedata') as $setting) {
if (empty($_POST['retriever_' . $setting])) { if (empty($_POST['retriever_' . $setting])) {
$retriever_rule['data'][$setting] = NULL; $retriever_rule['data'][$setting] = NULL;
} }
@ -753,6 +716,7 @@ function retriever_content($a) {
unset($retriever_rule['data']['exclude'][$k]); unset($retriever_rule['data']['exclude'][$k]);
} }
} }
//@@@ fix me
q("UPDATE `retriever_rule` SET `data`='%s' WHERE `id` = %d", q("UPDATE `retriever_rule` SET `data`='%s' WHERE `id` = %d",
DBA::escape(json_encode($retriever_rule['data'])), intval($retriever_rule["id"])); DBA::escape(json_encode($retriever_rule['data'])), intval($retriever_rule["id"]));
$a->page['content'] .= "<p><b>Settings Updated"; $a->page['content'] .= "<p><b>Settings Updated";
@ -769,6 +733,11 @@ function retriever_content($a) {
'retriever_enable', 'retriever_enable',
L10n::t('Enabled'), L10n::t('Enabled'),
$retriever_rule['data']['enable']), $retriever_rule['data']['enable']),
'$modurl' => array(
'retriever_modurl',
L10n::t('Modify URL'),
$retriever_rule['data']['modurl'],
L10n::t("Modify each article's URL with regular expressions before retrieving.")),
'$pattern' => array( '$pattern' => array(
'retriever_pattern', 'retriever_pattern',
L10n::t('URL Pattern'), L10n::t('URL Pattern'),
@ -832,7 +801,8 @@ function retriever_contact_photo_menu($a, &$args) {
} }
function retriever_post_remote_hook(&$a, &$item) { function retriever_post_remote_hook(&$a, &$item) {
Logger::info('@@@ 12 item class is ' . retriever_class_of_item($item) . ' ' . mat_test($item)); // Note that $item doesn't necessarily contain all the fields you would expect, in particular 'id'
Logger::info('retriever_post_remote_hook: ' . $item['uri'] . ' ' . $item['uid'] . ' ' . $item['contact-id']); Logger::info('retriever_post_remote_hook: ' . $item['uri'] . ' ' . $item['uid'] . ' ' . $item['contact-id']);
$uri_id = ItemURI::getIdByURI($item['uri']); //@@@ why can't I get this from the item itself? $uri_id = ItemURI::getIdByURI($item['uri']); //@@@ why can't I get this from the item itself?
@ -845,14 +815,13 @@ function retriever_post_remote_hook(&$a, &$item) {
// Convert to HTML and back to take advantage of bbcode's resolution of oembeds. // Convert to HTML and back to take advantage of bbcode's resolution of oembeds.
$content = DBA::selectFirst('item-content', [], ['uri-id' => $uri_id]); $content = DBA::selectFirst('item-content', [], ['uri-id' => $uri_id]);
$body = HTML::toBBCode(BBCode::convert($content['body'])); $body = HTML::toBBCode(BBCode::convert($content['body']));
Logger::debug('@@@ retriever_post_remote_hook item uri-id ' . $uri_id . ' body "' . $item['body'] . '" item content body "' . $body . '"');
if ($body) { if ($body) {
$item['body'] = $body; $item['body'] = $body;
Item::update(['body' => $body], ['uri-id' => $uri_id]); Item::update(['body' => $body], ['uri-id' => $uri_id]);
} }
} }
if (PConfig::get($item["uid"], 'retriever', 'all_photos')) { if (PConfig::get($item["uid"], 'retriever', 'all_photos')) {
retrieve_images($item, $a); retrieve_images($item, $a); //@@@ backwards
} }
} }
retriever_check_item_completed($item); retriever_check_item_completed($item);

View file

@ -41,6 +41,25 @@ function retriever_remove_row(id, number)
tbody.removeChild(row); tbody.removeChild(row);
} }
function retriever_toggle_url_block()
{
var pattern = document.querySelector("#id_retriever_pattern").parentNode;
if (document.querySelector("#id_retriever_modurl").checked) {
pattern.style.display = "block";
}
else {
pattern.style.display = "none";
}
var replace = document.querySelector("#id_retriever_replace").parentNode;
if (document.querySelector("#id_retriever_modurl").checked) {
replace.style.display = "block";
}
else {
replace.style.display = "none";
}
}
function retriever_toggle_cookiedata_block() function retriever_toggle_cookiedata_block()
{ {
var div = document.querySelector("#id_retriever_cookiedata").parentNode; var div = document.querySelector("#id_retriever_cookiedata").parentNode;
@ -53,6 +72,8 @@ function retriever_toggle_cookiedata_block()
} }
document.addEventListener('DOMContentLoaded', function() { document.addEventListener('DOMContentLoaded', function() {
retriever_toggle_url_block();
document.querySelector("#id_retriever_modurl").addEventListener('change', retriever_toggle_url_block, false);
retriever_toggle_cookiedata_block(); retriever_toggle_cookiedata_block();
document.querySelector("#id_retriever_storecookies").addEventListener('change', retriever_toggle_cookiedata_block, false); document.querySelector("#id_retriever_storecookies").addEventListener('change', retriever_toggle_cookiedata_block, false);
}, false); }, false);
@ -62,10 +83,6 @@ document.addEventListener('DOMContentLoaded', function() {
<form method="post"> <form method="post">
<input type="hidden" name="id" value="{{$id}}"> <input type="hidden" name="id" value="{{$id}}">
{{include file="field_checkbox.tpl" field=$enable}} {{include file="field_checkbox.tpl" field=$enable}}
{{include file="field_input.tpl" field=$pattern}}
{{include file="field_input.tpl" field=$replace}}
{{include file="field_checkbox.tpl" field=$images}}
{{include file="field_input.tpl" field=$retrospective}}
<h3>{{$include_t}}:</h3> <h3>{{$include_t}}:</h3>
<div> <div>
<table> <table>
@ -98,7 +115,7 @@ document.addEventListener('DOMContentLoaded', function() {
<div> <div>
<table> <table>
<thead> <thead>
<tr><th>Tag</th><th>Attribute</th><th>Value</th></tr> <tr><th>{{$tag_t}}</th><th>{{$attribute_t}}</th><th>{{$value_t}}</th></tr>
</thead> </thead>
<tbody id="retriever-exclude"> <tbody id="retriever-exclude">
{{if $exclude}} {{if $exclude}}
@ -122,9 +139,14 @@ document.addEventListener('DOMContentLoaded', function() {
</table> </table>
<input type="button" onclick="retriever_add_row('retriever-exclude')" value="{{$add_t}}"> <input type="button" onclick="retriever_add_row('retriever-exclude')" value="{{$add_t}}">
</div> </div>
{{include file="field_checkbox.tpl" field=$modurl}}
{{include file="field_input.tpl" field=$pattern}}
{{include file="field_input.tpl" field=$replace}}
{{include file="field_checkbox.tpl" field=$images}}
{{include file="field_textarea.tpl" field=$customxslt}} {{include file="field_textarea.tpl" field=$customxslt}}
{{include file="field_checkbox.tpl" field=$storecookies}} {{include file="field_checkbox.tpl" field=$storecookies}}
{{include file="field_textarea.tpl" field=$cookiedata}} {{include file="field_textarea.tpl" field=$cookiedata}}
{{include file="field_input.tpl" field=$retrospective}}
<input type="submit" size="70" value="{{$submit_t}}"> <input type="submit" size="70" value="{{$submit_t}}">
</form> </form>
</div> </div>