2019-07-07 15:45:23 +02:00
< ? php
/**
2019-09-30 08:25:00 +02:00
* Name : Retriever
2019-07-07 15:45:23 +02:00
* Description : Follow the permalink of RSS / Atom feed items and replace the summary with the full content .
2019-09-30 08:25:00 +02:00
* Version : 1.0
2019-07-07 15:45:23 +02:00
* Author : Matthew Exon < http :// mat . exon . name >
*/
use Friendica\Core\Addon ;
use Friendica\Core\Config ;
use Friendica\Core\PConfig ;
2019-07-20 15:37:57 +02:00
use Friendica\Core\Logger ;
2019-07-20 15:45:10 +02:00
use Friendica\Core\Renderer ;
2019-09-22 11:47:30 +02:00
use Friendica\Core\System ;
2019-07-07 15:45:23 +02:00
use Friendica\Content\Text\HTML ;
use Friendica\Content\Text\BBCode ;
2019-09-22 11:47:30 +02:00
use Friendica\Model\Photo ;
2019-07-07 15:45:23 +02:00
use Friendica\Object\Image ;
use Friendica\Util\Network ;
use Friendica\Core\L10n ;
use Friendica\Database\DBA ;
2019-07-21 19:27:14 +02:00
use Friendica\Model\ItemURI ;
2019-09-22 11:47:30 +02:00
use Friendica\Model\Item ;
2019-07-07 15:45:23 +02:00
function retriever_install () {
Addon :: registerHook ( 'plugin_settings' , 'addon/retriever/retriever.php' , 'retriever_plugin_settings' );
Addon :: registerHook ( 'plugin_settings_post' , 'addon/retriever/retriever.php' , 'retriever_plugin_settings_post' );
Addon :: registerHook ( 'post_remote' , 'addon/retriever/retriever.php' , 'retriever_post_remote_hook' );
Addon :: registerHook ( 'contact_photo_menu' , 'addon/retriever/retriever.php' , 'retriever_contact_photo_menu' );
Addon :: registerHook ( 'cron' , 'addon/retriever/retriever.php' , 'retriever_cron' );
if ( Config :: get ( 'retriever' , 'dbversion' ) == '0.10' ) {
2019-09-29 17:01:46 +02:00
q ( 'ALTER TABLE `retriever_resource` MODIFY COLUMN `type` char(255) NULL DEFAULT NULL' );
q ( 'ALTER TABLE `retriever_resource` MODIFY COLUMN `data` mediumblob NULL DEFAULT NULL' );
q ( 'ALTER TABLE `retriever_rule` MODIFY COLUMN `data` mediumtext NULL DEFAULT NULL' );
2019-07-07 15:45:23 +02:00
Config :: set ( 'retriever' , 'dbversion' , '0.11' );
}
if ( Config :: get ( 'retriever' , 'dbversion' ) == '0.11' ) {
2019-09-29 17:01:46 +02:00
q ( 'ALTER TABLE `retriever_resource` ADD INDEX `url` (`url`)' );
q ( 'ALTER TABLE `retriever_resource` ADD INDEX `completed` (`completed`)' );
q ( 'ALTER TABLE `retriever_item` ADD INDEX `finished` (`finished`)' );
q ( 'ALTER TABLE `retriever_item` ADD INDEX `item-uid` (`item-uid`)' );
2019-07-07 15:45:23 +02:00
Config :: set ( 'retriever' , 'dbversion' , '0.12' );
}
2019-09-22 11:47:30 +02:00
if ( Config :: get ( 'retriever' , 'dbversion' ) == '0.12' ) {
q ( " ALTER TABLE `retriever_resource` ADD COLUMN `contact-id` int(10) unsigned NOT NULL DEFAULT '0' AFTER `id` " );
q ( " ALTER TABLE `retriever_resource` ADD COLUMN `item-uid` int(10) unsigned NOT NULL DEFAULT '0' AFTER `id` " );
Config :: set ( 'retriever' , 'dbversion' , '0.13' );
}
2019-09-29 20:59:14 +02:00
if ( Config :: get ( 'retriever' , 'dbversion' ) == '0.13' ) {
Config :: set ( 'retriever' , 'downloads_per_cron' , '100' );
}
if ( Config :: get ( 'retriever' , 'dbversion' ) != '0.14' ) {
2019-07-07 15:45:23 +02:00
$schema = file_get_contents ( dirname ( __file__ ) . '/database.sql' );
$arr = explode ( ';' , $schema );
foreach ( $arr as $a ) {
2019-09-29 17:04:34 +02:00
if ( ! DBA :: e ( $a )) {
Logger :: warning ( 'Unable to create database table: ' . DBA :: errorMessage ());
return ;
}
2019-07-07 15:45:23 +02:00
}
2019-09-29 20:59:14 +02:00
Config :: set ( 'retriever' , 'downloads_per_cron' , '100' );
Config :: set ( 'retriever' , 'dbversion' , '0.14' );
2019-07-07 15:45:23 +02:00
}
}
function retriever_uninstall () {
Addon :: unregisterHook ( 'plugin_settings' , 'addon/retriever/retriever.php' , 'retriever_plugin_settings' );
Addon :: unregisterHook ( 'plugin_settings_post' , 'addon/retriever/retriever.php' , 'retriever_plugin_settings_post' );
Addon :: unregisterHook ( 'post_remote' , 'addon/retriever/retriever.php' , 'retriever_post_remote_hook' );
Addon :: unregisterHook ( 'plugin_settings' , 'addon/retriever/retriever.php' , 'retriever_plugin_settings' );
Addon :: unregisterHook ( 'plugin_settings_post' , 'addon/retriever/retriever.php' , 'retriever_plugin_settings_post' );
Addon :: unregisterHook ( 'contact_photo_menu' , 'addon/retriever/retriever.php' , 'retriever_contact_photo_menu' );
Addon :: unregisterHook ( 'cron' , 'addon/retriever/retriever.php' , 'retriever_cron' );
}
function retriever_module () {}
2019-09-29 20:59:14 +02:00
function retriever_addon_admin ( & $a , & $o ) {
$downloads_per_cron = Config :: get ( 'retriever' , 'downloads_per_cron' );
$template = Renderer :: getMarkupTemplate ( 'admin.tpl' , 'addon/retriever/' );
$config = [ 'downloads_per_cron' ,
L10n :: t ( 'Downloads per Cron' ),
$downloads_per_cron ,
L10n :: t ( 'Maximum number of downloads to attempt during each run of the cron job.' )];
$o .= Renderer :: replaceMacros ( $template , [
'$downloads_per_cron' => $config ,
'$submit' => L10n :: t ( 'Save Settings' )]);
}
function retriever_addon_admin_post ( $a ) {
if ( ! empty ( $_POST [ 'downloads_per_cron' ])) {
Config :: set ( 'retriever' , 'downloads_per_cron' , $_POST [ 'downloads_per_cron' ]);
}
}
2019-07-07 15:45:23 +02:00
function retriever_cron ( $a , $b ) {
2019-09-29 20:59:14 +02:00
$downloads_per_cron = Config :: get ( 'retriever' , 'downloads_per_cron' );
2019-09-22 11:47:30 +02:00
2019-09-29 20:59:14 +02:00
// Do this first, otherwise it can interfere with retriever_retrieve_items
retriever_clean_up_completed_resources ( $downloads_per_cron , $a );
2019-09-22 11:47:30 +02:00
2019-09-29 20:59:14 +02:00
retriever_retrieve_items ( $downloads_per_cron , $a );
2019-07-07 15:45:23 +02:00
retriever_tidy ();
}
$retriever_item_count = 0 ;
function retriever_retrieve_items ( $max_items , $a ) {
global $retriever_item_count ;
$retriever_schedule = array ( array ( 1 , 'minute' ),
array ( 10 , 'minute' ),
array ( 1 , 'hour' ),
array ( 1 , 'day' ),
array ( 2 , 'day' ),
array ( 1 , 'week' ),
array ( 1 , 'month' ));
$schedule_clauses = array ();
for ( $i = 0 ; $i < count ( $retriever_schedule ); $i ++ ) {
$num = $retriever_schedule [ $i ][ 0 ];
$unit = $retriever_schedule [ $i ][ 1 ];
array_push ( $schedule_clauses ,
'(`num-tries` = ' . $i . ' AND TIMESTAMPADD(' . DBA :: escape ( $unit ) .
', ' . intval ( $num ) . ', `last-try`) < now())' );
}
$retrieve_items = $max_items - $retriever_item_count ;
do {
2019-10-08 07:29:59 +02:00
//@@@ check this looks sane after moving inside the loop
Logger :: debug ( 'retriever_retrieve_items: asked for maximum ' . $max_items . ', already retrieved ' . $retriever_item_count . ', retrieve ' . $retrieve_items );
2019-09-29 20:59:14 +02:00
// TODO: figure out how to do this with DBA module
2019-09-22 11:47:30 +02:00
$retriever_resources = q ( " SELECT * FROM `retriever_resource` WHERE `completed` IS NULL AND (`last-try` IS NULL OR %s) ORDER BY `last-try` ASC LIMIT %d " ,
2019-07-07 15:45:23 +02:00
DBA :: escape ( implode ( $schedule_clauses , ' OR ' )),
intval ( $retrieve_items ));
2019-09-22 11:47:30 +02:00
if ( ! is_array ( $retriever_resources )) {
2019-07-07 15:45:23 +02:00
break ;
}
2019-09-22 11:47:30 +02:00
if ( count ( $retriever_resources ) == 0 ) {
2019-07-07 15:45:23 +02:00
break ;
}
2019-09-22 19:55:07 +02:00
Logger :: debug ( 'retriever_retrieve_items: found ' . count ( $retriever_resources ) . ' waiting resources in database' );
2019-09-22 11:47:30 +02:00
foreach ( $retriever_resources as $retriever_resource ) {
retrieve_resource ( $retriever_resource );
2019-07-07 15:45:23 +02:00
$retriever_item_count ++ ;
}
$retrieve_items = $max_items - $retriever_item_count ;
}
while ( $retrieve_items > 0 );
2019-09-29 20:59:14 +02:00
Logger :: debug ( 'retriever_retrieve_items: finished retrieving items' );
2019-09-22 11:47:30 +02:00
}
2019-07-07 15:45:23 +02:00
2019-09-29 22:05:49 +02:00
// Look for items that are waiting even though the resource has completed. This shouldn't happen, but is worth cleaning up if it does.
2019-09-22 11:47:30 +02:00
function retriever_clean_up_completed_resources ( $max_items , $a ) {
2019-09-29 20:59:14 +02:00
// TODO: figure out how to do this with DBA module
$r = q ( 'SELECT retriever_resource.`id` as resource, retriever_item.`id` as item FROM retriever_resource, retriever_item, retriever_rule WHERE retriever_item.`finished` = 0 AND retriever_item.`resource` = retriever_resource.`id` AND retriever_resource.`completed` IS NOT NULL AND retriever_item.`contact-id` = retriever_rule.`contact-id` AND retriever_item.`item-uid` = retriever_rule.`uid` LIMIT %d' ,
2019-09-22 11:47:30 +02:00
intval ( $max_items ));
2019-07-07 15:45:23 +02:00
if ( ! $r ) {
$r = array ();
}
2019-09-22 19:55:07 +02:00
Logger :: debug ( 'retriever_clean_up_completed_resources: items waiting even though resource has completed: ' . count ( $r ));
2019-07-07 15:45:23 +02:00
foreach ( $r as $rr ) {
$retriever_item = retriever_get_retriever_item ( $rr [ 'item' ]);
2019-09-22 11:47:30 +02:00
if ( ! DBA :: isResult ( $retriever_item )) {
2019-09-22 19:55:07 +02:00
Logger :: warning ( 'retriever_clean_up_completed_resources: no retriever item with id ' . $rr [ 'item' ]);
2019-07-07 15:45:23 +02:00
continue ;
}
$item = retriever_get_item ( $retriever_item );
if ( ! $item ) {
2019-09-22 19:55:07 +02:00
Logger :: warning ( 'retriever_clean_up_completed_resources: no item ' . $retriever_item [ 'item-uri' ]);
2019-07-07 15:45:23 +02:00
continue ;
}
2019-09-22 11:47:30 +02:00
$retriever_rule = get_retriever_rule ( $retriever_item [ 'contact-id' ], $item [ 'uid' ]);
if ( ! $retriever_rule ) {
2019-09-22 19:55:07 +02:00
Logger :: warning ( 'retriever_clean_up_completed_resources: no retriever for uri ' . $retriever_item [ 'item-uri' ] . ' uid ' . $retriever_item [ 'uid' ] . ' ' . $retriever_item [ 'contact-id' ]);
2019-07-07 15:45:23 +02:00
continue ;
}
2019-09-29 17:09:11 +02:00
$resource = DBA :: selectFirst ( 'retriever_resource' , [], [ 'id' => intval ( $rr [ 'resource' ])]);
2019-09-29 17:04:34 +02:00
retriever_apply_completed_resource_to_item ( $retriever_rule , $item , $resource , $a );
2019-10-02 07:19:59 +02:00
Logger :: info ( '@@@ retriever_clean_up_completed_resources tried to update id ' . $retriever_item [ 'id' ] . ' to finished, better check that it really worked!' );
2019-10-08 07:29:59 +02:00
DBA :: update ( 'retriever_item' , [ 'finished' => 1 ], [ 'id' => intval ( $retriever_item [ 'id' ])], [ 'finished' => 0 ]);
2019-07-07 15:45:23 +02:00
retriever_check_item_completed ( $item );
}
}
function retriever_tidy () {
2019-10-02 07:19:59 +02:00
// TODO: figure out how to do this with DBA module @@@ it is possible
2019-07-07 15:45:23 +02:00
q ( " DELETE FROM retriever_resource WHERE completed IS NOT NULL AND completed < DATE_SUB(now(), INTERVAL 1 WEEK) " );
q ( " DELETE FROM retriever_resource WHERE completed IS NULL AND created < DATE_SUB(now(), INTERVAL 3 MONTH) " );
$r = q ( " SELECT retriever_item.id FROM retriever_item LEFT OUTER JOIN retriever_resource ON (retriever_item.resource = retriever_resource.id) WHERE retriever_resource.id is null " );
2019-09-22 19:55:07 +02:00
Logger :: info ( 'retriever_tidy: found ' . count ( $r ) . ' retriever_items with no retriever_resource' );
2019-07-07 15:45:23 +02:00
foreach ( $r as $rr ) {
q ( 'DELETE FROM retriever_item WHERE id = %d' , intval ( $rr [ 'id' ]));
}
}
function retrieve_dataurl_resource ( $resource ) {
if ( ! preg_match ( " /date:(.*);base64,(.*)/ " , $resource [ 'url' ], $matches )) {
2019-09-22 19:55:07 +02:00
Logger :: info ( 'retrieve_dataurl_resource: ' . $resource [ 'id' ] . ' does not match pattern' );
2019-07-07 15:45:23 +02:00
} else {
$resource [ 'type' ] = $matches [ 1 ];
$resource [ 'data' ] = base64url_decode ( $matches [ 2 ]);
}
// Succeed or fail, there's no point retrying
q ( " UPDATE `retriever_resource` SET `last-try` = now(), `num-tries` = `num-tries` + 1, `completed` = now(), `data` = '%s', `type` = '%s' WHERE id = %d " ,
DBA :: escape ( $resource [ 'data' ]),
DBA :: escape ( $resource [ 'type' ]),
intval ( $resource [ 'id' ]));
retriever_resource_completed ( $resource , $a );
}
function retrieve_resource ( $resource ) {
if ( substr ( $resource [ 'url' ], 0 , 5 ) == " data: " ) {
return retrieve_dataurl_resource ( $resource );
}
$a = get_app ();
2019-09-22 11:47:30 +02:00
$retriever_rule = get_retriever_rule ( $resource [ 'contact-id' ], $resource [ 'item-uid' ]);
2019-10-02 07:19:59 +02:00
$rule_data = $retriever_rule [ 'data' ];
2019-09-22 11:47:30 +02:00
2019-07-07 15:45:23 +02:00
try {
2019-09-22 19:55:07 +02:00
Logger :: debug ( 'retrieve_resource: ' . ( $resource [ 'num-tries' ] + 1 ) . ' attempt at resource ' . $resource [ 'id' ] . ' ' . $resource [ 'url' ]);
2019-07-26 06:49:53 +02:00
$redirects = 0 ;
2019-09-29 17:01:46 +02:00
$cookiejar = '' ;
2019-10-02 07:19:59 +02:00
if ( array_key_exists ( 'storecookies' , $rule_data ) && $rule_data [ 'storecookies' ]) {
2019-09-29 17:01:46 +02:00
$cookiejar = tempnam ( get_temppath (), 'cookiejar-retriever-' );
2019-10-02 07:19:59 +02:00
file_put_contents ( $cookiejar , $rule_data [ 'cookiedata' ]);
2019-09-22 11:47:30 +02:00
}
2019-07-26 06:49:53 +02:00
$fetch_result = Network :: fetchUrlFull ( $resource [ 'url' ], $resource [ 'binary' ], $redirects , '' , $cookiejar );
2019-10-02 07:19:59 +02:00
if ( array_key_exists ( 'storecookies' , $rule_data ) && $rule_data [ 'storecookies' ]) {
$retriever_rule [ 'data' ][ 'cookiedata' ] = file_get_contents ( $cookiejar );
2019-10-08 07:29:59 +02:00
DBA :: update ( 'retriever_rule' , [ 'data' => json_encode ( $retriever_rule [ 'data' ])], [ 'id' => intval ( $retriever_rule [ " id " ])], $retriever_rule );
2019-10-02 07:19:59 +02:00
//@@@ check the update worked
unlink ( $cookiejar );
2019-09-22 11:47:30 +02:00
}
2019-07-20 15:37:57 +02:00
$resource [ 'data' ] = $fetch_result -> getBody ();
$resource [ 'http-code' ] = $fetch_result -> getReturnCode ();
$resource [ 'type' ] = $fetch_result -> getContentType ();
$resource [ 'redirect-url' ] = $fetch_result -> getRedirectUrl ();
2019-09-22 19:55:07 +02:00
Logger :: debug ( 'retrieve_resource: got code ' . $resource [ 'http-code' ] . ' retrieving resource ' . $resource [ 'id' ] . ' final url ' . $resource [ 'redirect-url' ]);
2019-07-07 15:45:23 +02:00
} catch ( Exception $e ) {
2019-09-22 19:55:07 +02:00
Logger :: info ( 'retrieve_resource: unable to retrieve ' . $resource [ 'url' ] . ' - ' . $e -> getMessage ());
2019-07-07 15:45:23 +02:00
}
2019-10-02 07:19:59 +02:00
// TODO: figure out how to do this with DBA module
2019-07-07 15:45:23 +02:00
q ( " UPDATE `retriever_resource` SET `last-try` = now(), `num-tries` = `num-tries` + 1, `http-code` = %d, `redirect-url` = '%s' WHERE id = %d " ,
intval ( $resource [ 'http-code' ]),
DBA :: escape ( $resource [ 'redirect-url' ]),
intval ( $resource [ 'id' ]));
if ( $resource [ 'data' ]) {
2019-10-02 07:19:59 +02:00
// TODO: figure out how to do this with DBA module
2019-07-07 15:45:23 +02:00
q ( " UPDATE `retriever_resource` SET `completed` = now(), `data` = '%s', `type` = '%s' WHERE id = %d " ,
DBA :: escape ( $resource [ 'data' ]),
DBA :: escape ( $resource [ 'type' ]),
intval ( $resource [ 'id' ]));
retriever_resource_completed ( $resource , $a );
}
}
2019-09-22 11:47:30 +02:00
function get_retriever_rule ( $contact_id , $uid , $create = false ) {
2019-10-02 07:19:59 +02:00
$retriever_rule = DBA :: selectFirst ( 'retriever_rule' , [], [ 'contact-id' => intval ( $contact_id ), 'uid' => intval ( $uid )]);
if ( $retriever_rule ) {
$retriever_rule [ 'data' ] = json_decode ( $retriever_rule [ 'data' ], true );
return $retriever_rule ;
2019-07-07 15:45:23 +02:00
}
if ( $create ) {
2019-10-02 07:19:59 +02:00
DBA :: insert ( 'retriever_rule' , [ 'uid' => intval ( $uid ), 'contact-id' => intval ( $contact_id )]);
//@@@ check that this worked
return DBA :: selectFirst ( 'retriever_rule' , [], [ 'contact-id' => intval ( $contact_id ), 'uid' => intval ( $uid )]);
2019-07-07 15:45:23 +02:00
}
}
function retriever_get_retriever_item ( $id ) {
2019-09-22 11:47:30 +02:00
return DBA :: selectFirst ( 'retriever_item' , [], [ 'id' => intval ( $id )]);
}
2019-07-07 15:45:23 +02:00
function retriever_get_item ( $retriever_item ) {
2019-10-02 07:19:59 +02:00
$item = Item :: selectFirst ([], [ 'uri' => $retriever_item [ 'item-uri' ], 'uid' => intval ( $retriever_item [ 'item-uid' ]), 'contact-id' => intval ( $retriever_item [ 'contact-id' ])]);
if ( ! DBA :: isResult ( $item )) {
Logger :: warning ( 'retriever_get_item: no item found for uri ' . $retriever_item [ 'item-uri' ]);
return ;
2019-07-07 15:45:23 +02:00
}
2019-10-02 07:19:59 +02:00
return $item ;
2019-07-07 15:45:23 +02:00
}
2019-10-08 18:55:34 +02:00
function retriever_item_completed ( $a , $retriever_item_id , $resource ) {
2019-09-22 19:55:07 +02:00
Logger :: debug ( 'retriever_item_completed: id ' . $retriever_item_id . ' url ' . $resource [ 'url' ]);
2019-07-07 15:45:23 +02:00
$retriever_item = retriever_get_retriever_item ( $retriever_item_id );
2019-09-22 11:47:30 +02:00
if ( ! DBA :: isResult ( $retriever_item )) {
2019-09-22 19:55:07 +02:00
Logger :: info ( 'retriever_item_completed: no retriever item with id ' . $retriever_item_id );
2019-07-07 15:45:23 +02:00
return ;
}
$item = retriever_get_item ( $retriever_item );
if ( ! $item ) {
2019-09-30 20:52:51 +02:00
Logger :: warning ( 'retriever_item_completed: no item ' . $retriever_item [ 'item-uri' ]);
2019-07-07 15:45:23 +02:00
return ;
}
2019-09-22 11:47:30 +02:00
// Note: the retriever might be null. Doesn't matter.
$retriever_rule = get_retriever_rule ( $retriever_item [ 'contact-id' ], $retriever_item [ 'item-uid' ]);
2019-07-07 15:45:23 +02:00
2019-09-22 11:47:30 +02:00
retriever_apply_completed_resource_to_item ( $retriever_rule , $item , $resource , $a );
2019-07-07 15:45:23 +02:00
2019-10-08 07:29:59 +02:00
DBA :: update ( 'retriever_item' , [ 'finished' => 1 ], [ 'id' => intval ( $retriever_item [ 'id' ])], [ 'finished' => 0 ]);
2019-07-07 15:45:23 +02:00
retriever_check_item_completed ( $item );
}
function retriever_resource_completed ( $resource , $a ) {
2019-09-22 19:55:07 +02:00
Logger :: debug ( 'retriever_resource_completed: id ' . $resource [ 'id' ] . ' url ' . $resource [ 'url' ]);
2019-10-08 07:29:59 +02:00
foreach ( DBA :: selectToArray ( 'retriever_item' , [ 'id' ], [ 'resource' => intval ( $resource [ 'id' ])]) as $retriever_item ) {
2019-10-08 18:55:34 +02:00
retriever_item_completed ( $a , $retriever_item [ 'id' ], $resource );
2019-07-07 15:45:23 +02:00
}
}
function apply_retrospective ( $a , $retriever , $num ) {
2019-10-08 07:29:59 +02:00
foreach ( Item :: selectToArray ([], [ 'contact-id' => intval ( $retriever [ 'contact-id' ])], [ 'order' => [ 'received' => true ], 'limit' => $num ]) as $item ) {
2019-10-02 07:19:59 +02:00
Item :: update ([ 'visible' => 0 ], [ 'id' => intval ( $item [ 'id' ])]);
//@@@ check that this works
2019-10-08 07:29:59 +02:00
foreach ( DBA :: selectToArray ( 'retriever_item' , [], [ 'item-uri' => $item [ 'uri' ], 'item-uid' => $item [ 'uid' ], 'contact-id' => $item [ 'contact-id' ]]) as $retriever_item ) {
2019-09-29 17:01:46 +02:00
DBA :: delete ( 'retriever_resource' , [ 'id' => $retriever_item [ 'resource' ]]);
DBA :: delete ( 'retriever_item' , [ 'id' => $retriever_item [ 'id' ]]);
}
2019-07-07 15:45:23 +02:00
retriever_on_item_insert ( $a , $retriever , $item );
}
}
2019-09-29 20:59:14 +02:00
// TODO: Currently this waits until the next cron before actually downloading. Should do it immediately.
// TODO: This queries then inserts. It should use some kind of lock to avoid requesting the same resource twice.
2019-07-07 15:45:23 +02:00
function retriever_on_item_insert ( $a , $retriever , & $item ) {
if ( ! $retriever || ! $retriever [ 'id' ]) {
2019-09-22 19:55:07 +02:00
Logger :: info ( 'retriever_on_item_insert: No retriever supplied' );
2019-07-07 15:45:23 +02:00
return ;
}
2019-09-30 20:52:51 +02:00
if ( ! array_key_exists ( 'enable' , $retriever [ 'data' ]) || ! $retriever [ 'data' ][ 'enable' ] == " on " ) {
2019-07-07 15:45:23 +02:00
return ;
}
2019-09-29 17:01:46 +02:00
if ( array_key_exists ( 'plink' , $item ) && strlen ( $item [ 'plink' ])) {
2019-09-27 22:05:22 +02:00
$url = $item [ 'plink' ];
2019-07-07 15:45:23 +02:00
}
else {
2019-09-29 17:01:46 +02:00
if ( ! array_key_exists ( 'uri-id' , $item )) {
Logger :: warning ( 'retriever_on_item_insert: item ' . $item [ 'id' ] . ' has no plink and no uri-id' );
2019-09-27 22:05:22 +02:00
return ;
}
2019-09-29 17:01:46 +02:00
$content = DBA :: selectFirst ( 'item-content' , [], [ 'uri-id' => $item [ 'uri-id' ]]);
2019-09-27 22:05:22 +02:00
$url = $content [ 'plink' ];
}
2019-10-02 07:19:59 +02:00
if ( $retriever [ 'data' ][ 'modurl' ]) {
2019-09-29 17:01:46 +02:00
$orig_url = $url ;
$url = preg_replace ( '/' . $retriever [ 'data' ][ 'pattern' ] . '/' , $retriever [ 'data' ][ 'replace' ], $orig_url );
Logger :: debug ( 'retriever_on_item_insert: Changed ' . $orig_url . ' to ' . $url );
2019-07-07 15:45:23 +02:00
}
2019-09-22 11:47:30 +02:00
$resource = add_retriever_resource ( $a , $url , $item [ 'uid' ], $item [ 'contact-id' ]);
2019-10-08 18:55:34 +02:00
Logger :: debug ( '@@@ check this makes sense: ' . $resource [ 'id' ] . ' url ' . $resource [ 'url' ]);
2019-07-07 15:45:23 +02:00
$retriever_item_id = add_retriever_item ( $item , $resource );
}
2019-09-22 11:47:30 +02:00
function add_retriever_resource ( $a , $url , $uid , $cid , $binary = false ) {
2019-09-22 19:55:07 +02:00
Logger :: debug ( 'add_retriever_resource: url ' . $url . ' uid ' . $uid . ' contact-id ' . $cid );
2019-07-07 15:45:23 +02:00
$scheme = parse_url ( $url , PHP_URL_SCHEME );
if ( $scheme == 'data' ) {
$fp = fopen ( $url , 'r' );
$meta = stream_get_meta_data ( $fp );
$type = $meta [ 'mediatype' ];
$data = stream_get_contents ( $fp );
fclose ( $fp );
$url = 'md5://' . hash ( 'md5' , $url );
2019-10-02 07:19:59 +02:00
if ( DBA :: selectFirst ( 'retriever_resource' , [], [ 'url' => $url , 'item-uid' => intval ( $uid ), 'contact-id' => intval ( $cid )])) {
2019-09-22 19:55:07 +02:00
Logger :: debug ( 'add_retriever_resource: Resource ' . $url . ' already requested' );
2019-07-07 15:45:23 +02:00
return $resource ;
}
2019-09-22 19:55:07 +02:00
Logger :: debug ( 'retrieve_resource: got data URL type ' . $resource [ 'type' ]);
2019-10-02 07:19:59 +02:00
// TODO: figure out how to do this with DBA module
// @@@ DBA::update('workerqueue', ['executed' => DateTimeFormat::utcNow()], ['pid' => $mypid, 'done' => false]);
2019-09-22 11:47:30 +02:00
q ( " INSERT INTO `retriever_resource` (`item-uid`, `contact-id`, `type`, `binary`, `url`, `completed`, `data`) " .
" VALUES (%d, %d, '%s', %d, '%s', now(), '%s') " ,
intval ( $uid ),
intval ( $cid ),
2019-07-07 15:45:23 +02:00
DBA :: escape ( $type ),
intval ( $binary ? 1 : 0 ),
DBA :: escape ( $url ),
DBA :: escape ( $data ));
2019-10-08 18:55:34 +02:00
if ( DBA :: selectFirst ( 'retriever_resource' , [], [ 'url' => $url ])) {
2019-07-07 15:45:23 +02:00
retriever_resource_completed ( $resource , $a );
}
return $resource ;
}
if ( strlen ( $url ) > 800 ) {
2019-09-22 19:55:07 +02:00
Logger :: warning ( 'add_retriever_resource: URL is longer than 800 characters' );
2019-07-07 15:45:23 +02:00
}
2019-10-08 18:55:34 +02:00
if ( DBA :: selectFirst ( 'retriever_resource' , [], [ 'url' => $url , 'item-uid' => intval ( $uid ), 'contact-id' => intval ( $cid )])) {
2019-09-22 19:55:07 +02:00
Logger :: debug ( 'add_retriever_resource: Resource ' . $url . ' uid ' . $uid . ' cid ' . $cid . ' already requested' );
2019-07-07 15:45:23 +02:00
return $r [ 0 ];
}
2019-10-08 18:55:34 +02:00
DBA :: insert ( 'retriever_rule' , [ 'item-uid' => intval ( $uid ), 'contact-id' => intval ( $cid ), 'binary' => ( $binary ? 1 : 0 ), 'url' => $url ]);
Logge :: debug ( '@@@ add_retriever_resource inserting resource ' . $url . ' uid ' . $uid . ' cid ' . $cid );
//@@@ check the insert worked
return DBA :: selectFirst ( 'retriever_resource' , [], [ 'url' => $url , 'item-uid' => intval ( $uid ), 'contact-id' => intval ( $cid )]);
2019-07-07 15:45:23 +02:00
}
function add_retriever_item ( & $item , $resource ) {
2019-09-22 19:55:07 +02:00
Logger :: debug ( 'add_retriever_item: ' . $resource [ 'url' ] . ' for ' . $item [ 'uri' ] . ' ' . $item [ 'uid' ] . ' ' . $item [ 'contact-id' ]);
2019-07-07 15:45:23 +02:00
2019-10-02 07:19:59 +02:00
//@@@ can use selectFirst
2019-09-15 10:26:25 +02:00
$r = q ( " SELECT COUNT(*) FROM `retriever_item` WHERE " .
" `item-uri` = '%s' AND `item-uid` = %d AND `contact-id` = %d AND `resource` = %d " ,
DBA :: escape ( $item [ 'uri' ]), intval ( $item [ 'uid' ]), intval ( $item [ 'contact-id' ]), intval ( $resource [ 'id' ]));
if ( $r [ 0 ][ 'COUNT(*)' ] > 0 ) {
2019-09-22 19:55:07 +02:00
Logger :: info ( " add_retriever_item: retriever item already present for " . $item [ 'uri' ] . ' ' . $item [ 'uid' ] . ' ' . $item [ 'contact-id' ]);
2019-09-15 10:26:25 +02:00
return ;
}
2019-10-02 07:19:59 +02:00
//@@@ fix this
2019-07-07 15:45:23 +02:00
q ( " INSERT INTO `retriever_item` (`item-uri`, `item-uid`, `contact-id`, `resource`) " .
" VALUES ('%s', %d, %d, %d) " ,
DBA :: escape ( $item [ 'uri' ]), intval ( $item [ 'uid' ]), intval ( $item [ 'contact-id' ]), intval ( $resource [ " id " ]));
2019-10-02 07:19:59 +02:00
//@@@ fix this
2019-07-07 15:45:23 +02:00
$r = q ( " SELECT id FROM `retriever_item` WHERE " .
" `item-uri` = '%s' AND `item-uid` = %d AND `contact-id` = %d AND `resource` = %d ORDER BY id DESC " ,
DBA :: escape ( $item [ 'uri' ]), intval ( $item [ 'uid' ]), intval ( $item [ 'contact-id' ]), intval ( $resource [ 'id' ]));
if ( ! count ( $r )) {
2019-09-22 19:55:07 +02:00
Logger :: info ( " add_retriever_item: couldn't create retriever item for " . $item [ 'uri' ] . ' ' . $item [ 'uid' ] . ' ' . $item [ 'contact-id' ]);
2019-07-07 15:45:23 +02:00
return ;
}
2019-09-22 19:55:07 +02:00
Logger :: debug ( 'add_retriever_item: created retriever_item ' . $r [ 0 ][ 'id' ] . ' for item ' . $item [ 'uri' ] . ' ' . $item [ 'uid' ] . ' ' . $item [ 'contact-id' ]);
2019-07-07 15:45:23 +02:00
return $r [ 0 ][ 'id' ];
}
function retriever_get_encoding ( $resource ) {
$matches = array ();
if ( preg_match ( '/charset=(.*)/' , $resource [ 'type' ], $matches )) {
return trim ( array_pop ( $matches ));
}
return 'utf-8' ;
}
function retriever_apply_xslt_text ( $xslt_text , $doc ) {
if ( ! $xslt_text ) {
2019-09-22 19:55:07 +02:00
Logger :: info ( 'retriever_apply_xslt_text: empty XSLT text' );
2019-07-07 15:45:23 +02:00
return $doc ;
}
$xslt_doc = new DOMDocument ();
if ( ! $xslt_doc -> loadXML ( $xslt_text )) {
2019-09-22 19:55:07 +02:00
Logger :: info ( 'retriever_apply_xslt_text: could not load XML' );
2019-07-07 15:45:23 +02:00
return $doc ;
}
2019-09-29 17:01:46 +02:00
Logger :: debug ( '@@@ retriever_apply_xslt_text: ' . $xslt_text );
2019-07-07 15:45:23 +02:00
$xp = new XsltProcessor ();
$xp -> importStylesheet ( $xslt_doc );
$result = $xp -> transformToDoc ( $doc );
return $result ;
}
2019-10-02 07:19:59 +02:00
//@@@ I think this is supposed to update the $item, but it doesn't
2019-07-07 15:45:23 +02:00
function retriever_apply_dom_filter ( $retriever , & $item , $resource ) {
2019-10-02 07:19:59 +02:00
//@@@ check if id and uri-id are there //@@@ uri-id definitely is not
Logger :: debug ( 'retriever_apply_dom_filter: applying XSLT to ' . $item [ 'id' ] . ' ' . $item [ 'uri' ] . ' contact ' . $item [ 'contact-id' ] . ' uri-id ' . $item [ 'uri-id' ]);
2019-07-07 15:45:23 +02:00
2019-07-20 11:44:38 +02:00
if ( ! array_key_exists ( 'include' , $retriever [ 'data' ]) && ! array_key_exists ( 'customxslt' , $retriever [ 'data' ])) {
2019-09-22 19:55:07 +02:00
Logger :: info ( 'retriever_apply_dom_filter: no include and no customxslt' );
2019-07-07 15:45:23 +02:00
return ;
}
if ( ! $resource [ 'data' ]) {
2019-09-22 19:55:07 +02:00
Logger :: info ( 'retriever_apply_dom_filter: no text to work with' );
2019-07-07 15:45:23 +02:00
return ;
}
2019-10-02 07:19:59 +02:00
$doc = retriever_load_into_dom ( $resource );
$doc = retriever_extract ( $doc , $retriever );
if ( ! $doc ) {
Logger :: info ( 'retriever_apply_dom_filter: failed to apply extract XSLT template' );
return ;
}
$doc = retriever_globalise_urls ( $doc , $resource );
if ( ! $doc ) {
Logger :: info ( 'retriever_apply_dom_filter: failed to apply fix urls XSLT template' );
return ;
}
$body = HTML :: toBBCode ( $doc -> saveHTML ());
if ( ! strlen ( $body )) {
Logger :: info ( 'retriever_apply_dom_filter retriever ' . $retriever [ 'id' ] . ' item ' . $item [ 'id' ] . ': output was empty' );
return ;
}
$body .= " \n \n " . L10n :: t ( 'Retrieved' ) . ' ' . date ( " Y-m-d " ) . ': [url=' ;
$body .= $item [ 'plink' ];
$body .= ']' . $item [ 'plink' ] . '[/url]' ;
$uri_id = ItemURI :: getIdByURI ( $item [ 'uri' ]); //@@@ why can't I get this from the item itself? Consider using item['id'] instead
Logger :: debug ( 'retriever_apply_dom_filter: XSLT result \"' . $body . '\"' );
Item :: update ([ 'body' => $body ], [ 'uri-id' => $uri_id ]);
}
function retriever_load_into_dom ( $resource ) {
Logger :: info ( '@@@ retriever_load_into_dom start' );
2019-07-07 15:45:23 +02:00
$encoding = retriever_get_encoding ( $resource );
$content = mb_convert_encoding ( $resource [ 'data' ], 'HTML-ENTITIES' , $encoding );
$doc = new DOMDocument ( '1.0' , 'UTF-8' );
if ( strpos ( $resource [ 'type' ], 'html' ) !== false ) {
@ $doc -> loadHTML ( $content );
}
else {
$doc -> loadXML ( $content );
}
2019-10-02 07:19:59 +02:00
Logger :: info ( '@@@ retriever_load_into_dom end' );
return $doc ;
}
2019-07-07 15:45:23 +02:00
2019-10-02 07:19:59 +02:00
function retriever_extract ( $doc , $retriever ) {
Logger :: info ( '@@@ retriever_extract start' );
2019-07-07 15:45:23 +02:00
$params = array ( '$spec' => $retriever [ 'data' ]);
2019-07-20 15:45:10 +02:00
$extract_template = Renderer :: getMarkupTemplate ( 'extract.tpl' , 'addon/retriever/' );
$extract_xslt = Renderer :: replaceMacros ( $extract_template , $params );
2019-07-07 15:45:23 +02:00
if ( $retriever [ 'data' ][ 'include' ]) {
2019-09-22 19:55:07 +02:00
Logger :: debug ( 'retriever_apply_dom_filter: applying include/exclude template \"' . $extract_xslt . '\"' );
2019-07-07 15:45:23 +02:00
$doc = retriever_apply_xslt_text ( $extract_xslt , $doc );
}
if ( array_key_exists ( 'customxslt' , $retriever [ 'data' ]) && $retriever [ 'data' ][ 'customxslt' ]) {
2019-10-02 07:19:59 +02:00
Logger :: debug ( 'retriever_extract: applying custom XSLT \"' . $retriever [ 'data' ][ 'customxslt' ] . '\"' );
2019-07-07 15:45:23 +02:00
$doc = retriever_apply_xslt_text ( $retriever [ 'data' ][ 'customxslt' ], $doc );
}
2019-10-02 07:19:59 +02:00
Logger :: info ( '@@@ retriever_extract end' );
return $doc ;
}
2019-07-07 15:45:23 +02:00
2019-10-02 07:19:59 +02:00
function retriever_globalise_urls ( $doc , $resource ) {
Logger :: info ( '@@@ retriever_globalise_urls start' );
2019-07-07 15:45:23 +02:00
$components = parse_url ( $resource [ 'redirect-url' ]);
$rooturl = $components [ 'scheme' ] . " :// " . $components [ 'host' ];
$dirurl = $rooturl . dirname ( $components [ 'path' ]) . " / " ;
$params = array ( '$dirurl' => $dirurl , '$rooturl' => $rooturl );
2019-07-20 15:45:10 +02:00
$fix_urls_template = Renderer :: getMarkupTemplate ( 'fix-urls.tpl' , 'addon/retriever/' );
$fix_urls_xslt = Renderer :: replaceMacros ( $fix_urls_template , $params );
2019-07-07 15:45:23 +02:00
$doc = retriever_apply_xslt_text ( $fix_urls_xslt , $doc );
2019-10-02 07:19:59 +02:00
Logger :: info ( '@@@ retriever_globalise_urls end' );
return $doc ;
2019-07-07 15:45:23 +02:00
}
2019-10-08 18:55:34 +02:00
function retrieve_images ( $a , & $item ) {
2019-09-29 20:59:14 +02:00
// Note that $item doesn't necessarily contain all the fields you would expect, in particular 'id'
2019-10-02 07:19:59 +02:00
//@@@ doe sit contain uri-id? //@@@ it definitely does not
2019-09-22 11:47:30 +02:00
2019-10-02 07:19:59 +02:00
Logger :: debug ( '@@@ retrieve_images start item id ' . ( array_key_exists ( 'id' , $item ) ? $item [ 'id' ] : 'undef' ) . ' uri ' . $item [ 'uri' ] . ' uri id ' . $item [ 'uri-id' ] . ' plink ' . $item [ 'plink' ] . ' guid ' . $item [ 'guid' ]);
2019-09-22 11:47:30 +02:00
$uri_id = ItemURI :: getIdByURI ( $item [ 'uri' ]); //@@@ why can't I get this from the item itself?
2019-09-29 20:59:14 +02:00
$content = DBA :: selectFirst ( 'item-content' , [ 'body' ], [ 'uri-id' => $uri_id ]);
2019-09-22 11:47:30 +02:00
$body = $content [ 'body' ];
if ( ! strlen ( $body )) {
2019-09-22 19:55:07 +02:00
Logger :: warning ( 'retrieve_images: no body for uri-id ' . $uri_id );
2019-09-22 11:47:30 +02:00
return ;
}
2019-09-29 22:05:49 +02:00
// I suspect that the first two are not used any more?
preg_match_all ( " / \ [img \ =([0-9]*)x([0-9]*) \ ](.*?) \ [ \ /img \ ]/ism " , $item [ " body " ], $matches1 );
preg_match_all ( " / \ [img \ ](.*?) \ [ \ /img \ ]/ism " , $item [ " body " ], $matches2 );
preg_match_all ( " / \ [img \ =([^ \ ]]*) \ ]([^[]*) \ [ \ /img \ ]/ism " , $item [ " body " ], $matches3 );
2019-09-29 20:59:14 +02:00
$matches = array_merge ( $matches1 [ 3 ], $matches2 [ 1 ], $matches3 [ 1 ]);
2019-09-22 19:55:07 +02:00
Logger :: debug ( 'retrieve_images: found ' . count ( $matches ) . ' images for item ' . $item [ 'uri' ] . ' ' . $item [ 'uid' ] . ' ' . $item [ 'contact-id' ]);
2019-07-07 15:45:23 +02:00
foreach ( $matches as $url ) {
2019-07-20 15:37:57 +02:00
if ( strpos ( $url , get_app () -> getBaseUrl ()) === FALSE ) {
2019-09-22 11:47:30 +02:00
$resource = add_retriever_resource ( $a , $url , $item [ 'uid' ], $item [ 'contact-id' ], true );
2019-10-08 18:55:34 +02:00
Logger :: debug ( '@@@ check this makes sense 2: ' . $resource [ 'id' ] . ' url ' . $resource [ 'url' ]);
2019-07-07 15:45:23 +02:00
if ( ! $resource [ 'completed' ]) {
add_retriever_item ( $item , $resource );
}
else {
retriever_transform_images ( $a , $item , $resource );
}
}
}
}
function retriever_check_item_completed ( & $item )
{
2019-10-08 18:55:34 +02:00
$waiting = DBA :: selectFirst ( 'retriever_item' , [], [ 'item-uri' => $item [ 'uri' ], 'item-uid' => intval ( $item [ 'uid' ]), 'contact-id' => intval ( $item [ 'contact-id' ]), 'finished' => 0 ]);
Logger :: debug ( '@@@ waiting is ' . $waiting );
2019-10-08 07:29:59 +02:00
// TODO: figure out how to do this with DBA module //@@@ selectFirst works
2019-07-07 15:45:23 +02:00
$r = q ( 'SELECT count(*) FROM retriever_item WHERE `item-uri` = "%s" ' .
'AND `item-uid` = %d AND `contact-id` = %d AND `finished` = 0' ,
DBA :: escape ( $item [ 'uri' ]), intval ( $item [ 'uid' ]),
intval ( $item [ 'contact-id' ]));
$waiting = $r [ 0 ][ 'count(*)' ];
2019-09-22 19:55:07 +02:00
Logger :: debug ( 'retriever_check_item_completed: item ' . $item [ 'uri' ] . ' ' . $item [ 'uid' ] . ' ' . $item [ 'contact-id' ] . ' waiting for ' . $waiting . ' resources' );
2019-07-07 15:45:23 +02:00
$old_visible = $item [ 'visible' ];
$item [ 'visible' ] = $waiting ? 0 : 1 ;
if ( array_key_exists ( 'id' , $item ) && ( $item [ 'id' ] > 0 ) && ( $old_visible != $item [ 'visible' ])) {
2019-09-22 19:55:07 +02:00
Logger :: debug ( 'retriever_check_item_completed: changing visible flag to ' . $item [ 'visible' ]);
2019-10-08 07:29:59 +02:00
Item :: update ([ 'visible' => $item [ 'visible' ]], [ 'id' => intval ( $item [ 'id' ])]);
2019-07-07 15:45:23 +02:00
}
}
function retriever_apply_completed_resource_to_item ( $retriever , & $item , $resource , $a ) {
2019-09-22 19:55:07 +02:00
Logger :: debug ( 'retriever_apply_completed_resource_to_item: retriever ' . ( $retriever ? $retriever [ 'id' ] : 'none' ) . ' resource ' . $resource [ 'url' ] . ' plink ' . $item [ 'plink' ]);
2019-07-07 15:45:23 +02:00
if ( strpos ( $resource [ 'type' ], 'image' ) !== false ) {
retriever_transform_images ( $a , $item , $resource );
}
if ( ! $retriever ) {
2019-10-08 18:55:34 +02:00
Logger :: warning ( 'retriever_apply_completed_resource_to_item: no retriever' );
2019-07-07 15:45:23 +02:00
return ;
}
if (( strpos ( $resource [ 'type' ], 'html' ) !== false ) ||
( strpos ( $resource [ 'type' ], 'xml' ) !== false )) {
retriever_apply_dom_filter ( $retriever , $item , $resource );
2019-09-29 17:01:46 +02:00
if ( $retriever [ 'data' ][ 'images' ] ) {
2019-10-08 18:55:34 +02:00
retrieve_images ( $a , $item );
2019-07-07 15:45:23 +02:00
}
}
}
2019-09-22 11:47:30 +02:00
//@@@ todo: what is this reference for? document if needed delete if not
2019-07-07 15:45:23 +02:00
function retriever_transform_images ( $a , & $item , $resource ) {
2019-09-29 17:01:46 +02:00
if ( ! $resource [ 'data' ]) {
2019-09-22 19:55:07 +02:00
Logger :: info ( 'retriever_transform_images: no data available for ' . $resource [ 'id' ] . ' ' . $resource [ 'url' ]);
2019-07-07 15:45:23 +02:00
return ;
}
2019-09-22 11:47:30 +02:00
$uri_id = ItemURI :: getIdByURI ( $item [ 'uri' ]); //@@@ why can't I get this from the item itself?
2019-10-02 07:19:59 +02:00
$data = $resource [ 'data' ];
$type = $resource [ 'type' ];
$uid = $item [ 'uid' ];
$cid = $item [ 'contact-id' ];
$rid = Photo :: newResource ();
$path = parse_url ( $resource [ 'url' ], PHP_URL_PATH );
$parts = pathinfo ( $path );
$filename = $parts [ 'filename' ] . ( array_key_exists ( 'extension' , $parts ) ? '.' . $parts [ 'extension' ] : '' );
$album = 'Wall Photos' ;
$scale = 0 ;
$desc = '' ; // TODO: store alt text with resource when it's requested so we can fill this in
Logger :: debug ( 'retriever_transform_images storing ' . strlen ( $data ) . ' bytes type ' . $type . ': uid ' . $uid . ' cid ' . $cid . ' rid ' . $rid . ' filename ' . $filename . ' album ' . $album . ' scale ' . $scale . ' desc ' . $desc );
$image = new Image ( $data , $type );
if ( ! $image -> isValid ()) {
Logger :: warning ( 'retriever_transform_images: invalid image found at URL ' . $resource [ 'url' ] . ' for item ' . $item [ 'id' ]);
return ;
}
$photo = Photo :: store ( $image , $uid , $cid , $rid , $filename , $album , 0 , 0 , " " , " " , " " , " " , $desc );
$new_url = System :: baseUrl () . '/photo/' . $rid . '-0.' . $image -> getExt ();
if ( ! strlen ( $new_url )) {
Logger :: warning ( 'retriever_transform_images: no replacement URL for image ' . $resource [ 'url' ]);
return ;
}
2019-09-22 11:47:30 +02:00
2019-10-02 07:19:59 +02:00
$content = DBA :: selectFirst ( 'item-content' , [ 'body' ], [ 'uri-id' => $uri_id ]);
$body = $content [ 'body' ];
2019-09-22 11:47:30 +02:00
2019-10-02 07:19:59 +02:00
Logger :: debug ( 'retriever_transform_images: replacing ' . $resource [ 'url' ] . ' with ' . $new_url . ' in item ' . $item [ 'uri' ]);
$body = str_replace ( $resource [ " url " ], $new_url , $body );
2019-09-22 11:47:30 +02:00
2019-10-02 07:19:59 +02:00
Item :: update ([ 'body' => $body ], [ 'uri-id' => $uri_id ]);
2019-07-07 15:45:23 +02:00
}
function retriever_content ( $a ) {
if ( ! local_user ()) {
$a -> page [ 'content' ] .= " <p>Please log in</p> " ;
return ;
}
if ( $a -> argv [ 1 ] === 'help' ) {
2019-10-08 07:29:59 +02:00
$feeds = DBA :: selectToArray ( 'contact' , [ 'id' , 'name' , 'thumb' ], [ 'uid' => local_user (), 'network' => 'feed' ]);
for ( $i = 0 ; $i < count ( $feeds ); ++ $i ) {
$feeds [ $i ][ 'url' ] = $a -> getBaseUrl () . '/retriever/' . $feeds [ $i ][ 'id' ];
2019-07-07 15:45:23 +02:00
}
2019-10-08 07:29:59 +02:00
//@@@ this is broken
2019-07-20 15:45:10 +02:00
$template = Renderer :: getMarkupTemplate ( '/help.tpl' , 'addon/retriever/' );
$a -> page [ 'content' ] .= Renderer :: replaceMacros ( $template , array (
2019-07-21 19:27:14 +02:00
'$config' => $a -> getBaseUrl () . '/settings/addon' ,
2019-07-07 15:45:23 +02:00
'$feeds' => $feeds ));
return ;
}
if ( $a -> argv [ 1 ]) {
2019-09-22 11:47:30 +02:00
$retriever_rule = get_retriever_rule ( $a -> argv [ 1 ], local_user (), false );
2019-07-07 15:45:23 +02:00
2019-07-21 19:27:14 +02:00
if ( ! empty ( $_POST [ " id " ])) {
2019-09-22 11:47:30 +02:00
$retriever_rule = get_retriever_rule ( $a -> argv [ 1 ], local_user (), true );
2019-09-29 17:01:46 +02:00
$retriever_rule [ 'data' ] = array ();
2019-10-02 07:19:59 +02:00
foreach ( array ( 'modurl' , 'pattern' , 'replace' , 'enable' , 'images' , 'customxslt' , 'storecookies' , 'cookiedata' ) as $setting ) {
2019-09-29 17:01:46 +02:00
if ( empty ( $_POST [ 'retriever_' . $setting ])) {
$retriever_rule [ 'data' ][ $setting ] = NULL ;
}
else {
$retriever_rule [ 'data' ][ $setting ] = $_POST [ 'retriever_' . $setting ];
2019-07-07 15:45:23 +02:00
}
}
foreach ( $_POST as $k => $v ) {
if ( preg_match ( " /retriever-(include|exclude)-( \ d+)-(element|attribute|value)/ " , $k , $matches )) {
2019-09-22 11:47:30 +02:00
$retriever_rule [ 'data' ][ $matches [ 1 ]][ intval ( $matches [ 2 ])][ $matches [ 3 ]] = $v ;
2019-07-07 15:45:23 +02:00
}
}
// You've gotta have an element, even if it's just "*"
2019-09-22 11:47:30 +02:00
foreach ( $retriever_rule [ 'data' ][ 'include' ] as $k => $clause ) {
2019-07-07 15:45:23 +02:00
if ( ! $clause [ 'element' ]) {
2019-09-22 11:47:30 +02:00
unset ( $retriever_rule [ 'data' ][ 'include' ][ $k ]);
2019-07-07 15:45:23 +02:00
}
}
2019-09-22 11:47:30 +02:00
foreach ( $retriever_rule [ 'data' ][ 'exclude' ] as $k => $clause ) {
2019-07-07 15:45:23 +02:00
if ( ! $clause [ 'element' ]) {
2019-09-22 11:47:30 +02:00
unset ( $retriever_rule [ 'data' ][ 'exclude' ][ $k ]);
2019-07-07 15:45:23 +02:00
}
}
2019-10-08 18:55:34 +02:00
//@@@ check that this works
DBA :: update ( 'retriever_rule' , [ 'data' => json_encode ( $retriever_rule [ 'data' ])], [ 'id' => intval ( $retriever_rule [ " id " ])], [ 'data' => '' ]);
2019-07-07 15:45:23 +02:00
$a -> page [ 'content' ] .= " <p><b>Settings Updated " ;
2019-07-21 19:27:14 +02:00
if ( ! empty ( $_POST [ " retriever_retrospective " ])) {
2019-09-22 11:47:30 +02:00
apply_retrospective ( $a , $retriever_rule , $_POST [ " retriever_retrospective " ]);
2019-09-29 17:01:46 +02:00
$a -> page [ 'content' ] .= " and retrospectively applied to " . $_POST [ " retriever_retrospective " ] . " posts " ;
2019-07-07 15:45:23 +02:00
}
$a -> page [ 'content' ] .= " .</p></b> " ;
}
2019-07-20 15:45:10 +02:00
$template = Renderer :: getMarkupTemplate ( '/rule-config.tpl' , 'addon/retriever/' );
$a -> page [ 'content' ] .= Renderer :: replaceMacros ( $template , array (
2019-07-07 15:45:23 +02:00
'$enable' => array (
'retriever_enable' ,
L10n :: t ( 'Enabled' ),
2019-09-22 11:47:30 +02:00
$retriever_rule [ 'data' ][ 'enable' ]),
2019-10-02 07:19:59 +02:00
'$modurl' => array (
'retriever_modurl' ,
L10n :: t ( 'Modify URL' ),
$retriever_rule [ 'data' ][ 'modurl' ],
L10n :: t ( " Modify each article's URL with regular expressions before retrieving. " )),
2019-07-07 15:45:23 +02:00
'$pattern' => array (
'retriever_pattern' ,
L10n :: t ( 'URL Pattern' ),
2019-09-29 17:01:46 +02:00
$retriever_rule [ 'data' ][ 'pattern' ],
2019-07-07 15:45:23 +02:00
L10n :: t ( 'Regular expression matching part of the URL to replace' )),
'$replace' => array (
'retriever_replace' ,
L10n :: t ( 'URL Replace' ),
2019-09-29 17:01:46 +02:00
$retriever_rule [ 'data' ][ 'replace' ],
2019-07-07 15:45:23 +02:00
L10n :: t ( 'Text to replace matching part of above regular expression' )),
'$images' => array (
'retriever_images' ,
L10n :: t ( 'Download Images' ),
2019-09-22 11:47:30 +02:00
$retriever_rule [ 'data' ][ 'images' ]),
2019-07-07 15:45:23 +02:00
'$retrospective' => array (
'retriever_retrospective' ,
L10n :: t ( 'Retrospectively Apply' ),
'0' ,
L10n :: t ( 'Reapply the rules to this number of posts' )),
2019-07-21 20:32:30 +02:00
'storecookies' => array (
'retriever_storecookies' ,
L10n :: t ( 'Store cookies' ),
2019-09-22 11:47:30 +02:00
$retriever_rule [ 'data' ][ 'storecookies' ],
2019-07-21 20:32:30 +02:00
L10n :: t ( " Preserve cookie data across fetches. " )),
'$cookiedata' => array (
'retriever_cookiedata' ,
L10n :: t ( 'Cookie Data' ),
2019-09-22 11:47:30 +02:00
$retriever_rule [ 'data' ][ 'cookiedata' ],
2019-07-21 20:32:30 +02:00
L10n :: t ( " Latest cookie data for this feed. Netscape cookie file format. " )),
2019-07-07 15:45:23 +02:00
'$customxslt' => array (
'retriever_customxslt' ,
L10n :: t ( 'Custom XSLT' ),
2019-09-22 11:47:30 +02:00
$retriever_rule [ 'data' ][ 'customxslt' ],
2019-07-07 15:45:23 +02:00
L10n :: t ( " When standard rules aren't enough, apply custom XSLT to the article " )),
'$title' => L10n :: t ( 'Retrieve Feed Content' ),
2019-07-21 19:27:14 +02:00
'$help' => $a -> getBaseUrl () . '/retriever/help' ,
2019-07-07 15:45:23 +02:00
'$help_t' => L10n :: t ( 'Get Help' ),
'$submit_t' => L10n :: t ( 'Submit' ),
'$submit' => L10n :: t ( 'Save Settings' ),
2019-09-22 11:47:30 +02:00
'$id' => ( $retriever_rule [ " id " ] ? $retriever_rule [ " id " ] : " create " ),
2019-07-07 15:45:23 +02:00
'$tag_t' => L10n :: t ( 'Tag' ),
'$attribute_t' => L10n :: t ( 'Attribute' ),
'$value_t' => L10n :: t ( 'Value' ),
'$add_t' => L10n :: t ( 'Add' ),
'$remove_t' => L10n :: t ( 'Remove' ),
'$include_t' => L10n :: t ( 'Include' ),
2019-09-22 11:47:30 +02:00
'$include' => $retriever_rule [ 'data' ][ 'include' ],
2019-07-07 15:45:23 +02:00
'$exclude_t' => L10n :: t ( 'Exclude' ),
2019-09-29 17:01:46 +02:00
'$exclude' => $retriever_rule [ 'data' ][ 'exclude' ]));
2019-07-07 15:45:23 +02:00
return ;
}
}
function retriever_contact_photo_menu ( $a , & $args ) {
if ( ! $args ) {
return ;
}
if ( $args [ " contact " ][ " network " ] == " feed " ) {
2019-07-21 19:27:14 +02:00
$args [ " menu " ][ 'retriever' ] = array ( L10n :: t ( 'Retriever' ), $a -> getBaseUrl () . '/retriever/' . $args [ " contact " ][ 'id' ]);
2019-07-07 15:45:23 +02:00
}
}
function retriever_post_remote_hook ( & $a , & $item ) {
2019-10-02 07:19:59 +02:00
// Note that $item doesn't necessarily contain all the fields you would expect, in particular 'id'
2019-09-22 19:55:07 +02:00
Logger :: info ( 'retriever_post_remote_hook: ' . $item [ 'uri' ] . ' ' . $item [ 'uid' ] . ' ' . $item [ 'contact-id' ]);
2019-07-07 15:45:23 +02:00
2019-09-22 11:47:30 +02:00
$uri_id = ItemURI :: getIdByURI ( $item [ 'uri' ]); //@@@ why can't I get this from the item itself?
$retriever_rule = get_retriever_rule ( $item [ 'contact-id' ], $item [ " uid " ], false );
if ( $retriever_rule ) {
retriever_on_item_insert ( $a , $retriever_rule , $item );
2019-07-07 15:45:23 +02:00
}
else {
if ( PConfig :: get ( $item [ " uid " ], 'retriever' , 'oembed' )) {
// Convert to HTML and back to take advantage of bbcode's resolution of oembeds.
2019-09-22 11:47:30 +02:00
$content = DBA :: selectFirst ( 'item-content' , [], [ 'uri-id' => $uri_id ]);
$body = HTML :: toBBCode ( BBCode :: convert ( $content [ 'body' ]));
2019-07-07 15:45:23 +02:00
if ( $body ) {
$item [ 'body' ] = $body ;
2019-09-27 22:05:00 +02:00
Item :: update ([ 'body' => $body ], [ 'uri-id' => $uri_id ]);
2019-07-07 15:45:23 +02:00
}
}
if ( PConfig :: get ( $item [ " uid " ], 'retriever' , 'all_photos' )) {
2019-10-08 18:55:34 +02:00
retrieve_images ( $a , $item );
2019-07-07 15:45:23 +02:00
}
}
retriever_check_item_completed ( $item );
}
function retriever_plugin_settings ( & $a , & $s ) {
$all_photos = PConfig :: get ( local_user (), 'retriever' , 'all_photos' );
$oembed = PConfig :: get ( local_user (), 'retriever' , 'oembed' );
2019-07-20 15:45:10 +02:00
$template = Renderer :: getMarkupTemplate ( '/settings.tpl' , 'addon/retriever/' );
$s .= Renderer :: replaceMacros ( $template , array (
2019-07-07 15:45:23 +02:00
'$allphotos' => array (
'retriever_all_photos' ,
L10n :: t ( 'All Photos' ),
$all_photos ,
L10n :: t ( 'Check this to retrieve photos for all posts' )),
'$oembed' => array (
'retriever_oembed' ,
L10n :: t ( 'Resolve OEmbed' ),
$oembed ,
L10n :: t ( 'Check this to attempt to retrieve embedded content for all posts - useful e.g. for Facebook posts' )),
'$submit' => L10n :: t ( 'Save Settings' ),
'$title' => L10n :: t ( 'Retriever Settings' ),
2019-07-21 19:27:14 +02:00
'$help' => $a -> getBaseUrl () . '/retriever/help' ));
2019-07-07 15:45:23 +02:00
}
function retriever_plugin_settings_post ( $a , $post ) {
if ( $_POST [ 'retriever_all_photos' ]) {
PConfig :: set ( local_user (), 'retriever' , 'all_photos' , $_POST [ 'retriever_all_photos' ]);
}
else {
PConfig :: del ( local_user (), 'retriever' , 'all_photos' );
}
if ( $_POST [ 'retriever_oembed' ]) {
PConfig :: set ( local_user (), 'retriever' , 'oembed' , $_POST [ 'retriever_oembed' ]);
}
else {
PConfig :: del ( local_user (), 'retriever' , 'oembed' );
}
}