From 5ea8dca4c429d95bd998be61b3158500a6f12e8d Mon Sep 17 00:00:00 2001 From: Matthew Exon Date: Thu, 20 Jun 2024 20:31:07 +0100 Subject: [PATCH 1/5] Fix broken images that have been broken for ages --- retriever/retriever.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/retriever/retriever.php b/retriever/retriever.php index 66a44fdb..275e84e7 100644 --- a/retriever/retriever.php +++ b/retriever/retriever.php @@ -829,7 +829,7 @@ function retriever_transform_images(array &$item, array $resource) { Logger::error('retriever_transform_images: unable to store photo ' . $resource['url'] . ' error: ' . $e->getMessage()); return; } - $new_url = DI::baseUrl() . '/photo/' . $rid . '-0.' . $image->getExt(); + $new_url = DI::baseUrl() . '/photo/' . $rid . '-0' . $image->getExt(); if (!strlen($new_url)) { Logger::warning('retriever_transform_images: no replacement URL for image ' . $resource['url']); return; From 941818ffb852ed8a5820cd9c45b732fbeaf4e91a Mon Sep 17 00:00:00 2001 From: Matthew Exon Date: Thu, 20 Jun 2024 20:32:05 +0100 Subject: [PATCH 2/5] fix whitespace --- retriever/retriever.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/retriever/retriever.php b/retriever/retriever.php index 275e84e7..4f87fe1b 100644 --- a/retriever/retriever.php +++ b/retriever/retriever.php @@ -661,9 +661,9 @@ function retriever_extract(DOMDocument $doc, array $retriever) { */ function retriever_globalise_urls(DOMDocument $doc, array $resource) { $components = parse_url($resource['redirect-url']); - if (!array_key_exists('scheme', $components) || !array_key_exists('host', $components) || !array_key_exists('path', $components)) { + if (!array_key_exists('scheme', $components) || !array_key_exists('host', $components) || !array_key_exists('path', $components)) { return $doc; - } + } $rooturl = $components['scheme'] . "://" . $components['host']; $dirurl = $rooturl . dirname($components['path']) . "/"; $params = array('$dirurl' => $dirurl, '$rooturl' => $rooturl); From 58dc1ecef1bb6b31552e1e75062f9befca2986c4 Mon Sep 17 00:00:00 2001 From: Matthew Exon Date: Thu, 20 Jun 2024 20:32:52 +0100 Subject: [PATCH 3/5] globalise_urls works better when retrospectively applying --- retriever/retriever.php | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/retriever/retriever.php b/retriever/retriever.php index 4f87fe1b..639d96c6 100644 --- a/retriever/retriever.php +++ b/retriever/retriever.php @@ -660,7 +660,11 @@ function retriever_extract(DOMDocument $doc, array $retriever) { * @return DOMDocument New DOM document with global URLs */ function retriever_globalise_urls(DOMDocument $doc, array $resource) { - $components = parse_url($resource['redirect-url']); + $url = $resource['redirect-url']; + if ($url == "") { + $url = $resource['url']; + } + $components = parse_url($url); if (!array_key_exists('scheme', $components) || !array_key_exists('host', $components) || !array_key_exists('path', $components)) { return $doc; } From 7224eac3a32fc3592aa527a55c404e2c5490a932 Mon Sep 17 00:00:00 2001 From: Matthew Exon Date: Thu, 20 Jun 2024 20:33:32 +0100 Subject: [PATCH 4/5] globalise urls now handles relative urls --- retriever/templates/fix-urls.tpl | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/retriever/templates/fix-urls.tpl b/retriever/templates/fix-urls.tpl index 248d4770..1d59938c 100644 --- a/retriever/templates/fix-urls.tpl +++ b/retriever/templates/fix-urls.tpl @@ -22,5 +22,10 @@ + + + + + From 12a9e9472f3b3461ef1236b0fa6e37ab80df5025 Mon Sep 17 00:00:00 2001 From: Matthew Exon Date: Thu, 20 Jun 2024 20:34:14 +0100 Subject: [PATCH 5/5] handle failed image urls better --- mailstream/mailstream.php | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mailstream/mailstream.php b/mailstream/mailstream.php index cfc665f0..d814e28c 100644 --- a/mailstream/mailstream.php +++ b/mailstream/mailstream.php @@ -239,6 +239,11 @@ function mailstream_do_images(array &$item, array &$attachments) $cookiejar = tempnam(System::getTempPath(), 'cookiejar-mailstream-'); try { $curlResult = DI::httpClient()->fetchFull($url, HttpClientAccept::DEFAULT, 0, $cookiejar); + if (!$curlResult->isSuccess()) { + Logger::debug('mailstream: fetch image url failed', [ + 'url' => $url, 'item_id' => $item['id'], 'return_code' => $curlResult->getReturnCode()]); + continue; + } } catch (InvalidArgumentException $e) { Logger::error('mailstream_do_images exception fetching url', ['url' => $url, 'item_id' => $item['id']]); continue;