From e11ee98986b6d79301cc194c8e665f927bc76f34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20K=C3=BCchler?= Date: Thu, 9 Jan 2025 13:57:41 +0100 Subject: [PATCH 1/3] Add Html Content Validation and actuality --- src/ConfluencePageContentDownloader.php | 26 +++++++++++++++-- src/Endpoint/Content.php | 6 +++- src/Endpoint/Download.php | 35 ++++++++++++++++++----- src/Endpoint/Dto/ConfluenceAttachment.php | 10 +++++++ 4 files changed, 67 insertions(+), 10 deletions(-) diff --git a/src/ConfluencePageContentDownloader.php b/src/ConfluencePageContentDownloader.php index 6920545..ba4a67f 100755 --- a/src/ConfluencePageContentDownloader.php +++ b/src/ConfluencePageContentDownloader.php @@ -8,6 +8,7 @@ use Artemeon\Confluence\Endpoint\Download; use Artemeon\Confluence\Endpoint\Dto\ConfluencePage; use Artemeon\Confluence\MacroReplacer\MacroReplacerInterface; +use DOMDocument; use Exception; class ConfluencePageContentDownloader @@ -25,6 +26,8 @@ public function __construct(Content $contentEndpoint, Download $downloadEndpoint public function downloadPageContent(ConfluencePage $page, bool $withAttachments = true): void { + $page = $this->repairPageContent($page); + try { foreach ($this->macroReplacers as $macroReplacer) { if ($macroReplacer instanceof MacroReplacerInterface) { @@ -42,10 +45,29 @@ public function downloadPageContent(ConfluencePage $page, bool $withAttachments foreach ($attachments as $attachment) { $this->downloadEndpoint->downloadAttachment($attachment); } - } catch (Exception $e) { - echo 'An error has occurred: ' . $e->getMessage(); + echo 'An error has occurred: '.$e->getMessage(); } } + private function repairPageContent(ConfluencePage $page): ConfluencePage + { + $previousLibxmlState = libxml_use_internal_errors(true); + + $domDocument = new DOMDocument(); + $domDocument->loadHTML($page->getContent()); + if (!$domDocument->validate()) { + $pageContent = ''; + foreach ($domDocument->getElementsByTagName('body')->item(0)->childNodes as $child) { + $pageContent .= $domDocument->saveHTML($child); + } + + $page->setContent($pageContent); + } + + libxml_clear_errors(); + libxml_use_internal_errors($previousLibxmlState); + + return $page; + } } diff --git a/src/Endpoint/Content.php b/src/Endpoint/Content.php index c59ba1a..b8911c1 100755 --- a/src/Endpoint/Content.php +++ b/src/Endpoint/Content.php @@ -94,7 +94,11 @@ public function findChildAttachments(string $pageId): array { $response = $this->client->get( 'wiki/rest/api/content/' . $pageId . '/child/attachment', - array_merge([], $this->auth->getAuthenticationArray()) + array_merge([ + 'query' => [ + 'expand' => 'history,history.lastUpdated' + ] + ], $this->auth->getAuthenticationArray()) ); if ($response->getStatusCode() === 200) { diff --git a/src/Endpoint/Download.php b/src/Endpoint/Download.php index 0b0f1e2..33d21cf 100755 --- a/src/Endpoint/Download.php +++ b/src/Endpoint/Download.php @@ -38,7 +38,7 @@ public function downloadPageContent(ConfluencePage $confluencePage, string $file return; } - $htmlFile = $this->downloadFolder . '/' . $fileName; + $htmlFile = $this->downloadFolder.'/'.$fileName; file_put_contents($htmlFile, $confluencePage->getContent()); } @@ -50,12 +50,33 @@ public function downloadAttachment(ConfluenceAttachment $attachment): void return; } - // Verwende den relativen Pfad aus der API, um das Attachment herunterzuladen - $attachmentContent = $this->client->get( - '/wiki/' . $attachment->findDownloadPath(), - array_merge([], $this->auth->getAuthenticationArray()) - )->getBody()->getContents(); + if ($this->shouldAttachmentBeUpdated($attachment)) { + // Verwende den relativen Pfad aus der API, um das Attachment herunterzuladen + $attachmentContent = $this->client->get( + '/wiki/'.$attachment->findDownloadPath(), + array_merge([], $this->auth->getAuthenticationArray()) + )->getBody()->getContents(); - file_put_contents($this->downloadFolder . '/' . $attachment->getTitle(), $attachmentContent); + file_put_contents($this->getAttachmentFilePath($attachment), $attachmentContent); + } + } + + private function getAttachmentFilePath(ConfluenceAttachment $attachment): string + { + return $this->downloadFolder.'/'.$attachment->getTitle(); + } + + private function shouldAttachmentBeUpdated(ConfluenceAttachment $attachment): bool + { + $filepath = $this->getAttachmentFilePath($attachment); + + if (file_exists($filepath)) { + $filemtime = filemtime($filepath); + if (is_int($filemtime)) { + return $filemtime < $attachment->getLastUpdated()->getTimestamp(); + } + } + + return true; } } diff --git a/src/Endpoint/Dto/ConfluenceAttachment.php b/src/Endpoint/Dto/ConfluenceAttachment.php index 70fefaf..2841d1f 100755 --- a/src/Endpoint/Dto/ConfluenceAttachment.php +++ b/src/Endpoint/Dto/ConfluenceAttachment.php @@ -4,17 +4,22 @@ namespace Artemeon\Confluence\Endpoint\Dto; +use DateTime; + class ConfluenceAttachment { private array $rawData; private string $title; + private ?DateTime $lastUpdated; + public function __construct(array $rawData) { $this->rawData = $rawData; $this->title = $rawData['title']; + $this->lastUpdated = isset($rawData['history']['lastUpdated']['when']) ? new DateTime($rawData['history']['lastUpdated']['when']) : null; } public function findDownloadPath(): ?string @@ -26,4 +31,9 @@ public function getTitle(): string { return $this->title; } + + public function getLastUpdated(): ?DateTime + { + return $this->lastUpdated; + } } From 942c7ca3d1019d5d4bacc2b256153e2cabda6d08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20K=C3=BCchler?= Date: Thu, 9 Jan 2025 14:00:55 +0100 Subject: [PATCH 2/3] reformat code --- src/Endpoint/Download.php | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Endpoint/Download.php b/src/Endpoint/Download.php index 33d21cf..d2366e2 100755 --- a/src/Endpoint/Download.php +++ b/src/Endpoint/Download.php @@ -38,7 +38,7 @@ public function downloadPageContent(ConfluencePage $confluencePage, string $file return; } - $htmlFile = $this->downloadFolder.'/'.$fileName; + $htmlFile = $this->downloadFolder . '/' . $fileName; file_put_contents($htmlFile, $confluencePage->getContent()); } @@ -53,7 +53,7 @@ public function downloadAttachment(ConfluenceAttachment $attachment): void if ($this->shouldAttachmentBeUpdated($attachment)) { // Verwende den relativen Pfad aus der API, um das Attachment herunterzuladen $attachmentContent = $this->client->get( - '/wiki/'.$attachment->findDownloadPath(), + '/wiki/' . $attachment->findDownloadPath(), array_merge([], $this->auth->getAuthenticationArray()) )->getBody()->getContents(); @@ -63,7 +63,7 @@ public function downloadAttachment(ConfluenceAttachment $attachment): void private function getAttachmentFilePath(ConfluenceAttachment $attachment): string { - return $this->downloadFolder.'/'.$attachment->getTitle(); + return $this->downloadFolder . '/' . $attachment->getTitle(); } private function shouldAttachmentBeUpdated(ConfluenceAttachment $attachment): bool From e3ebd36590c76f938a5c0cbdb35b28a1c985a9ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20K=C3=BCchler?= Date: Thu, 9 Jan 2025 14:02:49 +0100 Subject: [PATCH 3/3] reformat code --- src/ConfluencePageContentDownloader.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ConfluencePageContentDownloader.php b/src/ConfluencePageContentDownloader.php index ba4a67f..5da8d73 100755 --- a/src/ConfluencePageContentDownloader.php +++ b/src/ConfluencePageContentDownloader.php @@ -46,7 +46,7 @@ public function downloadPageContent(ConfluencePage $page, bool $withAttachments $this->downloadEndpoint->downloadAttachment($attachment); } } catch (Exception $e) { - echo 'An error has occurred: '.$e->getMessage(); + echo 'An error has occurred: ' . $e->getMessage(); } }