Added:

- ContentSource handling in message - ContentSource list, add/update ui - nextPageSelector and imageSelector can be null - cleanup
2024-06-30 20:47:27 +02:00
parent ba30d3102d
commit 3012adfee7
24 changed files with 762 additions and 707 deletions
--- a/src/Service/MangaScraperService.php
+++ b/src/Service/MangaScraperService.php
@@ -166,6 +166,21 @@ class MangaScraperService
        return json_decode(implode("", $output), true);
    }

+    /**
+     * @throws GuzzleException
+     */
+    public function testScrapingHtml(string $mangaSlug, string $chapterNumber, ContentSource $contentSource): array
+    {
+        $chapterUrl = $contentSource->getChapterUrl($mangaSlug, $chapterNumber);
+        $html = $this->fetchHtml($chapterUrl);
+
+        if ($contentSource->getNextPageSelector() === null) {
+            return $this->scrapeVerticalReader($html, $contentSource);
+        } else {
+            return $this->scrapeHorizontalReader($chapterUrl, $contentSource);
+        }
+    }
+
    /**
     * @throws GuzzleException
     */
@@ -173,32 +188,32 @@ class MangaScraperService
    {
        $chapterUrl = $mangaSource->getChapterUrl($manga->getSlug(), $chapter->getNumber());

-        $pageData = [];
-        $currentPageUrl = $chapterUrl;
-
        $tempDir = sys_get_temp_dir() . '/' . uniqid('manga_scraper_');
        mkdir($tempDir);

-        do {
-            $html = $this->fetchHtml($currentPageUrl);
-            $page = $this->extractMangaPageData($html, $mangaSource);
+        $pageData = [];

-            $imageName = sprintf('%03d.%s', count($pageData) + 1, pathinfo(parse_url($page['image_url'], PHP_URL_PATH), PATHINFO_EXTENSION));
+        if ($mangaSource->getNextPageSelector() === null) {
+            // Lecteur vertical
+            $html = $this->fetchHtml($chapterUrl);
+            $pageData = $this->scrapeVerticalReader($html, $mangaSource);
+        } else {
+            // Lecteur horizontal (paginé)
+            $pageData = $this->scrapeHorizontalReader($chapterUrl, $mangaSource);
+        }
+
+        // Télécharger et sauvegarder les images
+        foreach ($pageData as $index => &$page) {
+            $imageName = sprintf('%03d.%s', $index + 1, pathinfo(parse_url($page['image_url'], PHP_URL_PATH), PATHINFO_EXTENSION));
            $imagePath = $tempDir . '/' . $imageName;

            $this->downloadAndSaveImage($page['image_url'], $imagePath);

-            $event = new PageScrappingProgressEvent($chapter->getId(), count($pageData) + 1, 0);
+            $event = new PageScrappingProgressEvent($chapter->getId(), $index + 1, count($pageData));
            $this->eventDispatcher->dispatch($event, PageScrappingProgressEvent::NAME);

-            $pageData[] = [
-                'image_url' => $page['image_url'],
-                'local_image_url' => $imagePath,
-                'page_number' => count($pageData) + 1,
-            ];
-
-            $currentPageUrl = $page['next_page_url'];
-        } while ($currentPageUrl);
+            $page['local_image_url'] = $imagePath;
+        }

        $cbzFilePath = $this->generateCbzPath($manga, $chapter);
        $this->createCbzFile($tempDir, $pageData, $cbzFilePath);
@@ -210,7 +225,78 @@ class MangaScraperService
        // Nettoyage du répertoire temporaire
        $this->cleanupTempFiles($tempDir);

-        return true;
+        return $pageData;
+    }
+
+    private function scrapeVerticalReader(string $html, ContentSource $contentSource): array
+    {
+        $crawler = new Crawler($html);
+        $images = $crawler->filter($contentSource->getImageSelector());
+
+        $pageData = [];
+        foreach ($images as $index => $image) {
+            if($image->getAttribute('src') === ''){
+                $imgUrl = $image->getAttribute('data-src');
+            }else{
+                $imgUrl = $image->getAttribute('src');
+            }
+            $pageData[] = [
+                'image_url' => $this->cleanImageUrl($imgUrl),
+                'page_number' => $index + 1,
+            ];
+        }
+
+        return $pageData;
+    }
+
+    /**
+     * @throws GuzzleException
+     */
+    private function scrapeHorizontalReader(string $chapterUrl, ContentSource $contentSource): array
+    {
+        $pageData = [];
+        $currentPageUrl = $chapterUrl;
+
+        do {
+            $html = $this->fetchHtml($currentPageUrl);
+            $page = $this->extractMangaPageData($html, $contentSource);
+
+            $pageData[] = [
+                'image_url' => $this->cleanImageUrl($page['image_url']),
+                'page_number' => count($pageData) + 1,
+            ];
+
+            $currentPageUrl = $page['next_page_url'];
+        } while ($currentPageUrl);
+
+        return $pageData;
+    }
+
+    /**
+     * Processes a single image
+     * @throws GuzzleException
+     */
+    private function processImage(string $imgUrl, string $tempDir, array &$pageData, int $index, Chapter $chapter): void
+    {
+        $imgUrl = $this->cleanImageUrl($imgUrl);
+        $imageName = sprintf('%03d.%s', $index + 1, pathinfo(parse_url($imgUrl, PHP_URL_PATH), PATHINFO_EXTENSION));
+        $imagePath = $tempDir . '/' . $imageName;
+
+        $this->downloadAndSaveImage($imgUrl, $imagePath);
+
+//        $event = new PageScrappingProgressEvent($chapter->getId(), $index + 1, 0);
+//        $this->eventDispatcher->dispatch($event, PageScrappingProgressEvent::NAME);
+
+        $pageData[] = [
+            'image_url' => $imgUrl,
+            'local_image_url' => $imagePath,
+            'page_number' => $index + 1,
+        ];
+    }
+
+    private function cleanImageUrl(string $url): string
+    {
+        return preg_replace('/[\x00-\x1F\x7F]/', '', trim($url));
    }

    /**