Added:

- Messenger, Mercure - chapter download flow (lelscan only)
2024-06-13 18:11:11 +02:00
parent f88fa2c232
commit bc85649789
24 changed files with 744 additions and 78 deletions
--- a/src/Service/MangaScraperService.php
+++ b/src/Service/MangaScraperService.php
@@ -8,7 +8,11 @@ use App\Entity\ContentSource;
 use App\EventSubscriber\MangaScrapedEvent;
 use GuzzleHttp\Client;
 use GuzzleHttp\Exception\GuzzleException;
+use GuzzleHttp\Exception\RequestException;
 use Symfony\Component\DomCrawler\Crawler;
+use Symfony\Component\HttpKernel\Exception\BadRequestHttpException;
+use Symfony\Component\HttpKernel\Exception\HttpException;
+use Symfony\Component\HttpKernel\Exception\NotFoundHttpException;
 use Symfony\Component\Routing\Matcher\UrlMatcher;
 use Symfony\Component\Routing\RequestContext;
 use Symfony\Component\Routing\Route;
@@ -27,42 +31,31 @@ class MangaScraperService
        $this->eventDispatcher = $eventDispatcher;
    }

-    public function extractMangaPageData(string $html, ContentSource $mangaSource): array
+    private function extractMangaPageData(string $html, ContentSource $mangaSource): array
    {
        $crawler = new Crawler($html);
-        $imgUrls = [];
+        $imgUrl = $crawler->filter($mangaSource->getImageSelector())->attr('src')
+            ?? $crawler->filter($mangaSource->getImageSelector())->attr('data-src');

-        // Search for images with different extensions
-        foreach (['img[src$=".jpg"]', 'img[src$=".jpeg"]', 'img[src$=".png"]', 'img'] as $selector) {
-            $crawler->filter($selector)->each(function (Crawler $node) use (&$imgUrls) {
-                $src = $node->attr('src') ?? $node->attr('data-src');
-                if ($src) {
-                    $imgUrls[] = $src;
-                }
-            });
-        }
+//        dd($imgUrl);

-        if (empty($imgUrls)) {
-            throw new \Exception('No valid image found on the page.');
-        }
+//        if (empty($imgUrl)) {
+//            throw new \Exception('No valid image found on the page.');
+//        }

        $nextLink = $crawler->filter($mangaSource->getNextPageSelector());
        $nextUrl = $nextLink->count() > 0 ? $nextLink->attr('href') : null;

        // Convert relative URLs to absolute URLs
-        $baseUrl = $mangaSource->getBaseUrl();
-        $imgUrls = array_map(function ($imgUrl) use ($baseUrl) {
-            if (!preg_match('/^https?:\/\//', $imgUrl)) {
-                $urlComponents = parse_url($baseUrl);
-                $scheme = $urlComponents['scheme'];
-                $host = $urlComponents['host'];
-                $imgUrl = $scheme . '://' . $host . '/' . ltrim($imgUrl, '/');
-            }
-            return $imgUrl;
-        }, $imgUrls);
+        if (!preg_match('/^https?:\/\//', $imgUrl)) {
+            $urlComponents = parse_url($mangaSource->getBaseUrl());
+            $scheme = $urlComponents['scheme'];
+            $host = $urlComponents['host'];
+            $imgUrl = $scheme . '://' . $host . '/' . ltrim($imgUrl, '/');
+        }

        return [
-            'image_urls' => $imgUrls,
+            'image_url' => $imgUrl,
            'next_page_url' => $nextUrl,
        ];
    }
@@ -75,7 +68,7 @@ class MangaScraperService
        $allChaptersData = [];

        foreach ($manga->getChapters() as $chapter) {
-            $chapterData = $this->scrapeChapter($manga, $chapter, $mangaSource);
+            $chapterData = $this->scrapeChapter($chapter, $mangaSource);
            if ($chapterData !== false) {
                $allChaptersData[$chapter->getNumber()] = $chapterData;
            }
@@ -84,13 +77,13 @@ class MangaScraperService
        return $allChaptersData;
    }

-    private function scrapeChapter(Manga $manga, Chapter $chapter, ContentSource $mangaSource): array|bool
+    public function scrapeChapter(Chapter $chapter, ContentSource $mangaSource): array|bool
    {
        switch ($mangaSource->getScrapingType()) {
            case 'html':
-                return $this->scrapeChapterHtml($manga, $chapter, $mangaSource);
+                return $this->scrapeChapterHtml($chapter->getManga(), $chapter, $mangaSource);
            case 'javascript':
-                return $this->scrapeChapterJavaScript($manga, $chapter, $mangaSource);
+                return $this->scrapeChapterJavaScript($chapter->getManga(), $chapter, $mangaSource);
 //            case 'api':
 //                // Implémentez la méthode de scraping par API si nécessaire
 //                return $this->scrapeChapterApi($manga, $chapter, $mangaSource);
@@ -121,10 +114,10 @@ class MangaScraperService
        // Appeler le script Puppeteer avec les paramètres nécessaires
        $output = [];
        $command = sprintf('node puppeteer-script.js "%s" "%s" "%s" 2>&1', $url, $imageSelector, $nextButtonSelector); // Redirect stderr to stdout
-        dump($command);
+//        dump($command);
 //        exec($command, $output, $return_var);

-        dd($command, $output);
+//        dd($command, $output);

        // Convertir la sortie JSON en tableau PHP
        return json_decode(implode("", $output), true);
@@ -156,34 +149,25 @@ class MangaScraperService
            $html = $this->fetchHtml($currentPageUrl);
            $page = $this->extractMangaPageData($html, $mangaSource);

-            foreach ($page['image_urls'] as $imgUrl) {
-                dump($imgUrl);
-                dump(base64_decode($imgUrl));
-                // Déterminer l'extension de l'image
-                $imageExtension = pathinfo(parse_url($imgUrl, PHP_URL_PATH), PATHINFO_EXTENSION);
+            // Déterminer l'extension de l'image
+            $imageExtension = pathinfo(parse_url($page['image_url'], PHP_URL_PATH), PATHINFO_EXTENSION);

-                // Construire le nom de fichier de l'image
-                $imageName = sprintf('%03d.%s', count($pageData) + 1, $imageExtension);
-                $imagePath = sprintf('%s/%s', $chapterDir, $imageName);
+            // Construire le nom de fichier de l'image
+            $imageName = sprintf('%03d.%s', count($pageData) + 1, $imageExtension);
+            $imagePath = sprintf('%s/%s', $chapterDir, $imageName);

-                $this->downloadAndSaveImage($imgUrl, $imagePath);
+            $this->downloadAndSaveImage($page['image_url'], $imagePath);

-                $pageData[] = [
-                    'image_url' => $imgUrl,
-                    'local_image_url' => sprintf('/manga-images/%s/%s/%s', $mangaTitle, $chapterNumber, $imageName),
-                    'page_number' => count($pageData) + 1,
-                ];
-            }
-
-            // Si plus d'une image a été trouvée, ne pas chercher la page suivante
-            if (count($page['image_urls']) > 1) {
-                break;
-            }
+            $pageData[] = [
+                'image_url' => $page['image_url'],
+                'local_image_url' => sprintf('/manga-images/%s/%s/%s', $mangaTitle, $chapterNumber, $imageName),
+                'page_number' => count($pageData) + 1,
+            ];

            $currentPageUrl = $page['next_page_url'];
        } while ($currentPageUrl);

-        $event = new MangaScrapedEvent($mangaTitle, $chapterNumber, $pageData);
+        $event = new MangaScrapedEvent($mangaTitle, $chapterNumber, $pageData, $chapterDir);
        $this->eventDispatcher->dispatch($event, MangaScrapedEvent::NAME);

        return $pageData;
@@ -195,9 +179,25 @@ class MangaScraperService
    private function fetchHtml(string $url): string
    {
        $client = new Client();
-        $response = $client->get($url);

-        return (string)$response->getBody();
+        try {
+            $response = $client->get($url, [
+                'http_errors' => true,
+                'allow_redirects' => false
+            ]);
+
+            $statusCode = $response->getStatusCode();
+
+            if ($statusCode >= 300 && $statusCode < 400) {
+                throw new NotFoundHttpException('Chapter Not Found at ' . $url);
+            } elseif ($statusCode == 404) {
+                throw new NotFoundHttpException('Chapter Not Found at ' . $url);
+            }
+
+            return (string)$response->getBody();
+        } catch (HttpException $e) {
+            throw new BadRequestHttpException('Bad Request: ' . $e->getMessage());
+        }
    }

    /**
@@ -240,7 +240,7 @@ class MangaScraperService
            ];
        }

-        $event = new MangaScrapedEvent($mangaTitle, $chapterNumber, $pageData);
+        $event = new MangaScrapedEvent($mangaTitle, $chapterNumber, $pageData, $chapterDir);
        $this->eventDispatcher->dispatch($event, MangaScrapedEvent::NAME);

        return $pageData;