Added:

- Refactor MangaScraperService (not used everywhere now) - Added JavascriptScraper.php - Added alternatives slugs in Manga.php - Improvement in manga edit form
2024-07-21 19:08:46 +02:00
parent ff59aa5d77
commit fafff5014c
21 changed files with 1180 additions and 28 deletions
--- a/src/Service/Scraper/HtmlScraper.php
+++ b/src/Service/Scraper/HtmlScraper.php
@@ -0,0 +1,197 @@
+<?php
+
+namespace App\Service\Scraper;
+
+use App\Entity\Chapter;
+use App\Entity\ContentSource;
+use Doctrine\ORM\EntityManagerInterface;
+use Exception;
+use GuzzleHttp\Client;
+use Symfony\Component\EventDispatcher\EventDispatcherInterface;
+use Symfony\Component\DomCrawler\Crawler;
+
+class HtmlScraper extends AbstractScraper
+{
+    private Client $client;
+
+    public function __construct(
+        string $projectDir,
+        EventDispatcherInterface $eventDispatcher,
+        EntityManagerInterface $entityManager
+    ) {
+        parent::__construct($projectDir, $eventDispatcher, $entityManager);
+        $this->client = new Client();
+    }
+
+    /**
+     * @throws Exception
+     */
+    public function scrapeChapter(Chapter $chapter, ContentSource $contentSource): array|bool
+    {
+        $manga = $chapter->getManga();
+        $chapterUrl = $this->getValidChapterUrl($contentSource, $manga, $chapter->getNumber());
+
+        if (!$chapterUrl) {
+            throw new Exception("Aucune URL valide trouvée pour le chapitre {$chapter->getNumber()} du manga {$manga->getTitle()}");
+        }
+
+        $tempDir = sys_get_temp_dir() . '/' . uniqid('manga_scraper_');
+        mkdir($tempDir);
+
+        $pageData = [];
+
+        if ($contentSource->getNextPageSelector() === null) {
+            // Lecteur vertical
+            $html = $this->fetchHtml($chapterUrl);
+            $pageData = $this->scrapeVerticalReader($html, $contentSource);
+        } else {
+            // Lecteur horizontal (paginé)
+            $pageData = $this->scrapeHorizontalReader($chapterUrl, $contentSource);
+        }
+
+        // Télécharger et sauvegarder les images
+        foreach ($pageData as $index => &$page) {
+            $imageName = sprintf('%03d.%s', $index + 1, pathinfo(parse_url($page['image_url'], PHP_URL_PATH), PATHINFO_EXTENSION));
+            $imagePath = $tempDir . '/' . $imageName;
+
+            $this->downloadAndSaveImage($page['image_url'], $imagePath);
+
+            $this->dispatchProgressEvent($chapter, $index + 1, count($pageData));
+
+            $page['local_image_url'] = $imagePath;
+        }
+
+        $cbzFilePath = $this->generateCbzPath($manga, $chapter);
+        $this->createCbzFile($tempDir, $pageData, $cbzFilePath);
+
+        $chapter->setCbzPath($cbzFilePath);
+        $this->entityManager->persist($chapter);
+        $this->entityManager->flush();
+
+        // Nettoyage du répertoire temporaire
+        $this->cleanupTempFiles($tempDir);
+
+        return $pageData;
+    }
+
+    /**
+     * @throws Exception
+     */
+    public function testScraping(string $mangaSlug, string $chapterNumber, ContentSource $contentSource): array
+    {
+        $chapterUrl = $contentSource->getChapterUrl($mangaSlug, $chapterNumber);
+
+        if (!$this->isChapterUrlValid($chapterUrl)) {
+            throw new \Exception("Invalid URL, check format and slug");
+        }
+
+        $html = $this->fetchHtml($chapterUrl);
+
+        if ($contentSource->getNextPageSelector() === null) {
+            return $this->scrapeVerticalReader($html, $contentSource);
+        } else {
+            return $this->scrapeHorizontalReader($chapterUrl, $contentSource);
+        }
+    }
+
+    public function supports(string $scrapingType): bool
+    {
+        return $scrapingType === 'html';
+    }
+
+    private function scrapeVerticalReader(string $html, ContentSource $contentSource): array
+    {
+        $crawler = new Crawler($html);
+        $images = $crawler->filter($contentSource->getImageSelector());
+
+        $pageData = [];
+        foreach ($images as $index => $image) {
+            $imgUrl = $image->getAttribute('src') ?: $image->getAttribute('data-src');
+            $pageData[] = [
+                'image_url' => $this->cleanImageUrl($imgUrl),
+                'page_number' => $index + 1,
+            ];
+        }
+
+        return $pageData;
+    }
+
+    private function scrapeHorizontalReader(string $chapterUrl, ContentSource $contentSource): array
+    {
+        $pageData = [];
+        $currentPageUrl = $chapterUrl;
+
+        do {
+            $html = $this->fetchHtml($currentPageUrl);
+            $page = $this->extractMangaPageData($html, $contentSource);
+
+            $pageData[] = [
+                'image_url' => $this->cleanImageUrl($page['image_url']),
+                'page_number' => count($pageData) + 1,
+            ];
+
+            $currentPageUrl = $page['next_page_url'];
+        } while ($currentPageUrl);
+
+        return $pageData;
+    }
+
+    private function fetchHtml(string $url): string
+    {
+        try {
+            $response = $this->client->get($url, [
+                'http_errors' => true,
+                'allow_redirects' => false
+            ]);
+
+            $statusCode = $response->getStatusCode();
+
+            if ($statusCode >= 300 && $statusCode < 400 || $statusCode == 404) {
+                throw new Exception('Chapter Not Found at ' . $url);
+            }
+
+            return (string)$response->getBody();
+        } catch (Exception $e) {
+            throw new Exception('Bad Request: ' . $e->getMessage());
+        }
+    }
+
+    private function downloadAndSaveImage(string $imageUrl, string $destinationPath): void
+    {
+        try {
+            $response = $this->client->get($imageUrl);
+            $contentType = $response->getHeaderLine('Content-Type');
+
+            if (str_starts_with($contentType, 'image/')) {
+                file_put_contents($destinationPath, $response->getBody()->getContents());
+            } else {
+                throw new Exception('Le contenu récupéré n\'est pas une image. Type de contenu : ' . $contentType);
+            }
+        } catch (Exception $e) {
+            throw new Exception('Erreur lors de la récupération de l\'image : ' . $e->getMessage());
+        }
+    }
+
+    private function extractMangaPageData(string $html, ContentSource $mangaSource): array
+    {
+        $crawler = new Crawler($html);
+        $imgUrl = $crawler->filter($mangaSource->getImageSelector())->attr('src')
+            ?? $crawler->filter($mangaSource->getImageSelector())->attr('data-src');
+
+        $nextLink = $crawler->filter($mangaSource->getNextPageSelector());
+        $nextUrl = $nextLink->count() > 0 ? $nextLink->attr('href') : null;
+
+        // Convert relative URLs to absolute URLs
+        if (!preg_match('/^https?:\/\//', $imgUrl)) {
+            $urlComponents = parse_url($mangaSource->getBaseUrl());
+            $scheme = $urlComponents['scheme'];
+            $host = $urlComponents['host'];
+            $imgUrl = $scheme . '://' . $host . '/' . ltrim($imgUrl, '/');
+        }
+
+        return [
+            'image_url' => $imgUrl,
+            'next_page_url' => $nextUrl,
+        ];
+    }
+}