- trop de trucs d'un coup... je vais faire attention ensuite ^^'

2024-06-10 13:57:50 +02:00
parent 9595831aa3
commit c46e1a0a5c
69 changed files with 4004 additions and 385 deletions
--- a/src/Service/MangaScraperService.php
+++ b/src/Service/MangaScraperService.php
@@ -2,6 +2,9 @@

 namespace App\Service;

+use App\Entity\Chapter;
+use App\Entity\Manga;
+use App\Entity\ContentSource;
 use App\EventSubscriber\MangaScrapedEvent;
 use GuzzleHttp\Client;
 use GuzzleHttp\Exception\GuzzleException;
@@ -14,144 +17,256 @@ use Symfony\Contracts\EventDispatcher\EventDispatcherInterface;

 class MangaScraperService
 {
-	const string IMG_BASE_DIR = '/public/manga-images';
-	private string $projectDir;
-	private EventDispatcherInterface $eventDispatcher;
+    const IMG_BASE_DIR = '/public/manga-images';
+    private string $projectDir;
+    private EventDispatcherInterface $eventDispatcher;

-	public function __construct($projectDir, EventDispatcherInterface $eventDispatcher)
-	{
-		$this->projectDir = $projectDir;
-		$this->eventDispatcher = $eventDispatcher;
-	}
+    public function __construct($projectDir, EventDispatcherInterface $eventDispatcher)
+    {
+        $this->projectDir = $projectDir;
+        $this->eventDispatcher = $eventDispatcher;
+    }

-	public function extractMangaPageData(string $html): array
-	{
-		$baseUrl = 'https://lelscans.net';
-        //pour éviter à PhpStorm de gueuler...
-        $selector = 'img';
-		$crawler = new Crawler($html);
-		$imgUrl = $crawler->filter($selector)->attr('src');
-		$nextLink = $crawler->filter('a[title="Suivant"]');
+    public function extractMangaPageData(string $html, ContentSource $mangaSource): array
+    {
+        $crawler = new Crawler($html);
+        $imgUrls = [];

-		if (!preg_match('/^https?:\/\//', $imgUrl)) {
-			$urlComponents = parse_url($baseUrl);
-			$scheme = $urlComponents['scheme'];
-			$host = $urlComponents['host'];
+        // Search for images with different extensions
+        foreach (['img[src$=".jpg"]', 'img[src$=".jpeg"]', 'img[src$=".png"]', 'img'] as $selector) {
+            $crawler->filter($selector)->each(function (Crawler $node) use (&$imgUrls) {
+                $src = $node->attr('src') ?? $node->attr('data-src');
+                if ($src) {
+                    $imgUrls[] = $src;
+                }
+            });
+        }

-			// Construit l'URL absolue de l'image
-			$imgUrl = $scheme . '://' . $host . '/' . ltrim($imgUrl, '/');
-		}
+        if (empty($imgUrls)) {
+            throw new \Exception('No valid image found on the page.');
+        }

-		if($nextLink->count() > 0){
-			$nextUrl = $nextLink->attr('href');
-		}else{
-			$nextUrl = null;
-		}
+        $nextLink = $crawler->filter($mangaSource->getNextPageSelector());
+        $nextUrl = $nextLink->count() > 0 ? $nextLink->attr('href') : null;

-		return [
-			'image_url' => $imgUrl,
-			'next_page_url' => $nextUrl,
-		];
-	}
+        // Convert relative URLs to absolute URLs
+        $baseUrl = $mangaSource->getBaseUrl();
+        $imgUrls = array_map(function ($imgUrl) use ($baseUrl) {
+            if (!preg_match('/^https?:\/\//', $imgUrl)) {
+                $urlComponents = parse_url($baseUrl);
+                $scheme = $urlComponents['scheme'];
+                $host = $urlComponents['host'];
+                $imgUrl = $scheme . '://' . $host . '/' . ltrim($imgUrl, '/');
+            }
+            return $imgUrl;
+        }, $imgUrls);
+
+        return [
+            'image_urls' => $imgUrls,
+            'next_page_url' => $nextUrl,
+        ];
+    }

    /**
     * @throws GuzzleException
     */
-    public function scrapeMangaChapter(string $chapterUrl, string $mangaTitle, float $chapterNumber): array|bool
-	{
-		if(!$this->isChapterAvailable($chapterUrl, $chapterNumber)){
-			return false;
-		}
+    public function scrapeManga(Manga $manga, ContentSource $mangaSource): array
+    {
+        $allChaptersData = [];

-		$pageData = [];
-		$currentPageUrl = $chapterUrl;
+        foreach ($manga->getChapters() as $chapter) {
+            $chapterData = $this->scrapeChapter($manga, $chapter, $mangaSource);
+            if ($chapterData !== false) {
+                $allChaptersData[$chapter->getNumber()] = $chapterData;
+            }
+        }

-		$mangaDir = sprintf('%s/%s', $this->projectDir . self::IMG_BASE_DIR, $mangaTitle);
-		if (!is_dir($mangaDir)) {
-			mkdir($mangaDir, 0755, true);
-		}
+        return $allChaptersData;
+    }

-		// Créez le dossier du chapitre s'il n'existe pas
-		$chapterDir = sprintf('%s/%s', $mangaDir, $chapterNumber);
-		if (!is_dir($chapterDir)) {
-			mkdir($chapterDir, 0755, true);
-		}
+    private function scrapeChapter(Manga $manga, Chapter $chapter, ContentSource $mangaSource): array|bool
+    {
+        switch ($mangaSource->getScrapingType()) {
+            case 'html':
+                return $this->scrapeChapterHtml($manga, $chapter, $mangaSource);
+            case 'javascript':
+                return $this->scrapeChapterJavaScript($manga, $chapter, $mangaSource);
+//            case 'api':
+//                // Implémentez la méthode de scraping par API si nécessaire
+//                return $this->scrapeChapterApi($manga, $chapter, $mangaSource);
+            default:
+                throw new \Exception('Unsupported scraping type: ' . $mangaSource->getScrapingType());
+        }
+    }

-		do {
-			$html = $this->fetchHtml($currentPageUrl);
-			$page = $this->extractMangaPageData($html);
-			$pageData[] = $page;
-			$currentPageUrl = $page['next_page_url'];
+//    private function scrapeChapterHtml(Manga $manga, Chapter $chapter, MangaSource $mangaSource): array|bool
+//    {
+//        $chapterUrl = $mangaSource->getChapterUrl($manga->getTitle(), $chapter->getChapterNumber());
+//        $html = $this->fetchHtml($chapterUrl);
+//        $imgUrls = $this->extractMangaPageData($html);
+//
+//        return $this->saveChapterImages($manga, $chapter, $imgUrls);
+//    }

-			// Construisez le nom de fichier de l'image
-			$imageName = sprintf('%03d.jpg', count($pageData));
+    private function scrapeChapterJavaScript(Manga $manga, Chapter $chapter, ContentSource $mangaSource): array|bool
+    {
+        $chapterUrl = $mangaSource->getChapterUrl($manga->getTitle(), $chapter->getNumber());
+        $imgUrls = $this->fetchImagesUsingPuppeteer($chapterUrl, $mangaSource->getImageSelector(), $mangaSource->getNextPageSelector());

-			// Construisez le chemin du fichier de l'image
-			$imagePath = sprintf('%s/%s', $chapterDir, $imageName);
+        return $this->saveChapterImages($manga, $chapter, $imgUrls);
+    }

-			// Téléchargez et enregistrez l'image
-			$this->downloadAndSaveImage($page['image_url'], $imagePath);
+    private function fetchImagesUsingPuppeteer(string $url, string $imageSelector, string $nextButtonSelector): array
+    {
+        // Appeler le script Puppeteer avec les paramètres nécessaires
+        $output = [];
+        $command = sprintf('node puppeteer-script.js "%s" "%s" "%s" 2>&1', $url, $imageSelector, $nextButtonSelector); // Redirect stderr to stdout
+        dump($command);
+//        exec($command, $output, $return_var);

-			// Modifiez les données de la page pour inclure l'URL de l'image stockée localement
-			$pageData[count($pageData) - 1]['local_image_url'] = sprintf('/manga-images/%s/%s/%s', $mangaTitle, $chapterNumber, $imageName);
-			$pageData[count($pageData) - 1]['page_number'] = count($pageData);
+        dd($command, $output);

-		} while ($currentPageUrl);
+        // Convertir la sortie JSON en tableau PHP
+        return json_decode(implode("", $output), true);
+    }

-		$event = new MangaScrapedEvent($mangaTitle, $chapterNumber, $pageData);
-		$this->eventDispatcher->dispatch($event, MangaScrapedEvent::NAME);
+    /**
+     * @throws GuzzleException
+     */
+    private function scrapeChapterHtml(Manga $manga, Chapter $chapter, ContentSource $mangaSource): array|bool
+    {
+        $chapterUrl = $mangaSource->getChapterUrl($manga->getSlug(), $chapter->getNumber());

-		return $pageData;
-	}
+        $pageData = [];
+        $currentPageUrl = $chapterUrl;
+        $mangaTitle = $manga->getTitle();
+        $chapterNumber = $chapter->getNumber();
+
+        $mangaDir = sprintf('%s/%s', $this->projectDir . self::IMG_BASE_DIR, $mangaTitle);
+        if (!is_dir($mangaDir)) {
+            mkdir($mangaDir, 0755, true);
+        }
+
+        $chapterDir = sprintf('%s/%s', $mangaDir, $chapterNumber);
+        if (!is_dir($chapterDir)) {
+            mkdir($chapterDir, 0755, true);
+        }
+
+        do {
+            $html = $this->fetchHtml($currentPageUrl);
+            $page = $this->extractMangaPageData($html, $mangaSource);
+
+            foreach ($page['image_urls'] as $imgUrl) {
+                dump($imgUrl);
+                dump(base64_decode($imgUrl));
+                // Déterminer l'extension de l'image
+                $imageExtension = pathinfo(parse_url($imgUrl, PHP_URL_PATH), PATHINFO_EXTENSION);
+
+                // Construire le nom de fichier de l'image
+                $imageName = sprintf('%03d.%s', count($pageData) + 1, $imageExtension);
+                $imagePath = sprintf('%s/%s', $chapterDir, $imageName);
+
+                $this->downloadAndSaveImage($imgUrl, $imagePath);
+
+                $pageData[] = [
+                    'image_url' => $imgUrl,
+                    'local_image_url' => sprintf('/manga-images/%s/%s/%s', $mangaTitle, $chapterNumber, $imageName),
+                    'page_number' => count($pageData) + 1,
+                ];
+            }
+
+            // Si plus d'une image a été trouvée, ne pas chercher la page suivante
+            if (count($page['image_urls']) > 1) {
+                break;
+            }
+
+            $currentPageUrl = $page['next_page_url'];
+        } while ($currentPageUrl);
+
+        $event = new MangaScrapedEvent($mangaTitle, $chapterNumber, $pageData);
+        $this->eventDispatcher->dispatch($event, MangaScrapedEvent::NAME);
+
+        return $pageData;
+    }

    /**
     * @throws GuzzleException
     */
    private function fetchHtml(string $url): string
-	{
-		$client = new Client();
-		$response = $client->get($url);
+    {
+        $client = new Client();
+        $response = $client->get($url);

-		return (string) $response->getBody();
-	}
+        return (string)$response->getBody();
+    }

    /**
     * @throws GuzzleException
     */
    private function downloadAndSaveImage(string $imageUrl, string $destinationPath): void
-	{
-		$client = new Client();
-		$response = $client->get($imageUrl);
+    {
+        $client = new Client();
+        $response = $client->get($imageUrl);

-		file_put_contents($destinationPath, $response->getBody()->getContents());
-	}
+        file_put_contents($destinationPath, $response->getBody()->getContents());
+    }
+
+    private function saveChapterImages(Manga $manga, Chapter $chapter, array $imgUrls): array
+    {
+        $mangaTitle = $manga->getTitle();
+        $chapterNumber = $chapter->getNumber();
+
+        $mangaDir = sprintf('%s/%s', $this->projectDir . self::IMG_BASE_DIR, $mangaTitle);
+        if (!is_dir($mangaDir)) {
+            mkdir($mangaDir, 0755, true);
+        }
+
+        $chapterDir = sprintf('%s/%s', $mangaDir, $chapterNumber);
+        if (!is_dir($chapterDir)) {
+            mkdir($chapterDir, 0755, true);
+        }
+
+        $pageData = [];
+        foreach ($imgUrls as $index => $imgUrl) {
+            $imageName = sprintf('%03d.%s', $index + 1, pathinfo(parse_url($imgUrl, PHP_URL_PATH), PATHINFO_EXTENSION));
+            $imagePath = sprintf('%s/%s', $chapterDir, $imageName);
+
+            $this->downloadAndSaveImage($imgUrl, $imagePath);
+
+            $pageData[] = [
+                'image_url' => $imgUrl,
+                'local_image_url' => sprintf('/manga-images/%s/%s/%s', $mangaTitle, $chapterNumber, $imageName),
+                'page_number' => $index + 1,
+            ];
+        }
+
+        $event = new MangaScrapedEvent($mangaTitle, $chapterNumber, $pageData);
+        $this->eventDispatcher->dispatch($event, MangaScrapedEvent::NAME);
+
+        return $pageData;
+    }

    /**
     * @throws GuzzleException
     */
-    private function isChapterAvailable(string $chapterUrl, float $chapterNumber): bool
-	{
-		$html = $this->fetchHtml($chapterUrl);
-		$crawler = new Crawler($html);
-		$nextLink = $crawler->filter('a[title="Suivant"]');
+    private function isChapterAvailable(string $chapterUrl, float $chapterNumber, ContentSource $mangaSource): bool
+    {
+        $html = $this->fetchHtml($chapterUrl);
+        $crawler = new Crawler($html);
+        $nextLink = $crawler->filter($mangaSource->getNextPageSelector());

-		if($nextLink->count() === 0){
-			return false;
-		}else{
-			$nextUrl = $nextLink->attr('href');
-		}
+        if ($nextLink->count() === 0) {
+            return false;
+        }

-		$routeCollection = new RouteCollection();
-		$routeCollection->add('manga_chapter', new Route('/scan-{manga}/{chapter}/{page}'));
-		$context = new RequestContext('/');
-		$matcher = new UrlMatcher($routeCollection, $context);
-		$path = parse_url($nextUrl, PHP_URL_PATH);
-		$parameters = $matcher->match($path);
+        $nextUrl = $nextLink->attr('href');
+        $routeCollection = new RouteCollection();
+        $routeCollection->add('manga_chapter', new Route('/scan-{manga}/{chapter}/{page}'));
+        $context = new RequestContext('/');
+        $matcher = new UrlMatcher($routeCollection, $context);
+        $path = parse_url($nextUrl, PHP_URL_PATH);
+        $parameters = $matcher->match($path);

-		if((float) $parameters['chapter'] !== $chapterNumber){
-			return false;
-		}
-
-		return true;
-	}
+        return (float)$parameters['chapter'] === $chapterNumber;
+    }
 }