filter($mangaSource->getImageSelector())->attr('src') ?? $crawler->filter($mangaSource->getImageSelector())->attr('data-src'); // dd($imgUrl); // if (empty($imgUrl)) { // throw new \Exception('No valid image found on the page.'); // } $nextLink = $crawler->filter($mangaSource->getNextPageSelector()); $nextUrl = $nextLink->count() > 0 ? $nextLink->attr('href') : null; // Convert relative URLs to absolute URLs if (!preg_match('/^https?:\/\//', $imgUrl)) { $urlComponents = parse_url($mangaSource->getBaseUrl()); $scheme = $urlComponents['scheme']; $host = $urlComponents['host']; $imgUrl = $scheme . '://' . $host . '/' . ltrim($imgUrl, '/'); } return [ 'image_url' => $imgUrl, 'next_page_url' => $nextUrl, ]; } /** * @throws GuzzleException */ public function scrapeManga(Manga $manga, ContentSource $mangaSource): array { $allChaptersData = []; foreach ($manga->getChapters() as $chapter) { $chapterData = $this->scrapeChapter($chapter, $mangaSource); if ($chapterData !== false) { $allChaptersData[$chapter->getNumber()] = $chapterData; } } return $allChaptersData; } /** * @throws GuzzleException * @throws Exception */ public function scrapeChapter(Chapter $chapter, ContentSource $mangaSource): array|bool { return match ($mangaSource->getScrapingType()) { 'html' => $this->scrapeChapterHtml($chapter->getManga(), $chapter, $mangaSource), 'javascript' => $this->scrapeChapterJavaScript($chapter->getManga(), $chapter, $mangaSource), 'mangadex' => $this->scrapeChapterMangadex($chapter, $mangaSource), default => throw new Exception('Unsupported scraping type: ' . $mangaSource->getScrapingType()), }; } /** * @throws GuzzleException * @throws Exception */ private function scrapeChapterMangadex(Chapter $chapter, ContentSource $mangaSource): bool { $client = new Client(); $chapterUrl = $mangaSource->getBaseUrl() . sprintf($mangaSource->getChapterUrlFormat(), $chapter->getExternalId()); $manga = $chapter->getManga(); $pageData = []; $response = $client->get($chapterUrl); $results = json_decode($response->getBody()->getContents(), true); if ($results['result'] !== 'ok' || count($results['chapter']['dataSaver']) === 0) { throw new Exception('Error while fetching chapter data from Mangadex ' . $manga->getTitle() . ' ' . $chapter->getNumber()); } $tempDir = sys_get_temp_dir() . '/' . uniqid('manga_scraper_'); mkdir($tempDir); foreach ($results['chapter']['dataSaver'] as $index => $page) { $pageUrl = $results['baseUrl'] . '/data-saver/' . $results['chapter']['hash'] . '/' . $page; $imagePath = $tempDir . '/' . sprintf('%03d.%s', $index + 1, pathinfo($page, PATHINFO_EXTENSION)); $this->downloadAndSaveImage($pageUrl, $imagePath); $event = new PageScrappingProgressEvent($chapter->getId(), $index + 1, count($results['chapter']['dataSaver'])); $this->eventDispatcher->dispatch($event, PageScrappingProgressEvent::NAME); $pageData[] = [ 'image_url' => $pageUrl, 'local_image_url' => $imagePath, 'page_number' => $index + 1, ]; } $cbzFilePath = $this->generateCbzPath($manga, $chapter); $this->createCbzFile($tempDir, $pageData, $cbzFilePath); $chapter->setCbzPath($cbzFilePath); $this->entityManager->persist($chapter); $this->entityManager->flush(); // Nettoyage du répertoire temporaire $this->cleanupTempFiles($tempDir); return true; } private function scrapeChapterJavascript(Manga $manga, Chapter $chapter, ContentSource $mangaSource): array|bool { $pantherClient = PantherClient::createChromeClient(); $chapterUrl = $mangaSource->getChapterUrl($manga->getSlug(), $chapter->getNumber()); $pantherClient->request('GET', $chapterUrl); // Sélection du chapitre dans le menu déroulant try { $crawler = $pantherClient->waitFor('body'); $select = $crawler->filter('#selectChapitres'); if ($select->count() > 0) { $chapterNumber = $chapter->getNumber(); $options = $select->filter('option'); $targetindex = null; /** @var RemoteWebElement $option */ foreach ($options->getIterator() as $index => $option) { $optionText = $option->getText(); // Recherche plus flexible du numéro de chapitre if (preg_match("/\b{$chapterNumber}\b/", $optionText)) { $targetIndex = $index; break; } } if ($targetIndex !== null) { $pantherClient->executeScript(" var select = document.querySelector('#selectChapitres'); select.selectedIndex = $targetIndex; select.dispatchEvent(new Event('change')); "); // Attendre que la page se mette à jour après la sélection $pantherClient->wait(60000)->until( // 60 secondes de timeout function ($driver) { return $driver->executeScript(" var scansPlacement = document.querySelector('#scansPlacement'); if (!scansPlacement) return false; var lazyImages = scansPlacement.querySelectorAll('img.lazy'); var loadingGif = scansPlacement.querySelector('img[src*=\"loading_scans.gif\"]'); // Vérifier que toutes les images lazy sont chargées et que le GIF de chargement n'est plus présent var allImagesLoaded = Array.from(lazyImages).every(img => img.complete && img.naturalWidth > 0); return lazyImages.length > 0 && allImagesLoaded && !loadingGif; "); } ); } else { throw new \Exception("Chapitre $chapterNumber non trouvé dans le menu déroulant"); } } } catch (\Exception $e) { // $this->logger->warning('Erreur lors de la sélection du chapitre : ' . $e->getMessage()); $pantherClient->close(); return false; } $pageData = []; try { if ($mangaSource->getNextPageSelector() === null) { // Lecteur vertical $pageData = $this->scrapeVerticalReaderJavascript($pantherClient, $mangaSource, $chapter); } else { // Lecteur horizontal $pageData = $this->scrapeHorizontalReaderJavascript($pantherClient, $mangaSource, $chapter); } } catch (\Exception $e) { throw $e; // $this->logger->warning('Erreur lors du scraping du chapitre ' . $chapter->getNumber() . ' du manga ' . $manga->getTitle() . ': ' . $e->getMessage()); } finally { $pantherClient->close(); } return $pageData; } private function scrapeVerticalReaderJavascript(PantherClient $pantherClient, ContentSource $mangaSource, Chapter $chapter): array { $pageData = []; $pageNumber = 1; $crawler = $pantherClient->waitFor($mangaSource->getImageSelector()); $images = $crawler->filter($mangaSource->getImageSelector()); foreach ($images->getIterator() as $image) { $imageUrl = $image->getAttribute('src') ?: $image->getAttribute('data-src'); $pageData[] = [ 'image_url' => $this->cleanImageUrl($imageUrl), 'page_number' => $pageNumber, ]; $event = new PageScrappingProgressEvent($chapter->getId(), $pageNumber, $images->count()); $this->eventDispatcher->dispatch($event, PageScrappingProgressEvent::NAME); $pageNumber++; } return $pageData; } private function scrapeHorizontalReaderJavascript(PantherClient $pantherClient, ContentSource $mangaSource, Chapter $chapter): array { $pageData = []; $pageNumber = 1; while (true) { try { $crawler = $pantherClient->waitFor($mangaSource->getImageSelector()); $imageElement = $crawler->filter($mangaSource->getImageSelector())->first(); if ($imageElement->count() === 0) { break; // Fin du chapitre } $imageUrl = $imageElement->attr('src') ?: $imageElement->attr('data-src'); $pageData[] = [ 'image_url' => $this->cleanImageUrl($imageUrl), 'page_number' => $pageNumber, ]; $event = new PageScrappingProgressEvent($chapter->getId(), $pageNumber, 0); $this->eventDispatcher->dispatch($event, PageScrappingProgressEvent::NAME); // Passer à la page suivante $nextButton = $pantherCrawler->filter($mangaSource->getNextPageSelector()); if ($nextButton->count() === 0) { break; // Pas de bouton suivant, fin du chapitre } $nextButton->click(); // Attendre que la page change $pantherClient->waitFor($mangaSource->getImageSelector(), 10); // Mettre à jour le crawler avec le nouveau contenu de la page $pantherCrawler = $pantherClient->refreshCrawler(); $pageNumber++; } catch (\Exception $e) { throw $e; // $this->logger->warning('Erreur lors du scraping de la page ' . $pageNumber . ' du chapitre ' . $chapter->getNumber() . ': ' . $e->getMessage()); break; } } return $pageData; } private function fetchImagesUsingPuppeteer(string $url, string $imageSelector, string $nextButtonSelector): array { // Appeler le script Puppeteer avec les paramètres nécessaires $output = []; $command = sprintf('node puppeteer-script.js "%s" "%s" "%s" 2>&1', $url, $imageSelector, $nextButtonSelector); // Redirect stderr to stdout // dump($command); // exec($command, $output, $return_var); // dd($command, $output); // Convertir la sortie JSON en tableau PHP return json_decode(implode("", $output), true); } public function testScraping(string $mangaSlug, string $chapterNumber, ContentSource $contentSource): array { return match ($contentSource->getScrapingType()) { 'html' => $this->testScrapingHtml($mangaSlug, $chapterNumber, $contentSource), 'javascript' => $this->testScrapingJavascript($mangaSlug, $chapterNumber, $contentSource), default => throw new Exception('Unsupported scraping type: ' . $contentSource->getScrapingType()), }; } /** * @throws Exception */ public function testScrapingJavascript(string $mangaSlug, string $chapterNumber, ContentSource $contentSource): array { $manga = $this->mangaRepository->findOneBy(['slug' => $mangaSlug]); $chapter = $manga->getChapterByNumber($chapterNumber); return $this->scrapeChapterJavascript($manga, $chapter, $contentSource); } /** * @throws GuzzleException */ public function testScrapingHtml(string $mangaSlug, string $chapterNumber, ContentSource $contentSource): array { $chapterUrl = $contentSource->getChapterUrl($mangaSlug, $chapterNumber); $html = $this->fetchHtml($chapterUrl); if ($contentSource->getNextPageSelector() === null) { return $this->scrapeVerticalReader($html, $contentSource); } else { return $this->scrapeHorizontalReader($chapterUrl, $contentSource); } } /** * @throws GuzzleException */ private function scrapeChapterHtml(Manga $manga, Chapter $chapter, ContentSource $mangaSource): array|bool { $chapterUrl = $mangaSource->getChapterUrl($manga->getSlug(), $chapter->getNumber()); $tempDir = sys_get_temp_dir() . '/' . uniqid('manga_scraper_'); mkdir($tempDir); $pageData = []; if ($mangaSource->getNextPageSelector() === null) { // Lecteur vertical $html = $this->fetchHtml($chapterUrl); $pageData = $this->scrapeVerticalReader($html, $mangaSource); } else { // Lecteur horizontal (paginé) $pageData = $this->scrapeHorizontalReader($chapterUrl, $mangaSource); } // Télécharger et sauvegarder les images foreach ($pageData as $index => &$page) { $imageName = sprintf('%03d.%s', $index + 1, pathinfo(parse_url($page['image_url'], PHP_URL_PATH), PATHINFO_EXTENSION)); $imagePath = $tempDir . '/' . $imageName; $this->downloadAndSaveImage($page['image_url'], $imagePath); $event = new PageScrappingProgressEvent($chapter->getId(), $index + 1, count($pageData)); $this->eventDispatcher->dispatch($event, PageScrappingProgressEvent::NAME); $page['local_image_url'] = $imagePath; } $cbzFilePath = $this->generateCbzPath($manga, $chapter); $this->createCbzFile($tempDir, $pageData, $cbzFilePath); $chapter->setCbzPath($cbzFilePath); $this->entityManager->persist($chapter); $this->entityManager->flush(); // Nettoyage du répertoire temporaire $this->cleanupTempFiles($tempDir); return $pageData; } private function scrapeVerticalReader(string $html, ContentSource $contentSource): array { $crawler = new Crawler($html); $images = $crawler->filter($contentSource->getImageSelector()); $pageData = []; foreach ($images as $index => $image) { if ($image->getAttribute('src') === '') { $imgUrl = $image->getAttribute('data-src'); } else { $imgUrl = $image->getAttribute('src'); } $pageData[] = [ 'image_url' => $this->cleanImageUrl($imgUrl), 'page_number' => $index + 1, ]; } return $pageData; } /** * @throws GuzzleException */ private function scrapeHorizontalReader(string $chapterUrl, ContentSource $contentSource): array { $pageData = []; $currentPageUrl = $chapterUrl; do { $html = $this->fetchHtml($currentPageUrl); $page = $this->extractMangaPageData($html, $contentSource); $pageData[] = [ 'image_url' => $this->cleanImageUrl($page['image_url']), 'page_number' => count($pageData) + 1, ]; $currentPageUrl = $page['next_page_url']; } while ($currentPageUrl); return $pageData; } /** * Processes a single image * @throws GuzzleException */ private function processImage(string $imgUrl, string $tempDir, array &$pageData, int $index, Chapter $chapter): void { $imgUrl = $this->cleanImageUrl($imgUrl); $imageName = sprintf('%03d.%s', $index + 1, pathinfo(parse_url($imgUrl, PHP_URL_PATH), PATHINFO_EXTENSION)); $imagePath = $tempDir . '/' . $imageName; $this->downloadAndSaveImage($imgUrl, $imagePath); // $event = new PageScrappingProgressEvent($chapter->getId(), $index + 1, 0); // $this->eventDispatcher->dispatch($event, PageScrappingProgressEvent::NAME); $pageData[] = [ 'image_url' => $imgUrl, 'local_image_url' => $imagePath, 'page_number' => $index + 1, ]; } private function cleanImageUrl(string $url): string { return preg_replace('/[\x00-\x1F\x7F]/', '', trim($url)); } /** * @throws GuzzleException * @throws Exception */ private function fetchHtml(string $url): string { $client = new Client(); try { $response = $client->get($url, [ 'http_errors' => true, 'allow_redirects' => false ]); $statusCode = $response->getStatusCode(); if ($statusCode >= 300 && $statusCode < 400) { throw new Exception('Chapter Not Found at ' . $url); } elseif ($statusCode == 404) { throw new Exception('Chapter Not Found at ' . $url); } return (string)$response->getBody(); } catch (Exception $e) { throw new Exception('Bad Request: ' . $e->getMessage()); } } /** * @throws GuzzleException */ private function downloadAndSaveImage(string $imageUrl, string $destinationPath): void { $client = new Client(); $startTime = microtime(true); try { $response = $client->get($imageUrl); $endTime = microtime(true); $contentType = $response->getHeaderLine('Content-Type'); $xCacheHeader = $response->getHeaderLine('X-Cache'); $isCached = str_starts_with($xCacheHeader, 'HIT'); $contentLength = $response->getHeaderLine('Content-Length'); if (str_starts_with($contentType, 'image/')) { file_put_contents($destinationPath, $response->getBody()->getContents()); // if ($this->scrapingType === 'mangadex') { // $this->sendReport($imageUrl, true, $isCached, (int)$contentLength, ($endTime - $startTime) * 1000); // } } else { // if ($this->scrapingType === 'mangadex') { // $this->sendReport($imageUrl, false, $isCached, (int)$contentLength, ($endTime - $startTime) * 1000); // } throw new \Exception('Le contenu récupéré n\'est pas une image. Type de contenu : ' . $contentType); } } catch (RequestException $e) { throw new \Exception('Erreur lors de la récupération de l\'image : ' . $e->getMessage()); } } /** * @throws GuzzleException */ private function isChapterAvailable(string $chapterUrl, float $chapterNumber, ContentSource $mangaSource): bool { $html = $this->fetchHtml($chapterUrl); $crawler = new Crawler($html); $nextLink = $crawler->filter($mangaSource->getNextPageSelector()); if ($nextLink->count() === 0) { return false; } $nextUrl = $nextLink->attr('href'); $routeCollection = new RouteCollection(); $routeCollection->add('manga_chapter', new Route('/scan-{manga}/{chapter}/{page}')); $context = new RequestContext('/'); $matcher = new UrlMatcher($routeCollection, $context); $path = parse_url($nextUrl, PHP_URL_PATH); $parameters = $matcher->match($path); return (float)$parameters['chapter'] === $chapterNumber; } private function sendReport(string $imageUrl, bool $success, bool $cached, int $bytes, float $duration): void { $client = new Client(); try { $client->post('https://api.mangadex.network/report', [ 'headers' => [ 'Content-Type' => 'application/json', ], 'json' => [ 'url' => $imageUrl, 'success' => $success, 'cached' => $cached, 'bytes' => $bytes, 'duration' => $duration, ], ]); } catch (RequestException $e) { // Gérer les exceptions de requête pour le rapport throw new \Exception('Erreur lors de l\'envoi du rapport : ' . $e->getMessage()); } } private function createCbzFile(string $tempDir, array $pageData, string $cbzFilePath): void { $zip = new \ZipArchive(); if ($zip->open($cbzFilePath, \ZipArchive::CREATE) === true) { foreach ($pageData as $page) { $zip->addFile($page['local_image_url'], basename($page['local_image_url'])); } $zip->close(); } } private function generateCbzPath(Manga $manga, Chapter $chapter): string { $volumeDir = $this->createDirectories($manga, $chapter->getVolume()); $fileName = sprintf( '%s_vol%d_ch%s.cbz', $manga->getSlug(), $chapter->getVolume(), $chapter->getNumber() ); return $volumeDir . '/' . $fileName; } private function createDirectories(Manga $manga, int $volume): string { $mangaYear = $manga->getPublicationYear() ?? 'unknown'; $mangaDir = sprintf('%s/%s (%s)', $this->projectDir . self::PUBLIC_CBZ, ucfirst($manga->getSlug()), $mangaYear); $volumeDir = sprintf('%s/volume_%d', $mangaDir, sprintf('%02d', $volume)); if (!is_dir($volumeDir)) { mkdir($volumeDir, 0755, true); } return $volumeDir; } private function cleanupTempFiles(string $directory): void { $files = glob($directory . '/*'); foreach ($files as $file) { if (is_file($file)) { unlink($file); } } rmdir($directory); } }