getManga(); $chapterUrl = $this->getValidChapterUrl($contentSource, $manga, $chapter->getNumber()); if (!$chapterUrl) { throw new \Exception("Aucune URL valide trouvée pour le chapitre {$chapter->getNumber()} du manga {$manga->getTitle()}"); } $tempDir = sys_get_temp_dir().'/'.uniqid('manga_scraper_'); mkdir($tempDir); $pageData = []; if (null === $contentSource->getNextPageSelector()) { // Lecteur vertical $html = $this->fetchHtml($chapterUrl); $pageData = $this->scrapeVerticalReader($html, $contentSource); } else { // Lecteur horizontal (paginé) $pageData = $this->scrapeHorizontalReader($chapterUrl, $contentSource); } // Télécharger et sauvegarder les images foreach ($pageData as $index => &$page) { $imageName = sprintf('%03d.%s', $index + 1, pathinfo(parse_url($page['image_url'], PHP_URL_PATH), PATHINFO_EXTENSION)); $imagePath = $tempDir.'/'.$imageName; $destinationPath = $this->downloadAndSaveImage($page['image_url'], $imagePath); $this->dispatchProgressEvent($chapter, $index + 1, count($pageData)); $page['local_image_url'] = $destinationPath; } $cbzFilePath = $this->generateCbzPath($manga, $chapter); $this->createCbzFile($pageData, $cbzFilePath); $chapter->setCbzPath($cbzFilePath); $this->entityManager->persist($chapter); $this->entityManager->flush(); $this->cleanupTempFiles($tempDir); return $pageData; } /** * @throws \Exception */ public function testScraping(string $mangaSlug, string $chapterNumber, ContentSource $contentSource): array { $chapterUrl = $contentSource->getChapterUrl($mangaSlug, $chapterNumber); if (!$this->isChapterUrlValid($chapterUrl)) { throw new \Exception('Invalid URL, check format and slug'); } $html = $this->fetchHtml($chapterUrl); if (null === $contentSource->getNextPageSelector()) { return $this->scrapeVerticalReader($html, $contentSource); } else { return $this->scrapeHorizontalReader($chapterUrl, $contentSource); } } public function supports(string $scrapingType): bool { return 'html' === $scrapingType; } private function scrapeVerticalReader(string $html, ContentSource $contentSource): array { $crawler = new Crawler($html); $images = $crawler->filter($contentSource->getImageSelector()); $pageData = []; foreach ($images as $index => $image) { $imgUrl = $image->getAttribute('src') ?: $image->getAttribute('data-src'); $pageData[] = [ 'image_url' => $this->cleanImageUrl($imgUrl), 'page_number' => $index + 1, ]; } return $pageData; } /** * @throws \Exception */ private function scrapeHorizontalReader(string $chapterUrl, ContentSource $contentSource): array { $pageData = []; $currentPageUrl = $chapterUrl; do { $html = $this->fetchHtml($currentPageUrl); $page = $this->extractMangaPageData($html, $contentSource); $pageData[] = [ 'image_url' => $this->cleanImageUrl($page['image_url']), 'page_number' => count($pageData) + 1, ]; $currentPageUrl = $page['next_page_url']; } while ($currentPageUrl); return $pageData; } private function fetchHtml(string $url): string { try { $response = $this->httpClient->get($url, [ 'http_errors' => true, 'allow_redirects' => false, ]); $statusCode = $response->getStatusCode(); if ($statusCode >= 300 && $statusCode < 400 || 404 == $statusCode) { throw new \Exception('Chapter Not Found at '.$url); } return (string) $response->getBody(); } catch (\Exception $e) { throw new \Exception('Bad Request: '.$e->getMessage()); } } private function extractMangaPageData(string $html, ContentSource $mangaSource): array { $crawler = new Crawler($html); $imgUrl = $crawler->filter($mangaSource->getImageSelector())->attr('src') ?? $crawler->filter($mangaSource->getImageSelector())->attr('data-src'); $nextLink = $crawler->filter($mangaSource->getNextPageSelector()); $nextUrl = $nextLink->count() > 0 ? $nextLink->attr('href') : null; // Convert relative URLs to absolute URLs if (!preg_match('/^https?:\/\//', $imgUrl)) { $urlComponents = parse_url($mangaSource->getBaseUrl()); $scheme = $urlComponents['scheme']; $host = $urlComponents['host']; $imgUrl = $scheme.'://'.$host.'/'.ltrim($imgUrl, '/'); } return [ 'image_url' => $imgUrl, 'next_page_url' => $nextUrl, ]; } }