client = new Client(); } /** * @throws Exception */ public function scrapeChapter(Chapter $chapter, ContentSource $contentSource): array|bool { $manga = $chapter->getManga(); $chapterUrl = $this->getValidChapterUrl($contentSource, $manga, $chapter->getNumber()); if (!$chapterUrl) { throw new Exception("Aucune URL valide trouvée pour le chapitre {$chapter->getNumber()} du manga {$manga->getTitle()}"); } $tempDir = sys_get_temp_dir() . '/' . uniqid('manga_scraper_'); mkdir($tempDir); $pageData = []; if ($contentSource->getNextPageSelector() === null) { // Lecteur vertical $html = $this->fetchHtml($chapterUrl); $pageData = $this->scrapeVerticalReader($html, $contentSource); } else { // Lecteur horizontal (paginé) $pageData = $this->scrapeHorizontalReader($chapterUrl, $contentSource); } // Télécharger et sauvegarder les images foreach ($pageData as $index => &$page) { $imageName = sprintf('%03d.%s', $index + 1, pathinfo(parse_url($page['image_url'], PHP_URL_PATH), PATHINFO_EXTENSION)); $imagePath = $tempDir . '/' . $imageName; $this->downloadAndSaveImage($page['image_url'], $imagePath); $this->dispatchProgressEvent($chapter, $index + 1, count($pageData)); $page['local_image_url'] = $imagePath; } $cbzFilePath = $this->generateCbzPath($manga, $chapter); $this->createCbzFile($tempDir, $pageData, $cbzFilePath); $chapter->setCbzPath($cbzFilePath); $this->entityManager->persist($chapter); $this->entityManager->flush(); // Nettoyage du répertoire temporaire $this->cleanupTempFiles($tempDir); return $pageData; } /** * @throws Exception */ public function testScraping(string $mangaSlug, string $chapterNumber, ContentSource $contentSource): array { $chapterUrl = $contentSource->getChapterUrl($mangaSlug, $chapterNumber); if (!$this->isChapterUrlValid($chapterUrl)) { throw new \Exception("Invalid URL, check format and slug"); } $html = $this->fetchHtml($chapterUrl); if ($contentSource->getNextPageSelector() === null) { return $this->scrapeVerticalReader($html, $contentSource); } else { return $this->scrapeHorizontalReader($chapterUrl, $contentSource); } } public function supports(string $scrapingType): bool { return $scrapingType === 'html'; } private function scrapeVerticalReader(string $html, ContentSource $contentSource): array { $crawler = new Crawler($html); $images = $crawler->filter($contentSource->getImageSelector()); $pageData = []; foreach ($images as $index => $image) { $imgUrl = $image->getAttribute('src') ?: $image->getAttribute('data-src'); $pageData[] = [ 'image_url' => $this->cleanImageUrl($imgUrl), 'page_number' => $index + 1, ]; } return $pageData; } private function scrapeHorizontalReader(string $chapterUrl, ContentSource $contentSource): array { $pageData = []; $currentPageUrl = $chapterUrl; do { $html = $this->fetchHtml($currentPageUrl); $page = $this->extractMangaPageData($html, $contentSource); $pageData[] = [ 'image_url' => $this->cleanImageUrl($page['image_url']), 'page_number' => count($pageData) + 1, ]; $currentPageUrl = $page['next_page_url']; } while ($currentPageUrl); return $pageData; } private function fetchHtml(string $url): string { try { $response = $this->client->get($url, [ 'http_errors' => true, 'allow_redirects' => false ]); $statusCode = $response->getStatusCode(); if ($statusCode >= 300 && $statusCode < 400 || $statusCode == 404) { throw new Exception('Chapter Not Found at ' . $url); } return (string)$response->getBody(); } catch (Exception $e) { throw new Exception('Bad Request: ' . $e->getMessage()); } } private function downloadAndSaveImage(string $imageUrl, string $destinationPath): void { try { $response = $this->client->get($imageUrl); $contentType = $response->getHeaderLine('Content-Type'); if (str_starts_with($contentType, 'image/')) { file_put_contents($destinationPath, $response->getBody()->getContents()); } else { throw new Exception('Le contenu récupéré n\'est pas une image. Type de contenu : ' . $contentType); } } catch (Exception $e) { throw new Exception('Erreur lors de la récupération de l\'image : ' . $e->getMessage()); } } private function extractMangaPageData(string $html, ContentSource $mangaSource): array { $crawler = new Crawler($html); $imgUrl = $crawler->filter($mangaSource->getImageSelector())->attr('src') ?? $crawler->filter($mangaSource->getImageSelector())->attr('data-src'); $nextLink = $crawler->filter($mangaSource->getNextPageSelector()); $nextUrl = $nextLink->count() > 0 ? $nextLink->attr('href') : null; // Convert relative URLs to absolute URLs if (!preg_match('/^https?:\/\//', $imgUrl)) { $urlComponents = parse_url($mangaSource->getBaseUrl()); $scheme = $urlComponents['scheme']; $host = $urlComponents['host']; $imgUrl = $scheme . '://' . $host . '/' . ltrim($imgUrl, '/'); } return [ 'image_url' => $imgUrl, 'next_page_url' => $nextUrl, ]; } }