Mangarr/src/Service/Scraper/HtmlScraper.php

<?php

namespace App\Service\Scraper;

use App\Entity\Chapter;
use App\Entity\ContentSource;
use GuzzleHttp\Exception\GuzzleException;
use Symfony\Component\DomCrawler\Crawler;

class HtmlScraper extends AbstractScraper
{
    /**
     * @throws \Exception
     * @throws GuzzleException
     */
    public function scrapeChapter(Chapter $chapter, ContentSource $contentSource): array|bool
    {
        $manga = $chapter->getManga();
        $chapterUrl = $this->getValidChapterUrl($contentSource, $manga, $chapter->getNumber());

        if (!$chapterUrl) {
            throw new \Exception("Aucune URL valide trouvée pour le chapitre {$chapter->getNumber()} du manga {$manga->getTitle()}");
        }

        $tempDir = sys_get_temp_dir().'/'.uniqid('manga_scraper_');
        mkdir($tempDir);

        $pageData = [];

        if (null === $contentSource->getNextPageSelector()) {
            // Lecteur vertical
            $html = $this->fetchHtml($chapterUrl);
            $pageData = $this->scrapeVerticalReader($html, $contentSource);
        } else {
            // Lecteur horizontal (paginé)
            $pageData = $this->scrapeHorizontalReader($chapterUrl, $contentSource);
        }

        // Télécharger et sauvegarder les images
        foreach ($pageData as $index => &$page) {
            $imageName = sprintf('%03d.%s', $index + 1, pathinfo(parse_url($page['image_url'], PHP_URL_PATH), PATHINFO_EXTENSION));
            $imagePath = $tempDir.'/'.$imageName;

            $destinationPath = $this->downloadAndSaveImage($page['image_url'], $imagePath);

            $this->dispatchProgressEvent($chapter, $index + 1, count($pageData));

            $page['local_image_url'] = $destinationPath;
        }

        $cbzFilePath = $this->generateCbzPath($manga, $chapter);
        $this->createCbzFile($pageData, $cbzFilePath);

        $chapter->setCbzPath($cbzFilePath);
        $this->entityManager->persist($chapter);
        $this->entityManager->flush();

        $this->cleanupTempFiles($tempDir);

        return $pageData;
    }

    /**
     * @throws \Exception
     */
    public function testScraping(string $mangaSlug, string $chapterNumber, ContentSource $contentSource): array
    {
        $chapterUrl = $contentSource->getChapterUrl($mangaSlug, $chapterNumber);

        if (!$this->isChapterUrlValid($chapterUrl)) {
            throw new \Exception('Invalid URL, check format and slug');
        }

        $html = $this->fetchHtml($chapterUrl);

        if (null === $contentSource->getNextPageSelector()) {
            return $this->scrapeVerticalReader($html, $contentSource);
        } else {
            return $this->scrapeHorizontalReader($chapterUrl, $contentSource);
        }
    }

    public function supports(string $scrapingType): bool
    {
        return 'html' === $scrapingType;
    }

    private function scrapeVerticalReader(string $html, ContentSource $contentSource): array
    {
        $crawler = new Crawler($html);
        $images = $crawler->filter($contentSource->getImageSelector());

        $pageData = [];
        foreach ($images as $index => $image) {
            $imgUrl = $image->getAttribute('src') ?: $image->getAttribute('data-src');
            $pageData[] = [
                'image_url' => $this->cleanImageUrl($imgUrl),
                'page_number' => $index + 1,
            ];
        }

        return $pageData;
    }

    /**
     * @throws \Exception
     */
    private function scrapeHorizontalReader(string $chapterUrl, ContentSource $contentSource): array
    {
        $pageData = [];
        $currentPageUrl = $chapterUrl;

        do {
            $html = $this->fetchHtml($currentPageUrl);
            $page = $this->extractMangaPageData($html, $contentSource);

            $pageData[] = [
                'image_url' => $this->cleanImageUrl($page['image_url']),
                'page_number' => count($pageData) + 1,
            ];

            $currentPageUrl = $page['next_page_url'];
        } while ($currentPageUrl);

        return $pageData;
    }

    private function fetchHtml(string $url): string
    {
        try {
            $response = $this->httpClient->get($url, [
                'http_errors' => true,
                'allow_redirects' => false,
            ]);

            $statusCode = $response->getStatusCode();

            if ($statusCode >= 300 && $statusCode < 400 || 404 == $statusCode) {
                throw new \Exception('Chapter Not Found at '.$url);
            }

            return (string) $response->getBody();
        } catch (\Exception $e) {
            throw new \Exception('Bad Request: '.$e->getMessage());
        }
    }

    private function extractMangaPageData(string $html, ContentSource $mangaSource): array
    {
        $crawler = new Crawler($html);
        $imgUrl = $crawler->filter($mangaSource->getImageSelector())->attr('src')
            ?? $crawler->filter($mangaSource->getImageSelector())->attr('data-src');

        $nextLink = $crawler->filter($mangaSource->getNextPageSelector());
        $nextUrl = $nextLink->count() > 0 ? $nextLink->attr('href') : null;

        // Convert relative URLs to absolute URLs
        if (!preg_match('/^https?:\/\//', $imgUrl)) {
            $urlComponents = parse_url($mangaSource->getBaseUrl());
            $scheme = $urlComponents['scheme'];
            $host = $urlComponents['host'];
            $imgUrl = $scheme.'://'.$host.'/'.ltrim($imgUrl, '/');
        }

        return [
            'image_url' => $imgUrl,
            'next_page_url' => $nextUrl,
        ];
    }
}