Mangarr/src/Service/MangaScraperService.php

<?php

namespace App\Service;

use App\Entity\Chapter;
use App\Entity\Manga;
use App\Entity\ContentSource;
use App\Event\PageScrappingProgressEvent;
use App\Repository\ChapterRepository;
use App\Repository\MangaRepository;
use Doctrine\ORM\EntityManagerInterface;
use Exception;
use Facebook\WebDriver\Remote\RemoteWebElement;
use Facebook\WebDriver\WebDriverExpectedCondition;
use GuzzleHttp\Client;
use GuzzleHttp\Exception\GuzzleException;
use GuzzleHttp\Exception\RequestException;
use Symfony\Component\DomCrawler\Crawler;
use Symfony\Component\Routing\Matcher\UrlMatcher;
use Symfony\Component\Routing\RequestContext;
use Symfony\Component\Routing\Route;
use Symfony\Component\Routing\RouteCollection;
use Symfony\Contracts\EventDispatcher\EventDispatcherInterface;
use Symfony\Component\Panther\Client as PantherClient;

class MangaScraperService
{
    public const string PUBLIC_CBZ = '/public/cbz';

    public function __construct(
        private readonly string                   $projectDir,
        private readonly EventDispatcherInterface $eventDispatcher,
        private readonly EntityManagerInterface   $entityManager,
        private readonly MangaRepository          $mangaRepository,
    ) {
    }

    private function extractMangaPageData(string $html, ContentSource $mangaSource): array
    {
        $crawler = new Crawler($html);
        $imgUrl = $crawler->filter($mangaSource->getImageSelector())->attr('src')
            ?? $crawler->filter($mangaSource->getImageSelector())->attr('data-src');

        //        dd($imgUrl);

        //        if (empty($imgUrl)) {
        //            throw new \Exception('No valid image found on the page.');
        //        }

        $nextLink = $crawler->filter($mangaSource->getNextPageSelector());
        $nextUrl = $nextLink->count() > 0 ? $nextLink->attr('href') : null;

        // Convert relative URLs to absolute URLs
        if (!preg_match('/^https?:\/\//', $imgUrl)) {
            $urlComponents = parse_url($mangaSource->getBaseUrl());
            $scheme = $urlComponents['scheme'];
            $host = $urlComponents['host'];
            $imgUrl = $scheme . '://' . $host . '/' . ltrim($imgUrl, '/');
        }

        return [
            'image_url' => $imgUrl,
            'next_page_url' => $nextUrl,
        ];
    }

    /**
     * @throws GuzzleException
     */
    public function scrapeManga(Manga $manga, ContentSource $mangaSource): array
    {
        $allChaptersData = [];

        foreach ($manga->getChapters() as $chapter) {
            $chapterData = $this->scrapeChapter($chapter, $mangaSource);
            if ($chapterData !== false) {
                $allChaptersData[$chapter->getNumber()] = $chapterData;
            }
        }

        return $allChaptersData;
    }

    /**
     * @throws GuzzleException
     * @throws Exception
     */
    public function scrapeChapter(Chapter $chapter, ContentSource $mangaSource): array|bool
    {
        return match ($mangaSource->getScrapingType()) {
            'html' => $this->scrapeChapterHtml($chapter->getManga(), $chapter, $mangaSource),
            'javascript' => $this->scrapeChapterJavaScript($chapter->getManga(), $chapter, $mangaSource),
            'mangadex' => $this->scrapeChapterMangadex($chapter, $mangaSource),
            default => throw new Exception('Unsupported scraping type: ' . $mangaSource->getScrapingType()),
        };
    }

    /**
     * @throws GuzzleException
     * @throws Exception
     */
    private function scrapeChapterMangadex(Chapter $chapter, ContentSource $mangaSource): bool
    {
        $client = new Client();
        $chapterUrl = $mangaSource->getBaseUrl() . sprintf($mangaSource->getChapterUrlFormat(), $chapter->getExternalId());
        $manga = $chapter->getManga();
        $pageData = [];

        $response = $client->get($chapterUrl);
        $results = json_decode($response->getBody()->getContents(), true);

        if ($results['result'] !== 'ok' || count($results['chapter']['dataSaver']) === 0) {
            throw new Exception('Error while fetching chapter data from Mangadex ' . $manga->getTitle() . ' ' . $chapter->getNumber());
        }

        $tempDir = sys_get_temp_dir() . '/' . uniqid('manga_scraper_');
        mkdir($tempDir);

        foreach ($results['chapter']['dataSaver'] as $index => $page) {
            $pageUrl = $results['baseUrl'] . '/data-saver/' . $results['chapter']['hash'] . '/' . $page;
            $imagePath = $tempDir . '/' . sprintf('%03d.%s', $index + 1, pathinfo($page, PATHINFO_EXTENSION));

            $this->downloadAndSaveImage($pageUrl, $imagePath);

            $event = new PageScrappingProgressEvent($chapter->getId(), $index + 1, count($results['chapter']['dataSaver']));
            $this->eventDispatcher->dispatch($event, PageScrappingProgressEvent::NAME);

            $pageData[] = [
                'image_url' => $pageUrl,
                'local_image_url' => $imagePath,
                'page_number' => $index + 1,
            ];
        }

        $cbzFilePath = $this->generateCbzPath($manga, $chapter);
        $this->createCbzFile($tempDir, $pageData, $cbzFilePath);

        $chapter->setCbzPath($cbzFilePath);
        $this->entityManager->persist($chapter);
        $this->entityManager->flush();

        // Nettoyage du répertoire temporaire
        $this->cleanupTempFiles($tempDir);

        return true;
    }

    private function scrapeChapterJavascript(Manga $manga, Chapter $chapter, ContentSource $mangaSource): array|bool
    {
        $pantherClient = PantherClient::createChromeClient();
        $chapterUrl = $mangaSource->getChapterUrl($manga->getSlug(), $chapter->getNumber());

        $pantherClient->request('GET', $chapterUrl);

        // Sélection du chapitre dans le menu déroulant
        try {
            $crawler = $pantherClient->waitFor('body');
            $select = $crawler->filter('#selectChapitres');

            if ($select->count() > 0) {
                $chapterNumber = $chapter->getNumber();
                $options = $select->filter('option');
                $targetindex = null;

                /** @var RemoteWebElement $option */
                foreach ($options->getIterator() as $index => $option) {
                    $optionText = $option->getText();
                    // Recherche plus flexible du numéro de chapitre
                    if (preg_match("/\b{$chapterNumber}\b/", $optionText)) {
                        $targetIndex = $index;
                        break;
                    }
                }


                if ($targetIndex !== null) {
                    $pantherClient->executeScript("
                    var select = document.querySelector('#selectChapitres');
                    select.selectedIndex = $targetIndex;
                    select.dispatchEvent(new Event('change'));
                ");

                    // Attendre que la page se mette à jour après la sélection
                    $pantherClient->wait(60000)->until( // 60 secondes de timeout
                        function ($driver) {
                            return $driver->executeScript("
                                var scansPlacement = document.querySelector('#scansPlacement');
                                if (!scansPlacement) return false;

                                var lazyImages = scansPlacement.querySelectorAll('img.lazy');
                                var loadingGif = scansPlacement.querySelector('img[src*=\"loading_scans.gif\"]');

                                // Vérifier que toutes les images lazy sont chargées et que le GIF de chargement n'est plus présent
                                var allImagesLoaded = Array.from(lazyImages).every(img => img.complete && img.naturalWidth > 0);

                                return lazyImages.length > 0 && allImagesLoaded && !loadingGif;
                            ");
                        }
                    );
                } else {
                    throw new \Exception("Chapitre $chapterNumber non trouvé dans le menu déroulant");
                }
            }
        } catch (\Exception $e) {
            //            $this->logger->warning('Erreur lors de la sélection du chapitre : ' . $e->getMessage());
            $pantherClient->close();
            return false;
        }

        $pageData = [];

        try {
            if ($mangaSource->getNextPageSelector() === null) {
                // Lecteur vertical
                $pageData = $this->scrapeVerticalReaderJavascript($pantherClient, $mangaSource, $chapter);
            } else {
                // Lecteur horizontal
                $pageData = $this->scrapeHorizontalReaderJavascript($pantherClient, $mangaSource, $chapter);
            }
        } catch (\Exception $e) {
            throw $e;
            //            $this->logger->warning('Erreur lors du scraping du chapitre ' . $chapter->getNumber() . ' du manga ' . $manga->getTitle() . ': ' . $e->getMessage());
        } finally {
            $pantherClient->close();
        }

        return $pageData;
    }

    private function scrapeVerticalReaderJavascript(PantherClient $pantherClient, ContentSource $mangaSource, Chapter $chapter): array
    {
        $pageData = [];
        $pageNumber = 1;

        $crawler = $pantherClient->waitFor($mangaSource->getImageSelector());
        $images = $crawler->filter($mangaSource->getImageSelector());

        foreach ($images->getIterator() as $image) {
            $imageUrl = $image->getAttribute('src') ?: $image->getAttribute('data-src');

            $pageData[] = [
                'image_url' => $this->cleanImageUrl($imageUrl),
                'page_number' => $pageNumber,
            ];

            $event = new PageScrappingProgressEvent($chapter->getId(), $pageNumber, $images->count());
            $this->eventDispatcher->dispatch($event, PageScrappingProgressEvent::NAME);

            $pageNumber++;
        }

        return $pageData;
    }

    private function scrapeHorizontalReaderJavascript(PantherClient $pantherClient, ContentSource $mangaSource, Chapter $chapter): array
    {
        $pageData = [];
        $pageNumber = 1;

        while (true) {
            try {
                $crawler = $pantherClient->waitFor($mangaSource->getImageSelector());

                $imageElement = $crawler->filter($mangaSource->getImageSelector())->first();
                if ($imageElement->count() === 0) {
                    break; // Fin du chapitre
                }

                $imageUrl = $imageElement->attr('src') ?: $imageElement->attr('data-src');

                $pageData[] = [
                    'image_url' => $this->cleanImageUrl($imageUrl),
                    'page_number' => $pageNumber,
                ];

                $event = new PageScrappingProgressEvent($chapter->getId(), $pageNumber, 0);
                $this->eventDispatcher->dispatch($event, PageScrappingProgressEvent::NAME);

                // Passer à la page suivante
                $nextButton = $pantherCrawler->filter($mangaSource->getNextPageSelector());
                if ($nextButton->count() === 0) {
                    break; // Pas de bouton suivant, fin du chapitre
                }

                $nextButton->click();

                // Attendre que la page change
                $pantherClient->waitFor($mangaSource->getImageSelector(), 10);

                // Mettre à jour le crawler avec le nouveau contenu de la page
                $pantherCrawler = $pantherClient->refreshCrawler();

                $pageNumber++;
            } catch (\Exception $e) {
                throw $e;
                //                $this->logger->warning('Erreur lors du scraping de la page ' . $pageNumber . ' du chapitre ' . $chapter->getNumber() . ': ' . $e->getMessage());
                break;
            }
        }

        return $pageData;
    }

    private function fetchImagesUsingPuppeteer(string $url, string $imageSelector, string $nextButtonSelector): array
    {
        // Appeler le script Puppeteer avec les paramètres nécessaires
        $output = [];
        $command = sprintf('node puppeteer-script.js "%s" "%s" "%s" 2>&1', $url, $imageSelector, $nextButtonSelector); // Redirect stderr to stdout
        //        dump($command);
        //        exec($command, $output, $return_var);

        //        dd($command, $output);

        // Convertir la sortie JSON en tableau PHP
        return json_decode(implode("", $output), true);
    }

    public function testScraping(string $mangaSlug, string $chapterNumber, ContentSource $contentSource): array
    {
        return match ($contentSource->getScrapingType()) {
            'html' => $this->testScrapingHtml($mangaSlug, $chapterNumber, $contentSource),
            'javascript' => $this->testScrapingJavascript($mangaSlug, $chapterNumber, $contentSource),
            default => throw new Exception('Unsupported scraping type: ' . $contentSource->getScrapingType()),
        };
    }

    /**
     * @throws Exception
     */
    public function testScrapingJavascript(string $mangaSlug, string $chapterNumber, ContentSource $contentSource): array
    {
        $manga = $this->mangaRepository->findOneBy(['slug' => $mangaSlug]);
        $chapter = $manga->getChapterByNumber($chapterNumber);

        return $this->scrapeChapterJavascript($manga, $chapter, $contentSource);
    }

    /**
     * @throws GuzzleException
     */
    public function testScrapingHtml(string $mangaSlug, string $chapterNumber, ContentSource $contentSource): array
    {
        $chapterUrl = $contentSource->getChapterUrl($mangaSlug, $chapterNumber);
        $html = $this->fetchHtml($chapterUrl);

        if ($contentSource->getNextPageSelector() === null) {
            return $this->scrapeVerticalReader($html, $contentSource);
        } else {
            return $this->scrapeHorizontalReader($chapterUrl, $contentSource);
        }
    }

    /**
     * @throws GuzzleException
     */
    private function scrapeChapterHtml(Manga $manga, Chapter $chapter, ContentSource $mangaSource): array|bool
    {
        $chapterUrl = $mangaSource->getChapterUrl($manga->getSlug(), $chapter->getNumber());

        $tempDir = sys_get_temp_dir() . '/' . uniqid('manga_scraper_');
        mkdir($tempDir);

        $pageData = [];

        if ($mangaSource->getNextPageSelector() === null) {
            // Lecteur vertical
            $html = $this->fetchHtml($chapterUrl);
            $pageData = $this->scrapeVerticalReader($html, $mangaSource);
        } else {
            // Lecteur horizontal (paginé)
            $pageData = $this->scrapeHorizontalReader($chapterUrl, $mangaSource);
        }

        // Télécharger et sauvegarder les images
        foreach ($pageData as $index => &$page) {
            $imageName = sprintf('%03d.%s', $index + 1, pathinfo(parse_url($page['image_url'], PHP_URL_PATH), PATHINFO_EXTENSION));
            $imagePath = $tempDir . '/' . $imageName;

            $this->downloadAndSaveImage($page['image_url'], $imagePath);

            $event = new PageScrappingProgressEvent($chapter->getId(), $index + 1, count($pageData));
            $this->eventDispatcher->dispatch($event, PageScrappingProgressEvent::NAME);

            $page['local_image_url'] = $imagePath;
        }

        $cbzFilePath = $this->generateCbzPath($manga, $chapter);
        $this->createCbzFile($tempDir, $pageData, $cbzFilePath);

        $chapter->setCbzPath($cbzFilePath);
        $this->entityManager->persist($chapter);
        $this->entityManager->flush();

        // Nettoyage du répertoire temporaire
        $this->cleanupTempFiles($tempDir);

        return $pageData;
    }

    private function scrapeVerticalReader(string $html, ContentSource $contentSource): array
    {
        $crawler = new Crawler($html);
        $images = $crawler->filter($contentSource->getImageSelector());

        $pageData = [];
        foreach ($images as $index => $image) {
            if ($image->getAttribute('src') === '') {
                $imgUrl = $image->getAttribute('data-src');
            } else {
                $imgUrl = $image->getAttribute('src');
            }
            $pageData[] = [
                'image_url' => $this->cleanImageUrl($imgUrl),
                'page_number' => $index + 1,
            ];
        }

        return $pageData;
    }

    /**
     * @throws GuzzleException
     */
    private function scrapeHorizontalReader(string $chapterUrl, ContentSource $contentSource): array
    {
        $pageData = [];
        $currentPageUrl = $chapterUrl;

        do {
            $html = $this->fetchHtml($currentPageUrl);
            $page = $this->extractMangaPageData($html, $contentSource);

            $pageData[] = [
                'image_url' => $this->cleanImageUrl($page['image_url']),
                'page_number' => count($pageData) + 1,
            ];

            $currentPageUrl = $page['next_page_url'];
        } while ($currentPageUrl);

        return $pageData;
    }

    /**
     * Processes a single image
     * @throws GuzzleException
     */
    private function processImage(string $imgUrl, string $tempDir, array &$pageData, int $index, Chapter $chapter): void
    {
        $imgUrl = $this->cleanImageUrl($imgUrl);
        $imageName = sprintf('%03d.%s', $index + 1, pathinfo(parse_url($imgUrl, PHP_URL_PATH), PATHINFO_EXTENSION));
        $imagePath = $tempDir . '/' . $imageName;

        $this->downloadAndSaveImage($imgUrl, $imagePath);

        //        $event = new PageScrappingProgressEvent($chapter->getId(), $index + 1, 0);
        //        $this->eventDispatcher->dispatch($event, PageScrappingProgressEvent::NAME);

        $pageData[] = [
            'image_url' => $imgUrl,
            'local_image_url' => $imagePath,
            'page_number' => $index + 1,
        ];
    }

    private function cleanImageUrl(string $url): string
    {
        return preg_replace('/[\x00-\x1F\x7F]/', '', trim($url));
    }

    /**
     * @throws GuzzleException
     * @throws Exception
     */
    private function fetchHtml(string $url): string
    {
        $client = new Client();

        try {
            $response = $client->get($url, [
                'http_errors' => true,
                'allow_redirects' => false
            ]);

            $statusCode = $response->getStatusCode();

            if ($statusCode >= 300 && $statusCode < 400) {
                throw new Exception('Chapter Not Found at ' . $url);
            } elseif ($statusCode == 404) {
                throw new Exception('Chapter Not Found at ' . $url);
            }

            return (string)$response->getBody();
        } catch (Exception $e) {
            throw new Exception('Bad Request: ' . $e->getMessage());
        }
    }

    /**
     * @throws GuzzleException
     */
    private function downloadAndSaveImage(string $imageUrl, string $destinationPath): void
    {
        $client = new Client();
        $startTime = microtime(true);

        try {
            $response = $client->get($imageUrl);
            $endTime = microtime(true);
            $contentType = $response->getHeaderLine('Content-Type');
            $xCacheHeader = $response->getHeaderLine('X-Cache');
            $isCached = str_starts_with($xCacheHeader, 'HIT');
            $contentLength = $response->getHeaderLine('Content-Length');

            if (str_starts_with($contentType, 'image/')) {
                file_put_contents($destinationPath, $response->getBody()->getContents());
                //                if ($this->scrapingType === 'mangadex') {
                //                    $this->sendReport($imageUrl, true, $isCached, (int)$contentLength, ($endTime - $startTime) * 1000);
                //                }
            } else {
                //                if ($this->scrapingType === 'mangadex') {
                //                    $this->sendReport($imageUrl, false, $isCached, (int)$contentLength, ($endTime - $startTime) * 1000);
                //                }
                throw new \Exception('Le contenu récupéré n\'est pas une image. Type de contenu : ' . $contentType);
            }
        } catch (RequestException $e) {
            throw new \Exception('Erreur lors de la récupération de l\'image : ' . $e->getMessage());
        }
    }

    /**
     * @throws GuzzleException
     */
    private function isChapterAvailable(string $chapterUrl, float $chapterNumber, ContentSource $mangaSource): bool
    {
        $html = $this->fetchHtml($chapterUrl);
        $crawler = new Crawler($html);
        $nextLink = $crawler->filter($mangaSource->getNextPageSelector());

        if ($nextLink->count() === 0) {
            return false;
        }

        $nextUrl = $nextLink->attr('href');
        $routeCollection = new RouteCollection();
        $routeCollection->add('manga_chapter', new Route('/scan-{manga}/{chapter}/{page}'));
        $context = new RequestContext('/');
        $matcher = new UrlMatcher($routeCollection, $context);
        $path = parse_url($nextUrl, PHP_URL_PATH);
        $parameters = $matcher->match($path);

        return (float)$parameters['chapter'] === $chapterNumber;
    }

    private function sendReport(string $imageUrl, bool $success, bool $cached, int $bytes, float $duration): void
    {
        $client = new Client();

        try {
            $client->post('https://api.mangadex.network/report', [
                'headers' => [
                    'Content-Type' => 'application/json',
                ],
                'json' => [
                    'url' => $imageUrl,
                    'success' => $success,
                    'cached' => $cached,
                    'bytes' => $bytes,
                    'duration' => $duration,
                ],
            ]);
        } catch (RequestException $e) {
            // Gérer les exceptions de requête pour le rapport
            throw new \Exception('Erreur lors de l\'envoi du rapport : ' . $e->getMessage());
        }
    }

    private function createCbzFile(string $tempDir, array $pageData, string $cbzFilePath): void
    {
        $zip = new \ZipArchive();

        if ($zip->open($cbzFilePath, \ZipArchive::CREATE) === true) {
            foreach ($pageData as $page) {
                $zip->addFile($page['local_image_url'], basename($page['local_image_url']));
            }
            $zip->close();
        }
    }

    private function generateCbzPath(Manga $manga, Chapter $chapter): string
    {
        $volumeDir = $this->createDirectories($manga, $chapter->getVolume());
        $fileName = sprintf(
            '%s_vol%d_ch%s.cbz',
            $manga->getSlug(),
            $chapter->getVolume(),
            $chapter->getNumber()
        );
        return $volumeDir . '/' . $fileName;
    }

    private function createDirectories(Manga $manga, int $volume): string
    {
        $mangaYear = $manga->getPublicationYear() ?? 'unknown';
        $mangaDir = sprintf('%s/%s (%s)', $this->projectDir . self::PUBLIC_CBZ, ucfirst($manga->getSlug()), $mangaYear);
        $volumeDir = sprintf('%s/volume_%d', $mangaDir, sprintf('%02d', $volume));

        if (!is_dir($volumeDir)) {
            mkdir($volumeDir, 0755, true);
        }

        return $volumeDir;
    }

    private function cleanupTempFiles(string $directory): void
    {
        $files = glob($directory . '/*');
        foreach ($files as $file) {
            if (is_file($file)) {
                unlink($file);
            }
        }
        rmdir($directory);
    }
}