feat: ajout d'une nouvelle infrastructure de scraping avec des scrapers pour HTML, HTML avancé et JavaScript, ainsi qu'une factory pour gérer leur création et leur sélection. Mise à jour des gestionnaires de commandes pour intégrer cette nouvelle architecture et améliorer la gestion des erreurs lors du scraping des chapitres.

2025-07-08 15:30:22 +02:00
parent cbb62989d4
commit b456f9304d
10 changed files with 1244 additions and 36 deletions
--- a/src/Domain/Scraping/Infrastructure/Service/Scraper/AdvancedHtmlScraper.php
+++ b/src/Domain/Scraping/Infrastructure/Service/Scraper/AdvancedHtmlScraper.php
@@ -0,0 +1,252 @@
+<?php
+
+namespace App\Domain\Scraping\Infrastructure\Service\Scraper;
+
+use App\Domain\Scraping\Domain\Contract\Service\ScraperInterface;
+use App\Domain\Scraping\Domain\Model\ValueObject\ScrapingRequest;
+use App\Domain\Scraping\Domain\Model\ValueObject\ScrapingResult;
+use Symfony\Component\DomCrawler\Crawler;
+use Symfony\Contracts\HttpClient\HttpClientInterface;
+use Symfony\Component\HttpClient\HttpClient;
+
+class AdvancedHtmlScraper implements ScraperInterface
+{
+    private const USER_AGENTS = [
+        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
+        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0',
+        'Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0'
+    ];
+
+    private const ACCEPT_HEADERS = [
+        'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
+        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+        'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
+    ];
+
+    private const ACCEPT_LANGUAGE_HEADERS = [
+        'en-US,en;q=0.9',
+        'en-US,en;q=0.8',
+        'en-GB,en;q=0.9',
+        'fr-FR,fr;q=0.9,en;q=0.8'
+    ];
+
+    private const RETRY_ATTEMPTS = 3;
+    private const RETRY_DELAY = 2; // secondes
+    private const REQUEST_TIMEOUT = 30;
+
+    public function __construct(
+        private readonly HttpClientInterface $httpClient
+    ) {
+        // Utiliser un client HTTP personnalisé si non fourni
+        if (!$this->httpClient) {
+            $this->httpClient = HttpClient::create([
+                'timeout' => self::REQUEST_TIMEOUT,
+                'verify_peer' => false,
+                'verify_host' => false
+            ]);
+        }
+    }
+
+    public function scrape(ScrapingRequest $request): ScrapingResult
+    {
+        $scrapingParameters = $request->getScrapingParameters();
+
+        try {
+            $pages = !$scrapingParameters['nextPageSelector']
+                ? $this->scrapeVerticalReader($request)
+                : $this->scrapeHorizontalReader($request);
+
+            return new ScrapingResult($pages, count($pages));
+        } catch (\Exception $e) {
+            throw new \RuntimeException('Advanced HTML scraping failed: ' . $e->getMessage(), 0, $e);
+        }
+    }
+
+    public function supports(string $sourceType): bool
+    {
+        return 'advanced_html' === $sourceType;
+    }
+
+    private function scrapeVerticalReader(ScrapingRequest $request): array
+    {
+        $html = $this->fetchHtmlWithRetry($request->getChapterUrl());
+        $crawler = new Crawler($html);
+        $params = $request->getScrapingParameters();
+
+        $images = $crawler->filter($params['imageSelector'])
+            ->each(function ($node) {
+                // Essayer plusieurs attributs pour trouver l'URL de l'image
+                $src = $node->attr('src') ?:
+                       $node->attr('data-src') ?:
+                       $node->attr('data-lazy-src') ?:
+                       $node->attr('data-original') ?:
+                       $node->attr('data-zoom-image') ?:
+                       $node->attr('data-full-src');
+
+                return $this->cleanImageUrl($src);
+            });
+
+        return array_filter($images, fn($url) => !empty($url));
+    }
+
+    private function scrapeHorizontalReader(ScrapingRequest $request): array
+    {
+        $pages = [];
+        $currentUrl = $request->getChapterUrl();
+        $params = $request->getScrapingParameters();
+        $visitedUrls = new \SplObjectStorage();
+        $maxPages = 200; // Limite de sécurité
+        $pageCount = 0;
+
+        while ($currentUrl && $pageCount < $maxPages) {
+            // Éviter les boucles infinies
+            if (isset($visitedUrls[$currentUrl])) {
+                break;
+            }
+            $visitedUrls[$currentUrl] = true;
+
+            $html = $this->fetchHtmlWithRetry($currentUrl);
+            $crawler = new Crawler($html);
+
+            // Récupérer l'image de la page
+            $imageNode = $crawler->filter($params['imageSelector'])->first();
+            if ($imageNode->count() > 0) {
+                $imageUrl = $imageNode->attr('src') ?:
+                           $imageNode->attr('data-src') ?:
+                           $imageNode->attr('data-lazy-src') ?:
+                           $imageNode->attr('data-original');
+
+                if ($imageUrl) {
+                    $imageUrl = $this->resolveRelativeUrl($imageUrl, $currentUrl);
+                    $pages[] = $this->cleanImageUrl($imageUrl);
+                }
+            }
+
+            // Chercher le lien suivant
+            $nextLink = $crawler->filter($params['nextPageSelector'])->first();
+            if ($nextLink->count() === 0) {
+                break;
+            }
+
+            $nextUrl = $nextLink->attr('href');
+            if (!$nextUrl) {
+                break;
+            }
+
+            $currentUrl = $this->resolveRelativeUrl($nextUrl, $currentUrl);
+            $pageCount++;
+
+            // Pause entre les requêtes pour éviter la détection
+            sleep(1);
+        }
+
+        return array_filter($pages, fn($url) => !empty($url));
+    }
+
+    private function fetchHtmlWithRetry(string $url): string
+    {
+        $lastException = null;
+
+        for ($attempt = 1; $attempt <= self::RETRY_ATTEMPTS; $attempt++) {
+            try {
+                return $this->fetchHtml($url);
+            } catch (\Exception $e) {
+                $lastException = $e;
+
+                if ($attempt < self::RETRY_ATTEMPTS) {
+                    // Attendre avant de réessayer
+                    sleep(self::RETRY_DELAY * $attempt);
+                }
+            }
+        }
+
+        throw $lastException;
+    }
+
+    private function fetchHtml(string $url): string
+    {
+        $headers = $this->generateHeaders();
+
+        try {
+            $response = $this->httpClient->request('GET', $url, [
+                'headers' => $headers,
+                'timeout' => self::REQUEST_TIMEOUT
+            ]);
+
+            $statusCode = $response->getStatusCode();
+
+            if ($statusCode >= 400) {
+                throw new \RuntimeException("HTTP {$statusCode} error for URL: {$url}");
+            }
+
+            $content = $response->getContent();
+
+            // Vérifier si on a été bloqué par Cloudflare
+            if (strpos($content, 'cf-browser-verification') !== false ||
+                strpos($content, 'Checking your browser') !== false) {
+                throw new \RuntimeException('Blocked by Cloudflare protection');
+            }
+
+            return $content;
+        } catch (\Exception $e) {
+            throw new \RuntimeException('Failed to fetch HTML: ' . $e->getMessage(), 0, $e);
+        }
+    }
+
+    private function generateHeaders(): array
+    {
+        return [
+            'User-Agent' => self::USER_AGENTS[array_rand(self::USER_AGENTS)],
+            'Accept' => self::ACCEPT_HEADERS[array_rand(self::ACCEPT_HEADERS)],
+            'Accept-Language' => self::ACCEPT_LANGUAGE_HEADERS[array_rand(self::ACCEPT_LANGUAGE_HEADERS)],
+            'Accept-Encoding' => 'gzip, deflate, br',
+            'DNT' => '1',
+            'Connection' => 'keep-alive',
+            'Upgrade-Insecure-Requests' => '1',
+            'Sec-Fetch-Dest' => 'document',
+            'Sec-Fetch-Mode' => 'navigate',
+            'Sec-Fetch-Site' => 'none',
+            'Sec-Fetch-User' => '?1',
+            'Cache-Control' => 'max-age=0'
+        ];
+    }
+
+    private function resolveRelativeUrl(string $url, string $baseUrl): string
+    {
+        if (preg_match('/^https?:\/\//', $url)) {
+            return $url;
+        }
+
+        $parsedBase = parse_url($baseUrl);
+        $scheme = $parsedBase['scheme'];
+        $host = $parsedBase['host'];
+        $port = isset($parsedBase['port']) ? ':' . $parsedBase['port'] : '';
+
+        if (strpos($url, '/') === 0) {
+            // URL absolue relative à la racine
+            return $scheme . '://' . $host . $port . $url;
+        } else {
+            // URL relative au chemin actuel
+            $path = isset($parsedBase['path']) ? dirname($parsedBase['path']) : '';
+            return $scheme . '://' . $host . $port . $path . '/' . $url;
+        }
+    }
+
+    private function cleanImageUrl(string $url): string
+    {
+        if (empty($url)) {
+            return '';
+        }
+
+        // Supprimer les caractères de contrôle
+        $url = preg_replace('/[\x00-\x1F\x7F]/', '', trim($url));
+
+        // Supprimer les paramètres de requête inutiles
+        $url = preg_replace('/(\?|&)(utm_[^&]*|ref[^&]*|source[^&]*)/i', '', $url);
+
+        return $url;
+    }
+}
--- a/src/Domain/Scraping/Infrastructure/Service/Scraper/JavaScriptScraper.php
+++ b/src/Domain/Scraping/Infrastructure/Service/Scraper/JavaScriptScraper.php
@@ -0,0 +1,157 @@
+<?php
+
+namespace App\Domain\Scraping\Infrastructure\Service\Scraper;
+
+use App\Domain\Scraping\Domain\Contract\Service\ScraperInterface;
+use App\Domain\Scraping\Domain\Exception\ChapterNotFoundException;
+use App\Domain\Scraping\Domain\Model\ValueObject\ScrapingRequest;
+use App\Domain\Scraping\Domain\Model\ValueObject\ScrapingResult;
+use Symfony\Component\Process\Process;
+
+class JavaScriptScraper implements ScraperInterface
+{
+    private const PUPPETEER_TIMEOUT = 60; // secondes
+    private const PUPPETEER_SCRIPT_PATH = '/public/puppeteer-scraper.js';
+    private const NODE_EXECUTABLE = 'node';
+
+    public function __construct(
+        private readonly string $projectDir
+    ) {}
+
+    public function scrape(ScrapingRequest $request): ScrapingResult
+    {
+        $scrappingParameters = $request->getScrapingParameters();
+
+        try {
+            $scriptPath = $this->projectDir . self::PUPPETEER_SCRIPT_PATH;
+
+            if (!file_exists($scriptPath)) {
+                throw new \RuntimeException('Puppeteer script not found at: ' . $scriptPath);
+            }
+
+            $imageUrls = !empty($scrappingParameters['nextPageSelector'])
+                ? $this->scrapeHorizontalReader($request, $scriptPath)
+                : $this->scrapeVerticalReader($request, $scriptPath);
+
+            return new ScrapingResult($imageUrls, count($imageUrls));
+        } catch (\Exception $e) {
+            throw new \RuntimeException('JavaScript scraping failed: ' . $e->getMessage(), 0, $e);
+        }
+    }
+
+    public function supports(string $sourceType): bool
+    {
+        return 'javascript' === $sourceType;
+    }
+
+        private function scrapeVerticalReader(ScrapingRequest $request, string $scriptPath): array
+    {
+        $params = $request->getScrapingParameters();
+        $processArgs = [
+            self::NODE_EXECUTABLE,
+            $scriptPath,
+            '--mode=vertical',
+            '--url=' . $request->getChapterUrl(),
+            '--image-selector=' . $params['imageSelector'],
+            '--wait-for-images=true',
+            '--scroll=true'
+        ];
+
+        // Ajouter les paramètres de chapitre si disponibles
+        if (!empty($params['chapterSelector'])) {
+            $processArgs[] = '--chapter-selector=' . $params['chapterSelector'];
+        }
+
+        if (isset($params['chapterNumber'])) {
+            $processArgs[] = '--chapter-number=' . $params['chapterNumber'];
+        }
+
+        $process = new Process($processArgs);
+        return $this->executeProcess($process);
+    }
+
+        private function scrapeHorizontalReader(ScrapingRequest $request, string $scriptPath): array
+    {
+        $params = $request->getScrapingParameters();
+
+        $processArgs = [
+            self::NODE_EXECUTABLE,
+            $scriptPath,
+            '--mode=horizontal',
+            '--url=' . $request->getChapterUrl(),
+            '--image-selector=' . $params['imageSelector'],
+            '--next-selector=' . $params['nextPageSelector'],
+            '--wait-for-images=true'
+        ];
+
+        // Ajouter les paramètres de chapitre si disponibles
+        if (!empty($params['chapterSelector'])) {
+            $processArgs[] = '--chapter-selector=' . $params['chapterSelector'];
+        }
+
+        if (isset($params['chapterNumber'])) {
+            $processArgs[] = '--chapter-number=' . $params['chapterNumber'];
+        }
+
+        $process = new Process($processArgs);
+        return $this->executeProcess($process);
+    }
+
+    private function executeProcess(Process $process): array
+    {
+        $process->setTimeout(self::PUPPETEER_TIMEOUT);
+        $process->run();
+
+        if (!$process->isSuccessful()) {
+            $error = $process->getErrorOutput() ?: $process->getOutput();
+            throw new \RuntimeException('Puppeteer process failed: ' . $error);
+        }
+
+        $output = $process->getOutput();
+        $lines = explode("\n", trim($output));
+        $resultLine = end($lines);
+
+        // Gérer le cas où le chapitre n'est pas trouvé
+        if (strpos($resultLine, 'CHAPTER_NOT_FOUND:') === 0) {
+            $jsonData = substr($resultLine, 18); // Remove 'CHAPTER_NOT_FOUND:' prefix
+            $errorData = json_decode($jsonData, true);
+
+            if (is_array($errorData) && isset($errorData['message'])) {
+                throw new ChapterNotFoundException($errorData['message']);
+            }
+
+            throw new ChapterNotFoundException('Le chapitre demandé n\'est pas disponible.');
+        }
+
+        // Gérer le cas normal avec des images
+        if (strpos($resultLine, 'RESULT:') === 0) {
+            $jsonData = substr($resultLine, 7); // Remove 'RESULT:' prefix
+            $imageUrls = json_decode($jsonData, true);
+
+            if (!is_array($imageUrls)) {
+                throw new \RuntimeException('Failed to parse Puppeteer output');
+            }
+
+            return $this->cleanImageUrls($imageUrls);
+        }
+
+        // Format de sortie non reconnu
+        throw new \RuntimeException('Invalid Puppeteer output format: ' . $resultLine);
+    }
+
+    private function cleanImageUrls(array $urls): array
+    {
+        return array_filter(
+            array_map(
+                fn($url) => $this->cleanImageUrl($url),
+                $urls
+            ),
+            fn($url) => !empty($url) && filter_var($url, FILTER_VALIDATE_URL)
+        );
+    }
+
+    private function cleanImageUrl(string $url): string
+    {
+        return preg_replace('/[\x00-\x1F\x7F]/', '', trim($url));
+    }
+}
--- a/src/Domain/Scraping/Infrastructure/Service/ScraperFactory.php
+++ b/src/Domain/Scraping/Infrastructure/Service/ScraperFactory.php
@@ -0,0 +1,146 @@
+<?php
+
+namespace App\Domain\Scraping\Infrastructure\Service;
+
+use App\Domain\Scraping\Domain\Contract\Service\ScraperInterface;
+use App\Domain\Scraping\Domain\Contract\Service\ScraperFactoryInterface;
+use App\Domain\Scraping\Infrastructure\Service\Scraper\HtmlScraper;
+use App\Domain\Scraping\Infrastructure\Service\Scraper\AdvancedHtmlScraper;
+use App\Domain\Scraping\Infrastructure\Service\Scraper\JavaScriptScraper;
+use App\Domain\Scraping\Domain\Contract\Service\ImageDownloaderInterface;
+use Symfony\Component\Messenger\MessageBusInterface;
+use Symfony\Contracts\HttpClient\HttpClientInterface;
+
+class ScraperFactory implements ScraperFactoryInterface
+{
+    private const SCRAPER_TYPES = [
+        'html' => HtmlScraper::class,
+        'advanced_html' => AdvancedHtmlScraper::class,
+        'javascript' => JavaScriptScraper::class,
+    ];
+
+    private const SCRAPER_PRIORITIES = [
+        'javascript' => 1,      // Le plus puissant pour contourner les protections
+        'advanced_html' => 2,   // Bon compromis entre performance et efficacité
+        'html' => 3,           // Le plus simple et rapide
+    ];
+
+    private array $scrapers = [];
+
+    public function __construct(
+        private readonly ImageDownloaderInterface $imageDownloader,
+        private readonly MessageBusInterface $eventBus,
+        private readonly HttpClientInterface $httpClient,
+        private readonly string $projectDir
+    ) {
+        $this->initializeScrapers();
+    }
+
+    /**
+     * Créer un scraper pour un type spécifique
+     */
+    public function createScraper(string $type): ScraperInterface
+    {
+        if (!isset($this->scrapers[$type])) {
+            throw new \InvalidArgumentException("Scraper type '{$type}' is not supported");
+        }
+
+        return $this->scrapers[$type];
+    }
+
+    /**
+     * Obtenir le scraper le plus approprié selon la priorité
+     */
+    public function getBestScraper(): ScraperInterface
+    {
+        $sortedTypes = array_keys(self::SCRAPER_PRIORITIES);
+        usort($sortedTypes, fn($a, $b) => self::SCRAPER_PRIORITIES[$a] <=> self::SCRAPER_PRIORITIES[$b]);
+
+        return $this->scrapers[$sortedTypes[0]];
+    }
+
+    /**
+     * Obtenir tous les scrapers disponibles
+     */
+    public function getAvailableScrapers(): array
+    {
+        return $this->scrapers;
+    }
+
+    /**
+     * Obtenir les types de scrapers supportés
+     */
+    public function getSupportedTypes(): array
+    {
+        return array_keys(self::SCRAPER_TYPES);
+    }
+
+    /**
+     * Vérifier si un type de scraper est supporté
+     */
+    public function isSupported(string $type): bool
+    {
+        return isset(self::SCRAPER_TYPES[$type]);
+    }
+
+    /**
+     * Obtenir le scraper de fallback (le plus simple)
+     */
+    public function getFallbackScraper(): ScraperInterface
+    {
+        return $this->scrapers['html'];
+    }
+
+    /**
+     * Essayer plusieurs scrapers en cascade jusqu'à ce qu'un fonctionne
+     */
+    public function getScraperWithFallback(string $preferredType): ScraperInterface
+    {
+        // Essayer le type préféré d'abord
+        if ($this->isSupported($preferredType)) {
+            return $this->scrapers[$preferredType];
+        }
+
+        // Fallback vers le scraper par défaut
+        return $this->getFallbackScraper();
+    }
+
+    /**
+     * Obtenir des statistiques sur les scrapers
+     */
+    public function getScraperStats(): array
+    {
+        return [
+            'total_scrapers' => count($this->scrapers),
+            'supported_types' => $this->getSupportedTypes(),
+            'priorities' => self::SCRAPER_PRIORITIES,
+            'best_scraper' => $this->getBestScraper()::class,
+            'fallback_scraper' => $this->getFallbackScraper()::class
+        ];
+    }
+
+    private function initializeScrapers(): void
+    {
+        foreach (self::SCRAPER_TYPES as $type => $class) {
+            $this->scrapers[$type] = $this->createScraperInstance($class);
+        }
+    }
+
+    private function createScraperInstance(string $class): ScraperInterface
+    {
+        return match ($class) {
+            HtmlScraper::class => new HtmlScraper(
+                $this->imageDownloader,
+                $this->eventBus,
+                $this->httpClient
+            ),
+            AdvancedHtmlScraper::class => new AdvancedHtmlScraper(
+                $this->httpClient
+            ),
+            JavaScriptScraper::class => new JavaScriptScraper(
+                $this->projectDir
+            ),
+            default => throw new \InvalidArgumentException("Unknown scraper class: {$class}")
+        };
+    }
+}