From b456f9304de19f3291740cefbafd2ee77b40d227 Mon Sep 17 00:00:00 2001 From: "ext.jeremy.guillot@maxicoffee.domains" Date: Tue, 8 Jul 2025 15:30:22 +0200 Subject: [PATCH] =?UTF-8?q?feat:=20ajout=20d'une=20nouvelle=20infrastructu?= =?UTF-8?q?re=20de=20scraping=20avec=20des=20scrapers=20pour=20HTML,=20HTM?= =?UTF-8?q?L=20avanc=C3=A9=20et=20JavaScript,=20ainsi=20qu'une=20factory?= =?UTF-8?q?=20pour=20g=C3=A9rer=20leur=20cr=C3=A9ation=20et=20leur=20s?= =?UTF-8?q?=C3=A9lection.=20Mise=20=C3=A0=20jour=20des=20gestionnaires=20d?= =?UTF-8?q?e=20commandes=20pour=20int=C3=A9grer=20cette=20nouvelle=20archi?= =?UTF-8?q?tecture=20et=20am=C3=A9liorer=20la=20gestion=20des=20erreurs=20?= =?UTF-8?q?lors=20du=20scraping=20des=20chapitres.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- config/services.yaml | 23 + public/puppeteer-scraper.js | 520 ++++++++++++++++++ .../CommandHandler/ScrapeChapterHandler.php | 22 +- .../TestScraperConfigurationHandler.php | 114 +++- .../Service/ScraperFactoryInterface.php | 36 ++ .../Exception/ChapterNotFoundException.php | 6 +- .../Domain/Model/ValueObject/ChapterUrl.php | 4 +- .../Service/Scraper/AdvancedHtmlScraper.php | 252 +++++++++ .../Service/Scraper/JavaScriptScraper.php | 157 ++++++ .../Infrastructure/Service/ScraperFactory.php | 146 +++++ 10 files changed, 1244 insertions(+), 36 deletions(-) create mode 100644 public/puppeteer-scraper.js create mode 100644 src/Domain/Scraping/Domain/Contract/Service/ScraperFactoryInterface.php create mode 100644 src/Domain/Scraping/Infrastructure/Service/Scraper/AdvancedHtmlScraper.php create mode 100644 src/Domain/Scraping/Infrastructure/Service/Scraper/JavaScriptScraper.php create mode 100644 src/Domain/Scraping/Infrastructure/Service/ScraperFactory.php diff --git a/config/services.yaml b/config/services.yaml index 084af5e..a11740d 100644 --- a/config/services.yaml +++ b/config/services.yaml @@ -93,6 +93,29 @@ services: arguments: $scraperFactory: '@App\Service\Scraper\ScraperFactory' + # New Scrapers Factory for Domain Layer + App\Domain\Scraping\Infrastructure\Service\ScraperFactory: + arguments: + $projectDir: '%kernel.project_dir%' + + # Scraper Factory Interface alias + App\Domain\Scraping\Domain\Contract\Service\ScraperFactoryInterface: + alias: App\Domain\Scraping\Infrastructure\Service\ScraperFactory + + # Test Scraper Configuration Handler + App\Domain\Scraping\Application\CommandHandler\TestScraperConfigurationHandler: ~ + + # JavaScript Scraper + App\Domain\Scraping\Infrastructure\Service\Scraper\JavaScriptScraper: + arguments: + $projectDir: '%kernel.project_dir%' + + # Advanced HTML Scraper + App\Domain\Scraping\Infrastructure\Service\Scraper\AdvancedHtmlScraper: ~ + + # Scrape Chapter Handler + App\Domain\Scraping\Application\CommandHandler\ScrapeChapterHandler: ~ + App\Domain\Scraping\Infrastructure\CommandHandler\SymfonyScrapeChapterHandler: tags: - { name: messenger.message_handler, bus: command.bus } diff --git a/public/puppeteer-scraper.js b/public/puppeteer-scraper.js new file mode 100644 index 0000000..c681185 --- /dev/null +++ b/public/puppeteer-scraper.js @@ -0,0 +1,520 @@ +const puppeteer = require('puppeteer'); + +// Configuration par défaut +const CONFIG = { + // Timeout en millisecondes + PAGE_TIMEOUT: 30000, + NAVIGATION_TIMEOUT: 10000, + SCROLL_DELAY: 100, + SCROLL_DISTANCE: 100, + // Timeout réduit pour la détection d'erreur + ERROR_DETECTION_TIMEOUT: 5000, + + // User agents pour contourner la détection + USER_AGENTS: [ + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' + ], + + // Arguments pour contourner la détection + BROWSER_ARGS: [ + '--no-sandbox', + '--disable-setuid-sandbox', + '--disable-dev-shm-usage', + '--disable-accelerated-2d-canvas', + '--no-first-run', + '--no-zygote', + '--single-process', + '--disable-gpu', + '--disable-web-security', + '--disable-features=VizDisplayCompositor', + '--disable-blink-features=AutomationControlled' + ] +}; + +class ChapterNotFoundError extends Error { + constructor(chapterNumber) { + super(`Chapter ${chapterNumber} not found`); + this.name = 'ChapterNotFoundError'; + this.chapterNumber = chapterNumber; + } +} + +class PuppeteerScraper { + constructor(options = {}) { + this.options = this.parseArguments(options); + this.browser = null; + this.page = null; + this.imageUrls = new Set(); + this.lastResponseStatus = null; + this.navigationError = null; + } + + parseArguments(options) { + const args = process.argv.slice(2); + const parsed = { ...options }; + + args.forEach(arg => { + if (arg.startsWith('--')) { + const [key, value] = arg.substring(2).split('='); + parsed[key.replace(/-/g, '_')] = value === 'true' ? true : value === 'false' ? false : value; + } + }); + + return parsed; + } + + async launch() { + // Essayer de trouver un exécutable Chrome/Chromium disponible + const possiblePaths = [ + process.env.CHROME_BIN, + '/usr/bin/chromium', + '/usr/bin/chromium-browser', + '/usr/bin/google-chrome', + '/usr/bin/google-chrome-stable', + '/snap/bin/chromium' + ].filter(path => path); // Supprimer les valeurs nulles/undefined + + let executablePath = null; + + // Vérifier si on peut utiliser un des chemins + for (const path of possiblePaths) { + try { + const fs = require('fs'); + if (fs.existsSync(path)) { + executablePath = path; + console.log(`Using Chrome at: ${path}`); + break; + } + } catch (e) { + // Continuer avec le chemin suivant + } + } + + // Si aucun exécutable trouvé, laisser Puppeteer utiliser celui installé via npm + this.browser = await puppeteer.launch({ + headless: 'new', + executablePath: executablePath, + args: CONFIG.BROWSER_ARGS + }); + + this.page = await this.browser.newPage(); + + // Configuration anti-détection + await this.setupAntiDetection(); + + console.log('Browser launched and configured'); + } + + async setupAntiDetection() { + // Rotation des User-Agent + const userAgent = CONFIG.USER_AGENTS[Math.floor(Math.random() * CONFIG.USER_AGENTS.length)]; + await this.page.setUserAgent(userAgent); + + // Écouter les réponses pour détecter rapidement les erreurs HTTP + this.page.on('response', (response) => { + // Ne surveiller que les réponses de navigation principales + if (response.request().isNavigationRequest()) { + this.lastResponseStatus = response.status(); + + if (response.status() >= 400) { + this.navigationError = { + status: response.status(), + statusText: response.statusText(), + url: response.url() + }; + console.log(`❌ HTTP Error ${response.status()} detected for: ${response.url()}`); + } + } + }); + + // Désactiver seulement les fonts et certains styles pour optimiser + await this.page.setRequestInterception(true); + this.page.on('request', (request) => { + if (['font'].includes(request.resourceType())) { + request.abort(); + } else { + request.continue(); + } + }); + + // Masquer les propriétés de détection de Puppeteer + await this.page.evaluateOnNewDocument(() => { + Object.defineProperty(navigator, 'webdriver', { get: () => undefined }); + Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] }); + Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] }); + window.chrome = { runtime: {} }; + }); + + // Viewport aléatoire + await this.page.setViewport({ + width: 1366 + Math.floor(Math.random() * 200), + height: 768 + Math.floor(Math.random() * 200) + }); + } + + async navigateToPage(url, useReducedTimeout = false) { + // Reset des variables de détection d'erreur + this.lastResponseStatus = null; + this.navigationError = null; + + const timeout = useReducedTimeout ? CONFIG.ERROR_DETECTION_TIMEOUT : CONFIG.PAGE_TIMEOUT; + + try { + console.log(`🌐 Navigating to: ${url}`); + + const response = await this.page.goto(url, { + waitUntil: 'domcontentloaded', // Plus rapide que networkidle2 pour la détection d'erreur + timeout: timeout + }); + + // Vérifier immédiatement le code de statut de la réponse + if (response && response.status() >= 400) { + throw new Error(`HTTP ${response.status()}: ${response.statusText()} for URL: ${url}`); + } + + // Si pas d'erreur immédiate, attendre que le contenu se charge complètement + if (!this.navigationError) { + // Attendre un court délai pour permettre aux ressources de se charger + await new Promise(resolve => setTimeout(resolve, 1000)); + } else { + throw new Error(`Navigation error: ${this.navigationError.status} ${this.navigationError.statusText}`); + } + + console.log(`✅ Successfully loaded: ${url}`); + return response; + + } catch (error) { + // Distinguer les erreurs de timeout des erreurs HTTP + if (error.message.includes('HTTP 4') || error.message.includes('HTTP 5')) { + console.log(`🚫 HTTP Error detected quickly: ${error.message}`); + throw error; + } else if (error.message.includes('timeout')) { + console.log(`⏱️ Navigation timeout for: ${url}`); + throw new Error(`Navigation timeout after ${timeout}ms for URL: ${url}`); + } else { + console.log(`❌ Navigation error: ${error.message}`); + throw error; + } + } + } + + async navigateToPageWithFallback(url) { + try { + // Première tentative avec timeout réduit pour détection rapide d'erreur + return await this.navigateToPage(url, true); + } catch (error) { + if (error.message.includes('HTTP 4') || error.message.includes('HTTP 5')) { + // Erreur HTTP confirmée, ne pas réessayer + throw error; + } + + // Si c'est un timeout, réessayer avec timeout complet + console.log(`🔄 Quick check failed, retrying with full timeout...`); + return await this.navigateToPage(url, false); + } + } + + async selectChapter(chapterSelector, chapterNumber) { + try { + console.log(`📚 Looking for chapter selector: ${chapterSelector}`); + + // Attendre que le sélecteur soit présent + await this.page.waitForSelector(chapterSelector, { timeout: CONFIG.NAVIGATION_TIMEOUT }); + + // Lister toutes les options disponibles + const options = await this.page.$$eval(chapterSelector + ' option', opts => + opts.map(opt => ({ + value: opt.value, + text: opt.textContent.trim(), + selected: opt.selected + })) + ); + + console.log(`📖 Found ${options.length} chapter options`); + + // Chercher l'option correspondant au chapitre demandé + const targetOption = options.find(opt => { + const text = opt.text.toLowerCase(); + const chapterStr = chapterNumber.toString(); + + return text.includes(chapterStr) || + text.includes(`chapitre ${chapterStr}`) || + text.includes(`chapter ${chapterStr}`) || + opt.value === chapterStr || + text.includes(`${chapterStr}.0`) || + text.includes(`${chapterStr} -`); + }); + + if (targetOption) { + console.log(`🎯 Found target chapter: ${targetOption.text} (value: ${targetOption.value})`); + + // Sélectionner le chapitre + await this.page.select(chapterSelector, targetOption.value); + console.log('✅ Chapter selected, waiting for page reload...'); + + // Attendre que la page se recharge après la sélection + try { + await this.page.waitForNavigation({ + waitUntil: 'domcontentloaded', + timeout: CONFIG.ERROR_DETECTION_TIMEOUT + }); + console.log('🔄 Page reloaded after chapter selection'); + } catch (error) { + console.log(`⚠️ Warning during chapter navigation: ${error.message}`); + // Attendre un peu même si la navigation échoue + await new Promise(resolve => setTimeout(resolve, 2000)); + } + + } else { + // Lancer une exception spécifique pour le chapitre non trouvé + throw new ChapterNotFoundError(chapterNumber); + } + + } catch (error) { + if (error instanceof ChapterNotFoundError) { + // Re-lancer l'exception pour qu'elle soit gérée en amont + throw error; + } + console.log(`⚠️ Error selecting chapter: ${error.message}`); + // Continuer même si la sélection échoue pour les autres erreurs + } + } + + async scrapeVertical() { + const url = this.options.url; + const imageSelector = this.options.image_selector; + const waitForImages = this.options.wait_for_images === 'true'; + const shouldScroll = this.options.scroll === 'true'; + const chapterSelector = this.options.chapter_selector; + const chapterNumber = this.options.chapter_number; + + try { + await this.navigateToPageWithFallback(url); + } catch (error) { + if (error.message.includes('HTTP 4') || error.message.includes('HTTP 5')) { + console.log(`🚫 Cannot access page: ${error.message}`); + return []; // Retourner un tableau vide pour les erreurs HTTP + } + throw error; // Re-lancer les autres erreurs + } + + // Gérer la sélection de chapitre si nécessaire + if (chapterSelector && chapterNumber) { + try { + await this.selectChapter(chapterSelector, chapterNumber); + } catch (error) { + if (error instanceof ChapterNotFoundError) { + console.log(`📚 MANGA_EXISTS_BUT_CHAPTER_NOT_FOUND: ${error.message}`); + return { + error: 'CHAPTER_NOT_FOUND', + message: `Le manga existe mais le chapitre ${error.chapterNumber} n'est pas disponible.`, + images: [] + }; + } + throw error; // Re-lancer les autres erreurs + } + } + + // Attendre le sélecteur d'image + if (waitForImages) { + await this.page.waitForSelector(imageSelector, { timeout: CONFIG.NAVIGATION_TIMEOUT }); + } + + // Scroll pour charger toutes les images lazy-load + if (shouldScroll) { + await this.autoScroll(); + } + + // Attendre un peu pour que les images se chargent (plus de temps pour lazy loading) + await new Promise(resolve => setTimeout(resolve, 3000)); + + // Collecter les URLs d'images + const imageUrls = await this.page.$$eval(imageSelector, imgs => { + return imgs.map(img => { + // Priorité au src, puis aux attributs data-* + return img.src || + img.getAttribute('src') || + img.getAttribute('data-src') || + img.getAttribute('data-lazy-src') || + img.getAttribute('data-original'); + }).filter(url => url && url !== 'about:blank'); + }); + + console.log(`Found ${imageUrls.length} images`); + return imageUrls; + } + + async scrapeHorizontal() { + const url = this.options.url; + const imageSelector = this.options.image_selector; + const nextSelector = this.options.next_selector; + const waitForImages = this.options.wait_for_images === 'true'; + const chapterSelector = this.options.chapter_selector; + const chapterNumber = this.options.chapter_number; + + let currentUrl = url; + let pageCount = 0; + const maxPages = 200; // Limite de sécurité + + while (currentUrl && pageCount < maxPages) { + console.log(`Scraping page ${pageCount + 1}: ${currentUrl}`); + + try { + await this.navigateToPageWithFallback(currentUrl); + } catch (error) { + if (error.message.includes('HTTP 4') || error.message.includes('HTTP 5')) { + console.log(`🚫 Cannot access page ${pageCount + 1}: ${error.message}`); + break; // Arrêter le scraping si on rencontre une 404 + } + // Pour les autres erreurs, essayer de continuer + console.log(`⚠️ Warning on page ${pageCount + 1}: ${error.message}, continuing...`); + } + + // Gérer la sélection de chapitre pour la première page seulement + if (pageCount === 0 && chapterSelector && chapterNumber) { + try { + await this.selectChapter(chapterSelector, chapterNumber); + } catch (error) { + if (error instanceof ChapterNotFoundError) { + console.log(`📚 MANGA_EXISTS_BUT_CHAPTER_NOT_FOUND: ${error.message}`); + return { + error: 'CHAPTER_NOT_FOUND', + message: `Le manga existe mais le chapitre ${error.chapterNumber} n'est pas disponible.`, + images: [] + }; + } + throw error; // Re-lancer les autres erreurs + } + } + + // Attendre le sélecteur d'image + if (waitForImages) { + try { + await this.page.waitForSelector(imageSelector, { timeout: CONFIG.NAVIGATION_TIMEOUT }); + } catch (e) { + console.log(`No image found on page ${pageCount + 1}, skipping`); + break; + } + } + + // Récupérer l'image de la page + const imageUrl = await this.page.$eval(imageSelector, img => { + return img.src || + img.getAttribute('src') || + img.getAttribute('data-src') || + img.getAttribute('data-lazy-src') || + img.getAttribute('data-original'); + }).catch(() => null); + + if (imageUrl) { + this.imageUrls.add(imageUrl); + console.log(`Image found: ${imageUrl}`); + } + + // Chercher le bouton/lien suivant + const nextElement = await this.page.$(nextSelector); + if (!nextElement) { + console.log('No next button found, ending scraping'); + break; + } + + // Récupérer l'URL suivante + currentUrl = await nextElement.evaluate(el => { + return el.href || el.getAttribute('href'); + }); + + if (!currentUrl) { + console.log('No next URL found, ending scraping'); + break; + } + + pageCount++; + await new Promise(resolve => setTimeout(resolve, 1000)); // Pause entre les pages + } + + return Array.from(this.imageUrls); + } + + async autoScroll() { + await this.page.evaluate(async (config) => { + await new Promise((resolve) => { + let totalHeight = 0; + let lastHeight = 0; + + const timer = setInterval(() => { + const scrollHeight = document.body.scrollHeight; + + // Si la hauteur a changé, on continue + if (scrollHeight !== lastHeight) { + lastHeight = scrollHeight; + totalHeight = 0; // Reset le counter car plus de contenu apparaît + } + + window.scrollBy(0, config.SCROLL_DISTANCE); + totalHeight += config.SCROLL_DISTANCE; + + // Arrêter si on a atteint le bas ET que rien de nouveau ne charge + if (totalHeight >= scrollHeight) { + clearInterval(timer); + // Scroll final jusqu'à la vraie fin + window.scrollTo(0, document.body.scrollHeight); + resolve(); + } + }, config.SCROLL_DELAY); + }); + }, CONFIG); + } + + async close() { + if (this.browser) { + await this.browser.close(); + } + } +} + +(async () => { + const scraper = new PuppeteerScraper(); + + try { + await scraper.launch(); + + let result = []; + + if (scraper.options.mode === 'vertical') { + result = await scraper.scrapeVertical(); + } else if (scraper.options.mode === 'horizontal') { + result = await scraper.scrapeHorizontal(); + } else { + throw new Error('Invalid mode. Use --mode=vertical or --mode=horizontal'); + } + + // Vérifier si le résultat est un objet d'erreur ou un tableau d'URLs + if (result && typeof result === 'object' && result.error === 'CHAPTER_NOT_FOUND') { + // Cas où le chapitre n'est pas trouvé + console.log(`CHAPTER_NOT_FOUND:${JSON.stringify(result)}`); + } else { + // Cas normal - nettoyer les URLs + const imageUrls = Array.isArray(result) ? result : []; + const cleanUrls = imageUrls.filter(url => url && typeof url === 'string'); + console.log(`RESULT:${JSON.stringify(cleanUrls)}`); + } + + } catch (error) { + if (error instanceof ChapterNotFoundError) { + // Cette erreur est déjà gérée dans les fonctions de scraping + // Mais au cas où elle remonterait ici + console.log(`CHAPTER_NOT_FOUND:${JSON.stringify({ + error: 'CHAPTER_NOT_FOUND', + message: `Le manga existe mais le chapitre ${error.chapterNumber} n'est pas disponible.`, + images: [] + })}`); + } else { + console.error('Error:', error.message); + process.exit(1); + } + } finally { + await scraper.close(); + } +})(); diff --git a/src/Domain/Scraping/Application/CommandHandler/ScrapeChapterHandler.php b/src/Domain/Scraping/Application/CommandHandler/ScrapeChapterHandler.php index 1e8bf10..ecbcb14 100644 --- a/src/Domain/Scraping/Application/CommandHandler/ScrapeChapterHandler.php +++ b/src/Domain/Scraping/Application/CommandHandler/ScrapeChapterHandler.php @@ -9,6 +9,7 @@ use App\Domain\Scraping\Domain\Contract\Repository\SourceRepositoryInterface; use App\Domain\Scraping\Domain\Contract\Service\CbzGeneratorInterface; use App\Domain\Scraping\Domain\Contract\Service\ImageDownloaderInterface; use App\Domain\Scraping\Domain\Contract\Service\ScraperInterface; +use App\Domain\Scraping\Domain\Contract\Service\ScraperFactoryInterface; use App\Domain\Scraping\Domain\Event\ChapterScraped; use App\Domain\Scraping\Domain\Event\ChapterScrapingFailed; use App\Domain\Scraping\Domain\Model\Chapter; @@ -25,7 +26,7 @@ use Doctrine\ORM\EntityManagerInterface; readonly class ScrapeChapterHandler { public function __construct( - private ScraperInterface $scraper, + private ScraperFactoryInterface $scraperFactory, private ImageDownloaderInterface $imageDownloader, private CbzGeneratorInterface $cbzGenerator, private JobRepositoryInterface $jobRepository, @@ -87,13 +88,19 @@ readonly class ScrapeChapterHandler $this->entityManager->beginTransaction(); // 5. Scraping des URLs avec le slug courant + $scrapingParameters = $source->getScrappingParameters(); + $scrapingParameters['chapterNumber'] = $chapter->chapterNumber; + $scrapingType = $scrapingParameters['scrapingType'] ?? 'html'; + $scrapingRequest = new ScrapingRequest( - 'html', + $scrapingType, $source->buildChapterUrl($slug, $chapter->chapterNumber), - $source->getScrappingParameters() + $scrapingParameters ); - $scrapingResult = $this->scraper->scrape($scrapingRequest); + // Sélection du scraper approprié selon le type + $scraper = $this->scraperFactory->getScraperWithFallback($scrapingType); + $scrapingResult = $scraper->scrape($scrapingRequest); // 6. Téléchargement des images $tempDir = new TempDirectory(); @@ -134,6 +141,8 @@ readonly class ScrapeChapterHandler break; } catch (\Exception $e) { + dump('EXCEPTION for source ' . $source->getName() . ' with slug ' . $slug . ': ' . $e->getMessage()); + $this->entityManager->rollback(); if (isset($job)) { @@ -184,6 +193,11 @@ readonly class ScrapeChapterHandler if ($source) { $preferredSources[] = $source; } + + // Limiter à 3 sources préférées maximum + if (count($preferredSources) >= 3) { + break; + } } if (!empty($preferredSources)) { diff --git a/src/Domain/Scraping/Application/CommandHandler/TestScraperConfigurationHandler.php b/src/Domain/Scraping/Application/CommandHandler/TestScraperConfigurationHandler.php index 29fb155..5ebbeb4 100644 --- a/src/Domain/Scraping/Application/CommandHandler/TestScraperConfigurationHandler.php +++ b/src/Domain/Scraping/Application/CommandHandler/TestScraperConfigurationHandler.php @@ -5,46 +5,43 @@ namespace App\Domain\Scraping\Application\CommandHandler; use App\Domain\Scraping\Application\Command\TestScraperConfiguration; use App\Domain\Scraping\Application\Response\TestScraperConfigurationResponse; use App\Domain\Scraping\Domain\Contract\Service\ScraperInterface; +use App\Domain\Scraping\Domain\Contract\Service\ScraperFactoryInterface; use App\Domain\Scraping\Domain\Model\ValueObject\ScrapingRequest; readonly class TestScraperConfigurationHandler { public function __construct( - private ScraperInterface $scraper + private ScraperFactoryInterface $scraperFactory ) {} public function handle(TestScraperConfiguration $command): TestScraperConfigurationResponse { + // Construction des paramètres de scraping depuis les données de la commande + $scrapingParameters = [ + 'imageSelector' => $command->imageSelector, + 'nextPageSelector' => $command->nextPageSelector, + 'chapterUrlFormat' => $command->chapterUrlFormat, + 'scrapingType' => $command->scrapingType, + 'chapterSelector' => $command->chapterSelector, + 'chapterNumber' => $command->chapterNumber + ]; + // Vérification que le scraper supporte le type de scraping + if (!$this->scraperFactory->isSupported($command->scrapingType)) { + return $this->tryWithFallbackScrapers($command, $scrapingParameters); + } + + // Essayer avec le scraper demandé try { - // Construction des paramètres de scraping depuis les données de la commande - $scrapingParameters = [ - 'imageSelector' => $command->imageSelector, - 'nextPageSelector' => $command->nextPageSelector, - 'chapterUrlFormat' => $command->chapterUrlFormat, - 'scrapingType' => $command->scrapingType, - 'chapterSelector' => $command->chapterSelector - ]; - - // Vérification que le scraper supporte le type de scraping - if (!$this->scraper->supports($command->scrapingType)) { - return TestScraperConfigurationResponse::failure( - $command->testUrl, - $command->scrapingType, - ["Type de scraping '{$command->scrapingType}' non supporté"] - ); - } - - // Création de la requête de scraping avec l'URL de test fournie directement + $scraper = $this->scraperFactory->createScraper($command->scrapingType); $scrapingRequest = new ScrapingRequest( $command->scrapingType, $command->testUrl, $scrapingParameters ); - // Tentative de scraping - $scrapingResult = $this->scraper->scrape($scrapingRequest); + $scrapingResult = $scraper->scrape($scrapingRequest); + - // Retour du succès avec les URLs trouvées return TestScraperConfigurationResponse::success( $scrapingResult->getImageUrls(), $command->testUrl, @@ -52,17 +49,80 @@ readonly class TestScraperConfigurationHandler ); } catch (\Exception $e) { - // Analyse de l'erreur pour fournir un message plus détaillé - $errors = $this->analyzeError($e, $command); - return TestScraperConfigurationResponse::failure( $command->testUrl, $command->scrapingType, - $errors + $this->analyzeError($e, $command) ); } } + private function tryWithFallbackScrapers( + TestScraperConfiguration $command, + array $scrapingParameters, + ?\Exception $originalException = null + ): TestScraperConfigurationResponse { + $errors = []; + + if ($originalException) { + $errors[] = [ + 'type' => 'primary_scraper_failed', + 'scraper' => $command->scrapingType, + 'message' => $originalException->getMessage() + ]; + } + + // Essayer avec tous les scrapers disponibles + $availableScrapers = $this->scraperFactory->getSupportedTypes(); + $triedScrapers = []; + + foreach ($availableScrapers as $scraperType) { + if ($scraperType === $command->scrapingType) { + continue; // Déjà essayé + } + + try { + $scraper = $this->scraperFactory->createScraper($scraperType); + $scrapingRequest = new ScrapingRequest( + $scraperType, + $command->testUrl, + $scrapingParameters + ); + + $scrapingResult = $scraper->scrape($scrapingRequest); + + // Succès avec un scraper alternatif + return TestScraperConfigurationResponse::success( + $scrapingResult->getImageUrls(), + $command->testUrl, + $scraperType, // Retourner le type de scraper qui a fonctionné + "Scraper alternatif utilisé: {$scraperType} (au lieu de {$command->scrapingType})" + ); + + } catch (\Exception $e) { + $triedScrapers[] = $scraperType; + $errors[] = [ + 'type' => 'fallback_scraper_failed', + 'scraper' => $scraperType, + 'message' => $e->getMessage() + ]; + } + } + + // Tous les scrapers ont échoué + $errors[] = [ + 'type' => 'all_scrapers_failed', + 'message' => 'Aucun scraper disponible n\'a réussi à traiter cette URL', + 'tried_scrapers' => array_merge([$command->scrapingType], $triedScrapers) + ]; + + return TestScraperConfigurationResponse::failure( + $command->testUrl, + $command->scrapingType, + $errors + ); + } + private function analyzeError(\Exception $e, TestScraperConfiguration $command): array { $errors = []; diff --git a/src/Domain/Scraping/Domain/Contract/Service/ScraperFactoryInterface.php b/src/Domain/Scraping/Domain/Contract/Service/ScraperFactoryInterface.php new file mode 100644 index 0000000..69ff6b2 --- /dev/null +++ b/src/Domain/Scraping/Domain/Contract/Service/ScraperFactoryInterface.php @@ -0,0 +1,36 @@ +httpClient) { + $this->httpClient = HttpClient::create([ + 'timeout' => self::REQUEST_TIMEOUT, + 'verify_peer' => false, + 'verify_host' => false + ]); + } + } + + public function scrape(ScrapingRequest $request): ScrapingResult + { + $scrapingParameters = $request->getScrapingParameters(); + + try { + $pages = !$scrapingParameters['nextPageSelector'] + ? $this->scrapeVerticalReader($request) + : $this->scrapeHorizontalReader($request); + + return new ScrapingResult($pages, count($pages)); + } catch (\Exception $e) { + throw new \RuntimeException('Advanced HTML scraping failed: ' . $e->getMessage(), 0, $e); + } + } + + public function supports(string $sourceType): bool + { + return 'advanced_html' === $sourceType; + } + + private function scrapeVerticalReader(ScrapingRequest $request): array + { + $html = $this->fetchHtmlWithRetry($request->getChapterUrl()); + $crawler = new Crawler($html); + $params = $request->getScrapingParameters(); + + $images = $crawler->filter($params['imageSelector']) + ->each(function ($node) { + // Essayer plusieurs attributs pour trouver l'URL de l'image + $src = $node->attr('src') ?: + $node->attr('data-src') ?: + $node->attr('data-lazy-src') ?: + $node->attr('data-original') ?: + $node->attr('data-zoom-image') ?: + $node->attr('data-full-src'); + + return $this->cleanImageUrl($src); + }); + + return array_filter($images, fn($url) => !empty($url)); + } + + private function scrapeHorizontalReader(ScrapingRequest $request): array + { + $pages = []; + $currentUrl = $request->getChapterUrl(); + $params = $request->getScrapingParameters(); + $visitedUrls = new \SplObjectStorage(); + $maxPages = 200; // Limite de sécurité + $pageCount = 0; + + while ($currentUrl && $pageCount < $maxPages) { + // Éviter les boucles infinies + if (isset($visitedUrls[$currentUrl])) { + break; + } + $visitedUrls[$currentUrl] = true; + + $html = $this->fetchHtmlWithRetry($currentUrl); + $crawler = new Crawler($html); + + // Récupérer l'image de la page + $imageNode = $crawler->filter($params['imageSelector'])->first(); + if ($imageNode->count() > 0) { + $imageUrl = $imageNode->attr('src') ?: + $imageNode->attr('data-src') ?: + $imageNode->attr('data-lazy-src') ?: + $imageNode->attr('data-original'); + + if ($imageUrl) { + $imageUrl = $this->resolveRelativeUrl($imageUrl, $currentUrl); + $pages[] = $this->cleanImageUrl($imageUrl); + } + } + + // Chercher le lien suivant + $nextLink = $crawler->filter($params['nextPageSelector'])->first(); + if ($nextLink->count() === 0) { + break; + } + + $nextUrl = $nextLink->attr('href'); + if (!$nextUrl) { + break; + } + + $currentUrl = $this->resolveRelativeUrl($nextUrl, $currentUrl); + $pageCount++; + + // Pause entre les requêtes pour éviter la détection + sleep(1); + } + + return array_filter($pages, fn($url) => !empty($url)); + } + + private function fetchHtmlWithRetry(string $url): string + { + $lastException = null; + + for ($attempt = 1; $attempt <= self::RETRY_ATTEMPTS; $attempt++) { + try { + return $this->fetchHtml($url); + } catch (\Exception $e) { + $lastException = $e; + + if ($attempt < self::RETRY_ATTEMPTS) { + // Attendre avant de réessayer + sleep(self::RETRY_DELAY * $attempt); + } + } + } + + throw $lastException; + } + + private function fetchHtml(string $url): string + { + $headers = $this->generateHeaders(); + + try { + $response = $this->httpClient->request('GET', $url, [ + 'headers' => $headers, + 'timeout' => self::REQUEST_TIMEOUT + ]); + + $statusCode = $response->getStatusCode(); + + if ($statusCode >= 400) { + throw new \RuntimeException("HTTP {$statusCode} error for URL: {$url}"); + } + + $content = $response->getContent(); + + // Vérifier si on a été bloqué par Cloudflare + if (strpos($content, 'cf-browser-verification') !== false || + strpos($content, 'Checking your browser') !== false) { + throw new \RuntimeException('Blocked by Cloudflare protection'); + } + + return $content; + } catch (\Exception $e) { + throw new \RuntimeException('Failed to fetch HTML: ' . $e->getMessage(), 0, $e); + } + } + + private function generateHeaders(): array + { + return [ + 'User-Agent' => self::USER_AGENTS[array_rand(self::USER_AGENTS)], + 'Accept' => self::ACCEPT_HEADERS[array_rand(self::ACCEPT_HEADERS)], + 'Accept-Language' => self::ACCEPT_LANGUAGE_HEADERS[array_rand(self::ACCEPT_LANGUAGE_HEADERS)], + 'Accept-Encoding' => 'gzip, deflate, br', + 'DNT' => '1', + 'Connection' => 'keep-alive', + 'Upgrade-Insecure-Requests' => '1', + 'Sec-Fetch-Dest' => 'document', + 'Sec-Fetch-Mode' => 'navigate', + 'Sec-Fetch-Site' => 'none', + 'Sec-Fetch-User' => '?1', + 'Cache-Control' => 'max-age=0' + ]; + } + + private function resolveRelativeUrl(string $url, string $baseUrl): string + { + if (preg_match('/^https?:\/\//', $url)) { + return $url; + } + + $parsedBase = parse_url($baseUrl); + $scheme = $parsedBase['scheme']; + $host = $parsedBase['host']; + $port = isset($parsedBase['port']) ? ':' . $parsedBase['port'] : ''; + + if (strpos($url, '/') === 0) { + // URL absolue relative à la racine + return $scheme . '://' . $host . $port . $url; + } else { + // URL relative au chemin actuel + $path = isset($parsedBase['path']) ? dirname($parsedBase['path']) : ''; + return $scheme . '://' . $host . $port . $path . '/' . $url; + } + } + + private function cleanImageUrl(string $url): string + { + if (empty($url)) { + return ''; + } + + // Supprimer les caractères de contrôle + $url = preg_replace('/[\x00-\x1F\x7F]/', '', trim($url)); + + // Supprimer les paramètres de requête inutiles + $url = preg_replace('/(\?|&)(utm_[^&]*|ref[^&]*|source[^&]*)/i', '', $url); + + return $url; + } +} diff --git a/src/Domain/Scraping/Infrastructure/Service/Scraper/JavaScriptScraper.php b/src/Domain/Scraping/Infrastructure/Service/Scraper/JavaScriptScraper.php new file mode 100644 index 0000000..e455815 --- /dev/null +++ b/src/Domain/Scraping/Infrastructure/Service/Scraper/JavaScriptScraper.php @@ -0,0 +1,157 @@ +getScrapingParameters(); + + try { + $scriptPath = $this->projectDir . self::PUPPETEER_SCRIPT_PATH; + + if (!file_exists($scriptPath)) { + throw new \RuntimeException('Puppeteer script not found at: ' . $scriptPath); + } + + $imageUrls = !empty($scrappingParameters['nextPageSelector']) + ? $this->scrapeHorizontalReader($request, $scriptPath) + : $this->scrapeVerticalReader($request, $scriptPath); + + return new ScrapingResult($imageUrls, count($imageUrls)); + } catch (\Exception $e) { + throw new \RuntimeException('JavaScript scraping failed: ' . $e->getMessage(), 0, $e); + } + } + + public function supports(string $sourceType): bool + { + return 'javascript' === $sourceType; + } + + private function scrapeVerticalReader(ScrapingRequest $request, string $scriptPath): array + { + $params = $request->getScrapingParameters(); + $processArgs = [ + self::NODE_EXECUTABLE, + $scriptPath, + '--mode=vertical', + '--url=' . $request->getChapterUrl(), + '--image-selector=' . $params['imageSelector'], + '--wait-for-images=true', + '--scroll=true' + ]; + + // Ajouter les paramètres de chapitre si disponibles + if (!empty($params['chapterSelector'])) { + $processArgs[] = '--chapter-selector=' . $params['chapterSelector']; + } + + if (isset($params['chapterNumber'])) { + $processArgs[] = '--chapter-number=' . $params['chapterNumber']; + } + + $process = new Process($processArgs); + return $this->executeProcess($process); + } + + private function scrapeHorizontalReader(ScrapingRequest $request, string $scriptPath): array + { + $params = $request->getScrapingParameters(); + + $processArgs = [ + self::NODE_EXECUTABLE, + $scriptPath, + '--mode=horizontal', + '--url=' . $request->getChapterUrl(), + '--image-selector=' . $params['imageSelector'], + '--next-selector=' . $params['nextPageSelector'], + '--wait-for-images=true' + ]; + + // Ajouter les paramètres de chapitre si disponibles + if (!empty($params['chapterSelector'])) { + $processArgs[] = '--chapter-selector=' . $params['chapterSelector']; + } + + if (isset($params['chapterNumber'])) { + $processArgs[] = '--chapter-number=' . $params['chapterNumber']; + } + + $process = new Process($processArgs); + return $this->executeProcess($process); + } + + private function executeProcess(Process $process): array + { + $process->setTimeout(self::PUPPETEER_TIMEOUT); + $process->run(); + + if (!$process->isSuccessful()) { + $error = $process->getErrorOutput() ?: $process->getOutput(); + throw new \RuntimeException('Puppeteer process failed: ' . $error); + } + + $output = $process->getOutput(); + $lines = explode("\n", trim($output)); + $resultLine = end($lines); + + // Gérer le cas où le chapitre n'est pas trouvé + if (strpos($resultLine, 'CHAPTER_NOT_FOUND:') === 0) { + $jsonData = substr($resultLine, 18); // Remove 'CHAPTER_NOT_FOUND:' prefix + $errorData = json_decode($jsonData, true); + + if (is_array($errorData) && isset($errorData['message'])) { + throw new ChapterNotFoundException($errorData['message']); + } + + throw new ChapterNotFoundException('Le chapitre demandé n\'est pas disponible.'); + } + + // Gérer le cas normal avec des images + if (strpos($resultLine, 'RESULT:') === 0) { + $jsonData = substr($resultLine, 7); // Remove 'RESULT:' prefix + $imageUrls = json_decode($jsonData, true); + + if (!is_array($imageUrls)) { + throw new \RuntimeException('Failed to parse Puppeteer output'); + } + + return $this->cleanImageUrls($imageUrls); + } + + // Format de sortie non reconnu + throw new \RuntimeException('Invalid Puppeteer output format: ' . $resultLine); + } + + private function cleanImageUrls(array $urls): array + { + return array_filter( + array_map( + fn($url) => $this->cleanImageUrl($url), + $urls + ), + fn($url) => !empty($url) && filter_var($url, FILTER_VALIDATE_URL) + ); + } + + private function cleanImageUrl(string $url): string + { + return preg_replace('/[\x00-\x1F\x7F]/', '', trim($url)); + } +} diff --git a/src/Domain/Scraping/Infrastructure/Service/ScraperFactory.php b/src/Domain/Scraping/Infrastructure/Service/ScraperFactory.php new file mode 100644 index 0000000..8bcf116 --- /dev/null +++ b/src/Domain/Scraping/Infrastructure/Service/ScraperFactory.php @@ -0,0 +1,146 @@ + HtmlScraper::class, + 'advanced_html' => AdvancedHtmlScraper::class, + 'javascript' => JavaScriptScraper::class, + ]; + + private const SCRAPER_PRIORITIES = [ + 'javascript' => 1, // Le plus puissant pour contourner les protections + 'advanced_html' => 2, // Bon compromis entre performance et efficacité + 'html' => 3, // Le plus simple et rapide + ]; + + private array $scrapers = []; + + public function __construct( + private readonly ImageDownloaderInterface $imageDownloader, + private readonly MessageBusInterface $eventBus, + private readonly HttpClientInterface $httpClient, + private readonly string $projectDir + ) { + $this->initializeScrapers(); + } + + /** + * Créer un scraper pour un type spécifique + */ + public function createScraper(string $type): ScraperInterface + { + if (!isset($this->scrapers[$type])) { + throw new \InvalidArgumentException("Scraper type '{$type}' is not supported"); + } + + return $this->scrapers[$type]; + } + + /** + * Obtenir le scraper le plus approprié selon la priorité + */ + public function getBestScraper(): ScraperInterface + { + $sortedTypes = array_keys(self::SCRAPER_PRIORITIES); + usort($sortedTypes, fn($a, $b) => self::SCRAPER_PRIORITIES[$a] <=> self::SCRAPER_PRIORITIES[$b]); + + return $this->scrapers[$sortedTypes[0]]; + } + + /** + * Obtenir tous les scrapers disponibles + */ + public function getAvailableScrapers(): array + { + return $this->scrapers; + } + + /** + * Obtenir les types de scrapers supportés + */ + public function getSupportedTypes(): array + { + return array_keys(self::SCRAPER_TYPES); + } + + /** + * Vérifier si un type de scraper est supporté + */ + public function isSupported(string $type): bool + { + return isset(self::SCRAPER_TYPES[$type]); + } + + /** + * Obtenir le scraper de fallback (le plus simple) + */ + public function getFallbackScraper(): ScraperInterface + { + return $this->scrapers['html']; + } + + /** + * Essayer plusieurs scrapers en cascade jusqu'à ce qu'un fonctionne + */ + public function getScraperWithFallback(string $preferredType): ScraperInterface + { + // Essayer le type préféré d'abord + if ($this->isSupported($preferredType)) { + return $this->scrapers[$preferredType]; + } + + // Fallback vers le scraper par défaut + return $this->getFallbackScraper(); + } + + /** + * Obtenir des statistiques sur les scrapers + */ + public function getScraperStats(): array + { + return [ + 'total_scrapers' => count($this->scrapers), + 'supported_types' => $this->getSupportedTypes(), + 'priorities' => self::SCRAPER_PRIORITIES, + 'best_scraper' => $this->getBestScraper()::class, + 'fallback_scraper' => $this->getFallbackScraper()::class + ]; + } + + private function initializeScrapers(): void + { + foreach (self::SCRAPER_TYPES as $type => $class) { + $this->scrapers[$type] = $this->createScraperInstance($class); + } + } + + private function createScraperInstance(string $class): ScraperInterface + { + return match ($class) { + HtmlScraper::class => new HtmlScraper( + $this->imageDownloader, + $this->eventBus, + $this->httpClient + ), + AdvancedHtmlScraper::class => new AdvancedHtmlScraper( + $this->httpClient + ), + JavaScriptScraper::class => new JavaScriptScraper( + $this->projectDir + ), + default => throw new \InvalidArgumentException("Unknown scraper class: {$class}") + }; + } +}