feat: ajout d'une nouvelle infrastructure de scraping avec des scrapers pour HTML, HTML avancé et JavaScript, ainsi qu'une factory pour gérer leur création et leur sélection. Mise à jour des gestionnaires de commandes pour intégrer cette nouvelle architecture et améliorer la gestion des erreurs lors du scraping des chapitres.
This commit is contained in:
parent
cbb62989d4
commit
b456f9304d
@@ -93,6 +93,29 @@ services:
|
||||
arguments:
|
||||
$scraperFactory: '@App\Service\Scraper\ScraperFactory'
|
||||
|
||||
# New Scrapers Factory for Domain Layer
|
||||
App\Domain\Scraping\Infrastructure\Service\ScraperFactory:
|
||||
arguments:
|
||||
$projectDir: '%kernel.project_dir%'
|
||||
|
||||
# Scraper Factory Interface alias
|
||||
App\Domain\Scraping\Domain\Contract\Service\ScraperFactoryInterface:
|
||||
alias: App\Domain\Scraping\Infrastructure\Service\ScraperFactory
|
||||
|
||||
# Test Scraper Configuration Handler
|
||||
App\Domain\Scraping\Application\CommandHandler\TestScraperConfigurationHandler: ~
|
||||
|
||||
# JavaScript Scraper
|
||||
App\Domain\Scraping\Infrastructure\Service\Scraper\JavaScriptScraper:
|
||||
arguments:
|
||||
$projectDir: '%kernel.project_dir%'
|
||||
|
||||
# Advanced HTML Scraper
|
||||
App\Domain\Scraping\Infrastructure\Service\Scraper\AdvancedHtmlScraper: ~
|
||||
|
||||
# Scrape Chapter Handler
|
||||
App\Domain\Scraping\Application\CommandHandler\ScrapeChapterHandler: ~
|
||||
|
||||
App\Domain\Scraping\Infrastructure\CommandHandler\SymfonyScrapeChapterHandler:
|
||||
tags:
|
||||
- { name: messenger.message_handler, bus: command.bus }
|
||||
|
||||
520
public/puppeteer-scraper.js
Normal file
520
public/puppeteer-scraper.js
Normal file
@@ -0,0 +1,520 @@
|
||||
const puppeteer = require('puppeteer');
|
||||
|
||||
// Configuration par défaut
|
||||
const CONFIG = {
|
||||
// Timeout en millisecondes
|
||||
PAGE_TIMEOUT: 30000,
|
||||
NAVIGATION_TIMEOUT: 10000,
|
||||
SCROLL_DELAY: 100,
|
||||
SCROLL_DISTANCE: 100,
|
||||
// Timeout réduit pour la détection d'erreur
|
||||
ERROR_DETECTION_TIMEOUT: 5000,
|
||||
|
||||
// User agents pour contourner la détection
|
||||
USER_AGENTS: [
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||||
],
|
||||
|
||||
// Arguments pour contourner la détection
|
||||
BROWSER_ARGS: [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-accelerated-2d-canvas',
|
||||
'--no-first-run',
|
||||
'--no-zygote',
|
||||
'--single-process',
|
||||
'--disable-gpu',
|
||||
'--disable-web-security',
|
||||
'--disable-features=VizDisplayCompositor',
|
||||
'--disable-blink-features=AutomationControlled'
|
||||
]
|
||||
};
|
||||
|
||||
class ChapterNotFoundError extends Error {
|
||||
constructor(chapterNumber) {
|
||||
super(`Chapter ${chapterNumber} not found`);
|
||||
this.name = 'ChapterNotFoundError';
|
||||
this.chapterNumber = chapterNumber;
|
||||
}
|
||||
}
|
||||
|
||||
class PuppeteerScraper {
|
||||
constructor(options = {}) {
|
||||
this.options = this.parseArguments(options);
|
||||
this.browser = null;
|
||||
this.page = null;
|
||||
this.imageUrls = new Set();
|
||||
this.lastResponseStatus = null;
|
||||
this.navigationError = null;
|
||||
}
|
||||
|
||||
parseArguments(options) {
|
||||
const args = process.argv.slice(2);
|
||||
const parsed = { ...options };
|
||||
|
||||
args.forEach(arg => {
|
||||
if (arg.startsWith('--')) {
|
||||
const [key, value] = arg.substring(2).split('=');
|
||||
parsed[key.replace(/-/g, '_')] = value === 'true' ? true : value === 'false' ? false : value;
|
||||
}
|
||||
});
|
||||
|
||||
return parsed;
|
||||
}
|
||||
|
||||
async launch() {
|
||||
// Essayer de trouver un exécutable Chrome/Chromium disponible
|
||||
const possiblePaths = [
|
||||
process.env.CHROME_BIN,
|
||||
'/usr/bin/chromium',
|
||||
'/usr/bin/chromium-browser',
|
||||
'/usr/bin/google-chrome',
|
||||
'/usr/bin/google-chrome-stable',
|
||||
'/snap/bin/chromium'
|
||||
].filter(path => path); // Supprimer les valeurs nulles/undefined
|
||||
|
||||
let executablePath = null;
|
||||
|
||||
// Vérifier si on peut utiliser un des chemins
|
||||
for (const path of possiblePaths) {
|
||||
try {
|
||||
const fs = require('fs');
|
||||
if (fs.existsSync(path)) {
|
||||
executablePath = path;
|
||||
console.log(`Using Chrome at: ${path}`);
|
||||
break;
|
||||
}
|
||||
} catch (e) {
|
||||
// Continuer avec le chemin suivant
|
||||
}
|
||||
}
|
||||
|
||||
// Si aucun exécutable trouvé, laisser Puppeteer utiliser celui installé via npm
|
||||
this.browser = await puppeteer.launch({
|
||||
headless: 'new',
|
||||
executablePath: executablePath,
|
||||
args: CONFIG.BROWSER_ARGS
|
||||
});
|
||||
|
||||
this.page = await this.browser.newPage();
|
||||
|
||||
// Configuration anti-détection
|
||||
await this.setupAntiDetection();
|
||||
|
||||
console.log('Browser launched and configured');
|
||||
}
|
||||
|
||||
async setupAntiDetection() {
|
||||
// Rotation des User-Agent
|
||||
const userAgent = CONFIG.USER_AGENTS[Math.floor(Math.random() * CONFIG.USER_AGENTS.length)];
|
||||
await this.page.setUserAgent(userAgent);
|
||||
|
||||
// Écouter les réponses pour détecter rapidement les erreurs HTTP
|
||||
this.page.on('response', (response) => {
|
||||
// Ne surveiller que les réponses de navigation principales
|
||||
if (response.request().isNavigationRequest()) {
|
||||
this.lastResponseStatus = response.status();
|
||||
|
||||
if (response.status() >= 400) {
|
||||
this.navigationError = {
|
||||
status: response.status(),
|
||||
statusText: response.statusText(),
|
||||
url: response.url()
|
||||
};
|
||||
console.log(`❌ HTTP Error ${response.status()} detected for: ${response.url()}`);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// Désactiver seulement les fonts et certains styles pour optimiser
|
||||
await this.page.setRequestInterception(true);
|
||||
this.page.on('request', (request) => {
|
||||
if (['font'].includes(request.resourceType())) {
|
||||
request.abort();
|
||||
} else {
|
||||
request.continue();
|
||||
}
|
||||
});
|
||||
|
||||
// Masquer les propriétés de détection de Puppeteer
|
||||
await this.page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
|
||||
Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
|
||||
Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
|
||||
window.chrome = { runtime: {} };
|
||||
});
|
||||
|
||||
// Viewport aléatoire
|
||||
await this.page.setViewport({
|
||||
width: 1366 + Math.floor(Math.random() * 200),
|
||||
height: 768 + Math.floor(Math.random() * 200)
|
||||
});
|
||||
}
|
||||
|
||||
async navigateToPage(url, useReducedTimeout = false) {
|
||||
// Reset des variables de détection d'erreur
|
||||
this.lastResponseStatus = null;
|
||||
this.navigationError = null;
|
||||
|
||||
const timeout = useReducedTimeout ? CONFIG.ERROR_DETECTION_TIMEOUT : CONFIG.PAGE_TIMEOUT;
|
||||
|
||||
try {
|
||||
console.log(`🌐 Navigating to: ${url}`);
|
||||
|
||||
const response = await this.page.goto(url, {
|
||||
waitUntil: 'domcontentloaded', // Plus rapide que networkidle2 pour la détection d'erreur
|
||||
timeout: timeout
|
||||
});
|
||||
|
||||
// Vérifier immédiatement le code de statut de la réponse
|
||||
if (response && response.status() >= 400) {
|
||||
throw new Error(`HTTP ${response.status()}: ${response.statusText()} for URL: ${url}`);
|
||||
}
|
||||
|
||||
// Si pas d'erreur immédiate, attendre que le contenu se charge complètement
|
||||
if (!this.navigationError) {
|
||||
// Attendre un court délai pour permettre aux ressources de se charger
|
||||
await new Promise(resolve => setTimeout(resolve, 1000));
|
||||
} else {
|
||||
throw new Error(`Navigation error: ${this.navigationError.status} ${this.navigationError.statusText}`);
|
||||
}
|
||||
|
||||
console.log(`✅ Successfully loaded: ${url}`);
|
||||
return response;
|
||||
|
||||
} catch (error) {
|
||||
// Distinguer les erreurs de timeout des erreurs HTTP
|
||||
if (error.message.includes('HTTP 4') || error.message.includes('HTTP 5')) {
|
||||
console.log(`🚫 HTTP Error detected quickly: ${error.message}`);
|
||||
throw error;
|
||||
} else if (error.message.includes('timeout')) {
|
||||
console.log(`⏱️ Navigation timeout for: ${url}`);
|
||||
throw new Error(`Navigation timeout after ${timeout}ms for URL: ${url}`);
|
||||
} else {
|
||||
console.log(`❌ Navigation error: ${error.message}`);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async navigateToPageWithFallback(url) {
|
||||
try {
|
||||
// Première tentative avec timeout réduit pour détection rapide d'erreur
|
||||
return await this.navigateToPage(url, true);
|
||||
} catch (error) {
|
||||
if (error.message.includes('HTTP 4') || error.message.includes('HTTP 5')) {
|
||||
// Erreur HTTP confirmée, ne pas réessayer
|
||||
throw error;
|
||||
}
|
||||
|
||||
// Si c'est un timeout, réessayer avec timeout complet
|
||||
console.log(`🔄 Quick check failed, retrying with full timeout...`);
|
||||
return await this.navigateToPage(url, false);
|
||||
}
|
||||
}
|
||||
|
||||
async selectChapter(chapterSelector, chapterNumber) {
|
||||
try {
|
||||
console.log(`📚 Looking for chapter selector: ${chapterSelector}`);
|
||||
|
||||
// Attendre que le sélecteur soit présent
|
||||
await this.page.waitForSelector(chapterSelector, { timeout: CONFIG.NAVIGATION_TIMEOUT });
|
||||
|
||||
// Lister toutes les options disponibles
|
||||
const options = await this.page.$$eval(chapterSelector + ' option', opts =>
|
||||
opts.map(opt => ({
|
||||
value: opt.value,
|
||||
text: opt.textContent.trim(),
|
||||
selected: opt.selected
|
||||
}))
|
||||
);
|
||||
|
||||
console.log(`📖 Found ${options.length} chapter options`);
|
||||
|
||||
// Chercher l'option correspondant au chapitre demandé
|
||||
const targetOption = options.find(opt => {
|
||||
const text = opt.text.toLowerCase();
|
||||
const chapterStr = chapterNumber.toString();
|
||||
|
||||
return text.includes(chapterStr) ||
|
||||
text.includes(`chapitre ${chapterStr}`) ||
|
||||
text.includes(`chapter ${chapterStr}`) ||
|
||||
opt.value === chapterStr ||
|
||||
text.includes(`${chapterStr}.0`) ||
|
||||
text.includes(`${chapterStr} -`);
|
||||
});
|
||||
|
||||
if (targetOption) {
|
||||
console.log(`🎯 Found target chapter: ${targetOption.text} (value: ${targetOption.value})`);
|
||||
|
||||
// Sélectionner le chapitre
|
||||
await this.page.select(chapterSelector, targetOption.value);
|
||||
console.log('✅ Chapter selected, waiting for page reload...');
|
||||
|
||||
// Attendre que la page se recharge après la sélection
|
||||
try {
|
||||
await this.page.waitForNavigation({
|
||||
waitUntil: 'domcontentloaded',
|
||||
timeout: CONFIG.ERROR_DETECTION_TIMEOUT
|
||||
});
|
||||
console.log('🔄 Page reloaded after chapter selection');
|
||||
} catch (error) {
|
||||
console.log(`⚠️ Warning during chapter navigation: ${error.message}`);
|
||||
// Attendre un peu même si la navigation échoue
|
||||
await new Promise(resolve => setTimeout(resolve, 2000));
|
||||
}
|
||||
|
||||
} else {
|
||||
// Lancer une exception spécifique pour le chapitre non trouvé
|
||||
throw new ChapterNotFoundError(chapterNumber);
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
if (error instanceof ChapterNotFoundError) {
|
||||
// Re-lancer l'exception pour qu'elle soit gérée en amont
|
||||
throw error;
|
||||
}
|
||||
console.log(`⚠️ Error selecting chapter: ${error.message}`);
|
||||
// Continuer même si la sélection échoue pour les autres erreurs
|
||||
}
|
||||
}
|
||||
|
||||
async scrapeVertical() {
|
||||
const url = this.options.url;
|
||||
const imageSelector = this.options.image_selector;
|
||||
const waitForImages = this.options.wait_for_images === 'true';
|
||||
const shouldScroll = this.options.scroll === 'true';
|
||||
const chapterSelector = this.options.chapter_selector;
|
||||
const chapterNumber = this.options.chapter_number;
|
||||
|
||||
try {
|
||||
await this.navigateToPageWithFallback(url);
|
||||
} catch (error) {
|
||||
if (error.message.includes('HTTP 4') || error.message.includes('HTTP 5')) {
|
||||
console.log(`🚫 Cannot access page: ${error.message}`);
|
||||
return []; // Retourner un tableau vide pour les erreurs HTTP
|
||||
}
|
||||
throw error; // Re-lancer les autres erreurs
|
||||
}
|
||||
|
||||
// Gérer la sélection de chapitre si nécessaire
|
||||
if (chapterSelector && chapterNumber) {
|
||||
try {
|
||||
await this.selectChapter(chapterSelector, chapterNumber);
|
||||
} catch (error) {
|
||||
if (error instanceof ChapterNotFoundError) {
|
||||
console.log(`📚 MANGA_EXISTS_BUT_CHAPTER_NOT_FOUND: ${error.message}`);
|
||||
return {
|
||||
error: 'CHAPTER_NOT_FOUND',
|
||||
message: `Le manga existe mais le chapitre ${error.chapterNumber} n'est pas disponible.`,
|
||||
images: []
|
||||
};
|
||||
}
|
||||
throw error; // Re-lancer les autres erreurs
|
||||
}
|
||||
}
|
||||
|
||||
// Attendre le sélecteur d'image
|
||||
if (waitForImages) {
|
||||
await this.page.waitForSelector(imageSelector, { timeout: CONFIG.NAVIGATION_TIMEOUT });
|
||||
}
|
||||
|
||||
// Scroll pour charger toutes les images lazy-load
|
||||
if (shouldScroll) {
|
||||
await this.autoScroll();
|
||||
}
|
||||
|
||||
// Attendre un peu pour que les images se chargent (plus de temps pour lazy loading)
|
||||
await new Promise(resolve => setTimeout(resolve, 3000));
|
||||
|
||||
// Collecter les URLs d'images
|
||||
const imageUrls = await this.page.$$eval(imageSelector, imgs => {
|
||||
return imgs.map(img => {
|
||||
// Priorité au src, puis aux attributs data-*
|
||||
return img.src ||
|
||||
img.getAttribute('src') ||
|
||||
img.getAttribute('data-src') ||
|
||||
img.getAttribute('data-lazy-src') ||
|
||||
img.getAttribute('data-original');
|
||||
}).filter(url => url && url !== 'about:blank');
|
||||
});
|
||||
|
||||
console.log(`Found ${imageUrls.length} images`);
|
||||
return imageUrls;
|
||||
}
|
||||
|
||||
async scrapeHorizontal() {
|
||||
const url = this.options.url;
|
||||
const imageSelector = this.options.image_selector;
|
||||
const nextSelector = this.options.next_selector;
|
||||
const waitForImages = this.options.wait_for_images === 'true';
|
||||
const chapterSelector = this.options.chapter_selector;
|
||||
const chapterNumber = this.options.chapter_number;
|
||||
|
||||
let currentUrl = url;
|
||||
let pageCount = 0;
|
||||
const maxPages = 200; // Limite de sécurité
|
||||
|
||||
while (currentUrl && pageCount < maxPages) {
|
||||
console.log(`Scraping page ${pageCount + 1}: ${currentUrl}`);
|
||||
|
||||
try {
|
||||
await this.navigateToPageWithFallback(currentUrl);
|
||||
} catch (error) {
|
||||
if (error.message.includes('HTTP 4') || error.message.includes('HTTP 5')) {
|
||||
console.log(`🚫 Cannot access page ${pageCount + 1}: ${error.message}`);
|
||||
break; // Arrêter le scraping si on rencontre une 404
|
||||
}
|
||||
// Pour les autres erreurs, essayer de continuer
|
||||
console.log(`⚠️ Warning on page ${pageCount + 1}: ${error.message}, continuing...`);
|
||||
}
|
||||
|
||||
// Gérer la sélection de chapitre pour la première page seulement
|
||||
if (pageCount === 0 && chapterSelector && chapterNumber) {
|
||||
try {
|
||||
await this.selectChapter(chapterSelector, chapterNumber);
|
||||
} catch (error) {
|
||||
if (error instanceof ChapterNotFoundError) {
|
||||
console.log(`📚 MANGA_EXISTS_BUT_CHAPTER_NOT_FOUND: ${error.message}`);
|
||||
return {
|
||||
error: 'CHAPTER_NOT_FOUND',
|
||||
message: `Le manga existe mais le chapitre ${error.chapterNumber} n'est pas disponible.`,
|
||||
images: []
|
||||
};
|
||||
}
|
||||
throw error; // Re-lancer les autres erreurs
|
||||
}
|
||||
}
|
||||
|
||||
// Attendre le sélecteur d'image
|
||||
if (waitForImages) {
|
||||
try {
|
||||
await this.page.waitForSelector(imageSelector, { timeout: CONFIG.NAVIGATION_TIMEOUT });
|
||||
} catch (e) {
|
||||
console.log(`No image found on page ${pageCount + 1}, skipping`);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Récupérer l'image de la page
|
||||
const imageUrl = await this.page.$eval(imageSelector, img => {
|
||||
return img.src ||
|
||||
img.getAttribute('src') ||
|
||||
img.getAttribute('data-src') ||
|
||||
img.getAttribute('data-lazy-src') ||
|
||||
img.getAttribute('data-original');
|
||||
}).catch(() => null);
|
||||
|
||||
if (imageUrl) {
|
||||
this.imageUrls.add(imageUrl);
|
||||
console.log(`Image found: ${imageUrl}`);
|
||||
}
|
||||
|
||||
// Chercher le bouton/lien suivant
|
||||
const nextElement = await this.page.$(nextSelector);
|
||||
if (!nextElement) {
|
||||
console.log('No next button found, ending scraping');
|
||||
break;
|
||||
}
|
||||
|
||||
// Récupérer l'URL suivante
|
||||
currentUrl = await nextElement.evaluate(el => {
|
||||
return el.href || el.getAttribute('href');
|
||||
});
|
||||
|
||||
if (!currentUrl) {
|
||||
console.log('No next URL found, ending scraping');
|
||||
break;
|
||||
}
|
||||
|
||||
pageCount++;
|
||||
await new Promise(resolve => setTimeout(resolve, 1000)); // Pause entre les pages
|
||||
}
|
||||
|
||||
return Array.from(this.imageUrls);
|
||||
}
|
||||
|
||||
async autoScroll() {
|
||||
await this.page.evaluate(async (config) => {
|
||||
await new Promise((resolve) => {
|
||||
let totalHeight = 0;
|
||||
let lastHeight = 0;
|
||||
|
||||
const timer = setInterval(() => {
|
||||
const scrollHeight = document.body.scrollHeight;
|
||||
|
||||
// Si la hauteur a changé, on continue
|
||||
if (scrollHeight !== lastHeight) {
|
||||
lastHeight = scrollHeight;
|
||||
totalHeight = 0; // Reset le counter car plus de contenu apparaît
|
||||
}
|
||||
|
||||
window.scrollBy(0, config.SCROLL_DISTANCE);
|
||||
totalHeight += config.SCROLL_DISTANCE;
|
||||
|
||||
// Arrêter si on a atteint le bas ET que rien de nouveau ne charge
|
||||
if (totalHeight >= scrollHeight) {
|
||||
clearInterval(timer);
|
||||
// Scroll final jusqu'à la vraie fin
|
||||
window.scrollTo(0, document.body.scrollHeight);
|
||||
resolve();
|
||||
}
|
||||
}, config.SCROLL_DELAY);
|
||||
});
|
||||
}, CONFIG);
|
||||
}
|
||||
|
||||
async close() {
|
||||
if (this.browser) {
|
||||
await this.browser.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
(async () => {
|
||||
const scraper = new PuppeteerScraper();
|
||||
|
||||
try {
|
||||
await scraper.launch();
|
||||
|
||||
let result = [];
|
||||
|
||||
if (scraper.options.mode === 'vertical') {
|
||||
result = await scraper.scrapeVertical();
|
||||
} else if (scraper.options.mode === 'horizontal') {
|
||||
result = await scraper.scrapeHorizontal();
|
||||
} else {
|
||||
throw new Error('Invalid mode. Use --mode=vertical or --mode=horizontal');
|
||||
}
|
||||
|
||||
// Vérifier si le résultat est un objet d'erreur ou un tableau d'URLs
|
||||
if (result && typeof result === 'object' && result.error === 'CHAPTER_NOT_FOUND') {
|
||||
// Cas où le chapitre n'est pas trouvé
|
||||
console.log(`CHAPTER_NOT_FOUND:${JSON.stringify(result)}`);
|
||||
} else {
|
||||
// Cas normal - nettoyer les URLs
|
||||
const imageUrls = Array.isArray(result) ? result : [];
|
||||
const cleanUrls = imageUrls.filter(url => url && typeof url === 'string');
|
||||
console.log(`RESULT:${JSON.stringify(cleanUrls)}`);
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
if (error instanceof ChapterNotFoundError) {
|
||||
// Cette erreur est déjà gérée dans les fonctions de scraping
|
||||
// Mais au cas où elle remonterait ici
|
||||
console.log(`CHAPTER_NOT_FOUND:${JSON.stringify({
|
||||
error: 'CHAPTER_NOT_FOUND',
|
||||
message: `Le manga existe mais le chapitre ${error.chapterNumber} n'est pas disponible.`,
|
||||
images: []
|
||||
})}`);
|
||||
} else {
|
||||
console.error('Error:', error.message);
|
||||
process.exit(1);
|
||||
}
|
||||
} finally {
|
||||
await scraper.close();
|
||||
}
|
||||
})();
|
||||
@@ -9,6 +9,7 @@ use App\Domain\Scraping\Domain\Contract\Repository\SourceRepositoryInterface;
|
||||
use App\Domain\Scraping\Domain\Contract\Service\CbzGeneratorInterface;
|
||||
use App\Domain\Scraping\Domain\Contract\Service\ImageDownloaderInterface;
|
||||
use App\Domain\Scraping\Domain\Contract\Service\ScraperInterface;
|
||||
use App\Domain\Scraping\Domain\Contract\Service\ScraperFactoryInterface;
|
||||
use App\Domain\Scraping\Domain\Event\ChapterScraped;
|
||||
use App\Domain\Scraping\Domain\Event\ChapterScrapingFailed;
|
||||
use App\Domain\Scraping\Domain\Model\Chapter;
|
||||
@@ -25,7 +26,7 @@ use Doctrine\ORM\EntityManagerInterface;
|
||||
readonly class ScrapeChapterHandler
|
||||
{
|
||||
public function __construct(
|
||||
private ScraperInterface $scraper,
|
||||
private ScraperFactoryInterface $scraperFactory,
|
||||
private ImageDownloaderInterface $imageDownloader,
|
||||
private CbzGeneratorInterface $cbzGenerator,
|
||||
private JobRepositoryInterface $jobRepository,
|
||||
@@ -87,13 +88,19 @@ readonly class ScrapeChapterHandler
|
||||
$this->entityManager->beginTransaction();
|
||||
|
||||
// 5. Scraping des URLs avec le slug courant
|
||||
$scrapingParameters = $source->getScrappingParameters();
|
||||
$scrapingParameters['chapterNumber'] = $chapter->chapterNumber;
|
||||
$scrapingType = $scrapingParameters['scrapingType'] ?? 'html';
|
||||
|
||||
$scrapingRequest = new ScrapingRequest(
|
||||
'html',
|
||||
$scrapingType,
|
||||
$source->buildChapterUrl($slug, $chapter->chapterNumber),
|
||||
$source->getScrappingParameters()
|
||||
$scrapingParameters
|
||||
);
|
||||
|
||||
$scrapingResult = $this->scraper->scrape($scrapingRequest);
|
||||
// Sélection du scraper approprié selon le type
|
||||
$scraper = $this->scraperFactory->getScraperWithFallback($scrapingType);
|
||||
$scrapingResult = $scraper->scrape($scrapingRequest);
|
||||
|
||||
// 6. Téléchargement des images
|
||||
$tempDir = new TempDirectory();
|
||||
@@ -134,6 +141,8 @@ readonly class ScrapeChapterHandler
|
||||
break;
|
||||
|
||||
} catch (\Exception $e) {
|
||||
dump('EXCEPTION for source ' . $source->getName() . ' with slug ' . $slug . ': ' . $e->getMessage());
|
||||
|
||||
$this->entityManager->rollback();
|
||||
|
||||
if (isset($job)) {
|
||||
@@ -184,6 +193,11 @@ readonly class ScrapeChapterHandler
|
||||
if ($source) {
|
||||
$preferredSources[] = $source;
|
||||
}
|
||||
|
||||
// Limiter à 3 sources préférées maximum
|
||||
if (count($preferredSources) >= 3) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!empty($preferredSources)) {
|
||||
|
||||
@@ -5,46 +5,43 @@ namespace App\Domain\Scraping\Application\CommandHandler;
|
||||
use App\Domain\Scraping\Application\Command\TestScraperConfiguration;
|
||||
use App\Domain\Scraping\Application\Response\TestScraperConfigurationResponse;
|
||||
use App\Domain\Scraping\Domain\Contract\Service\ScraperInterface;
|
||||
use App\Domain\Scraping\Domain\Contract\Service\ScraperFactoryInterface;
|
||||
use App\Domain\Scraping\Domain\Model\ValueObject\ScrapingRequest;
|
||||
|
||||
readonly class TestScraperConfigurationHandler
|
||||
{
|
||||
public function __construct(
|
||||
private ScraperInterface $scraper
|
||||
private ScraperFactoryInterface $scraperFactory
|
||||
) {}
|
||||
|
||||
public function handle(TestScraperConfiguration $command): TestScraperConfigurationResponse
|
||||
{
|
||||
// Construction des paramètres de scraping depuis les données de la commande
|
||||
$scrapingParameters = [
|
||||
'imageSelector' => $command->imageSelector,
|
||||
'nextPageSelector' => $command->nextPageSelector,
|
||||
'chapterUrlFormat' => $command->chapterUrlFormat,
|
||||
'scrapingType' => $command->scrapingType,
|
||||
'chapterSelector' => $command->chapterSelector,
|
||||
'chapterNumber' => $command->chapterNumber
|
||||
];
|
||||
// Vérification que le scraper supporte le type de scraping
|
||||
if (!$this->scraperFactory->isSupported($command->scrapingType)) {
|
||||
return $this->tryWithFallbackScrapers($command, $scrapingParameters);
|
||||
}
|
||||
|
||||
// Essayer avec le scraper demandé
|
||||
try {
|
||||
// Construction des paramètres de scraping depuis les données de la commande
|
||||
$scrapingParameters = [
|
||||
'imageSelector' => $command->imageSelector,
|
||||
'nextPageSelector' => $command->nextPageSelector,
|
||||
'chapterUrlFormat' => $command->chapterUrlFormat,
|
||||
'scrapingType' => $command->scrapingType,
|
||||
'chapterSelector' => $command->chapterSelector
|
||||
];
|
||||
|
||||
// Vérification que le scraper supporte le type de scraping
|
||||
if (!$this->scraper->supports($command->scrapingType)) {
|
||||
return TestScraperConfigurationResponse::failure(
|
||||
$command->testUrl,
|
||||
$command->scrapingType,
|
||||
["Type de scraping '{$command->scrapingType}' non supporté"]
|
||||
);
|
||||
}
|
||||
|
||||
// Création de la requête de scraping avec l'URL de test fournie directement
|
||||
$scraper = $this->scraperFactory->createScraper($command->scrapingType);
|
||||
$scrapingRequest = new ScrapingRequest(
|
||||
$command->scrapingType,
|
||||
$command->testUrl,
|
||||
$scrapingParameters
|
||||
);
|
||||
|
||||
// Tentative de scraping
|
||||
$scrapingResult = $this->scraper->scrape($scrapingRequest);
|
||||
$scrapingResult = $scraper->scrape($scrapingRequest);
|
||||
|
||||
|
||||
// Retour du succès avec les URLs trouvées
|
||||
return TestScraperConfigurationResponse::success(
|
||||
$scrapingResult->getImageUrls(),
|
||||
$command->testUrl,
|
||||
@@ -52,17 +49,80 @@ readonly class TestScraperConfigurationHandler
|
||||
);
|
||||
|
||||
} catch (\Exception $e) {
|
||||
// Analyse de l'erreur pour fournir un message plus détaillé
|
||||
$errors = $this->analyzeError($e, $command);
|
||||
|
||||
return TestScraperConfigurationResponse::failure(
|
||||
$command->testUrl,
|
||||
$command->scrapingType,
|
||||
$errors
|
||||
$this->analyzeError($e, $command)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
private function tryWithFallbackScrapers(
|
||||
TestScraperConfiguration $command,
|
||||
array $scrapingParameters,
|
||||
?\Exception $originalException = null
|
||||
): TestScraperConfigurationResponse {
|
||||
$errors = [];
|
||||
|
||||
if ($originalException) {
|
||||
$errors[] = [
|
||||
'type' => 'primary_scraper_failed',
|
||||
'scraper' => $command->scrapingType,
|
||||
'message' => $originalException->getMessage()
|
||||
];
|
||||
}
|
||||
|
||||
// Essayer avec tous les scrapers disponibles
|
||||
$availableScrapers = $this->scraperFactory->getSupportedTypes();
|
||||
$triedScrapers = [];
|
||||
|
||||
foreach ($availableScrapers as $scraperType) {
|
||||
if ($scraperType === $command->scrapingType) {
|
||||
continue; // Déjà essayé
|
||||
}
|
||||
|
||||
try {
|
||||
$scraper = $this->scraperFactory->createScraper($scraperType);
|
||||
$scrapingRequest = new ScrapingRequest(
|
||||
$scraperType,
|
||||
$command->testUrl,
|
||||
$scrapingParameters
|
||||
);
|
||||
|
||||
$scrapingResult = $scraper->scrape($scrapingRequest);
|
||||
|
||||
// Succès avec un scraper alternatif
|
||||
return TestScraperConfigurationResponse::success(
|
||||
$scrapingResult->getImageUrls(),
|
||||
$command->testUrl,
|
||||
$scraperType, // Retourner le type de scraper qui a fonctionné
|
||||
"Scraper alternatif utilisé: {$scraperType} (au lieu de {$command->scrapingType})"
|
||||
);
|
||||
|
||||
} catch (\Exception $e) {
|
||||
$triedScrapers[] = $scraperType;
|
||||
$errors[] = [
|
||||
'type' => 'fallback_scraper_failed',
|
||||
'scraper' => $scraperType,
|
||||
'message' => $e->getMessage()
|
||||
];
|
||||
}
|
||||
}
|
||||
|
||||
// Tous les scrapers ont échoué
|
||||
$errors[] = [
|
||||
'type' => 'all_scrapers_failed',
|
||||
'message' => 'Aucun scraper disponible n\'a réussi à traiter cette URL',
|
||||
'tried_scrapers' => array_merge([$command->scrapingType], $triedScrapers)
|
||||
];
|
||||
|
||||
return TestScraperConfigurationResponse::failure(
|
||||
$command->testUrl,
|
||||
$command->scrapingType,
|
||||
$errors
|
||||
);
|
||||
}
|
||||
|
||||
private function analyzeError(\Exception $e, TestScraperConfiguration $command): array
|
||||
{
|
||||
$errors = [];
|
||||
|
||||
@@ -0,0 +1,36 @@
|
||||
<?php
|
||||
|
||||
namespace App\Domain\Scraping\Domain\Contract\Service;
|
||||
|
||||
interface ScraperFactoryInterface
|
||||
{
|
||||
/**
|
||||
* Créer un scraper pour un type spécifique
|
||||
*/
|
||||
public function createScraper(string $type): ScraperInterface;
|
||||
|
||||
/**
|
||||
* Obtenir le scraper le plus approprié selon la priorité
|
||||
*/
|
||||
public function getBestScraper(): ScraperInterface;
|
||||
|
||||
/**
|
||||
* Obtenir le scraper de fallback (le plus simple)
|
||||
*/
|
||||
public function getFallbackScraper(): ScraperInterface;
|
||||
|
||||
/**
|
||||
* Essayer plusieurs scrapers en cascade jusqu'à ce qu'un fonctionne
|
||||
*/
|
||||
public function getScraperWithFallback(string $preferredType): ScraperInterface;
|
||||
|
||||
/**
|
||||
* Obtenir les types de scrapers supportés
|
||||
*/
|
||||
public function getSupportedTypes(): array;
|
||||
|
||||
/**
|
||||
* Vérifier si un type de scraper est supporté
|
||||
*/
|
||||
public function isSupported(string $type): bool;
|
||||
}
|
||||
@@ -4,8 +4,8 @@ namespace App\Domain\Scraping\Domain\Exception;
|
||||
|
||||
class ChapterNotFoundException extends \Exception
|
||||
{
|
||||
public function __construct()
|
||||
public function __construct(string $message = 'Chapter not found')
|
||||
{
|
||||
parent::__construct('Chapter not found');
|
||||
parent::__construct($message);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -32,8 +32,8 @@ class ChapterUrl
|
||||
|
||||
private function validateUrlFormat(string $format): string
|
||||
{
|
||||
if (!str_contains($format, '{slug}') || !str_contains($format, '{chapterNumber}')) {
|
||||
throw new InvalidArgumentException("The URL format must contain both {slug} and {chapterNumber} placeholders.");
|
||||
if (!str_contains($format, '{slug}')) {
|
||||
throw new InvalidArgumentException("The URL format must contain {slug} placeholder.");
|
||||
}
|
||||
|
||||
return $format;
|
||||
|
||||
@@ -0,0 +1,252 @@
|
||||
<?php
|
||||
|
||||
namespace App\Domain\Scraping\Infrastructure\Service\Scraper;
|
||||
|
||||
use App\Domain\Scraping\Domain\Contract\Service\ScraperInterface;
|
||||
use App\Domain\Scraping\Domain\Model\ValueObject\ScrapingRequest;
|
||||
use App\Domain\Scraping\Domain\Model\ValueObject\ScrapingResult;
|
||||
use Symfony\Component\DomCrawler\Crawler;
|
||||
use Symfony\Contracts\HttpClient\HttpClientInterface;
|
||||
use Symfony\Component\HttpClient\HttpClient;
|
||||
|
||||
class AdvancedHtmlScraper implements ScraperInterface
|
||||
{
|
||||
private const USER_AGENTS = [
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0',
|
||||
'Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0'
|
||||
];
|
||||
|
||||
private const ACCEPT_HEADERS = [
|
||||
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
|
||||
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
|
||||
];
|
||||
|
||||
private const ACCEPT_LANGUAGE_HEADERS = [
|
||||
'en-US,en;q=0.9',
|
||||
'en-US,en;q=0.8',
|
||||
'en-GB,en;q=0.9',
|
||||
'fr-FR,fr;q=0.9,en;q=0.8'
|
||||
];
|
||||
|
||||
private const RETRY_ATTEMPTS = 3;
|
||||
private const RETRY_DELAY = 2; // secondes
|
||||
private const REQUEST_TIMEOUT = 30;
|
||||
|
||||
public function __construct(
|
||||
private readonly HttpClientInterface $httpClient
|
||||
) {
|
||||
// Utiliser un client HTTP personnalisé si non fourni
|
||||
if (!$this->httpClient) {
|
||||
$this->httpClient = HttpClient::create([
|
||||
'timeout' => self::REQUEST_TIMEOUT,
|
||||
'verify_peer' => false,
|
||||
'verify_host' => false
|
||||
]);
|
||||
}
|
||||
}
|
||||
|
||||
public function scrape(ScrapingRequest $request): ScrapingResult
|
||||
{
|
||||
$scrapingParameters = $request->getScrapingParameters();
|
||||
|
||||
try {
|
||||
$pages = !$scrapingParameters['nextPageSelector']
|
||||
? $this->scrapeVerticalReader($request)
|
||||
: $this->scrapeHorizontalReader($request);
|
||||
|
||||
return new ScrapingResult($pages, count($pages));
|
||||
} catch (\Exception $e) {
|
||||
throw new \RuntimeException('Advanced HTML scraping failed: ' . $e->getMessage(), 0, $e);
|
||||
}
|
||||
}
|
||||
|
||||
public function supports(string $sourceType): bool
|
||||
{
|
||||
return 'advanced_html' === $sourceType;
|
||||
}
|
||||
|
||||
private function scrapeVerticalReader(ScrapingRequest $request): array
|
||||
{
|
||||
$html = $this->fetchHtmlWithRetry($request->getChapterUrl());
|
||||
$crawler = new Crawler($html);
|
||||
$params = $request->getScrapingParameters();
|
||||
|
||||
$images = $crawler->filter($params['imageSelector'])
|
||||
->each(function ($node) {
|
||||
// Essayer plusieurs attributs pour trouver l'URL de l'image
|
||||
$src = $node->attr('src') ?:
|
||||
$node->attr('data-src') ?:
|
||||
$node->attr('data-lazy-src') ?:
|
||||
$node->attr('data-original') ?:
|
||||
$node->attr('data-zoom-image') ?:
|
||||
$node->attr('data-full-src');
|
||||
|
||||
return $this->cleanImageUrl($src);
|
||||
});
|
||||
|
||||
return array_filter($images, fn($url) => !empty($url));
|
||||
}
|
||||
|
||||
private function scrapeHorizontalReader(ScrapingRequest $request): array
|
||||
{
|
||||
$pages = [];
|
||||
$currentUrl = $request->getChapterUrl();
|
||||
$params = $request->getScrapingParameters();
|
||||
$visitedUrls = new \SplObjectStorage();
|
||||
$maxPages = 200; // Limite de sécurité
|
||||
$pageCount = 0;
|
||||
|
||||
while ($currentUrl && $pageCount < $maxPages) {
|
||||
// Éviter les boucles infinies
|
||||
if (isset($visitedUrls[$currentUrl])) {
|
||||
break;
|
||||
}
|
||||
$visitedUrls[$currentUrl] = true;
|
||||
|
||||
$html = $this->fetchHtmlWithRetry($currentUrl);
|
||||
$crawler = new Crawler($html);
|
||||
|
||||
// Récupérer l'image de la page
|
||||
$imageNode = $crawler->filter($params['imageSelector'])->first();
|
||||
if ($imageNode->count() > 0) {
|
||||
$imageUrl = $imageNode->attr('src') ?:
|
||||
$imageNode->attr('data-src') ?:
|
||||
$imageNode->attr('data-lazy-src') ?:
|
||||
$imageNode->attr('data-original');
|
||||
|
||||
if ($imageUrl) {
|
||||
$imageUrl = $this->resolveRelativeUrl($imageUrl, $currentUrl);
|
||||
$pages[] = $this->cleanImageUrl($imageUrl);
|
||||
}
|
||||
}
|
||||
|
||||
// Chercher le lien suivant
|
||||
$nextLink = $crawler->filter($params['nextPageSelector'])->first();
|
||||
if ($nextLink->count() === 0) {
|
||||
break;
|
||||
}
|
||||
|
||||
$nextUrl = $nextLink->attr('href');
|
||||
if (!$nextUrl) {
|
||||
break;
|
||||
}
|
||||
|
||||
$currentUrl = $this->resolveRelativeUrl($nextUrl, $currentUrl);
|
||||
$pageCount++;
|
||||
|
||||
// Pause entre les requêtes pour éviter la détection
|
||||
sleep(1);
|
||||
}
|
||||
|
||||
return array_filter($pages, fn($url) => !empty($url));
|
||||
}
|
||||
|
||||
private function fetchHtmlWithRetry(string $url): string
|
||||
{
|
||||
$lastException = null;
|
||||
|
||||
for ($attempt = 1; $attempt <= self::RETRY_ATTEMPTS; $attempt++) {
|
||||
try {
|
||||
return $this->fetchHtml($url);
|
||||
} catch (\Exception $e) {
|
||||
$lastException = $e;
|
||||
|
||||
if ($attempt < self::RETRY_ATTEMPTS) {
|
||||
// Attendre avant de réessayer
|
||||
sleep(self::RETRY_DELAY * $attempt);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
throw $lastException;
|
||||
}
|
||||
|
||||
private function fetchHtml(string $url): string
|
||||
{
|
||||
$headers = $this->generateHeaders();
|
||||
|
||||
try {
|
||||
$response = $this->httpClient->request('GET', $url, [
|
||||
'headers' => $headers,
|
||||
'timeout' => self::REQUEST_TIMEOUT
|
||||
]);
|
||||
|
||||
$statusCode = $response->getStatusCode();
|
||||
|
||||
if ($statusCode >= 400) {
|
||||
throw new \RuntimeException("HTTP {$statusCode} error for URL: {$url}");
|
||||
}
|
||||
|
||||
$content = $response->getContent();
|
||||
|
||||
// Vérifier si on a été bloqué par Cloudflare
|
||||
if (strpos($content, 'cf-browser-verification') !== false ||
|
||||
strpos($content, 'Checking your browser') !== false) {
|
||||
throw new \RuntimeException('Blocked by Cloudflare protection');
|
||||
}
|
||||
|
||||
return $content;
|
||||
} catch (\Exception $e) {
|
||||
throw new \RuntimeException('Failed to fetch HTML: ' . $e->getMessage(), 0, $e);
|
||||
}
|
||||
}
|
||||
|
||||
private function generateHeaders(): array
|
||||
{
|
||||
return [
|
||||
'User-Agent' => self::USER_AGENTS[array_rand(self::USER_AGENTS)],
|
||||
'Accept' => self::ACCEPT_HEADERS[array_rand(self::ACCEPT_HEADERS)],
|
||||
'Accept-Language' => self::ACCEPT_LANGUAGE_HEADERS[array_rand(self::ACCEPT_LANGUAGE_HEADERS)],
|
||||
'Accept-Encoding' => 'gzip, deflate, br',
|
||||
'DNT' => '1',
|
||||
'Connection' => 'keep-alive',
|
||||
'Upgrade-Insecure-Requests' => '1',
|
||||
'Sec-Fetch-Dest' => 'document',
|
||||
'Sec-Fetch-Mode' => 'navigate',
|
||||
'Sec-Fetch-Site' => 'none',
|
||||
'Sec-Fetch-User' => '?1',
|
||||
'Cache-Control' => 'max-age=0'
|
||||
];
|
||||
}
|
||||
|
||||
private function resolveRelativeUrl(string $url, string $baseUrl): string
|
||||
{
|
||||
if (preg_match('/^https?:\/\//', $url)) {
|
||||
return $url;
|
||||
}
|
||||
|
||||
$parsedBase = parse_url($baseUrl);
|
||||
$scheme = $parsedBase['scheme'];
|
||||
$host = $parsedBase['host'];
|
||||
$port = isset($parsedBase['port']) ? ':' . $parsedBase['port'] : '';
|
||||
|
||||
if (strpos($url, '/') === 0) {
|
||||
// URL absolue relative à la racine
|
||||
return $scheme . '://' . $host . $port . $url;
|
||||
} else {
|
||||
// URL relative au chemin actuel
|
||||
$path = isset($parsedBase['path']) ? dirname($parsedBase['path']) : '';
|
||||
return $scheme . '://' . $host . $port . $path . '/' . $url;
|
||||
}
|
||||
}
|
||||
|
||||
private function cleanImageUrl(string $url): string
|
||||
{
|
||||
if (empty($url)) {
|
||||
return '';
|
||||
}
|
||||
|
||||
// Supprimer les caractères de contrôle
|
||||
$url = preg_replace('/[\x00-\x1F\x7F]/', '', trim($url));
|
||||
|
||||
// Supprimer les paramètres de requête inutiles
|
||||
$url = preg_replace('/(\?|&)(utm_[^&]*|ref[^&]*|source[^&]*)/i', '', $url);
|
||||
|
||||
return $url;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,157 @@
|
||||
<?php
|
||||
|
||||
namespace App\Domain\Scraping\Infrastructure\Service\Scraper;
|
||||
|
||||
use App\Domain\Scraping\Domain\Contract\Service\ScraperInterface;
|
||||
use App\Domain\Scraping\Domain\Exception\ChapterNotFoundException;
|
||||
use App\Domain\Scraping\Domain\Model\ValueObject\ScrapingRequest;
|
||||
use App\Domain\Scraping\Domain\Model\ValueObject\ScrapingResult;
|
||||
use Symfony\Component\Process\Process;
|
||||
|
||||
class JavaScriptScraper implements ScraperInterface
|
||||
{
|
||||
private const PUPPETEER_TIMEOUT = 60; // secondes
|
||||
private const PUPPETEER_SCRIPT_PATH = '/public/puppeteer-scraper.js';
|
||||
private const NODE_EXECUTABLE = 'node';
|
||||
|
||||
public function __construct(
|
||||
private readonly string $projectDir
|
||||
) {}
|
||||
|
||||
public function scrape(ScrapingRequest $request): ScrapingResult
|
||||
{
|
||||
$scrappingParameters = $request->getScrapingParameters();
|
||||
|
||||
try {
|
||||
$scriptPath = $this->projectDir . self::PUPPETEER_SCRIPT_PATH;
|
||||
|
||||
if (!file_exists($scriptPath)) {
|
||||
throw new \RuntimeException('Puppeteer script not found at: ' . $scriptPath);
|
||||
}
|
||||
|
||||
$imageUrls = !empty($scrappingParameters['nextPageSelector'])
|
||||
? $this->scrapeHorizontalReader($request, $scriptPath)
|
||||
: $this->scrapeVerticalReader($request, $scriptPath);
|
||||
|
||||
return new ScrapingResult($imageUrls, count($imageUrls));
|
||||
} catch (\Exception $e) {
|
||||
throw new \RuntimeException('JavaScript scraping failed: ' . $e->getMessage(), 0, $e);
|
||||
}
|
||||
}
|
||||
|
||||
public function supports(string $sourceType): bool
|
||||
{
|
||||
return 'javascript' === $sourceType;
|
||||
}
|
||||
|
||||
private function scrapeVerticalReader(ScrapingRequest $request, string $scriptPath): array
|
||||
{
|
||||
$params = $request->getScrapingParameters();
|
||||
$processArgs = [
|
||||
self::NODE_EXECUTABLE,
|
||||
$scriptPath,
|
||||
'--mode=vertical',
|
||||
'--url=' . $request->getChapterUrl(),
|
||||
'--image-selector=' . $params['imageSelector'],
|
||||
'--wait-for-images=true',
|
||||
'--scroll=true'
|
||||
];
|
||||
|
||||
// Ajouter les paramètres de chapitre si disponibles
|
||||
if (!empty($params['chapterSelector'])) {
|
||||
$processArgs[] = '--chapter-selector=' . $params['chapterSelector'];
|
||||
}
|
||||
|
||||
if (isset($params['chapterNumber'])) {
|
||||
$processArgs[] = '--chapter-number=' . $params['chapterNumber'];
|
||||
}
|
||||
|
||||
$process = new Process($processArgs);
|
||||
return $this->executeProcess($process);
|
||||
}
|
||||
|
||||
private function scrapeHorizontalReader(ScrapingRequest $request, string $scriptPath): array
|
||||
{
|
||||
$params = $request->getScrapingParameters();
|
||||
|
||||
$processArgs = [
|
||||
self::NODE_EXECUTABLE,
|
||||
$scriptPath,
|
||||
'--mode=horizontal',
|
||||
'--url=' . $request->getChapterUrl(),
|
||||
'--image-selector=' . $params['imageSelector'],
|
||||
'--next-selector=' . $params['nextPageSelector'],
|
||||
'--wait-for-images=true'
|
||||
];
|
||||
|
||||
// Ajouter les paramètres de chapitre si disponibles
|
||||
if (!empty($params['chapterSelector'])) {
|
||||
$processArgs[] = '--chapter-selector=' . $params['chapterSelector'];
|
||||
}
|
||||
|
||||
if (isset($params['chapterNumber'])) {
|
||||
$processArgs[] = '--chapter-number=' . $params['chapterNumber'];
|
||||
}
|
||||
|
||||
$process = new Process($processArgs);
|
||||
return $this->executeProcess($process);
|
||||
}
|
||||
|
||||
private function executeProcess(Process $process): array
|
||||
{
|
||||
$process->setTimeout(self::PUPPETEER_TIMEOUT);
|
||||
$process->run();
|
||||
|
||||
if (!$process->isSuccessful()) {
|
||||
$error = $process->getErrorOutput() ?: $process->getOutput();
|
||||
throw new \RuntimeException('Puppeteer process failed: ' . $error);
|
||||
}
|
||||
|
||||
$output = $process->getOutput();
|
||||
$lines = explode("\n", trim($output));
|
||||
$resultLine = end($lines);
|
||||
|
||||
// Gérer le cas où le chapitre n'est pas trouvé
|
||||
if (strpos($resultLine, 'CHAPTER_NOT_FOUND:') === 0) {
|
||||
$jsonData = substr($resultLine, 18); // Remove 'CHAPTER_NOT_FOUND:' prefix
|
||||
$errorData = json_decode($jsonData, true);
|
||||
|
||||
if (is_array($errorData) && isset($errorData['message'])) {
|
||||
throw new ChapterNotFoundException($errorData['message']);
|
||||
}
|
||||
|
||||
throw new ChapterNotFoundException('Le chapitre demandé n\'est pas disponible.');
|
||||
}
|
||||
|
||||
// Gérer le cas normal avec des images
|
||||
if (strpos($resultLine, 'RESULT:') === 0) {
|
||||
$jsonData = substr($resultLine, 7); // Remove 'RESULT:' prefix
|
||||
$imageUrls = json_decode($jsonData, true);
|
||||
|
||||
if (!is_array($imageUrls)) {
|
||||
throw new \RuntimeException('Failed to parse Puppeteer output');
|
||||
}
|
||||
|
||||
return $this->cleanImageUrls($imageUrls);
|
||||
}
|
||||
|
||||
// Format de sortie non reconnu
|
||||
throw new \RuntimeException('Invalid Puppeteer output format: ' . $resultLine);
|
||||
}
|
||||
|
||||
private function cleanImageUrls(array $urls): array
|
||||
{
|
||||
return array_filter(
|
||||
array_map(
|
||||
fn($url) => $this->cleanImageUrl($url),
|
||||
$urls
|
||||
),
|
||||
fn($url) => !empty($url) && filter_var($url, FILTER_VALIDATE_URL)
|
||||
);
|
||||
}
|
||||
|
||||
private function cleanImageUrl(string $url): string
|
||||
{
|
||||
return preg_replace('/[\x00-\x1F\x7F]/', '', trim($url));
|
||||
}
|
||||
}
|
||||
146
src/Domain/Scraping/Infrastructure/Service/ScraperFactory.php
Normal file
146
src/Domain/Scraping/Infrastructure/Service/ScraperFactory.php
Normal file
@@ -0,0 +1,146 @@
|
||||
<?php
|
||||
|
||||
namespace App\Domain\Scraping\Infrastructure\Service;
|
||||
|
||||
use App\Domain\Scraping\Domain\Contract\Service\ScraperInterface;
|
||||
use App\Domain\Scraping\Domain\Contract\Service\ScraperFactoryInterface;
|
||||
use App\Domain\Scraping\Infrastructure\Service\Scraper\HtmlScraper;
|
||||
use App\Domain\Scraping\Infrastructure\Service\Scraper\AdvancedHtmlScraper;
|
||||
use App\Domain\Scraping\Infrastructure\Service\Scraper\JavaScriptScraper;
|
||||
use App\Domain\Scraping\Domain\Contract\Service\ImageDownloaderInterface;
|
||||
use Symfony\Component\Messenger\MessageBusInterface;
|
||||
use Symfony\Contracts\HttpClient\HttpClientInterface;
|
||||
|
||||
class ScraperFactory implements ScraperFactoryInterface
|
||||
{
|
||||
private const SCRAPER_TYPES = [
|
||||
'html' => HtmlScraper::class,
|
||||
'advanced_html' => AdvancedHtmlScraper::class,
|
||||
'javascript' => JavaScriptScraper::class,
|
||||
];
|
||||
|
||||
private const SCRAPER_PRIORITIES = [
|
||||
'javascript' => 1, // Le plus puissant pour contourner les protections
|
||||
'advanced_html' => 2, // Bon compromis entre performance et efficacité
|
||||
'html' => 3, // Le plus simple et rapide
|
||||
];
|
||||
|
||||
private array $scrapers = [];
|
||||
|
||||
public function __construct(
|
||||
private readonly ImageDownloaderInterface $imageDownloader,
|
||||
private readonly MessageBusInterface $eventBus,
|
||||
private readonly HttpClientInterface $httpClient,
|
||||
private readonly string $projectDir
|
||||
) {
|
||||
$this->initializeScrapers();
|
||||
}
|
||||
|
||||
/**
|
||||
* Créer un scraper pour un type spécifique
|
||||
*/
|
||||
public function createScraper(string $type): ScraperInterface
|
||||
{
|
||||
if (!isset($this->scrapers[$type])) {
|
||||
throw new \InvalidArgumentException("Scraper type '{$type}' is not supported");
|
||||
}
|
||||
|
||||
return $this->scrapers[$type];
|
||||
}
|
||||
|
||||
/**
|
||||
* Obtenir le scraper le plus approprié selon la priorité
|
||||
*/
|
||||
public function getBestScraper(): ScraperInterface
|
||||
{
|
||||
$sortedTypes = array_keys(self::SCRAPER_PRIORITIES);
|
||||
usort($sortedTypes, fn($a, $b) => self::SCRAPER_PRIORITIES[$a] <=> self::SCRAPER_PRIORITIES[$b]);
|
||||
|
||||
return $this->scrapers[$sortedTypes[0]];
|
||||
}
|
||||
|
||||
/**
|
||||
* Obtenir tous les scrapers disponibles
|
||||
*/
|
||||
public function getAvailableScrapers(): array
|
||||
{
|
||||
return $this->scrapers;
|
||||
}
|
||||
|
||||
/**
|
||||
* Obtenir les types de scrapers supportés
|
||||
*/
|
||||
public function getSupportedTypes(): array
|
||||
{
|
||||
return array_keys(self::SCRAPER_TYPES);
|
||||
}
|
||||
|
||||
/**
|
||||
* Vérifier si un type de scraper est supporté
|
||||
*/
|
||||
public function isSupported(string $type): bool
|
||||
{
|
||||
return isset(self::SCRAPER_TYPES[$type]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Obtenir le scraper de fallback (le plus simple)
|
||||
*/
|
||||
public function getFallbackScraper(): ScraperInterface
|
||||
{
|
||||
return $this->scrapers['html'];
|
||||
}
|
||||
|
||||
/**
|
||||
* Essayer plusieurs scrapers en cascade jusqu'à ce qu'un fonctionne
|
||||
*/
|
||||
public function getScraperWithFallback(string $preferredType): ScraperInterface
|
||||
{
|
||||
// Essayer le type préféré d'abord
|
||||
if ($this->isSupported($preferredType)) {
|
||||
return $this->scrapers[$preferredType];
|
||||
}
|
||||
|
||||
// Fallback vers le scraper par défaut
|
||||
return $this->getFallbackScraper();
|
||||
}
|
||||
|
||||
/**
|
||||
* Obtenir des statistiques sur les scrapers
|
||||
*/
|
||||
public function getScraperStats(): array
|
||||
{
|
||||
return [
|
||||
'total_scrapers' => count($this->scrapers),
|
||||
'supported_types' => $this->getSupportedTypes(),
|
||||
'priorities' => self::SCRAPER_PRIORITIES,
|
||||
'best_scraper' => $this->getBestScraper()::class,
|
||||
'fallback_scraper' => $this->getFallbackScraper()::class
|
||||
];
|
||||
}
|
||||
|
||||
private function initializeScrapers(): void
|
||||
{
|
||||
foreach (self::SCRAPER_TYPES as $type => $class) {
|
||||
$this->scrapers[$type] = $this->createScraperInstance($class);
|
||||
}
|
||||
}
|
||||
|
||||
private function createScraperInstance(string $class): ScraperInterface
|
||||
{
|
||||
return match ($class) {
|
||||
HtmlScraper::class => new HtmlScraper(
|
||||
$this->imageDownloader,
|
||||
$this->eventBus,
|
||||
$this->httpClient
|
||||
),
|
||||
AdvancedHtmlScraper::class => new AdvancedHtmlScraper(
|
||||
$this->httpClient
|
||||
),
|
||||
JavaScriptScraper::class => new JavaScriptScraper(
|
||||
$this->projectDir
|
||||
),
|
||||
default => throw new \InvalidArgumentException("Unknown scraper class: {$class}")
|
||||
};
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user