feat: ajout d'une nouvelle infrastructure de scraping avec des scrapers pour HTML, HTML avancé et JavaScript, ainsi qu'une factory pour gérer leur création et leur sélection. Mise à jour des gestionnaires de commandes pour intégrer cette nouvelle architecture et améliorer la gestion des erreurs lors du scraping des chapitres.

This commit is contained in:
ext.jeremy.guillot@maxicoffee.domains
2025-07-08 15:30:22 +02:00
parent cbb62989d4
commit b456f9304d
10 changed files with 1244 additions and 36 deletions

View File

@@ -9,6 +9,7 @@ use App\Domain\Scraping\Domain\Contract\Repository\SourceRepositoryInterface;
use App\Domain\Scraping\Domain\Contract\Service\CbzGeneratorInterface;
use App\Domain\Scraping\Domain\Contract\Service\ImageDownloaderInterface;
use App\Domain\Scraping\Domain\Contract\Service\ScraperInterface;
use App\Domain\Scraping\Domain\Contract\Service\ScraperFactoryInterface;
use App\Domain\Scraping\Domain\Event\ChapterScraped;
use App\Domain\Scraping\Domain\Event\ChapterScrapingFailed;
use App\Domain\Scraping\Domain\Model\Chapter;
@@ -25,7 +26,7 @@ use Doctrine\ORM\EntityManagerInterface;
readonly class ScrapeChapterHandler
{
public function __construct(
private ScraperInterface $scraper,
private ScraperFactoryInterface $scraperFactory,
private ImageDownloaderInterface $imageDownloader,
private CbzGeneratorInterface $cbzGenerator,
private JobRepositoryInterface $jobRepository,
@@ -87,13 +88,19 @@ readonly class ScrapeChapterHandler
$this->entityManager->beginTransaction();
// 5. Scraping des URLs avec le slug courant
$scrapingParameters = $source->getScrappingParameters();
$scrapingParameters['chapterNumber'] = $chapter->chapterNumber;
$scrapingType = $scrapingParameters['scrapingType'] ?? 'html';
$scrapingRequest = new ScrapingRequest(
'html',
$scrapingType,
$source->buildChapterUrl($slug, $chapter->chapterNumber),
$source->getScrappingParameters()
$scrapingParameters
);
$scrapingResult = $this->scraper->scrape($scrapingRequest);
// Sélection du scraper approprié selon le type
$scraper = $this->scraperFactory->getScraperWithFallback($scrapingType);
$scrapingResult = $scraper->scrape($scrapingRequest);
// 6. Téléchargement des images
$tempDir = new TempDirectory();
@@ -134,6 +141,8 @@ readonly class ScrapeChapterHandler
break;
} catch (\Exception $e) {
dump('EXCEPTION for source ' . $source->getName() . ' with slug ' . $slug . ': ' . $e->getMessage());
$this->entityManager->rollback();
if (isset($job)) {
@@ -184,6 +193,11 @@ readonly class ScrapeChapterHandler
if ($source) {
$preferredSources[] = $source;
}
// Limiter à 3 sources préférées maximum
if (count($preferredSources) >= 3) {
break;
}
}
if (!empty($preferredSources)) {

View File

@@ -5,46 +5,43 @@ namespace App\Domain\Scraping\Application\CommandHandler;
use App\Domain\Scraping\Application\Command\TestScraperConfiguration;
use App\Domain\Scraping\Application\Response\TestScraperConfigurationResponse;
use App\Domain\Scraping\Domain\Contract\Service\ScraperInterface;
use App\Domain\Scraping\Domain\Contract\Service\ScraperFactoryInterface;
use App\Domain\Scraping\Domain\Model\ValueObject\ScrapingRequest;
readonly class TestScraperConfigurationHandler
{
public function __construct(
private ScraperInterface $scraper
private ScraperFactoryInterface $scraperFactory
) {}
public function handle(TestScraperConfiguration $command): TestScraperConfigurationResponse
{
// Construction des paramètres de scraping depuis les données de la commande
$scrapingParameters = [
'imageSelector' => $command->imageSelector,
'nextPageSelector' => $command->nextPageSelector,
'chapterUrlFormat' => $command->chapterUrlFormat,
'scrapingType' => $command->scrapingType,
'chapterSelector' => $command->chapterSelector,
'chapterNumber' => $command->chapterNumber
];
// Vérification que le scraper supporte le type de scraping
if (!$this->scraperFactory->isSupported($command->scrapingType)) {
return $this->tryWithFallbackScrapers($command, $scrapingParameters);
}
// Essayer avec le scraper demandé
try {
// Construction des paramètres de scraping depuis les données de la commande
$scrapingParameters = [
'imageSelector' => $command->imageSelector,
'nextPageSelector' => $command->nextPageSelector,
'chapterUrlFormat' => $command->chapterUrlFormat,
'scrapingType' => $command->scrapingType,
'chapterSelector' => $command->chapterSelector
];
// Vérification que le scraper supporte le type de scraping
if (!$this->scraper->supports($command->scrapingType)) {
return TestScraperConfigurationResponse::failure(
$command->testUrl,
$command->scrapingType,
["Type de scraping '{$command->scrapingType}' non supporté"]
);
}
// Création de la requête de scraping avec l'URL de test fournie directement
$scraper = $this->scraperFactory->createScraper($command->scrapingType);
$scrapingRequest = new ScrapingRequest(
$command->scrapingType,
$command->testUrl,
$scrapingParameters
);
// Tentative de scraping
$scrapingResult = $this->scraper->scrape($scrapingRequest);
$scrapingResult = $scraper->scrape($scrapingRequest);
// Retour du succès avec les URLs trouvées
return TestScraperConfigurationResponse::success(
$scrapingResult->getImageUrls(),
$command->testUrl,
@@ -52,17 +49,80 @@ readonly class TestScraperConfigurationHandler
);
} catch (\Exception $e) {
// Analyse de l'erreur pour fournir un message plus détaillé
$errors = $this->analyzeError($e, $command);
return TestScraperConfigurationResponse::failure(
$command->testUrl,
$command->scrapingType,
$errors
$this->analyzeError($e, $command)
);
}
}
private function tryWithFallbackScrapers(
TestScraperConfiguration $command,
array $scrapingParameters,
?\Exception $originalException = null
): TestScraperConfigurationResponse {
$errors = [];
if ($originalException) {
$errors[] = [
'type' => 'primary_scraper_failed',
'scraper' => $command->scrapingType,
'message' => $originalException->getMessage()
];
}
// Essayer avec tous les scrapers disponibles
$availableScrapers = $this->scraperFactory->getSupportedTypes();
$triedScrapers = [];
foreach ($availableScrapers as $scraperType) {
if ($scraperType === $command->scrapingType) {
continue; // Déjà essayé
}
try {
$scraper = $this->scraperFactory->createScraper($scraperType);
$scrapingRequest = new ScrapingRequest(
$scraperType,
$command->testUrl,
$scrapingParameters
);
$scrapingResult = $scraper->scrape($scrapingRequest);
// Succès avec un scraper alternatif
return TestScraperConfigurationResponse::success(
$scrapingResult->getImageUrls(),
$command->testUrl,
$scraperType, // Retourner le type de scraper qui a fonctionné
"Scraper alternatif utilisé: {$scraperType} (au lieu de {$command->scrapingType})"
);
} catch (\Exception $e) {
$triedScrapers[] = $scraperType;
$errors[] = [
'type' => 'fallback_scraper_failed',
'scraper' => $scraperType,
'message' => $e->getMessage()
];
}
}
// Tous les scrapers ont échoué
$errors[] = [
'type' => 'all_scrapers_failed',
'message' => 'Aucun scraper disponible n\'a réussi à traiter cette URL',
'tried_scrapers' => array_merge([$command->scrapingType], $triedScrapers)
];
return TestScraperConfigurationResponse::failure(
$command->testUrl,
$command->scrapingType,
$errors
);
}
private function analyzeError(\Exception $e, TestScraperConfiguration $command): array
{
$errors = [];

View File

@@ -0,0 +1,36 @@
<?php
namespace App\Domain\Scraping\Domain\Contract\Service;
interface ScraperFactoryInterface
{
/**
* Créer un scraper pour un type spécifique
*/
public function createScraper(string $type): ScraperInterface;
/**
* Obtenir le scraper le plus approprié selon la priorité
*/
public function getBestScraper(): ScraperInterface;
/**
* Obtenir le scraper de fallback (le plus simple)
*/
public function getFallbackScraper(): ScraperInterface;
/**
* Essayer plusieurs scrapers en cascade jusqu'à ce qu'un fonctionne
*/
public function getScraperWithFallback(string $preferredType): ScraperInterface;
/**
* Obtenir les types de scrapers supportés
*/
public function getSupportedTypes(): array;
/**
* Vérifier si un type de scraper est supporté
*/
public function isSupported(string $type): bool;
}

View File

@@ -4,8 +4,8 @@ namespace App\Domain\Scraping\Domain\Exception;
class ChapterNotFoundException extends \Exception
{
public function __construct()
public function __construct(string $message = 'Chapter not found')
{
parent::__construct('Chapter not found');
parent::__construct($message);
}
}
}

View File

@@ -32,8 +32,8 @@ class ChapterUrl
private function validateUrlFormat(string $format): string
{
if (!str_contains($format, '{slug}') || !str_contains($format, '{chapterNumber}')) {
throw new InvalidArgumentException("The URL format must contain both {slug} and {chapterNumber} placeholders.");
if (!str_contains($format, '{slug}')) {
throw new InvalidArgumentException("The URL format must contain {slug} placeholder.");
}
return $format;

View File

@@ -0,0 +1,252 @@
<?php
namespace App\Domain\Scraping\Infrastructure\Service\Scraper;
use App\Domain\Scraping\Domain\Contract\Service\ScraperInterface;
use App\Domain\Scraping\Domain\Model\ValueObject\ScrapingRequest;
use App\Domain\Scraping\Domain\Model\ValueObject\ScrapingResult;
use Symfony\Component\DomCrawler\Crawler;
use Symfony\Contracts\HttpClient\HttpClientInterface;
use Symfony\Component\HttpClient\HttpClient;
class AdvancedHtmlScraper implements ScraperInterface
{
private const USER_AGENTS = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0',
'Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0'
];
private const ACCEPT_HEADERS = [
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
];
private const ACCEPT_LANGUAGE_HEADERS = [
'en-US,en;q=0.9',
'en-US,en;q=0.8',
'en-GB,en;q=0.9',
'fr-FR,fr;q=0.9,en;q=0.8'
];
private const RETRY_ATTEMPTS = 3;
private const RETRY_DELAY = 2; // secondes
private const REQUEST_TIMEOUT = 30;
public function __construct(
private readonly HttpClientInterface $httpClient
) {
// Utiliser un client HTTP personnalisé si non fourni
if (!$this->httpClient) {
$this->httpClient = HttpClient::create([
'timeout' => self::REQUEST_TIMEOUT,
'verify_peer' => false,
'verify_host' => false
]);
}
}
public function scrape(ScrapingRequest $request): ScrapingResult
{
$scrapingParameters = $request->getScrapingParameters();
try {
$pages = !$scrapingParameters['nextPageSelector']
? $this->scrapeVerticalReader($request)
: $this->scrapeHorizontalReader($request);
return new ScrapingResult($pages, count($pages));
} catch (\Exception $e) {
throw new \RuntimeException('Advanced HTML scraping failed: ' . $e->getMessage(), 0, $e);
}
}
public function supports(string $sourceType): bool
{
return 'advanced_html' === $sourceType;
}
private function scrapeVerticalReader(ScrapingRequest $request): array
{
$html = $this->fetchHtmlWithRetry($request->getChapterUrl());
$crawler = new Crawler($html);
$params = $request->getScrapingParameters();
$images = $crawler->filter($params['imageSelector'])
->each(function ($node) {
// Essayer plusieurs attributs pour trouver l'URL de l'image
$src = $node->attr('src') ?:
$node->attr('data-src') ?:
$node->attr('data-lazy-src') ?:
$node->attr('data-original') ?:
$node->attr('data-zoom-image') ?:
$node->attr('data-full-src');
return $this->cleanImageUrl($src);
});
return array_filter($images, fn($url) => !empty($url));
}
private function scrapeHorizontalReader(ScrapingRequest $request): array
{
$pages = [];
$currentUrl = $request->getChapterUrl();
$params = $request->getScrapingParameters();
$visitedUrls = new \SplObjectStorage();
$maxPages = 200; // Limite de sécurité
$pageCount = 0;
while ($currentUrl && $pageCount < $maxPages) {
// Éviter les boucles infinies
if (isset($visitedUrls[$currentUrl])) {
break;
}
$visitedUrls[$currentUrl] = true;
$html = $this->fetchHtmlWithRetry($currentUrl);
$crawler = new Crawler($html);
// Récupérer l'image de la page
$imageNode = $crawler->filter($params['imageSelector'])->first();
if ($imageNode->count() > 0) {
$imageUrl = $imageNode->attr('src') ?:
$imageNode->attr('data-src') ?:
$imageNode->attr('data-lazy-src') ?:
$imageNode->attr('data-original');
if ($imageUrl) {
$imageUrl = $this->resolveRelativeUrl($imageUrl, $currentUrl);
$pages[] = $this->cleanImageUrl($imageUrl);
}
}
// Chercher le lien suivant
$nextLink = $crawler->filter($params['nextPageSelector'])->first();
if ($nextLink->count() === 0) {
break;
}
$nextUrl = $nextLink->attr('href');
if (!$nextUrl) {
break;
}
$currentUrl = $this->resolveRelativeUrl($nextUrl, $currentUrl);
$pageCount++;
// Pause entre les requêtes pour éviter la détection
sleep(1);
}
return array_filter($pages, fn($url) => !empty($url));
}
private function fetchHtmlWithRetry(string $url): string
{
$lastException = null;
for ($attempt = 1; $attempt <= self::RETRY_ATTEMPTS; $attempt++) {
try {
return $this->fetchHtml($url);
} catch (\Exception $e) {
$lastException = $e;
if ($attempt < self::RETRY_ATTEMPTS) {
// Attendre avant de réessayer
sleep(self::RETRY_DELAY * $attempt);
}
}
}
throw $lastException;
}
private function fetchHtml(string $url): string
{
$headers = $this->generateHeaders();
try {
$response = $this->httpClient->request('GET', $url, [
'headers' => $headers,
'timeout' => self::REQUEST_TIMEOUT
]);
$statusCode = $response->getStatusCode();
if ($statusCode >= 400) {
throw new \RuntimeException("HTTP {$statusCode} error for URL: {$url}");
}
$content = $response->getContent();
// Vérifier si on a été bloqué par Cloudflare
if (strpos($content, 'cf-browser-verification') !== false ||
strpos($content, 'Checking your browser') !== false) {
throw new \RuntimeException('Blocked by Cloudflare protection');
}
return $content;
} catch (\Exception $e) {
throw new \RuntimeException('Failed to fetch HTML: ' . $e->getMessage(), 0, $e);
}
}
private function generateHeaders(): array
{
return [
'User-Agent' => self::USER_AGENTS[array_rand(self::USER_AGENTS)],
'Accept' => self::ACCEPT_HEADERS[array_rand(self::ACCEPT_HEADERS)],
'Accept-Language' => self::ACCEPT_LANGUAGE_HEADERS[array_rand(self::ACCEPT_LANGUAGE_HEADERS)],
'Accept-Encoding' => 'gzip, deflate, br',
'DNT' => '1',
'Connection' => 'keep-alive',
'Upgrade-Insecure-Requests' => '1',
'Sec-Fetch-Dest' => 'document',
'Sec-Fetch-Mode' => 'navigate',
'Sec-Fetch-Site' => 'none',
'Sec-Fetch-User' => '?1',
'Cache-Control' => 'max-age=0'
];
}
private function resolveRelativeUrl(string $url, string $baseUrl): string
{
if (preg_match('/^https?:\/\//', $url)) {
return $url;
}
$parsedBase = parse_url($baseUrl);
$scheme = $parsedBase['scheme'];
$host = $parsedBase['host'];
$port = isset($parsedBase['port']) ? ':' . $parsedBase['port'] : '';
if (strpos($url, '/') === 0) {
// URL absolue relative à la racine
return $scheme . '://' . $host . $port . $url;
} else {
// URL relative au chemin actuel
$path = isset($parsedBase['path']) ? dirname($parsedBase['path']) : '';
return $scheme . '://' . $host . $port . $path . '/' . $url;
}
}
private function cleanImageUrl(string $url): string
{
if (empty($url)) {
return '';
}
// Supprimer les caractères de contrôle
$url = preg_replace('/[\x00-\x1F\x7F]/', '', trim($url));
// Supprimer les paramètres de requête inutiles
$url = preg_replace('/(\?|&)(utm_[^&]*|ref[^&]*|source[^&]*)/i', '', $url);
return $url;
}
}

View File

@@ -0,0 +1,157 @@
<?php
namespace App\Domain\Scraping\Infrastructure\Service\Scraper;
use App\Domain\Scraping\Domain\Contract\Service\ScraperInterface;
use App\Domain\Scraping\Domain\Exception\ChapterNotFoundException;
use App\Domain\Scraping\Domain\Model\ValueObject\ScrapingRequest;
use App\Domain\Scraping\Domain\Model\ValueObject\ScrapingResult;
use Symfony\Component\Process\Process;
class JavaScriptScraper implements ScraperInterface
{
private const PUPPETEER_TIMEOUT = 60; // secondes
private const PUPPETEER_SCRIPT_PATH = '/public/puppeteer-scraper.js';
private const NODE_EXECUTABLE = 'node';
public function __construct(
private readonly string $projectDir
) {}
public function scrape(ScrapingRequest $request): ScrapingResult
{
$scrappingParameters = $request->getScrapingParameters();
try {
$scriptPath = $this->projectDir . self::PUPPETEER_SCRIPT_PATH;
if (!file_exists($scriptPath)) {
throw new \RuntimeException('Puppeteer script not found at: ' . $scriptPath);
}
$imageUrls = !empty($scrappingParameters['nextPageSelector'])
? $this->scrapeHorizontalReader($request, $scriptPath)
: $this->scrapeVerticalReader($request, $scriptPath);
return new ScrapingResult($imageUrls, count($imageUrls));
} catch (\Exception $e) {
throw new \RuntimeException('JavaScript scraping failed: ' . $e->getMessage(), 0, $e);
}
}
public function supports(string $sourceType): bool
{
return 'javascript' === $sourceType;
}
private function scrapeVerticalReader(ScrapingRequest $request, string $scriptPath): array
{
$params = $request->getScrapingParameters();
$processArgs = [
self::NODE_EXECUTABLE,
$scriptPath,
'--mode=vertical',
'--url=' . $request->getChapterUrl(),
'--image-selector=' . $params['imageSelector'],
'--wait-for-images=true',
'--scroll=true'
];
// Ajouter les paramètres de chapitre si disponibles
if (!empty($params['chapterSelector'])) {
$processArgs[] = '--chapter-selector=' . $params['chapterSelector'];
}
if (isset($params['chapterNumber'])) {
$processArgs[] = '--chapter-number=' . $params['chapterNumber'];
}
$process = new Process($processArgs);
return $this->executeProcess($process);
}
private function scrapeHorizontalReader(ScrapingRequest $request, string $scriptPath): array
{
$params = $request->getScrapingParameters();
$processArgs = [
self::NODE_EXECUTABLE,
$scriptPath,
'--mode=horizontal',
'--url=' . $request->getChapterUrl(),
'--image-selector=' . $params['imageSelector'],
'--next-selector=' . $params['nextPageSelector'],
'--wait-for-images=true'
];
// Ajouter les paramètres de chapitre si disponibles
if (!empty($params['chapterSelector'])) {
$processArgs[] = '--chapter-selector=' . $params['chapterSelector'];
}
if (isset($params['chapterNumber'])) {
$processArgs[] = '--chapter-number=' . $params['chapterNumber'];
}
$process = new Process($processArgs);
return $this->executeProcess($process);
}
private function executeProcess(Process $process): array
{
$process->setTimeout(self::PUPPETEER_TIMEOUT);
$process->run();
if (!$process->isSuccessful()) {
$error = $process->getErrorOutput() ?: $process->getOutput();
throw new \RuntimeException('Puppeteer process failed: ' . $error);
}
$output = $process->getOutput();
$lines = explode("\n", trim($output));
$resultLine = end($lines);
// Gérer le cas où le chapitre n'est pas trouvé
if (strpos($resultLine, 'CHAPTER_NOT_FOUND:') === 0) {
$jsonData = substr($resultLine, 18); // Remove 'CHAPTER_NOT_FOUND:' prefix
$errorData = json_decode($jsonData, true);
if (is_array($errorData) && isset($errorData['message'])) {
throw new ChapterNotFoundException($errorData['message']);
}
throw new ChapterNotFoundException('Le chapitre demandé n\'est pas disponible.');
}
// Gérer le cas normal avec des images
if (strpos($resultLine, 'RESULT:') === 0) {
$jsonData = substr($resultLine, 7); // Remove 'RESULT:' prefix
$imageUrls = json_decode($jsonData, true);
if (!is_array($imageUrls)) {
throw new \RuntimeException('Failed to parse Puppeteer output');
}
return $this->cleanImageUrls($imageUrls);
}
// Format de sortie non reconnu
throw new \RuntimeException('Invalid Puppeteer output format: ' . $resultLine);
}
private function cleanImageUrls(array $urls): array
{
return array_filter(
array_map(
fn($url) => $this->cleanImageUrl($url),
$urls
),
fn($url) => !empty($url) && filter_var($url, FILTER_VALIDATE_URL)
);
}
private function cleanImageUrl(string $url): string
{
return preg_replace('/[\x00-\x1F\x7F]/', '', trim($url));
}
}

View File

@@ -0,0 +1,146 @@
<?php
namespace App\Domain\Scraping\Infrastructure\Service;
use App\Domain\Scraping\Domain\Contract\Service\ScraperInterface;
use App\Domain\Scraping\Domain\Contract\Service\ScraperFactoryInterface;
use App\Domain\Scraping\Infrastructure\Service\Scraper\HtmlScraper;
use App\Domain\Scraping\Infrastructure\Service\Scraper\AdvancedHtmlScraper;
use App\Domain\Scraping\Infrastructure\Service\Scraper\JavaScriptScraper;
use App\Domain\Scraping\Domain\Contract\Service\ImageDownloaderInterface;
use Symfony\Component\Messenger\MessageBusInterface;
use Symfony\Contracts\HttpClient\HttpClientInterface;
class ScraperFactory implements ScraperFactoryInterface
{
private const SCRAPER_TYPES = [
'html' => HtmlScraper::class,
'advanced_html' => AdvancedHtmlScraper::class,
'javascript' => JavaScriptScraper::class,
];
private const SCRAPER_PRIORITIES = [
'javascript' => 1, // Le plus puissant pour contourner les protections
'advanced_html' => 2, // Bon compromis entre performance et efficacité
'html' => 3, // Le plus simple et rapide
];
private array $scrapers = [];
public function __construct(
private readonly ImageDownloaderInterface $imageDownloader,
private readonly MessageBusInterface $eventBus,
private readonly HttpClientInterface $httpClient,
private readonly string $projectDir
) {
$this->initializeScrapers();
}
/**
* Créer un scraper pour un type spécifique
*/
public function createScraper(string $type): ScraperInterface
{
if (!isset($this->scrapers[$type])) {
throw new \InvalidArgumentException("Scraper type '{$type}' is not supported");
}
return $this->scrapers[$type];
}
/**
* Obtenir le scraper le plus approprié selon la priorité
*/
public function getBestScraper(): ScraperInterface
{
$sortedTypes = array_keys(self::SCRAPER_PRIORITIES);
usort($sortedTypes, fn($a, $b) => self::SCRAPER_PRIORITIES[$a] <=> self::SCRAPER_PRIORITIES[$b]);
return $this->scrapers[$sortedTypes[0]];
}
/**
* Obtenir tous les scrapers disponibles
*/
public function getAvailableScrapers(): array
{
return $this->scrapers;
}
/**
* Obtenir les types de scrapers supportés
*/
public function getSupportedTypes(): array
{
return array_keys(self::SCRAPER_TYPES);
}
/**
* Vérifier si un type de scraper est supporté
*/
public function isSupported(string $type): bool
{
return isset(self::SCRAPER_TYPES[$type]);
}
/**
* Obtenir le scraper de fallback (le plus simple)
*/
public function getFallbackScraper(): ScraperInterface
{
return $this->scrapers['html'];
}
/**
* Essayer plusieurs scrapers en cascade jusqu'à ce qu'un fonctionne
*/
public function getScraperWithFallback(string $preferredType): ScraperInterface
{
// Essayer le type préféré d'abord
if ($this->isSupported($preferredType)) {
return $this->scrapers[$preferredType];
}
// Fallback vers le scraper par défaut
return $this->getFallbackScraper();
}
/**
* Obtenir des statistiques sur les scrapers
*/
public function getScraperStats(): array
{
return [
'total_scrapers' => count($this->scrapers),
'supported_types' => $this->getSupportedTypes(),
'priorities' => self::SCRAPER_PRIORITIES,
'best_scraper' => $this->getBestScraper()::class,
'fallback_scraper' => $this->getFallbackScraper()::class
];
}
private function initializeScrapers(): void
{
foreach (self::SCRAPER_TYPES as $type => $class) {
$this->scrapers[$type] = $this->createScraperInstance($class);
}
}
private function createScraperInstance(string $class): ScraperInterface
{
return match ($class) {
HtmlScraper::class => new HtmlScraper(
$this->imageDownloader,
$this->eventBus,
$this->httpClient
),
AdvancedHtmlScraper::class => new AdvancedHtmlScraper(
$this->httpClient
),
JavaScriptScraper::class => new JavaScriptScraper(
$this->projectDir
),
default => throw new \InvalidArgumentException("Unknown scraper class: {$class}")
};
}
}