diff --git a/src/Domain/Scraping/Application/CommandHandler/ScrapeChapterHandler.php b/src/Domain/Scraping/Application/CommandHandler/ScrapeChapterHandler.php index 5cc569e..c2dcf8d 100644 --- a/src/Domain/Scraping/Application/CommandHandler/ScrapeChapterHandler.php +++ b/src/Domain/Scraping/Application/CommandHandler/ScrapeChapterHandler.php @@ -4,13 +4,19 @@ namespace App\Domain\Scraping\Application\CommandHandler; use App\Domain\Scraping\Application\Command\ScrapeChapter; use App\Domain\Scraping\Domain\Contract\Repository\ChapterRepositoryInterface; +use App\Domain\Scraping\Domain\Contract\Repository\MangaRepositoryInterface; use App\Domain\Scraping\Domain\Contract\Repository\ScrapingJobRepositoryInterface; +use App\Domain\Scraping\Domain\Contract\Repository\SourceRepositoryInterface; +use App\Domain\Scraping\Domain\Contract\Service\CbzGeneratorInterface; +use App\Domain\Scraping\Domain\Contract\Service\ImageDownloaderInterface; use App\Domain\Scraping\Domain\Contract\Service\ScraperInterface; use App\Domain\Scraping\Domain\Event\ChapterScraped; use App\Domain\Scraping\Domain\Event\ChapterScrapingFailed; use App\Domain\Scraping\Domain\Event\ChapterScrapingStarted; use App\Domain\Scraping\Domain\Model\ScrapingJob; -use App\Domain\Scraping\Domain\Model\ScrapingStatus; +use App\Domain\Scraping\Domain\Model\ValueObject\CbzGenerationRequest; +use App\Domain\Scraping\Domain\Model\ValueObject\ScrapingRequest; +use App\Domain\Scraping\Domain\Model\ValueObject\TempDirectory; use Ramsey\Uuid\Uuid; use Symfony\Component\Messenger\MessageBusInterface; @@ -18,8 +24,12 @@ readonly class ScrapeChapterHandler { public function __construct( private ScraperInterface $scraper, + private ImageDownloaderInterface $imageDownloader, + private CbzGeneratorInterface $cbzGenerator, private ScrapingJobRepositoryInterface $scrapingJobRepository, private ChapterRepositoryInterface $chapterRepository, + private MangaRepositoryInterface $mangaRepository, + private SourceRepositoryInterface $sourceRepository, private MessageBusInterface $eventBus ) { } @@ -27,30 +37,71 @@ readonly class ScrapeChapterHandler public function handle(ScrapeChapter $command): void { try { + // 1. Création du job $job = new ScrapingJob( - Uuid::uuid4(), + Uuid::uuid4()->toString(), $command->mangaId, $command->chapterNumber, $command->sourceId ); - $this->scrapingJobRepository->save($job); + // 2. Préparation des données + $manga = $this->mangaRepository->getById($command->mangaId); + $chapter = $this->chapterRepository->getByMangaIdAndChapterNumber($command->mangaId, $command->chapterNumber); + $source = $this->sourceRepository->getById($command->sourceId); + $this->eventBus->dispatch(new ChapterScrapingStarted($job->getId())); - $job = $this->scraper->scrape($job); - - if($job->status === ScrapingStatus::FAILED) { - $this->eventBus->dispatch(new ChapterScrapingFailed($command->mangaId, $command->chapterNumber, $job->failureReason)); - }elseif ($job->status === ScrapingStatus::COMPLETED) { - $this->eventBus->dispatch(new ChapterScraped($job->getId())); - $chapter = $this->chapterRepository->getByMangaIdAndChapterNumber($command->mangaId, $command->chapterNumber); - $chapter->cbzPath = $job->cbzPath->getPath(); - $this->chapterRepository->save($chapter); - } + // 3. Scraping des URLs + $scrapingRequest = new ScrapingRequest( + 'html', + $source->buildChapterUrl($manga->getSlug(), $command->chapterNumber), + $source->getScrappingParameters(), + $job->getId() + ); + $scrapingResult = $this->scraper->scrape($scrapingRequest); + $job->totalPages = $scrapingResult->getTotalPages(); $this->scrapingJobRepository->save($job); + + // 4. Téléchargement des images + $tempDir = new TempDirectory(); + $downloadResults = $this->imageDownloader->downloadBatch( + $scrapingResult->getImageUrls(), + $tempDir, + $job->getId() + ); + + // 5. Génération du CBZ + $cbzRequest = new CbzGenerationRequest( + $manga->getTitle(), + $manga->getPublicationYear(), + $chapter->volumeNumber, + $command->chapterNumber, + $tempDir, + array_map(fn($r) => $r->getLocalPath(), $downloadResults) + ); + + $cbzPath = $this->cbzGenerator->generate($cbzRequest); + + // 6. Mise à jour et sauvegarde + $job->complete(); + $job->cbzPath = $cbzPath; + $this->scrapingJobRepository->save($job); + + $chapter->cbzPath = $cbzPath->getPath(); + $this->chapterRepository->save($chapter); + + $this->eventBus->dispatch(new ChapterScraped($job->getId())); + + // 7. Nettoyage + $tempDir->cleanup(); } catch (\Exception $e) { + if (isset($job)) { + $job->fail($e->getMessage()); + $this->scrapingJobRepository->save($job); + } $this->eventBus->dispatch(new ChapterScrapingFailed($command->mangaId, $command->chapterNumber, $e->getMessage())); throw $e; } diff --git a/src/Domain/Scraping/Domain/Contract/Service/CbzGeneratorInterface.php b/src/Domain/Scraping/Domain/Contract/Service/CbzGeneratorInterface.php index f1c258d..0f2e28f 100644 --- a/src/Domain/Scraping/Domain/Contract/Service/CbzGeneratorInterface.php +++ b/src/Domain/Scraping/Domain/Contract/Service/CbzGeneratorInterface.php @@ -2,11 +2,10 @@ namespace App\Domain\Scraping\Domain\Contract\Service; -use App\Domain\Scraping\Domain\Model\ScrapingJob; +use App\Domain\Scraping\Domain\Model\ValueObject\CbzGenerationRequest; use App\Domain\Scraping\Domain\Model\ValueObject\CbzPath; -use App\Domain\Scraping\Domain\Model\ValueObject\TempDirectory; interface CbzGeneratorInterface { - public function generate(ScrapingJob $job, TempDirectory $tempDirectory): CbzPath; + public function generate(CbzGenerationRequest $request): CbzPath; } diff --git a/src/Domain/Scraping/Domain/Contract/Service/ImageDownloaderInterface.php b/src/Domain/Scraping/Domain/Contract/Service/ImageDownloaderInterface.php index f3a3737..98fa8ae 100644 --- a/src/Domain/Scraping/Domain/Contract/Service/ImageDownloaderInterface.php +++ b/src/Domain/Scraping/Domain/Contract/Service/ImageDownloaderInterface.php @@ -2,7 +2,16 @@ namespace App\Domain\Scraping\Domain\Contract\Service; +use App\Domain\Scraping\Domain\Model\ValueObject\DownloadResult; +use App\Domain\Scraping\Domain\Model\ValueObject\TempDirectory; + interface ImageDownloaderInterface { public function download(string $url, string $destination): void; + + /** + * @param array $urls + * @return array + */ + public function downloadBatch(array $urls, TempDirectory $tempDir, string $jobId): array; } diff --git a/src/Domain/Scraping/Domain/Contract/Service/ScraperInterface.php b/src/Domain/Scraping/Domain/Contract/Service/ScraperInterface.php index 3c615f5..7f58522 100644 --- a/src/Domain/Scraping/Domain/Contract/Service/ScraperInterface.php +++ b/src/Domain/Scraping/Domain/Contract/Service/ScraperInterface.php @@ -2,10 +2,11 @@ namespace App\Domain\Scraping\Domain\Contract\Service; -use App\Domain\Scraping\Domain\Model\ScrapingJob; +use App\Domain\Scraping\Domain\Model\ValueObject\ScrapingRequest; +use App\Domain\Scraping\Domain\Model\ValueObject\ScrapingResult; interface ScraperInterface { - public function scrape(ScrapingJob $job): ScrapingJob; + public function scrape(ScrapingRequest $request): ScrapingResult; public function supports(string $sourceType): bool; } diff --git a/src/Domain/Scraping/Domain/Model/Source.php b/src/Domain/Scraping/Domain/Model/Source.php index 9d88f17..1bddef4 100644 --- a/src/Domain/Scraping/Domain/Model/Source.php +++ b/src/Domain/Scraping/Domain/Model/Source.php @@ -2,6 +2,7 @@ namespace App\Domain\Scraping\Domain\Model; +use App\Domain\Scraping\Domain\Model\ValueObject\ChapterUrl; use App\Domain\Scraping\Domain\Model\ValueObject\SourceId; use DateTimeImmutable; @@ -58,4 +59,10 @@ readonly class Source { return $this->updatedAt; } + + public function buildChapterUrl(string $mangaSlug, float $chapterNumber): string + { + $chapterUrl = new ChapterUrl($this->scrappingParameters['chapterUrlFormat'], $mangaSlug, $chapterNumber); + return $chapterUrl->getUrl(); + } } diff --git a/src/Domain/Scraping/Domain/Model/ValueObject/CbzGenerationRequest.php b/src/Domain/Scraping/Domain/Model/ValueObject/CbzGenerationRequest.php new file mode 100644 index 0000000..aac6bbd --- /dev/null +++ b/src/Domain/Scraping/Domain/Model/ValueObject/CbzGenerationRequest.php @@ -0,0 +1,65 @@ +mangaTitle; + } + + public function getPublicationYear(): string + { + return $this->publicationYear; + } + + public function getVolumeNumber(): int + { + return $this->volumeNumber; + } + + public function getChapterNumber(): float + { + return $this->chapterNumber; + } + + public function getSourceDirectory(): TempDirectory + { + return $this->sourceDirectory; + } + + public function getFiles(): array + { + return $this->files; + } +} diff --git a/src/Domain/Scraping/Domain/Model/ValueObject/DownloadResult.php b/src/Domain/Scraping/Domain/Model/ValueObject/DownloadResult.php new file mode 100644 index 0000000..8d71235 --- /dev/null +++ b/src/Domain/Scraping/Domain/Model/ValueObject/DownloadResult.php @@ -0,0 +1,29 @@ +localPath; + } + + public function getOriginalUrl(): string + { + return $this->originalUrl; + } +} diff --git a/src/Domain/Scraping/Domain/Model/ValueObject/ScrapingRequest.php b/src/Domain/Scraping/Domain/Model/ValueObject/ScrapingRequest.php new file mode 100644 index 0000000..e6120cb --- /dev/null +++ b/src/Domain/Scraping/Domain/Model/ValueObject/ScrapingRequest.php @@ -0,0 +1,34 @@ +sourceType; + } + + public function getChapterUrl(): string + { + return $this->chapterUrl; + } + + public function getScrapingParameters(): array + { + return $this->scrapingParameters; + } + + public function getJobId(): string + { + return $this->jobId; + } +} diff --git a/src/Domain/Scraping/Domain/Model/ValueObject/ScrapingResult.php b/src/Domain/Scraping/Domain/Model/ValueObject/ScrapingResult.php new file mode 100644 index 0000000..71654e6 --- /dev/null +++ b/src/Domain/Scraping/Domain/Model/ValueObject/ScrapingResult.php @@ -0,0 +1,31 @@ +imageUrls; + } + + public function getTotalPages(): int + { + return $this->totalPages; + } +} diff --git a/src/Domain/Scraping/Domain/Model/ValueObject/TempDirectory.php b/src/Domain/Scraping/Domain/Model/ValueObject/TempDirectory.php index d3528ab..bafccbe 100644 --- a/src/Domain/Scraping/Domain/Model/ValueObject/TempDirectory.php +++ b/src/Domain/Scraping/Domain/Model/ValueObject/TempDirectory.php @@ -2,12 +2,15 @@ namespace App\Domain\Scraping\Domain\Model\ValueObject; -readonly class TempDirectory +class TempDirectory { - public function __construct(private string $path) + private string $path; + + public function __construct() { - if (!is_dir($path) && !mkdir($path)) { - throw new \RuntimeException("Failed to create directory: $path"); + $this->path = sys_get_temp_dir() . '/' . uniqid('manga_scraper_'); + if (!mkdir($this->path, 0755, true)) { + throw new \RuntimeException('Failed to create temporary directory'); } } @@ -15,4 +18,31 @@ readonly class TempDirectory { return $this->path; } + + public function cleanup(): void + { + if (!is_dir($this->path)) { + return; + } + + $files = new \RecursiveIteratorIterator( + new \RecursiveDirectoryIterator($this->path, \RecursiveDirectoryIterator::SKIP_DOTS), + \RecursiveIteratorIterator::CHILD_FIRST + ); + + foreach ($files as $file) { + if ($file->isDir()) { + rmdir($file->getRealPath()); + } else { + unlink($file->getRealPath()); + } + } + + rmdir($this->path); + } + + public function __destruct() + { + $this->cleanup(); + } } diff --git a/src/Domain/Scraping/Infrastructure/Persistence/DoctrineScrapingJobRepository.php b/src/Domain/Scraping/Infrastructure/Persistence/DoctrineScrapingJobRepository.php index ef6438f..b42a6f8 100644 --- a/src/Domain/Scraping/Infrastructure/Persistence/DoctrineScrapingJobRepository.php +++ b/src/Domain/Scraping/Infrastructure/Persistence/DoctrineScrapingJobRepository.php @@ -17,6 +17,7 @@ readonly class DoctrineScrapingJobRepository implements ScrapingJobRepositoryInt public function save(ScrapingJob $job): void { + /** @var ScrapingJobEntity $existingEntity */ $existingEntity = $this->entityManager->getRepository(ScrapingJobEntity::class)->find($job->getId()); if ($existingEntity) { diff --git a/src/Domain/Scraping/Infrastructure/Persistence/Entity/ScrapingJobEntity.php b/src/Domain/Scraping/Infrastructure/Persistence/Entity/ScrapingJobEntity.php index b06b1d5..53fbc4e 100644 --- a/src/Domain/Scraping/Infrastructure/Persistence/Entity/ScrapingJobEntity.php +++ b/src/Domain/Scraping/Infrastructure/Persistence/Entity/ScrapingJobEntity.php @@ -86,7 +86,7 @@ class ScrapingJobEntity $this->pages = $pages; } - public function setCompletedAt(\DateTimeImmutable $completedAt): void + public function setCompletedAt(?\DateTimeImmutable $completedAt): void { $this->completedAt = $completedAt; } diff --git a/src/Domain/Scraping/Infrastructure/Service/CbzGenerator.php b/src/Domain/Scraping/Infrastructure/Service/CbzGenerator.php index 8a2d900..04cdcdc 100644 --- a/src/Domain/Scraping/Infrastructure/Service/CbzGenerator.php +++ b/src/Domain/Scraping/Infrastructure/Service/CbzGenerator.php @@ -2,103 +2,87 @@ namespace App\Domain\Scraping\Infrastructure\Service; -use App\Domain\Scraping\Domain\Contract\Repository\ChapterRepositoryInterface; -use App\Domain\Scraping\Domain\Contract\Repository\MangaRepositoryInterface; -use App\Domain\Scraping\Domain\Model\Manga; -use App\Domain\Scraping\Domain\Model\ScrapingJob; -use App\Domain\Scraping\Domain\Model\ValueObject\CbzPath; use App\Domain\Scraping\Domain\Contract\Service\CbzGeneratorInterface; -use App\Domain\Scraping\Domain\Model\Chapter; -use App\Domain\Scraping\Domain\Model\ValueObject\TempDirectory; -use App\Domain\Scraping\Domain\Exception\CbzGenerationException; -use Exception; +use App\Domain\Scraping\Domain\Model\ValueObject\CbzGenerationRequest; +use App\Domain\Scraping\Domain\Model\ValueObject\CbzPath; readonly class CbzGenerator implements CbzGeneratorInterface { public function __construct( - private string $projectDir, - private MangaRepositoryInterface $mangaRepository, - private ChapterRepositoryInterface $chapterRepository, - ) {} + private string $projectDir + ) { + } - /** - * @throws Exception - */ - public function generate(ScrapingJob $job, TempDirectory $tempDirectory): CbzPath + public function generate(CbzGenerationRequest $request): CbzPath { - $cbzPath = $this->generateCbzPath($job); - $this->createCbzArchive($tempDirectory->getPath(), $cbzPath); - + $cbzPath = $this->generateCbzPath($request); + $this->createCbzArchive($request->getFiles(), $cbzPath); return new CbzPath($cbzPath); } - private function generateCbzPath(ScrapingJob $job): string + private function generateCbzPath(CbzGenerationRequest $request): string { - $manga = $this->mangaRepository->getById($job->getMangaId()); - $chapter = $this->chapterRepository->getByMangaIdAndChapterNumber($job->getMangaId(), $job->getChapterNumber()); - - $baseDir = sprintf( - '%s/public/cbz/%s/%s', - $this->projectDir, - $manga->getTitle() . ' (' . $manga->getPublicationYear() . ')', - sprintf('volume_%02d', $chapter->volumeNumber) + $mangaDir = $this->createMangaDirectory( + $this->slugify($request->getMangaTitle()), + $request->getPublicationYear() ); - try { - if (!is_dir($baseDir)) { - if (!mkdir($baseDir, 0755, true)) { - throw new CbzGenerationException(); - } - } - } catch (Exception $e) { - throw CbzGenerationException::unableToCreateDirectory($baseDir); - } - - $chapterNumber = $job->getChapterNumber(); - $formattedNumber = $chapterNumber == floor($chapterNumber) - ? sprintf('%02d', (int)$chapterNumber) - : sprintf('%04.1f', $chapterNumber); + $volumeDir = $this->createVolumeDirectory($mangaDir, $request->getVolumeNumber()); return sprintf( - '%s/%s_vol%s_ch%s.cbz', - $baseDir, - strtolower($manga->getTitle()), - sprintf('%02d', $chapter->volumeNumber), - $formattedNumber + '%s/%s_vol%d_ch%s.cbz', + $volumeDir, + $this->slugify($request->getMangaTitle()), + $request->getVolumeNumber(), + $request->getChapterNumber() ); } - /** - * @throws Exception - */ - private function createCbzArchive(string $sourceDirectory, string $destinationPath): void + private function createCbzArchive(array $files, string $cbzPath): void { $zip = new \ZipArchive(); - - if ($zip->open($destinationPath, \ZipArchive::CREATE) !== true) { - throw CbzGenerationException::unableToCreateCbz($destinationPath); + if ($zip->open($cbzPath, \ZipArchive::CREATE | \ZipArchive::OVERWRITE) !== true) { + throw new \RuntimeException('Failed to create CBZ archive'); } - try { - $files = new \RecursiveIteratorIterator( - new \RecursiveDirectoryIterator($sourceDirectory), - \RecursiveIteratorIterator::LEAVES_ONLY - ); - - foreach ($files as $file) { - if (!$file->isDir()) { - $filePath = $file->getRealPath(); - $relativePath = substr($filePath, strlen($sourceDirectory) + 1); - if (!$zip->addFile($filePath, $relativePath)) { - throw CbzGenerationException::unableToAddFileToArchive($filePath); - } - } + foreach ($files as $file) { + if (!file_exists($file)) { + throw new \RuntimeException("File not found: $file"); } - } catch (Exception $e) { - $zip->close(); - throw $e; + $zip->addFile($file, basename($file)); } - $zip->close(); + if (!$zip->close()) { + throw new \RuntimeException('Failed to close CBZ archive'); + } + } + + private function createMangaDirectory(string $mangaSlug, string $publicationYear): string + { + $dir = sprintf('%s/public/cbz/%s', $this->projectDir, ucfirst($mangaSlug) . ' (' . $publicationYear . ')'); + if (!is_dir($dir) && !mkdir($dir, 0755, true)) { + throw new \RuntimeException("Failed to create directory: $dir"); + } + return $dir; + } + + private function createVolumeDirectory(string $mangaDir, int $volumeNumber): string + { + $dir = sprintf('%s/volume_%02d', $mangaDir, $volumeNumber); + if (!is_dir($dir) && !mkdir($dir, 0755, true)) { + throw new \RuntimeException("Failed to create directory: $dir"); + } + return $dir; + } + + private function slugify(string $text): string + { + $text = preg_replace('~[^\pL\d]+~u', '-', $text); + $text = iconv('utf-8', 'us-ascii//TRANSLIT', $text); + $text = preg_replace('~[^-\w]+~', '', $text); + $text = trim($text, '-'); + $text = preg_replace('~-+~', '-', $text); + $text = strtolower($text); + return $text ?: 'n-a'; } } diff --git a/src/Domain/Scraping/Infrastructure/Service/ImageDownloader.php b/src/Domain/Scraping/Infrastructure/Service/ImageDownloader.php index d42c435..eafd788 100644 --- a/src/Domain/Scraping/Infrastructure/Service/ImageDownloader.php +++ b/src/Domain/Scraping/Infrastructure/Service/ImageDownloader.php @@ -2,13 +2,19 @@ namespace App\Domain\Scraping\Infrastructure\Service; -use Symfony\Contracts\HttpClient\HttpClientInterface; use App\Domain\Scraping\Domain\Contract\Service\ImageDownloaderInterface; +use App\Domain\Scraping\Domain\Event\PageScrapingProgressed; +use App\Domain\Scraping\Domain\Model\ScrapingProgress; +use App\Domain\Scraping\Domain\Model\ValueObject\DownloadResult; +use App\Domain\Scraping\Domain\Model\ValueObject\TempDirectory; +use Symfony\Component\Messenger\MessageBusInterface; +use Symfony\Contracts\HttpClient\HttpClientInterface; readonly class ImageDownloader implements ImageDownloaderInterface { public function __construct( - private HttpClientInterface $httpClient + private HttpClientInterface $httpClient, + private MessageBusInterface $eventBus ) { } @@ -22,4 +28,44 @@ readonly class ImageDownloader implements ImageDownloaderInterface file_put_contents($destination, $response->getContent()); } + + public function downloadBatch(array $urls, TempDirectory $tempDir, string $jobId): array + { + $results = []; + $totalUrls = count($urls); + + foreach ($urls as $index => $url) { + try { + $extension = pathinfo(parse_url($url, PHP_URL_PATH), PATHINFO_EXTENSION) ?: 'jpg'; + $destination = sprintf( + '%s/%03d.%s', + $tempDir->getPath(), + $index + 1, + $extension + ); + + $this->download($url, $destination); + $results[] = new DownloadResult($destination, $url); + + $this->dispatchProgressEvent($jobId, $index + 1, $totalUrls); + } catch (\Exception $e) { + // Log l'erreur mais continue avec les autres images + error_log("Failed to download image {$url}: " . $e->getMessage()); + } + } + + if (empty($results)) { + throw new \RuntimeException('Failed to download any images'); + } + + return $results; + } + + private function dispatchProgressEvent(string $jobId, int $currentPage, int $totalPages): void + { + $this->eventBus->dispatch(new PageScrapingProgressed( + $jobId, + new ScrapingProgress($currentPage, $totalPages) + )); + } } diff --git a/src/Domain/Scraping/Infrastructure/Service/Scraper/AbstractScraper.php b/src/Domain/Scraping/Infrastructure/Service/Scraper/AbstractScraper.php index 9ba3c6e..4b8c4e2 100644 --- a/src/Domain/Scraping/Infrastructure/Service/Scraper/AbstractScraper.php +++ b/src/Domain/Scraping/Infrastructure/Service/Scraper/AbstractScraper.php @@ -8,6 +8,8 @@ use App\Domain\Scraping\Domain\Event\PageScrapingProgressed; use App\Domain\Scraping\Domain\Model\ScrapingJob; use App\Domain\Scraping\Domain\Model\ScrapingProgress; use App\Domain\Scraping\Domain\Model\Source; +use App\Domain\Scraping\Domain\Model\ValueObject\ScrapingRequest; +use App\Domain\Scraping\Domain\Model\ValueObject\ScrapingResult; use App\Domain\Scraping\Domain\Model\ValueObject\TempDirectory; use Symfony\Component\Messenger\MessageBusInterface; use Ramsey\Uuid\Uuid; @@ -20,7 +22,7 @@ abstract class AbstractScraper implements ScraperInterface ) { } - abstract public function scrape(ScrapingJob $job): ScrapingJob; + abstract public function scrape(ScrapingRequest $request): ScrapingResult; abstract protected function scrapePages(ScrapingJob $job, Source $source): array; diff --git a/src/Domain/Scraping/Infrastructure/Service/Scraper/HtmlScraper.php b/src/Domain/Scraping/Infrastructure/Service/Scraper/HtmlScraper.php index 8bafdb6..79b6555 100644 --- a/src/Domain/Scraping/Infrastructure/Service/Scraper/HtmlScraper.php +++ b/src/Domain/Scraping/Infrastructure/Service/Scraper/HtmlScraper.php @@ -15,74 +15,49 @@ use App\Domain\Scraping\Domain\Model\ValueObject\ChapterUrl; use Symfony\Component\DomCrawler\Crawler; use Symfony\Contracts\HttpClient\HttpClientInterface; use Symfony\Component\Messenger\MessageBusInterface; +use App\Domain\Scraping\Domain\Contract\Service\ScraperInterface; +use App\Domain\Scraping\Domain\Event\PageScrapingProgressed; +use App\Domain\Scraping\Domain\Model\ValueObject\ScrapingRequest; +use App\Domain\Scraping\Domain\Model\ValueObject\ScrapingResult; +use App\Domain\Scraping\Domain\Model\ValueObject\TempDirectory; +use App\Domain\Scraping\Domain\Model\ScrapingProgress; -class HtmlScraper extends AbstractScraper +class HtmlScraper implements ScraperInterface { public function __construct( - ImageDownloaderInterface $imageDownloader, - MessageBusInterface $eventBus, - private readonly CbzGeneratorInterface $cbzGenerator, - private readonly HttpClientInterface $httpClient, - private readonly SourceRepositoryInterface $sourceRepository, - private readonly MangaRepositoryInterface $mangaRepository, + private readonly ImageDownloaderInterface $imageDownloader, + private readonly MessageBusInterface $eventBus, + private readonly HttpClientInterface $httpClient ) { - parent::__construct($imageDownloader, $eventBus); } - public function scrape(ScrapingJob $job): ScrapingJob + public function scrape(ScrapingRequest $request): ScrapingResult { - $sourceConfig = $this->sourceRepository->getById($job->getSourceId()); - $tempDir = $this->createTempDirectory(); + $scrappingParameters = $request->getScrapingParameters(); try { - $pages = $this->scrapePages($job, $sourceConfig); + $pages = !$scrappingParameters['nextPageSelector'] + ? $this->scrapeVerticalReader($request) + : $this->scrapeHorizontalReader($request); - foreach ($pages as $index => $imageUrl) { - $pageNumber = new PageNumber($index + 1); - $extension = pathinfo(parse_url($imageUrl, PHP_URL_PATH), PATHINFO_EXTENSION); - $destination = sprintf( - '%s/%s.%s', - $tempDir->getPath(), - $pageNumber->getFormattedNumber(), - $extension - ); - - $this->downloadImage($imageUrl, $destination); - $job->addPage($pageNumber, new ImageUrl($imageUrl)); - - $this->dispatchProgressEvent($job, $index + 1, count($pages)); - } - - $cbzPath = $this->cbzGenerator->generate($job, $tempDir); - - $job->cbzPath = $cbzPath; - $job->complete(); - return $job; + return new ScrapingResult($pages, count($pages)); } catch (\Exception $e) { - $job->fail($e->getMessage()); - return $job; - } finally { - $this->cleanupTempFiles($tempDir); + throw new \RuntimeException('Scraping failed: ' . $e->getMessage(), 0, $e); } } - protected function scrapePages(ScrapingJob $job, Source $source): array + public function supports(string $sourceType): bool { - $scrappingParameters = $source->getScrappingParameters(); - - if (!$scrappingParameters['nextPageSelector']) { - return $this->scrapeVerticalReader($job, $source); - } - - return $this->scrapeHorizontalReader($job, $source); + return 'html' === $sourceType; } - private function scrapeVerticalReader(ScrapingJob $job, Source $sourceConfig): array + private function scrapeVerticalReader(ScrapingRequest $request): array { - $html = $this->fetchHtml($this->buildChapterUrl($job, $sourceConfig)); + $html = $this->fetchHtml($request->getChapterUrl()); $crawler = new Crawler($html); + $params = $request->getScrapingParameters(); - return $crawler->filter($sourceConfig->getScrappingParameters()['imageSelector']) + return $crawler->filter($params['imageSelector']) ->each(function ($node) { return $this->cleanImageUrl( $node->attr('src') ?: $node->attr('data-src') @@ -90,21 +65,22 @@ class HtmlScraper extends AbstractScraper }); } - private function scrapeHorizontalReader(ScrapingJob $job, Source $sourceConfig): array + private function scrapeHorizontalReader(ScrapingRequest $request): array { $pages = []; - $currentUrl = $this->buildChapterUrl($job, $sourceConfig); + $currentUrl = $request->getChapterUrl(); + $params = $request->getScrapingParameters(); while ($currentUrl) { $html = $this->fetchHtml($currentUrl); $crawler = new Crawler($html); - $imageUrl = $crawler->filter($sourceConfig->getScrappingParameters()['imageSelector']) - ->attr('src') ?: $crawler->filter($sourceConfig->getScrappingParameters()['imageSelector']) + $imageUrl = $crawler->filter($params['imageSelector']) + ->attr('src') ?: $crawler->filter($params['imageSelector']) ->attr('data-src'); if (!preg_match('/^https?:\/\//', $imageUrl)) { - $urlComponents = parse_url($sourceConfig->getScrappingParameters()['chapterUrlFormat']); + $urlComponents = parse_url($params['chapterUrlFormat']); $scheme = $urlComponents['scheme']; $host = $urlComponents['host']; $imageUrl = $scheme.'://'.$host.'/'.ltrim($imageUrl, '/'); @@ -112,8 +88,10 @@ class HtmlScraper extends AbstractScraper $pages[] = $this->cleanImageUrl($imageUrl); - $nextLink = $crawler->filter($sourceConfig->getScrappingParameters()['nextPageSelector']); + $nextLink = $crawler->filter($params['nextPageSelector']); $currentUrl = $nextLink->count() > 0 ? $nextLink->attr('href') : null; + + $this->dispatchProgressEvent($request->getJobId(), count($pages), count($pages)); } return $pages; @@ -121,31 +99,30 @@ class HtmlScraper extends AbstractScraper private function fetchHtml(string $url): string { - $response = $this->httpClient->request('GET', $url); + try { + $response = $this->httpClient->request('GET', $url); + $statusCode = $response->getStatusCode(); - if ($response->getStatusCode() >= 400) { - throw new \RuntimeException('Failed to fetch page: ' . $url); + if ($statusCode >= 300 && $statusCode < 400 || $statusCode === 404) { + throw new \RuntimeException('Chapter Not Found at ' . $url); + } + + return $response->getContent(); + } catch (\Exception $e) { + throw new \RuntimeException('Failed to fetch HTML: ' . $e->getMessage(), 0, $e); } - - return $response->getContent(); } private function cleanImageUrl(string $url): string { - // Logique de nettoyage d'URL d'image - return $url; + return preg_replace('/[\x00-\x1F\x7F]/', '', trim($url)); } - - private function buildChapterUrl(ScrapingJob $job, Source $sourceConfig): string + private function dispatchProgressEvent(string $jobId, int $currentPage, int $totalPages): void { - $manga = $this->mangaRepository->getById($job->getMangaId()); - $chapterUrl = new ChapterUrl($sourceConfig->getScrappingParameters()['chapterUrlFormat'], $manga->getSlug(), $job->getChapterNumber()); - return $chapterUrl->getUrl(); - } - - public function supports(string $sourceType): bool - { - return 'html' === $sourceType; + $this->eventBus->dispatch(new PageScrapingProgressed( + $jobId, + new ScrapingProgress($currentPage, $totalPages) + )); } } diff --git a/tests/Domain/Scraping/Adapter/InMemoryCbzGenerator.php b/tests/Domain/Scraping/Adapter/InMemoryCbzGenerator.php index 1a842aa..1418e54 100644 --- a/tests/Domain/Scraping/Adapter/InMemoryCbzGenerator.php +++ b/tests/Domain/Scraping/Adapter/InMemoryCbzGenerator.php @@ -3,9 +3,8 @@ namespace App\Tests\Domain\Scraping\Adapter; use App\Domain\Scraping\Domain\Contract\Service\CbzGeneratorInterface; -use App\Domain\Scraping\Domain\Model\ScrapingJob; +use App\Domain\Scraping\Domain\Model\ValueObject\CbzGenerationRequest; use App\Domain\Scraping\Domain\Model\ValueObject\CbzPath; -use App\Domain\Scraping\Domain\Model\ValueObject\TempDirectory; readonly class InMemoryCbzGenerator implements CbzGeneratorInterface { @@ -13,8 +12,8 @@ readonly class InMemoryCbzGenerator implements CbzGeneratorInterface { } - public function generate(ScrapingJob $job, TempDirectory $tempDirectory): CbzPath + public function generate(CbzGenerationRequest $request): CbzPath { - return new CbzPath('test.cbz'); + return new CbzPath('/path/to/test.cbz'); } } diff --git a/tests/Domain/Scraping/Adapter/InMemoryImageDownloader.php b/tests/Domain/Scraping/Adapter/InMemoryImageDownloader.php new file mode 100644 index 0000000..3eec5ab --- /dev/null +++ b/tests/Domain/Scraping/Adapter/InMemoryImageDownloader.php @@ -0,0 +1,48 @@ +shouldThrowException) { + throw $this->shouldThrowException; + } + + $this->downloadedFiles[$url] = $destination; + } + + public function downloadBatch(array $urls, TempDirectory $tempDir, string $jobId): array + { + if ($this->shouldThrowException) { + throw $this->shouldThrowException; + } + + $results = []; + foreach ($urls as $index => $url) { + $destination = sprintf('%s/%03d.jpg', $tempDir->getPath(), $index + 1); + $this->download($url, $destination); + $results[] = new DownloadResult($destination, $url); + } + + return $results; + } + + public function simulateError(\Exception $exception): void + { + $this->shouldThrowException = $exception; + } + + public function getDownloadedFiles(): array + { + return $this->downloadedFiles; + } +} diff --git a/tests/Domain/Scraping/Adapter/InMemoryMangaRepository.php b/tests/Domain/Scraping/Adapter/InMemoryMangaRepository.php new file mode 100644 index 0000000..95384fc --- /dev/null +++ b/tests/Domain/Scraping/Adapter/InMemoryMangaRepository.php @@ -0,0 +1,43 @@ +mangas['test-manga'] = new Manga( + 'test-manga', + 'Test Manga', + 'test-manga', + '2024', + 'Test Author', + 'A test manga description' + ); + } + + public function getById(string $id): Manga + { + if (!isset($this->mangas[$id])) { + throw new \RuntimeException('Manga not found'); + } + + return $this->mangas[$id]; + } + + public function save(Manga $manga): void + { + $this->mangas[$manga->getId()] = $manga; + } + + public function clear(): void + { + $this->mangas = []; + } +} diff --git a/tests/Domain/Scraping/Adapter/InMemoryScraperAdapter.php b/tests/Domain/Scraping/Adapter/InMemoryScraperAdapter.php index bdaf2f7..576d391 100644 --- a/tests/Domain/Scraping/Adapter/InMemoryScraperAdapter.php +++ b/tests/Domain/Scraping/Adapter/InMemoryScraperAdapter.php @@ -3,24 +3,23 @@ namespace App\Tests\Domain\Scraping\Adapter; use App\Domain\Scraping\Domain\Contract\Service\ScraperInterface; -use App\Domain\Scraping\Domain\Model\ScrapingJob; -use App\Domain\Scraping\Domain\Model\ValueObject\CbzPath; -use Ramsey\Uuid\Uuid; +use App\Domain\Scraping\Domain\Model\ValueObject\ScrapingRequest; +use App\Domain\Scraping\Domain\Model\ValueObject\ScrapingResult; class InMemoryScraperAdapter implements ScraperInterface { private ?\Exception $shouldThrowException = null; - public function scrape(ScrapingJob $job): ScrapingJob + public function scrape(ScrapingRequest $request): ScrapingResult { if ($this->shouldThrowException) { - $job->fail($this->shouldThrowException->getMessage()); - return $job; + throw $this->shouldThrowException; } - $job->complete(); - $job->cbzPath = new CbzPath('/path/to/test.cbz'); - return $job; + return new ScrapingResult( + ['http://example.com/image1.jpg', 'http://example.com/image2.jpg'], + 2 + ); } public function simulateError(\Exception $exception): void diff --git a/tests/Domain/Scraping/Adapter/InMemorySourceRepository.php b/tests/Domain/Scraping/Adapter/InMemorySourceRepository.php new file mode 100644 index 0000000..cf55dae --- /dev/null +++ b/tests/Domain/Scraping/Adapter/InMemorySourceRepository.php @@ -0,0 +1,51 @@ +sources['test-source'] = new Source( + new SourceId('test-source'), + 'Test Source', + 'A test source', + 'https://example.com', + [ + 'imageSelector' => 'img.manga-image', + 'nextPageSelector' => null, + 'chapterUrlFormat' => 'https://example.com/manga/{slug}/chapter-{chapterNumber}' + ], + true, + new DateTimeImmutable(), + new DateTimeImmutable() + ); + } + + public function getById(string $id): Source + { + if (!isset($this->sources[$id])) { + throw new \RuntimeException('Source not found'); + } + + return $this->sources[$id]; + } + + public function save(Source $source): void + { + $this->sources[$source->getId()->getValue()] = $source; + } + + public function clear(): void + { + $this->sources = []; + } +} diff --git a/tests/Domain/Scraping/Application/CommandHandler/ScrapeChapterHandlerTest.php b/tests/Domain/Scraping/Application/CommandHandler/ScrapeChapterHandlerTest.php index a9e2461..041e993 100644 --- a/tests/Domain/Scraping/Application/CommandHandler/ScrapeChapterHandlerTest.php +++ b/tests/Domain/Scraping/Application/CommandHandler/ScrapeChapterHandlerTest.php @@ -10,38 +10,54 @@ use App\Domain\Scraping\Domain\Event\ChapterScrapingStarted; use App\Domain\Scraping\Domain\Model\Chapter; use App\Domain\Scraping\Domain\Model\ScrapingStatus; use App\Tests\Domain\Scraping\Adapter\InMemoryChapterRepository; +use App\Tests\Domain\Scraping\Adapter\InMemoryCbzGenerator; use App\Tests\Domain\Scraping\Adapter\InMemoryEventBus; +use App\Tests\Domain\Scraping\Adapter\InMemoryImageDownloader; +use App\Tests\Domain\Scraping\Adapter\InMemoryMangaRepository; use App\Tests\Domain\Scraping\Adapter\InMemoryScraperAdapter; use App\Tests\Domain\Scraping\Adapter\InMemoryScrapingJobRepository; +use App\Tests\Domain\Scraping\Adapter\InMemorySourceRepository; use PHPUnit\Framework\TestCase; class ScrapeChapterHandlerTest extends TestCase { private InMemoryScraperAdapter $scraper; + private InMemoryImageDownloader $imageDownloader; + private InMemoryCbzGenerator $cbzGenerator; private InMemoryScrapingJobRepository $scrapingJobRepository; private InMemoryChapterRepository $chapterRepository; + private InMemoryMangaRepository $mangaRepository; + private InMemorySourceRepository $sourceRepository; private InMemoryEventBus $eventBus; private ScrapeChapterHandler $handler; protected function setUp(): void { $this->scraper = new InMemoryScraperAdapter(); + $this->imageDownloader = new InMemoryImageDownloader(); + $this->cbzGenerator = new InMemoryCbzGenerator('/test/project/dir'); $this->scrapingJobRepository = new InMemoryScrapingJobRepository(); $this->chapterRepository = new InMemoryChapterRepository(); + $this->mangaRepository = new InMemoryMangaRepository(); + $this->sourceRepository = new InMemorySourceRepository(); + $this->eventBus = new InMemoryEventBus(); $this->chapterRepository->save(new Chapter( id: '1', - mangaId: '1', - chapterNumber: '2', + mangaId: 'test-manga', + chapterNumber: 2, volumeNumber: 1, cbzPath: null, )); - $this->eventBus = new InMemoryEventBus(); $this->handler = new ScrapeChapterHandler( $this->scraper, + $this->imageDownloader, + $this->cbzGenerator, $this->scrapingJobRepository, $this->chapterRepository, + $this->mangaRepository, + $this->sourceRepository, $this->eventBus ); } @@ -49,9 +65,9 @@ class ScrapeChapterHandlerTest extends TestCase public function testHandleSuccessfully(): void { $command = new ScrapeChapter( - mangaId: '1', + mangaId: 'test-manga', chapterNumber: '2', - sourceId: '3', + sourceId: 'test-source' ); $this->handler->handle($command); @@ -66,42 +82,45 @@ class ScrapeChapterHandlerTest extends TestCase $this->assertInstanceOf(ChapterScraped::class, $dispatchedMessages[1]); $this->assertEquals($job->getId(), $dispatchedMessages[0]->getJobId()); - $chapter = $this->chapterRepository->getByMangaIdAndChapterNumber('1', '2'); + $chapter = $this->chapterRepository->getByMangaIdAndChapterNumber('test-manga', 2); $this->assertNotNull($chapter->cbzPath); } public function testHandleThrowsException(): void { $command = new ScrapeChapter( - mangaId: '1', + mangaId: 'test-manga', chapterNumber: '2', - sourceId: '3', + sourceId: 'test-source' ); $exception = new \Exception('Scraping failed'); $this->scraper->simulateError($exception); + $this->expectException(\Exception::class); + $this->expectExceptionMessage('Scraping failed'); + $this->handler->handle($command); $dispatchedMessages = $this->eventBus->getDispatchedMessages(); - $this->assertCount(2, $dispatchedMessages); $this->assertInstanceOf(ChapterScrapingStarted::class, $dispatchedMessages[0]); $this->assertInstanceOf(ChapterScrapingFailed::class, $dispatchedMessages[1]); - $this->assertEquals('1', $dispatchedMessages[1]->getMangaId()); + $this->assertEquals('test-manga', $dispatchedMessages[1]->getMangaId()); $this->assertEquals('2', $dispatchedMessages[1]->getChapterNumber()); $this->assertEquals('Scraping failed', $dispatchedMessages[1]->getReason()); $jobs = $this->scrapingJobRepository->getJobs(); - $this->assertCount(1, $jobs); $this->assertEquals(ScrapingStatus::FAILED, $jobs[0]->status); $this->assertEquals('Scraping failed', $jobs[0]->failureReason); } - public function tearDown(): void + protected function tearDown(): void { $this->scrapingJobRepository->clear(); $this->chapterRepository->clear(); + $this->mangaRepository->clear(); + $this->sourceRepository->clear(); } }