feat: refonte du gestionnaire de chapitres pour intégrer la génération de fichiers CBZ, le téléchargement d'images en lot et la gestion des requêtes de scraping, avec mise à jour des interfaces et des modèles associés

2025-03-28 20:42:21 +01:00
parent cdee6f77fc
commit d7088b14c2
22 changed files with 620 additions and 195 deletions
--- a/src/Domain/Scraping/Infrastructure/Persistence/DoctrineScrapingJobRepository.php
+++ b/src/Domain/Scraping/Infrastructure/Persistence/DoctrineScrapingJobRepository.php
@@ -17,6 +17,7 @@ readonly class DoctrineScrapingJobRepository implements ScrapingJobRepositoryInt

    public function save(ScrapingJob $job): void
    {
+        /** @var ScrapingJobEntity $existingEntity */
        $existingEntity = $this->entityManager->getRepository(ScrapingJobEntity::class)->find($job->getId());

        if ($existingEntity) {
--- a/src/Domain/Scraping/Infrastructure/Persistence/Entity/ScrapingJobEntity.php
+++ b/src/Domain/Scraping/Infrastructure/Persistence/Entity/ScrapingJobEntity.php
@@ -86,7 +86,7 @@ class ScrapingJobEntity
        $this->pages = $pages;
    }

-    public function setCompletedAt(\DateTimeImmutable $completedAt): void
+    public function setCompletedAt(?\DateTimeImmutable $completedAt): void
    {
        $this->completedAt = $completedAt;
    }
--- a/src/Domain/Scraping/Infrastructure/Service/CbzGenerator.php
+++ b/src/Domain/Scraping/Infrastructure/Service/CbzGenerator.php
@@ -2,103 +2,87 @@

 namespace App\Domain\Scraping\Infrastructure\Service;

-use App\Domain\Scraping\Domain\Contract\Repository\ChapterRepositoryInterface;
-use App\Domain\Scraping\Domain\Contract\Repository\MangaRepositoryInterface;
-use App\Domain\Scraping\Domain\Model\Manga;
-use App\Domain\Scraping\Domain\Model\ScrapingJob;
-use App\Domain\Scraping\Domain\Model\ValueObject\CbzPath;
 use App\Domain\Scraping\Domain\Contract\Service\CbzGeneratorInterface;
-use App\Domain\Scraping\Domain\Model\Chapter;
-use App\Domain\Scraping\Domain\Model\ValueObject\TempDirectory;
-use App\Domain\Scraping\Domain\Exception\CbzGenerationException;
-use Exception;
+use App\Domain\Scraping\Domain\Model\ValueObject\CbzGenerationRequest;
+use App\Domain\Scraping\Domain\Model\ValueObject\CbzPath;

 readonly class CbzGenerator implements CbzGeneratorInterface
 {
    public function __construct(
-        private string                     $projectDir,
-        private MangaRepositoryInterface   $mangaRepository,
-        private ChapterRepositoryInterface $chapterRepository,
-    ) {}
+        private string $projectDir
+    ) {
+    }

-    /**
-     * @throws Exception
-     */
-    public function generate(ScrapingJob $job, TempDirectory $tempDirectory): CbzPath
+    public function generate(CbzGenerationRequest $request): CbzPath
    {
-        $cbzPath = $this->generateCbzPath($job);
-        $this->createCbzArchive($tempDirectory->getPath(), $cbzPath);
-
+        $cbzPath = $this->generateCbzPath($request);
+        $this->createCbzArchive($request->getFiles(), $cbzPath);
        return new CbzPath($cbzPath);
    }

-    private function generateCbzPath(ScrapingJob $job): string
+    private function generateCbzPath(CbzGenerationRequest $request): string
    {
-        $manga = $this->mangaRepository->getById($job->getMangaId());
-        $chapter = $this->chapterRepository->getByMangaIdAndChapterNumber($job->getMangaId(), $job->getChapterNumber());
-
-        $baseDir = sprintf(
-            '%s/public/cbz/%s/%s',
-            $this->projectDir,
-            $manga->getTitle() . ' (' . $manga->getPublicationYear() . ')',
-            sprintf('volume_%02d', $chapter->volumeNumber)
+        $mangaDir = $this->createMangaDirectory(
+            $this->slugify($request->getMangaTitle()),
+            $request->getPublicationYear()
        );

-        try {
-            if (!is_dir($baseDir)) {
-                if (!mkdir($baseDir, 0755, true)) {
-                    throw new CbzGenerationException();
-                }
-            }
-        } catch (Exception $e) {
-            throw CbzGenerationException::unableToCreateDirectory($baseDir);
-        }
-
-        $chapterNumber = $job->getChapterNumber();
-        $formattedNumber = $chapterNumber == floor($chapterNumber)
-            ? sprintf('%02d', (int)$chapterNumber)
-            : sprintf('%04.1f', $chapterNumber);
+        $volumeDir = $this->createVolumeDirectory($mangaDir, $request->getVolumeNumber());

        return sprintf(
-            '%s/%s_vol%s_ch%s.cbz',
-            $baseDir,
-            strtolower($manga->getTitle()),
-            sprintf('%02d', $chapter->volumeNumber),
-            $formattedNumber
+            '%s/%s_vol%d_ch%s.cbz',
+            $volumeDir,
+            $this->slugify($request->getMangaTitle()),
+            $request->getVolumeNumber(),
+            $request->getChapterNumber()
        );
    }

-    /**
-     * @throws Exception
-     */
-    private function createCbzArchive(string $sourceDirectory, string $destinationPath): void
+    private function createCbzArchive(array $files, string $cbzPath): void
    {
        $zip = new \ZipArchive();
-
-        if ($zip->open($destinationPath, \ZipArchive::CREATE) !== true) {
-            throw CbzGenerationException::unableToCreateCbz($destinationPath);
+        if ($zip->open($cbzPath, \ZipArchive::CREATE | \ZipArchive::OVERWRITE) !== true) {
+            throw new \RuntimeException('Failed to create CBZ archive');
        }

-        try {
-            $files = new \RecursiveIteratorIterator(
-                new \RecursiveDirectoryIterator($sourceDirectory),
-                \RecursiveIteratorIterator::LEAVES_ONLY
-            );
-
-            foreach ($files as $file) {
-                if (!$file->isDir()) {
-                    $filePath = $file->getRealPath();
-                    $relativePath = substr($filePath, strlen($sourceDirectory) + 1);
-                    if (!$zip->addFile($filePath, $relativePath)) {
-                        throw CbzGenerationException::unableToAddFileToArchive($filePath);
-                    }
-                }
+        foreach ($files as $file) {
+            if (!file_exists($file)) {
+                throw new \RuntimeException("File not found: $file");
            }
-        } catch (Exception $e) {
-            $zip->close();
-            throw $e;
+            $zip->addFile($file, basename($file));
        }

-        $zip->close();
+        if (!$zip->close()) {
+            throw new \RuntimeException('Failed to close CBZ archive');
+        }
+    }
+
+    private function createMangaDirectory(string $mangaSlug, string $publicationYear): string
+    {
+        $dir = sprintf('%s/public/cbz/%s', $this->projectDir, ucfirst($mangaSlug) . ' (' . $publicationYear . ')');
+        if (!is_dir($dir) && !mkdir($dir, 0755, true)) {
+            throw new \RuntimeException("Failed to create directory: $dir");
+        }
+        return $dir;
+    }
+
+    private function createVolumeDirectory(string $mangaDir, int $volumeNumber): string
+    {
+        $dir = sprintf('%s/volume_%02d', $mangaDir, $volumeNumber);
+        if (!is_dir($dir) && !mkdir($dir, 0755, true)) {
+            throw new \RuntimeException("Failed to create directory: $dir");
+        }
+        return $dir;
+    }
+
+    private function slugify(string $text): string
+    {
+        $text = preg_replace('~[^\pL\d]+~u', '-', $text);
+        $text = iconv('utf-8', 'us-ascii//TRANSLIT', $text);
+        $text = preg_replace('~[^-\w]+~', '', $text);
+        $text = trim($text, '-');
+        $text = preg_replace('~-+~', '-', $text);
+        $text = strtolower($text);
+        return $text ?: 'n-a';
    }
 }
--- a/src/Domain/Scraping/Infrastructure/Service/ImageDownloader.php
+++ b/src/Domain/Scraping/Infrastructure/Service/ImageDownloader.php
@@ -2,13 +2,19 @@

 namespace App\Domain\Scraping\Infrastructure\Service;

-use Symfony\Contracts\HttpClient\HttpClientInterface;
 use App\Domain\Scraping\Domain\Contract\Service\ImageDownloaderInterface;
+use App\Domain\Scraping\Domain\Event\PageScrapingProgressed;
+use App\Domain\Scraping\Domain\Model\ScrapingProgress;
+use App\Domain\Scraping\Domain\Model\ValueObject\DownloadResult;
+use App\Domain\Scraping\Domain\Model\ValueObject\TempDirectory;
+use Symfony\Component\Messenger\MessageBusInterface;
+use Symfony\Contracts\HttpClient\HttpClientInterface;

 readonly class ImageDownloader implements ImageDownloaderInterface
 {
    public function __construct(
-        private HttpClientInterface $httpClient
+        private HttpClientInterface $httpClient,
+        private MessageBusInterface $eventBus
    ) {
    }

@@ -22,4 +28,44 @@ readonly class ImageDownloader implements ImageDownloaderInterface

        file_put_contents($destination, $response->getContent());
    }
+
+    public function downloadBatch(array $urls, TempDirectory $tempDir, string $jobId): array
+    {
+        $results = [];
+        $totalUrls = count($urls);
+
+        foreach ($urls as $index => $url) {
+            try {
+                $extension = pathinfo(parse_url($url, PHP_URL_PATH), PATHINFO_EXTENSION) ?: 'jpg';
+                $destination = sprintf(
+                    '%s/%03d.%s',
+                    $tempDir->getPath(),
+                    $index + 1,
+                    $extension
+                );
+
+                $this->download($url, $destination);
+                $results[] = new DownloadResult($destination, $url);
+
+                $this->dispatchProgressEvent($jobId, $index + 1, $totalUrls);
+            } catch (\Exception $e) {
+                // Log l'erreur mais continue avec les autres images
+                error_log("Failed to download image {$url}: " . $e->getMessage());
+            }
+        }
+
+        if (empty($results)) {
+            throw new \RuntimeException('Failed to download any images');
+        }
+
+        return $results;
+    }
+
+    private function dispatchProgressEvent(string $jobId, int $currentPage, int $totalPages): void
+    {
+        $this->eventBus->dispatch(new PageScrapingProgressed(
+            $jobId,
+            new ScrapingProgress($currentPage, $totalPages)
+        ));
+    }
 }
--- a/src/Domain/Scraping/Infrastructure/Service/Scraper/AbstractScraper.php
+++ b/src/Domain/Scraping/Infrastructure/Service/Scraper/AbstractScraper.php
@@ -8,6 +8,8 @@ use App\Domain\Scraping\Domain\Event\PageScrapingProgressed;
 use App\Domain\Scraping\Domain\Model\ScrapingJob;
 use App\Domain\Scraping\Domain\Model\ScrapingProgress;
 use App\Domain\Scraping\Domain\Model\Source;
+use App\Domain\Scraping\Domain\Model\ValueObject\ScrapingRequest;
+use App\Domain\Scraping\Domain\Model\ValueObject\ScrapingResult;
 use App\Domain\Scraping\Domain\Model\ValueObject\TempDirectory;
 use Symfony\Component\Messenger\MessageBusInterface;
 use Ramsey\Uuid\Uuid;
@@ -20,7 +22,7 @@ abstract class AbstractScraper implements ScraperInterface
    ) {
    }

-    abstract public function scrape(ScrapingJob $job): ScrapingJob;
+    abstract public function scrape(ScrapingRequest $request): ScrapingResult;

    abstract protected function scrapePages(ScrapingJob $job, Source $source): array;

--- a/src/Domain/Scraping/Infrastructure/Service/Scraper/HtmlScraper.php
+++ b/src/Domain/Scraping/Infrastructure/Service/Scraper/HtmlScraper.php
@@ -15,74 +15,49 @@ use App\Domain\Scraping\Domain\Model\ValueObject\ChapterUrl;
 use Symfony\Component\DomCrawler\Crawler;
 use Symfony\Contracts\HttpClient\HttpClientInterface;
 use Symfony\Component\Messenger\MessageBusInterface;
+use App\Domain\Scraping\Domain\Contract\Service\ScraperInterface;
+use App\Domain\Scraping\Domain\Event\PageScrapingProgressed;
+use App\Domain\Scraping\Domain\Model\ValueObject\ScrapingRequest;
+use App\Domain\Scraping\Domain\Model\ValueObject\ScrapingResult;
+use App\Domain\Scraping\Domain\Model\ValueObject\TempDirectory;
+use App\Domain\Scraping\Domain\Model\ScrapingProgress;

-class HtmlScraper extends AbstractScraper
+class HtmlScraper implements ScraperInterface
 {
    public function __construct(
-        ImageDownloaderInterface                   $imageDownloader,
-        MessageBusInterface                        $eventBus,
-        private readonly CbzGeneratorInterface     $cbzGenerator,
-        private readonly HttpClientInterface       $httpClient,
-        private readonly SourceRepositoryInterface $sourceRepository,
-        private readonly MangaRepositoryInterface  $mangaRepository,
+        private readonly ImageDownloaderInterface $imageDownloader,
+        private readonly MessageBusInterface $eventBus,
+        private readonly HttpClientInterface $httpClient
    ) {
-        parent::__construct($imageDownloader, $eventBus);
    }

-    public function scrape(ScrapingJob $job): ScrapingJob
+    public function scrape(ScrapingRequest $request): ScrapingResult
    {
-        $sourceConfig = $this->sourceRepository->getById($job->getSourceId());
-        $tempDir = $this->createTempDirectory();
+        $scrappingParameters = $request->getScrapingParameters();

        try {
-            $pages = $this->scrapePages($job, $sourceConfig);
+            $pages = !$scrappingParameters['nextPageSelector']
+                ? $this->scrapeVerticalReader($request)
+                : $this->scrapeHorizontalReader($request);

-            foreach ($pages as $index => $imageUrl) {
-                $pageNumber = new PageNumber($index + 1);
-                $extension = pathinfo(parse_url($imageUrl, PHP_URL_PATH), PATHINFO_EXTENSION);
-                $destination = sprintf(
-                    '%s/%s.%s',
-                    $tempDir->getPath(),
-                    $pageNumber->getFormattedNumber(),
-                    $extension
-                );
-
-                $this->downloadImage($imageUrl, $destination);
-                $job->addPage($pageNumber, new ImageUrl($imageUrl));
-
-                $this->dispatchProgressEvent($job, $index + 1, count($pages));
-            }
-
-            $cbzPath = $this->cbzGenerator->generate($job, $tempDir);
-
-            $job->cbzPath = $cbzPath;
-            $job->complete();
-            return $job;
+            return new ScrapingResult($pages, count($pages));
        } catch (\Exception $e) {
-            $job->fail($e->getMessage());
-            return $job;
-        } finally {
-            $this->cleanupTempFiles($tempDir);
+            throw new \RuntimeException('Scraping failed: ' . $e->getMessage(), 0, $e);
        }
    }

-    protected function scrapePages(ScrapingJob $job, Source $source): array
+    public function supports(string $sourceType): bool
    {
-        $scrappingParameters = $source->getScrappingParameters();
-
-        if (!$scrappingParameters['nextPageSelector']) {
-            return $this->scrapeVerticalReader($job, $source);
-        }
-
-        return $this->scrapeHorizontalReader($job, $source);
+        return 'html' === $sourceType;
    }

-    private function scrapeVerticalReader(ScrapingJob $job, Source $sourceConfig): array
+    private function scrapeVerticalReader(ScrapingRequest $request): array
    {
-        $html = $this->fetchHtml($this->buildChapterUrl($job, $sourceConfig));
+        $html = $this->fetchHtml($request->getChapterUrl());
        $crawler = new Crawler($html);
+        $params = $request->getScrapingParameters();

-        return $crawler->filter($sourceConfig->getScrappingParameters()['imageSelector'])
+        return $crawler->filter($params['imageSelector'])
            ->each(function ($node) {
                return $this->cleanImageUrl(
                    $node->attr('src') ?: $node->attr('data-src')
@@ -90,21 +65,22 @@ class HtmlScraper extends AbstractScraper
            });
    }

-    private function scrapeHorizontalReader(ScrapingJob $job, Source $sourceConfig): array
+    private function scrapeHorizontalReader(ScrapingRequest $request): array
    {
        $pages = [];
-        $currentUrl = $this->buildChapterUrl($job, $sourceConfig);
+        $currentUrl = $request->getChapterUrl();
+        $params = $request->getScrapingParameters();

        while ($currentUrl) {
            $html = $this->fetchHtml($currentUrl);
            $crawler = new Crawler($html);

-            $imageUrl = $crawler->filter($sourceConfig->getScrappingParameters()['imageSelector'])
-                ->attr('src') ?: $crawler->filter($sourceConfig->getScrappingParameters()['imageSelector'])
+            $imageUrl = $crawler->filter($params['imageSelector'])
+                ->attr('src') ?: $crawler->filter($params['imageSelector'])
                ->attr('data-src');

            if (!preg_match('/^https?:\/\//', $imageUrl)) {
-                $urlComponents = parse_url($sourceConfig->getScrappingParameters()['chapterUrlFormat']);
+                $urlComponents = parse_url($params['chapterUrlFormat']);
                $scheme = $urlComponents['scheme'];
                $host = $urlComponents['host'];
                $imageUrl = $scheme.'://'.$host.'/'.ltrim($imageUrl, '/');
@@ -112,8 +88,10 @@ class HtmlScraper extends AbstractScraper

            $pages[] = $this->cleanImageUrl($imageUrl);

-            $nextLink = $crawler->filter($sourceConfig->getScrappingParameters()['nextPageSelector']);
+            $nextLink = $crawler->filter($params['nextPageSelector']);
            $currentUrl = $nextLink->count() > 0 ? $nextLink->attr('href') : null;
+
+            $this->dispatchProgressEvent($request->getJobId(), count($pages), count($pages));
        }

        return $pages;
@@ -121,31 +99,30 @@ class HtmlScraper extends AbstractScraper

    private function fetchHtml(string $url): string
    {
-        $response = $this->httpClient->request('GET', $url);
+        try {
+            $response = $this->httpClient->request('GET', $url);
+            $statusCode = $response->getStatusCode();

-        if ($response->getStatusCode() >= 400) {
-            throw new \RuntimeException('Failed to fetch page: ' . $url);
+            if ($statusCode >= 300 && $statusCode < 400 || $statusCode === 404) {
+                throw new \RuntimeException('Chapter Not Found at ' . $url);
+            }
+
+            return $response->getContent();
+        } catch (\Exception $e) {
+            throw new \RuntimeException('Failed to fetch HTML: ' . $e->getMessage(), 0, $e);
        }
-
-        return $response->getContent();
    }

    private function cleanImageUrl(string $url): string
    {
-        // Logique de nettoyage d'URL d'image
-        return $url;
+        return preg_replace('/[\x00-\x1F\x7F]/', '', trim($url));
    }

-
-    private function buildChapterUrl(ScrapingJob $job, Source $sourceConfig): string
+    private function dispatchProgressEvent(string $jobId, int $currentPage, int $totalPages): void
    {
-        $manga = $this->mangaRepository->getById($job->getMangaId());
-        $chapterUrl = new ChapterUrl($sourceConfig->getScrappingParameters()['chapterUrlFormat'], $manga->getSlug(), $job->getChapterNumber());
-        return $chapterUrl->getUrl();
-    }
-
-    public function supports(string $sourceType): bool
-    {
-        return 'html' === $sourceType;
+        $this->eventBus->dispatch(new PageScrapingProgressed(
+            $jobId,
+            new ScrapingProgress($currentPage, $totalPages)
+        ));
    }
 }