feat: firsts unit tests for ScrapeChapterHandler.php

2025-02-03 10:38:53 +01:00
parent 21fcdd1084
commit 89570ad951
31 changed files with 1105 additions and 291 deletions
--- a/src/Domain/Scraping/Infrastructure/Persistence/DoctrineMangaRepository.php
+++ b/src/Domain/Scraping/Infrastructure/Persistence/DoctrineMangaRepository.php
@@ -0,0 +1,22 @@
+<?php
+
+namespace App\Domain\Scraping\Infrastructure\Persistence;
+
+use App\Domain\Scraping\Domain\Contract\Repository\MangaRepositoryInterface;
+use App\Domain\Scraping\Domain\Model\Manga;
+use App\Domain\Scraping\Infrastructure\Persistence\Entity\MangaEntity;
+use Doctrine\ORM\EntityManagerInterface;
+
+class DoctrineMangaRepository implements MangaRepositoryInterface
+{
+    public function __construct(
+        private readonly EntityManagerInterface $entityManager
+    ) {}
+
+    public function getById(string $id): ?Manga
+    {
+        $manga = $this->entityManager->getRepository(MangaEntity::class)->find($id);
+
+        return $manga ? $manga->toDomain() : null;
+    }
+}
--- a/src/Domain/Scraping/Infrastructure/Persistence/DoctrineSourceRepository.php
+++ b/src/Domain/Scraping/Infrastructure/Persistence/DoctrineSourceRepository.php
@@ -0,0 +1,26 @@
+<?php
+
+namespace App\Domain\Scraping\Infrastructure\Persistence;
+
+use App\Domain\Scraping\Domain\Model\Source;
+use App\Domain\Scraping\Domain\Repository\SourceRepositoryInterface;
+use Doctrine\ORM\EntityManagerInterface;
+use App\Domain\Scraping\Infrastructure\Persistence\Entity\SourceEntity as SourceEntityEntity;
+
+class DoctrineSourceRepository implements SourceRepositoryInterface
+{
+    public function __construct(
+        private readonly EntityManagerInterface $entityManager
+    ) {}
+
+    public function getById(string $id): ?Source
+    {
+        $sourceEntity = $this->entityManager->getRepository(SourceEntityEntity::class)->find($id);
+        
+        if (!$sourceEntity) {
+            return null;
+        }
+
+        return $sourceEntity->toDomain();
+    }
+} 
--- a/src/Domain/Scraping/Infrastructure/Persistence/Entity/MangaEntity.php
+++ b/src/Domain/Scraping/Infrastructure/Persistence/Entity/MangaEntity.php
@@ -0,0 +1,75 @@
+<?php
+
+namespace App\Domain\Scraping\Infrastructure\Persistence\Entity;
+
+use App\Domain\Scraping\Domain\Model\Manga;
+use Doctrine\DBAL\Types\Types;
+use Doctrine\ORM\Mapping as ORM;
+
+#[ORM\Entity]
+#[ORM\Table(name: 'mangas')]
+class MangaEntity
+{
+    #[ORM\Id]
+    #[ORM\Column(type: 'string', length: 36)]
+    private string $id;
+
+    #[ORM\Column(length: 255)]
+    private string $title;
+
+    #[ORM\Column(length: 255, unique: true)]
+    private string $slug;
+
+    #[ORM\Column(length: 255, nullable: true)]
+    private ?string $imageUrl = null;
+
+    #[ORM\Column(nullable: true)]
+    private ?int $publicationYear = null;
+
+    #[ORM\Column(type: Types::TEXT, nullable: true)]
+    private ?string $description = null;
+
+    #[ORM\Column(type: Types::ARRAY, nullable: true)]
+    private ?array $genres = null;
+
+    #[ORM\Column(type: 'datetime_immutable')]
+    private \DateTimeImmutable $createdAt;
+
+    #[ORM\Column(nullable: true)]
+    private ?float $rating = null;
+
+    #[ORM\Column(length: 255, nullable: true)]
+    private ?string $author = null;
+
+    #[ORM\Column(length: 255, nullable: true)]
+    private ?string $status = null;
+
+    #[ORM\Column]
+    private bool $monitored;
+
+    public static function fromDomain(Manga $manga): self
+    {
+        $entity = new self();
+        $entity->id = $manga->getId();
+        $entity->title = $manga->getTitle();
+        $entity->slug = $manga->getSlug();
+        $entity->description = $manga->getDescription();
+        $entity->author = $manga->getAuthor();
+
+        
+        return $entity;
+    }
+
+    public function toDomain(): Manga
+    {
+        $manga = new Manga(
+            $this->id,
+            $this->title,
+            $this->slug,
+            $this->description,
+            $this->author
+        );
+
+        return $manga;
+    }
+}
--- a/src/Domain/Scraping/Infrastructure/Persistence/Entity/ScrapingJobEntity.php
+++ b/src/Domain/Scraping/Infrastructure/Persistence/Entity/ScrapingJobEntity.php
@@ -3,7 +3,6 @@
 namespace App\Domain\Scraping\Infrastructure\Persistence\Entity;

 use App\Domain\Scraping\Domain\Model\ScrapingJob;
-use App\Domain\Scraping\Domain\Model\ScrapingStatus;
 use Doctrine\ORM\Mapping as ORM;

 #[ORM\Entity]
@@ -59,25 +58,6 @@ class ScrapingJobEntity
            $this->sourceId
        );

-        // Reconstruire l'état du job à partir des données persistées
-        $reflection = new \ReflectionClass(ScrapingJob::class);
-        
-        $pagesProperty = $reflection->getProperty('pages');
-        $pagesProperty->setAccessible(true);
-        $pagesProperty->setValue($job, $this->pages);
-        
-        $statusProperty = $reflection->getProperty('status');
-        $statusProperty->setAccessible(true);
-        $statusProperty->setValue($job, ScrapingStatus::from($this->status));
-        
-        $createdAtProperty = $reflection->getProperty('createdAt');
-        $createdAtProperty->setAccessible(true);
-        $createdAtProperty->setValue($job, $this->createdAt);
-        
-        $completedAtProperty = $reflection->getProperty('completedAt');
-        $completedAtProperty->setAccessible(true);
-        $completedAtProperty->setValue($job, $this->completedAt);
-
        return $job;
    }
 } 
--- a/src/Domain/Scraping/Infrastructure/Persistence/Entity/SourceEntity.php
+++ b/src/Domain/Scraping/Infrastructure/Persistence/Entity/SourceEntity.php
@@ -0,0 +1,65 @@
+<?php
+
+namespace App\Domain\Scraping\Infrastructure\Persistence\Entity;
+
+use App\Domain\Scraping\Domain\Model\Source;
+use Doctrine\ORM\Mapping as ORM;
+
+#[ORM\Entity]
+#[ORM\Table(name: 'sources')]
+class SourceEntity
+{
+    #[ORM\Id]
+    #[ORM\Column(type: 'string', length: 36)]
+    private string $id;
+
+    #[ORM\Column(type: 'string', nullable: true)]
+    private ?string $name = null;
+
+    #[ORM\Column(type: 'text', nullable: true)]
+    private ?string $description = null;
+
+    #[ORM\Column(type: 'string')]
+    private string $baseUrl;
+
+    #[ORM\Column(type: 'json')]
+    private array $scrappingParameters = [];
+
+    #[ORM\Column(type: 'boolean')]
+    private bool $isActive;
+
+    #[ORM\Column(type: 'datetime_immutable')]
+    private \DateTimeImmutable $createdAt;
+
+    #[ORM\Column(type: 'datetime_immutable')]
+    private \DateTimeImmutable $updatedAt;
+
+    public static function fromDomain(Source $source): self
+    {
+        $entity = new self();
+        $entity->id = $source->getId();
+        $entity->name = $source->getName();
+        $entity->description = $source->getDescription();
+        $entity->baseUrl = $source->getBaseUrl();
+        $entity->scrappingParameters = $source->getScrappingParameters();
+        $entity->isActive = $source->isActive();
+        $entity->createdAt = $source->getCreatedAt();
+        $entity->updatedAt = $source->getUpdatedAt();
+        
+        return $entity;
+    }
+
+    public function toDomain(): Source
+    {
+        return new Source(
+            $this->id,
+            $this->name ?? '',
+            $this->description ?? '',
+            $this->baseUrl,
+            $this->scrappingParameters,
+            $this->isActive,
+            $this->createdAt,
+            $this->updatedAt
+        );
+    }
+} 
--- a/src/Domain/Scraping/Infrastructure/Service/ImageDownloader.php
+++ b/src/Domain/Scraping/Infrastructure/Service/ImageDownloader.php
@@ -0,0 +1,23 @@
+<?php
+
+namespace App\Domain\Scraping\Infrastructure\Service;
+
+use Symfony\Contracts\HttpClient\HttpClientInterface;
+
+class ImageDownloader 
+{
+    public function __construct(
+        private readonly HttpClientInterface $httpClient
+    ) {}
+
+    public function download(string $url, string $destination): void
+    {
+        $response = $this->httpClient->request('GET', $url);
+        
+        if (!str_starts_with($response->getHeaders()['content-type'][0], 'image/')) {
+            throw new \RuntimeException('Invalid content type');
+        }
+        
+        file_put_contents($destination, $response->getContent());
+    }
+}
--- a/src/Domain/Scraping/Infrastructure/Service/Scraper/AbstractScraper.php
+++ b/src/Domain/Scraping/Infrastructure/Service/Scraper/AbstractScraper.php
@@ -3,67 +3,37 @@
 namespace App\Domain\Scraping\Infrastructure\Service\Scraper;

 use App\Domain\Scraping\Domain\Contract\ScraperInterface;
-use App\Domain\Scraping\Domain\Model\ScrapingJob;
-use App\Domain\Scraping\Domain\Event\PageScrapingProgressed;
 use App\Domain\Scraping\Domain\Event\ChapterScrapingCompleted;
 use App\Domain\Scraping\Domain\Event\ChapterScrapingStarted;
+use App\Domain\Scraping\Domain\Event\PageScrapingProgressed;
+use App\Domain\Scraping\Domain\Model\ScrapingJob;
 use App\Domain\Scraping\Domain\Model\ScrapingProgress;
-use Symfony\Component\EventDispatcher\EventDispatcherInterface;
-use Symfony\Contracts\HttpClient\HttpClientInterface;
+use App\Domain\Scraping\Domain\Model\Source;
+use App\Domain\Scraping\Domain\Model\ValueObject\TempDirectory;
+use App\Domain\Scraping\Infrastructure\Service\ImageDownloader;
+use Symfony\Component\Messenger\MessageBusInterface;
+use Ramsey\Uuid\Uuid;

 abstract class AbstractScraper implements ScraperInterface
 {
    public function __construct(
-        protected readonly HttpClientInterface $httpClient,
-        protected readonly EventDispatcherInterface $eventDispatcher,
-        protected readonly string $tempDir
+        protected readonly ImageDownloader $imageDownloader,
+        protected readonly MessageBusInterface $eventBus
    ) {}

-    public function createScrapingJob(string $chapterId, string $sourceId): ScrapingJob
+    public function createScrapingJob(string $mangaId, string $chapterId, string $sourceId): ScrapingJob
    {
        return new ScrapingJob(
-            uniqid('scraping_'),
+            Uuid::uuid4()->toString(),
+            $mangaId,
            $chapterId,
-            $sourceId
+            $sourceId,
        );
    }

-    public function scrape(ScrapingJob $job): void
-    {
-        try {
-            $this->eventDispatcher->dispatch(new ChapterScrapingStarted($job->getId()));
-            
-            $tempDir = $this->createTempDirectory($job);
-            $pageData = $this->scrapePages($job);
-            
-            foreach ($pageData as $page) {
-                $this->downloadPage($job, $page, $tempDir);
-            }
-            
-            $job->complete();
-            
-            $this->eventDispatcher->dispatch(
-                new ChapterScrapingCompleted($job->getId(), $job->getPages())
-            );
-            
-            $this->cleanupTempDirectory($tempDir);
-            
-        } catch (\Exception $e) {
-            $job->fail();
-            throw $e;
-        }
-    }
-
-    abstract protected function scrapePages(ScrapingJob $job): array;
-
-    protected function createTempDirectory(ScrapingJob $job): string
-    {
-        $tempDir = $this->tempDir . '/' . uniqid('scraping_' . $job->getId() . '_');
-        if (!mkdir($tempDir) && !is_dir($tempDir)) {
-            throw new \RuntimeException("Failed to create temporary directory: $tempDir");
-        }
-        return $tempDir;
-    }
+    abstract public function scrape(ScrapingJob $job): void;
+    
+    abstract protected function scrapePages(ScrapingJob $job, Source $source): array;

    protected function cleanupTempDirectory(string $tempDir): void
    {
@@ -84,11 +54,32 @@ abstract class AbstractScraper implements ScraperInterface
        }
    }

-    protected function dispatchProgressEvent(ScrapingJob $job, int $current, int $total): void
+    protected function dispatchProgressEvent(ScrapingJob $job, int $currentPage, int $totalPages): void
    {
-        $progress = new ScrapingProgress($current, $total);
-        $this->eventDispatcher->dispatch(
-            new PageScrapingProgressed($job->getId(), $progress)
-        );
+        $progress = new ScrapingProgress($currentPage, $totalPages);
+        $this->eventBus->dispatch(new PageScrapingProgressed($job->getId(), $progress));
    }
+
+    protected function downloadImage(string $imageUrl, string $destination): void
+    {
+        $this->imageDownloader->download($imageUrl, $destination);
+    }
+
+    protected function createTempDirectory(): TempDirectory
+    {
+        return new TempDirectory(sys_get_temp_dir() . '/' . uniqid('manga_scraper_'));
+    }
+
+    protected function cleanupTempFiles(TempDirectory $tempDirectory): void
+    {
+        $files = glob($tempDirectory->getPath() . '/*');
+        foreach ($files as $file) {
+            if (is_file($file)) {
+                unlink($file);
+            }
+        }
+        rmdir($tempDirectory->getPath());
+    }
+
+    abstract public function supports(string $sourceType): bool;
 } 
--- a/src/Domain/Scraping/Infrastructure/Service/Scraper/HtmlScraper.php
+++ b/src/Domain/Scraping/Infrastructure/Service/Scraper/HtmlScraper.php
@@ -3,61 +3,131 @@
 namespace App\Domain\Scraping\Infrastructure\Service\Scraper;

 use App\Domain\Scraping\Domain\Model\ScrapingJob;
+use App\Domain\Scraping\Domain\Model\Source;
 use App\Domain\Scraping\Domain\Model\ValueObject\ImageUrl;
 use App\Domain\Scraping\Domain\Model\ValueObject\PageNumber;
+use App\Domain\Scraping\Domain\Repository\SourceRepositoryInterface;
 use Symfony\Component\DomCrawler\Crawler;
+use Symfony\Contracts\HttpClient\HttpClientInterface;
+use Symfony\Component\Messenger\MessageBusInterface;
+use App\Domain\Scraping\Infrastructure\Service\ImageDownloader;

 class HtmlScraper extends AbstractScraper
 {
-    protected function scrapePages(ScrapingJob $job): array
+    public function __construct(
+        ImageDownloader $imageDownloader,
+        MessageBusInterface $eventBus,
+        private readonly HttpClientInterface $httpClient,
+        private readonly SourceRepositoryInterface $sourceRepository
+    ) {
+        parent::__construct($imageDownloader, $eventBus);
+    }
+
+    public function scrape(ScrapingJob $job): void
    {
-        $url = $this->buildUrl($job);
-        $response = $this->httpClient->request('GET', $url);
+        $sourceConfig = $this->sourceRepository->getById($job->getSourceId());
+        $tempDir = $this->createTempDirectory();
+
+        try {
+            $pages = $this->scrapePages($job, $sourceConfig);
+            
+            foreach ($pages as $index => $imageUrl) {
+                $pageNumber = new PageNumber($index + 1);
+                $extension = pathinfo(parse_url($imageUrl, PHP_URL_PATH), PATHINFO_EXTENSION);
+                $destination = sprintf(
+                    '%s/%s.%s',
+                    $tempDir->getPath(),
+                    $pageNumber->getFormattedNumber(),
+                    $extension
+                );
+
+                $this->downloadImage($imageUrl, $destination);
+                $job->addPage($pageNumber, new ImageUrl($imageUrl));
+                
+                $this->dispatchProgressEvent($job, $index + 1, count($pages));
+            }
+
+            $job->complete();
+        } catch (\Exception $e) {
+            $job->fail();
+            throw $e;
+        } finally {
+            $this->cleanupTempFiles($tempDir);
+        }
+    }
+
+    protected function scrapePages(ScrapingJob $job, Source $sourceConfig): array
+    {
+        if (!$sourceConfig['next_page_selector']) {
+            return $this->scrapeVerticalReader($job, $sourceConfig);
+        }
        
-        $crawler = new Crawler($response->getContent());
-        $images = $crawler->filter('img.manga-page'); // Adapter selon le site
+        return $this->scrapeHorizontalReader($job, $sourceConfig);
+    }
+
+    private function scrapeVerticalReader(ScrapingJob $job, Source $sourceConfig): array
+    {
+        $html = $this->fetchHtml($this->buildChapterUrl($job, $sourceConfig));
+        $crawler = new Crawler($html);
        
+        return $crawler->filter($sourceConfig['image_selector'])
+            ->each(function ($node) {
+                return $this->cleanImageUrl(
+                    $node->attr('src') ?: $node->attr('data-src')
+                );
+            });
+    }
+
+    private function scrapeHorizontalReader(ScrapingJob $job, Source $sourceConfig): array
+    {
        $pages = [];
-        $images->each(function (Crawler $image) use (&$pages) {
-            $pages[] = [
-                'url' => $image->attr('src'),
-                'number' => count($pages) + 1
-            ];
-        });
-        
+        $currentUrl = $this->buildChapterUrl($job, $sourceConfig);
+
+        while ($currentUrl) {
+            $html = $this->fetchHtml($currentUrl);
+            $crawler = new Crawler($html);
+            
+            $imageUrl = $crawler->filter($sourceConfig['image_selector'])
+                ->attr('src') ?: $crawler->filter($sourceConfig['image_selector'])
+                ->attr('data-src');
+            
+            $pages[] = $this->cleanImageUrl($imageUrl);
+
+            $nextLink = $crawler->filter($sourceConfig['next_page_selector']);
+            $currentUrl = $nextLink->count() > 0 ? $nextLink->attr('href') : null;
+        }
+
        return $pages;
    }

-    protected function downloadPage(ScrapingJob $job, array $page, string $tempDir): void
+    private function fetchHtml(string $url): string
    {
-        $imageUrl = new ImageUrl($page['url']);
-        $pageNumber = new PageNumber($page['number']);
+        $response = $this->httpClient->request('GET', $url);
        
-        $fileName = sprintf('%s/%03d.%s', 
-            $tempDir,
-            $pageNumber->getValue(),
-            $imageUrl->getExtension()
+        if ($response->getStatusCode() >= 400) {
+            throw new \RuntimeException('Failed to fetch page: ' . $url);
+        }
+        
+        return $response->getContent();
+    }
+
+    private function cleanImageUrl(string $url): string
+    {
+        // Logique de nettoyage d'URL d'image
+        return $url;
+    }
+
+
+    private function buildChapterUrl(ScrapingJob $job, Source $sourceConfig): string
+    {
+        return sprintf(
+            $sourceConfig->getBaseUrl(),
+            $job->getChapterId()
        );
-        
-        $response = $this->httpClient->request('GET', $imageUrl->getValue());
-        file_put_contents($fileName, $response->getContent());
-        
-        $job->addPage($pageNumber, $imageUrl);
-        $this->dispatchProgressEvent($job, $page['number'], count($pages));
    }

    public function supports(string $sourceType): bool
    {
-        return $sourceType === 'html';
-    }
-
-    private function buildUrl(ScrapingJob $job): string
-    {
-        // À implémenter selon votre logique de construction d'URL
-        // Vous aurez probablement besoin d'injecter un service pour récupérer les informations du chapitre
-        return sprintf('https://example.com/manga/%s/chapter/%s', 
-            $job->getMangaId(), 
-            $job->getChapterId()
-        );
+        return 'html' === $sourceType;
    }
 }
--- a/src/Domain/Scraping/Infrastructure/Service/Scraper/JavascriptScraper.php
+++ b/src/Domain/Scraping/Infrastructure/Service/Scraper/JavascriptScraper.php
@@ -1,38 +0,0 @@
-<?php
-
-namespace App\Domain\Scraping\Infrastructure\Service\Scraper;
-
-use App\Domain\Scraping\Domain\Model\ScrapingJob;
-use Symfony\Component\Panther\Client as PantherClient;
-
-class JavascriptScraper extends AbstractScraper
-{
-    protected function scrapePages(ScrapingJob $job): array
-    {
-        $client = PantherClient::createChromeClient();
-        try {
-            $url = $this->buildUrl($job);
-            $crawler = $client->request('GET', $url);
-            
-            // Attendre que les images soient chargées
-            $crawler->waitFor('img.manga-page');
-            
-            $pages = [];
-            $crawler->filter('img.manga-page')->each(function ($image) use (&$pages) {
-                $pages[] = [
-                    'url' => $image->attr('src'),
-                    'number' => count($pages) + 1
-                ];
-            });
-            
-            return $pages;
-        } finally {
-            $client->quit();
-        }
-    }
-
-    public function supports(string $sourceType): bool
-    {
-        return $sourceType === 'javascript';
-    }
-}