From 0a8e6786a88699b816e8b82ac4035ddf6088dbd7 Mon Sep 17 00:00:00 2001 From: "ext.jeremy.guillot@maxicoffee.domains" Date: Sat, 1 Feb 2025 13:59:37 +0100 Subject: [PATCH] feat: suite du passage en DDD de Scraping --- .../DoctrineScrapingJobRepository.php | 26 +++-- .../Service/Scraper/AbstractScraper.php | 94 +++++++++++++++++++ .../Service/Scraper/HtmlScraper.php | 76 +++++++-------- .../Service/Scraper/JavascriptScraper.php | 38 ++++++++ 4 files changed, 190 insertions(+), 44 deletions(-) create mode 100644 src/Domain/Scraping/Infrastructure/Service/Scraper/AbstractScraper.php create mode 100644 src/Domain/Scraping/Infrastructure/Service/Scraper/JavascriptScraper.php diff --git a/src/Domain/Scraping/Infrastructure/Persistence/DoctrineScrapingJobRepository.php b/src/Domain/Scraping/Infrastructure/Persistence/DoctrineScrapingJobRepository.php index 20ee0b1..0b569f0 100644 --- a/src/Domain/Scraping/Infrastructure/Persistence/DoctrineScrapingJobRepository.php +++ b/src/Domain/Scraping/Infrastructure/Persistence/DoctrineScrapingJobRepository.php @@ -3,7 +3,9 @@ namespace App\Domain\Scraping\Infrastructure\Persistence; use App\Domain\Scraping\Domain\Model\ScrapingJob; +use App\Domain\Scraping\Domain\Model\ScrapingStatus; use App\Domain\Scraping\Domain\Repository\ScrapingJobRepositoryInterface; +use App\Domain\Scraping\Infrastructure\Persistence\Entity\ScrapingJobEntity; use Doctrine\ORM\EntityManagerInterface; class DoctrineScrapingJobRepository implements ScrapingJobRepositoryInterface @@ -14,38 +16,48 @@ class DoctrineScrapingJobRepository implements ScrapingJobRepositoryInterface public function save(ScrapingJob $job): void { - $this->entityManager->persist($job); + $entity = ScrapingJobEntity::fromDomain($job); + $this->entityManager->persist($entity); $this->entityManager->flush(); } public function findById(string $id): ?ScrapingJob { - return $this->entityManager->getRepository(ScrapingJob::class)->find($id); + $entity = $this->entityManager->getRepository(ScrapingJobEntity::class) + ->find($id); + + return $entity?->toDomain(); } public function findByChapterId(string $chapterId): ?ScrapingJob { - return $this->entityManager->getRepository(ScrapingJob::class) + $entity = $this->entityManager->getRepository(ScrapingJobEntity::class) ->findOneBy(['chapterId' => $chapterId]); + + return $entity?->toDomain(); } public function findPendingJobs(): array { - return $this->entityManager->getRepository(ScrapingJob::class) + $entities = $this->entityManager->getRepository(ScrapingJobEntity::class) ->createQueryBuilder('sj') ->where('sj.status = :status') - ->setParameter('status', 'pending') + ->setParameter('status', ScrapingStatus::PENDING->value) ->getQuery() ->getResult(); + + return array_map(fn(ScrapingJobEntity $entity) => $entity->toDomain(), $entities); } public function findInProgressJobs(): array { - return $this->entityManager->getRepository(ScrapingJob::class) + $entities = $this->entityManager->getRepository(ScrapingJobEntity::class) ->createQueryBuilder('sj') ->where('sj.status = :status') - ->setParameter('status', 'in_progress') + ->setParameter('status', ScrapingStatus::IN_PROGRESS->value) ->getQuery() ->getResult(); + + return array_map(fn(ScrapingJobEntity $entity) => $entity->toDomain(), $entities); } } \ No newline at end of file diff --git a/src/Domain/Scraping/Infrastructure/Service/Scraper/AbstractScraper.php b/src/Domain/Scraping/Infrastructure/Service/Scraper/AbstractScraper.php new file mode 100644 index 0000000..75cef16 --- /dev/null +++ b/src/Domain/Scraping/Infrastructure/Service/Scraper/AbstractScraper.php @@ -0,0 +1,94 @@ +eventDispatcher->dispatch(new ChapterScrapingStarted($job->getId())); + + $tempDir = $this->createTempDirectory($job); + $pageData = $this->scrapePages($job); + + foreach ($pageData as $page) { + $this->downloadPage($job, $page, $tempDir); + } + + $job->complete(); + + $this->eventDispatcher->dispatch( + new ChapterScrapingCompleted($job->getId(), $job->getPages()) + ); + + $this->cleanupTempDirectory($tempDir); + + } catch (\Exception $e) { + $job->fail(); + throw $e; + } + } + + abstract protected function scrapePages(ScrapingJob $job): array; + + protected function createTempDirectory(ScrapingJob $job): string + { + $tempDir = $this->tempDir . '/' . uniqid('scraping_' . $job->getId() . '_'); + if (!mkdir($tempDir) && !is_dir($tempDir)) { + throw new \RuntimeException("Failed to create temporary directory: $tempDir"); + } + return $tempDir; + } + + protected function cleanupTempDirectory(string $tempDir): void + { + if (is_dir($tempDir)) { + $files = new \RecursiveIteratorIterator( + new \RecursiveDirectoryIterator($tempDir, \RecursiveDirectoryIterator::SKIP_DOTS), + \RecursiveIteratorIterator::CHILD_FIRST + ); + + foreach ($files as $file) { + if ($file->isDir()) { + rmdir($file->getRealPath()); + } else { + unlink($file->getRealPath()); + } + } + rmdir($tempDir); + } + } + + protected function dispatchProgressEvent(ScrapingJob $job, int $current, int $total): void + { + $progress = new ScrapingProgress($current, $total); + $this->eventDispatcher->dispatch( + new PageScrapingProgressed($job->getId(), $progress) + ); + } +} \ No newline at end of file diff --git a/src/Domain/Scraping/Infrastructure/Service/Scraper/HtmlScraper.php b/src/Domain/Scraping/Infrastructure/Service/Scraper/HtmlScraper.php index 6939472..385563c 100644 --- a/src/Domain/Scraping/Infrastructure/Service/Scraper/HtmlScraper.php +++ b/src/Domain/Scraping/Infrastructure/Service/Scraper/HtmlScraper.php @@ -2,60 +2,62 @@ namespace App\Domain\Scraping\Infrastructure\Service\Scraper; -use App\Domain\Scraping\Domain\Contract\ScraperInterface as ContractScraperInterface; -use App\Domain\Scraping\Domain\Service\ScraperInterface; use App\Domain\Scraping\Domain\Model\ScrapingJob; use App\Domain\Scraping\Domain\Model\ValueObject\ImageUrl; use App\Domain\Scraping\Domain\Model\ValueObject\PageNumber; -use App\Domain\Scraping\Domain\Event\PageScrapingProgressed; -use App\Domain\Scraping\Domain\Event\ChapterScrapingCompleted; use Symfony\Component\DomCrawler\Crawler; -use Symfony\Component\EventDispatcher\EventDispatcherInterface; -use Symfony\Contracts\HttpClient\HttpClientInterface; -class HtmlScraper implements ContractScraperInterface +class HtmlScraper extends AbstractScraper { - public function __construct( - private readonly HttpClientInterface $httpClient, - private readonly EventDispatcherInterface $eventDispatcher - ) {} - - public function createScrapingJob(string $chapterId, string $sourceId): ScrapingJob + protected function scrapePages(ScrapingJob $job): array { - return new ScrapingJob( - uniqid('scraping_'), - $chapterId, - $sourceId - ); - } - - public function scrape(ScrapingJob $job): void - { - $url = $this->buildUrl($job); // À implémenter selon votre logique + $url = $this->buildUrl($job); $response = $this->httpClient->request('GET', $url); $crawler = new Crawler($response->getContent()); - $images = $crawler->filter('img.manga-page'); // Adapter selon le site cible + $images = $crawler->filter('img.manga-page'); // Adapter selon le site - $pageNumber = 1; - $images->each(function (Crawler $image) use ($job, $pageNumber) { - $imageUrl = new ImageUrl($image->attr('src')); - $job->addPage(new PageNumber($pageNumber), $imageUrl); - - $this->eventDispatcher->dispatch( - new PageScrapingProgressed($job->getId(), $job->getProgress()) - ); - - $pageNumber++; + $pages = []; + $images->each(function (Crawler $image) use (&$pages) { + $pages[] = [ + 'url' => $image->attr('src'), + 'number' => count($pages) + 1 + ]; }); - $this->eventDispatcher->dispatch( - new ChapterScrapingCompleted($job->getId(), $job->getPages()) + return $pages; + } + + protected function downloadPage(ScrapingJob $job, array $page, string $tempDir): void + { + $imageUrl = new ImageUrl($page['url']); + $pageNumber = new PageNumber($page['number']); + + $fileName = sprintf('%s/%03d.%s', + $tempDir, + $pageNumber->getValue(), + $imageUrl->getExtension() ); + + $response = $this->httpClient->request('GET', $imageUrl->getValue()); + file_put_contents($fileName, $response->getContent()); + + $job->addPage($pageNumber, $imageUrl); + $this->dispatchProgressEvent($job, $page['number'], count($pages)); } public function supports(string $sourceType): bool { return $sourceType === 'html'; } -} \ No newline at end of file + + private function buildUrl(ScrapingJob $job): string + { + // À implémenter selon votre logique de construction d'URL + // Vous aurez probablement besoin d'injecter un service pour récupérer les informations du chapitre + return sprintf('https://example.com/manga/%s/chapter/%s', + $job->getMangaId(), + $job->getChapterId() + ); + } +} \ No newline at end of file diff --git a/src/Domain/Scraping/Infrastructure/Service/Scraper/JavascriptScraper.php b/src/Domain/Scraping/Infrastructure/Service/Scraper/JavascriptScraper.php new file mode 100644 index 0000000..69dedc7 --- /dev/null +++ b/src/Domain/Scraping/Infrastructure/Service/Scraper/JavascriptScraper.php @@ -0,0 +1,38 @@ +buildUrl($job); + $crawler = $client->request('GET', $url); + + // Attendre que les images soient chargées + $crawler->waitFor('img.manga-page'); + + $pages = []; + $crawler->filter('img.manga-page')->each(function ($image) use (&$pages) { + $pages[] = [ + 'url' => $image->attr('src'), + 'number' => count($pages) + 1 + ]; + }); + + return $pages; + } finally { + $client->quit(); + } + } + + public function supports(string $sourceType): bool + { + return $sourceType === 'javascript'; + } +} \ No newline at end of file