Mangarr/src/Domain/Scraping/Infrastructure/Service/Scraper/HtmlScraper.php

<?php

namespace App\Domain\Scraping\Infrastructure\Service\Scraper;

use App\Domain\Scraping\Domain\Contract\ScraperInterface as ContractScraperInterface;
use App\Domain\Scraping\Domain\Service\ScraperInterface;
use App\Domain\Scraping\Domain\Model\ScrapingJob;
use App\Domain\Scraping\Domain\Model\ValueObject\ImageUrl;
use App\Domain\Scraping\Domain\Model\ValueObject\PageNumber;
use App\Domain\Scraping\Domain\Event\PageScrapingProgressed;
use App\Domain\Scraping\Domain\Event\ChapterScrapingCompleted;
use Symfony\Component\DomCrawler\Crawler;
use Symfony\Component\EventDispatcher\EventDispatcherInterface;
use Symfony\Contracts\HttpClient\HttpClientInterface;

class HtmlScraper implements ContractScraperInterface
{
    public function __construct(
        private readonly HttpClientInterface $httpClient,
        private readonly EventDispatcherInterface $eventDispatcher
    ) {}

    public function createScrapingJob(string $chapterId, string $sourceId): ScrapingJob
    {
        return new ScrapingJob(
            uniqid('scraping_'),
            $chapterId,
            $sourceId
        );
    }

    public function scrape(ScrapingJob $job): void
    {
        $url = $this->buildUrl($job); // À implémenter selon votre logique
        $response = $this->httpClient->request('GET', $url);

        $crawler = new Crawler($response->getContent());
        $images = $crawler->filter('img.manga-page'); // Adapter selon le site cible

        $pageNumber = 1;
        $images->each(function (Crawler $image) use ($job, $pageNumber) {
            $imageUrl = new ImageUrl($image->attr('src'));
            $job->addPage(new PageNumber($pageNumber), $imageUrl);

            $this->eventDispatcher->dispatch(
                new PageScrapingProgressed($job->getId(), $job->getProgress())
            );

            $pageNumber++;
        });

        $this->eventDispatcher->dispatch(
            new ChapterScrapingCompleted($job->getId(), $job->getPages())
        );
    }

    public function supports(string $sourceType): bool
    {
        return $sourceType === 'html';
    }
}