feat: suite du passage en DDD de Scraping
This commit is contained in:
committed by
ThysTips
parent
0e3d72cc5e
commit
97d7bcf061
@@ -3,7 +3,9 @@
|
|||||||
namespace App\Domain\Scraping\Infrastructure\Persistence;
|
namespace App\Domain\Scraping\Infrastructure\Persistence;
|
||||||
|
|
||||||
use App\Domain\Scraping\Domain\Model\ScrapingJob;
|
use App\Domain\Scraping\Domain\Model\ScrapingJob;
|
||||||
|
use App\Domain\Scraping\Domain\Model\ScrapingStatus;
|
||||||
use App\Domain\Scraping\Domain\Repository\ScrapingJobRepositoryInterface;
|
use App\Domain\Scraping\Domain\Repository\ScrapingJobRepositoryInterface;
|
||||||
|
use App\Domain\Scraping\Infrastructure\Persistence\Entity\ScrapingJobEntity;
|
||||||
use Doctrine\ORM\EntityManagerInterface;
|
use Doctrine\ORM\EntityManagerInterface;
|
||||||
|
|
||||||
class DoctrineScrapingJobRepository implements ScrapingJobRepositoryInterface
|
class DoctrineScrapingJobRepository implements ScrapingJobRepositoryInterface
|
||||||
@@ -14,38 +16,48 @@ class DoctrineScrapingJobRepository implements ScrapingJobRepositoryInterface
|
|||||||
|
|
||||||
public function save(ScrapingJob $job): void
|
public function save(ScrapingJob $job): void
|
||||||
{
|
{
|
||||||
$this->entityManager->persist($job);
|
$entity = ScrapingJobEntity::fromDomain($job);
|
||||||
|
$this->entityManager->persist($entity);
|
||||||
$this->entityManager->flush();
|
$this->entityManager->flush();
|
||||||
}
|
}
|
||||||
|
|
||||||
public function findById(string $id): ?ScrapingJob
|
public function findById(string $id): ?ScrapingJob
|
||||||
{
|
{
|
||||||
return $this->entityManager->getRepository(ScrapingJob::class)->find($id);
|
$entity = $this->entityManager->getRepository(ScrapingJobEntity::class)
|
||||||
|
->find($id);
|
||||||
|
|
||||||
|
return $entity?->toDomain();
|
||||||
}
|
}
|
||||||
|
|
||||||
public function findByChapterId(string $chapterId): ?ScrapingJob
|
public function findByChapterId(string $chapterId): ?ScrapingJob
|
||||||
{
|
{
|
||||||
return $this->entityManager->getRepository(ScrapingJob::class)
|
$entity = $this->entityManager->getRepository(ScrapingJobEntity::class)
|
||||||
->findOneBy(['chapterId' => $chapterId]);
|
->findOneBy(['chapterId' => $chapterId]);
|
||||||
|
|
||||||
|
return $entity?->toDomain();
|
||||||
}
|
}
|
||||||
|
|
||||||
public function findPendingJobs(): array
|
public function findPendingJobs(): array
|
||||||
{
|
{
|
||||||
return $this->entityManager->getRepository(ScrapingJob::class)
|
$entities = $this->entityManager->getRepository(ScrapingJobEntity::class)
|
||||||
->createQueryBuilder('sj')
|
->createQueryBuilder('sj')
|
||||||
->where('sj.status = :status')
|
->where('sj.status = :status')
|
||||||
->setParameter('status', 'pending')
|
->setParameter('status', ScrapingStatus::PENDING->value)
|
||||||
->getQuery()
|
->getQuery()
|
||||||
->getResult();
|
->getResult();
|
||||||
|
|
||||||
|
return array_map(fn(ScrapingJobEntity $entity) => $entity->toDomain(), $entities);
|
||||||
}
|
}
|
||||||
|
|
||||||
public function findInProgressJobs(): array
|
public function findInProgressJobs(): array
|
||||||
{
|
{
|
||||||
return $this->entityManager->getRepository(ScrapingJob::class)
|
$entities = $this->entityManager->getRepository(ScrapingJobEntity::class)
|
||||||
->createQueryBuilder('sj')
|
->createQueryBuilder('sj')
|
||||||
->where('sj.status = :status')
|
->where('sj.status = :status')
|
||||||
->setParameter('status', 'in_progress')
|
->setParameter('status', ScrapingStatus::IN_PROGRESS->value)
|
||||||
->getQuery()
|
->getQuery()
|
||||||
->getResult();
|
->getResult();
|
||||||
|
|
||||||
|
return array_map(fn(ScrapingJobEntity $entity) => $entity->toDomain(), $entities);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -0,0 +1,94 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
namespace App\Domain\Scraping\Infrastructure\Service\Scraper;
|
||||||
|
|
||||||
|
use App\Domain\Scraping\Domain\Contract\ScraperInterface;
|
||||||
|
use App\Domain\Scraping\Domain\Model\ScrapingJob;
|
||||||
|
use App\Domain\Scraping\Domain\Event\PageScrapingProgressed;
|
||||||
|
use App\Domain\Scraping\Domain\Event\ChapterScrapingCompleted;
|
||||||
|
use App\Domain\Scraping\Domain\Event\ChapterScrapingStarted;
|
||||||
|
use App\Domain\Scraping\Domain\Model\ScrapingProgress;
|
||||||
|
use Symfony\Component\EventDispatcher\EventDispatcherInterface;
|
||||||
|
use Symfony\Contracts\HttpClient\HttpClientInterface;
|
||||||
|
|
||||||
|
abstract class AbstractScraper implements ScraperInterface
|
||||||
|
{
|
||||||
|
public function __construct(
|
||||||
|
protected readonly HttpClientInterface $httpClient,
|
||||||
|
protected readonly EventDispatcherInterface $eventDispatcher,
|
||||||
|
protected readonly string $tempDir
|
||||||
|
) {}
|
||||||
|
|
||||||
|
public function createScrapingJob(string $chapterId, string $sourceId): ScrapingJob
|
||||||
|
{
|
||||||
|
return new ScrapingJob(
|
||||||
|
uniqid('scraping_'),
|
||||||
|
$chapterId,
|
||||||
|
$sourceId
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function scrape(ScrapingJob $job): void
|
||||||
|
{
|
||||||
|
try {
|
||||||
|
$this->eventDispatcher->dispatch(new ChapterScrapingStarted($job->getId()));
|
||||||
|
|
||||||
|
$tempDir = $this->createTempDirectory($job);
|
||||||
|
$pageData = $this->scrapePages($job);
|
||||||
|
|
||||||
|
foreach ($pageData as $page) {
|
||||||
|
$this->downloadPage($job, $page, $tempDir);
|
||||||
|
}
|
||||||
|
|
||||||
|
$job->complete();
|
||||||
|
|
||||||
|
$this->eventDispatcher->dispatch(
|
||||||
|
new ChapterScrapingCompleted($job->getId(), $job->getPages())
|
||||||
|
);
|
||||||
|
|
||||||
|
$this->cleanupTempDirectory($tempDir);
|
||||||
|
|
||||||
|
} catch (\Exception $e) {
|
||||||
|
$job->fail();
|
||||||
|
throw $e;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
abstract protected function scrapePages(ScrapingJob $job): array;
|
||||||
|
|
||||||
|
protected function createTempDirectory(ScrapingJob $job): string
|
||||||
|
{
|
||||||
|
$tempDir = $this->tempDir . '/' . uniqid('scraping_' . $job->getId() . '_');
|
||||||
|
if (!mkdir($tempDir) && !is_dir($tempDir)) {
|
||||||
|
throw new \RuntimeException("Failed to create temporary directory: $tempDir");
|
||||||
|
}
|
||||||
|
return $tempDir;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected function cleanupTempDirectory(string $tempDir): void
|
||||||
|
{
|
||||||
|
if (is_dir($tempDir)) {
|
||||||
|
$files = new \RecursiveIteratorIterator(
|
||||||
|
new \RecursiveDirectoryIterator($tempDir, \RecursiveDirectoryIterator::SKIP_DOTS),
|
||||||
|
\RecursiveIteratorIterator::CHILD_FIRST
|
||||||
|
);
|
||||||
|
|
||||||
|
foreach ($files as $file) {
|
||||||
|
if ($file->isDir()) {
|
||||||
|
rmdir($file->getRealPath());
|
||||||
|
} else {
|
||||||
|
unlink($file->getRealPath());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
rmdir($tempDir);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
protected function dispatchProgressEvent(ScrapingJob $job, int $current, int $total): void
|
||||||
|
{
|
||||||
|
$progress = new ScrapingProgress($current, $total);
|
||||||
|
$this->eventDispatcher->dispatch(
|
||||||
|
new PageScrapingProgressed($job->getId(), $progress)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -2,60 +2,62 @@
|
|||||||
|
|
||||||
namespace App\Domain\Scraping\Infrastructure\Service\Scraper;
|
namespace App\Domain\Scraping\Infrastructure\Service\Scraper;
|
||||||
|
|
||||||
use App\Domain\Scraping\Domain\Contract\ScraperInterface as ContractScraperInterface;
|
|
||||||
use App\Domain\Scraping\Domain\Service\ScraperInterface;
|
|
||||||
use App\Domain\Scraping\Domain\Model\ScrapingJob;
|
use App\Domain\Scraping\Domain\Model\ScrapingJob;
|
||||||
use App\Domain\Scraping\Domain\Model\ValueObject\ImageUrl;
|
use App\Domain\Scraping\Domain\Model\ValueObject\ImageUrl;
|
||||||
use App\Domain\Scraping\Domain\Model\ValueObject\PageNumber;
|
use App\Domain\Scraping\Domain\Model\ValueObject\PageNumber;
|
||||||
use App\Domain\Scraping\Domain\Event\PageScrapingProgressed;
|
|
||||||
use App\Domain\Scraping\Domain\Event\ChapterScrapingCompleted;
|
|
||||||
use Symfony\Component\DomCrawler\Crawler;
|
use Symfony\Component\DomCrawler\Crawler;
|
||||||
use Symfony\Component\EventDispatcher\EventDispatcherInterface;
|
|
||||||
use Symfony\Contracts\HttpClient\HttpClientInterface;
|
|
||||||
|
|
||||||
class HtmlScraper implements ContractScraperInterface
|
class HtmlScraper extends AbstractScraper
|
||||||
{
|
{
|
||||||
public function __construct(
|
protected function scrapePages(ScrapingJob $job): array
|
||||||
private readonly HttpClientInterface $httpClient,
|
|
||||||
private readonly EventDispatcherInterface $eventDispatcher
|
|
||||||
) {}
|
|
||||||
|
|
||||||
public function createScrapingJob(string $chapterId, string $sourceId): ScrapingJob
|
|
||||||
{
|
{
|
||||||
return new ScrapingJob(
|
$url = $this->buildUrl($job);
|
||||||
uniqid('scraping_'),
|
|
||||||
$chapterId,
|
|
||||||
$sourceId
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function scrape(ScrapingJob $job): void
|
|
||||||
{
|
|
||||||
$url = $this->buildUrl($job); // À implémenter selon votre logique
|
|
||||||
$response = $this->httpClient->request('GET', $url);
|
$response = $this->httpClient->request('GET', $url);
|
||||||
|
|
||||||
$crawler = new Crawler($response->getContent());
|
$crawler = new Crawler($response->getContent());
|
||||||
$images = $crawler->filter('img.manga-page'); // Adapter selon le site cible
|
$images = $crawler->filter('img.manga-page'); // Adapter selon le site
|
||||||
|
|
||||||
$pageNumber = 1;
|
$pages = [];
|
||||||
$images->each(function (Crawler $image) use ($job, $pageNumber) {
|
$images->each(function (Crawler $image) use (&$pages) {
|
||||||
$imageUrl = new ImageUrl($image->attr('src'));
|
$pages[] = [
|
||||||
$job->addPage(new PageNumber($pageNumber), $imageUrl);
|
'url' => $image->attr('src'),
|
||||||
|
'number' => count($pages) + 1
|
||||||
$this->eventDispatcher->dispatch(
|
];
|
||||||
new PageScrapingProgressed($job->getId(), $job->getProgress())
|
|
||||||
);
|
|
||||||
|
|
||||||
$pageNumber++;
|
|
||||||
});
|
});
|
||||||
|
|
||||||
$this->eventDispatcher->dispatch(
|
return $pages;
|
||||||
new ChapterScrapingCompleted($job->getId(), $job->getPages())
|
}
|
||||||
|
|
||||||
|
protected function downloadPage(ScrapingJob $job, array $page, string $tempDir): void
|
||||||
|
{
|
||||||
|
$imageUrl = new ImageUrl($page['url']);
|
||||||
|
$pageNumber = new PageNumber($page['number']);
|
||||||
|
|
||||||
|
$fileName = sprintf('%s/%03d.%s',
|
||||||
|
$tempDir,
|
||||||
|
$pageNumber->getValue(),
|
||||||
|
$imageUrl->getExtension()
|
||||||
);
|
);
|
||||||
|
|
||||||
|
$response = $this->httpClient->request('GET', $imageUrl->getValue());
|
||||||
|
file_put_contents($fileName, $response->getContent());
|
||||||
|
|
||||||
|
$job->addPage($pageNumber, $imageUrl);
|
||||||
|
$this->dispatchProgressEvent($job, $page['number'], count($pages));
|
||||||
}
|
}
|
||||||
|
|
||||||
public function supports(string $sourceType): bool
|
public function supports(string $sourceType): bool
|
||||||
{
|
{
|
||||||
return $sourceType === 'html';
|
return $sourceType === 'html';
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
private function buildUrl(ScrapingJob $job): string
|
||||||
|
{
|
||||||
|
// À implémenter selon votre logique de construction d'URL
|
||||||
|
// Vous aurez probablement besoin d'injecter un service pour récupérer les informations du chapitre
|
||||||
|
return sprintf('https://example.com/manga/%s/chapter/%s',
|
||||||
|
$job->getMangaId(),
|
||||||
|
$job->getChapterId()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,38 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
namespace App\Domain\Scraping\Infrastructure\Service\Scraper;
|
||||||
|
|
||||||
|
use App\Domain\Scraping\Domain\Model\ScrapingJob;
|
||||||
|
use Symfony\Component\Panther\Client as PantherClient;
|
||||||
|
|
||||||
|
class JavascriptScraper extends AbstractScraper
|
||||||
|
{
|
||||||
|
protected function scrapePages(ScrapingJob $job): array
|
||||||
|
{
|
||||||
|
$client = PantherClient::createChromeClient();
|
||||||
|
try {
|
||||||
|
$url = $this->buildUrl($job);
|
||||||
|
$crawler = $client->request('GET', $url);
|
||||||
|
|
||||||
|
// Attendre que les images soient chargées
|
||||||
|
$crawler->waitFor('img.manga-page');
|
||||||
|
|
||||||
|
$pages = [];
|
||||||
|
$crawler->filter('img.manga-page')->each(function ($image) use (&$pages) {
|
||||||
|
$pages[] = [
|
||||||
|
'url' => $image->attr('src'),
|
||||||
|
'number' => count($pages) + 1
|
||||||
|
];
|
||||||
|
});
|
||||||
|
|
||||||
|
return $pages;
|
||||||
|
} finally {
|
||||||
|
$client->quit();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public function supports(string $sourceType): bool
|
||||||
|
{
|
||||||
|
return $sourceType === 'javascript';
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user