feat: firsts unit tests for ScrapeChapterHandler.php
This commit is contained in:
parent
21fcdd1084
commit
89570ad951
@@ -1,11 +1,13 @@
|
||||
<?php
|
||||
|
||||
namespace App\Domain\Scraping\Application\Command\ScrapeChapter;
|
||||
namespace App\Domain\Scraping\Application\Command;
|
||||
|
||||
class ScrapeChapterCommand
|
||||
readonly class ScrapeChapter
|
||||
{
|
||||
public function __construct(
|
||||
public readonly string $chapterId,
|
||||
public readonly string $sourceId
|
||||
) {}
|
||||
public string $chapterId,
|
||||
public string $sourceId,
|
||||
public string $mangaId
|
||||
) {
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,31 +1,40 @@
|
||||
<?php
|
||||
|
||||
namespace App\Domain\Scraping\Application\Command\ScrapeChapter;
|
||||
namespace App\Domain\Scraping\Application\CommandHandler;
|
||||
|
||||
use App\Domain\Scraping\Domain\Contract\ScraperInterface;
|
||||
use App\Domain\Scraping\Domain\Repository\ScrapingJobRepositoryInterface;
|
||||
use App\Domain\Scraping\Application\Command\ScrapeChapter;
|
||||
use App\Domain\Scraping\Domain\Contract\Repository\ScrapingJobRepositoryInterface;
|
||||
use App\Domain\Scraping\Domain\Contract\Service\ScraperInterface;
|
||||
use App\Domain\Scraping\Domain\Event\ChapterScrapingFailed;
|
||||
use App\Domain\Scraping\Domain\Event\ChapterScrapingStarted;
|
||||
use Symfony\Component\Messenger\MessageBusInterface;
|
||||
|
||||
class ScrapeChapterHandler
|
||||
readonly class ScrapeChapterHandler
|
||||
{
|
||||
public function __construct(
|
||||
private readonly ScraperInterface $scraper,
|
||||
private readonly ScrapingJobRepositoryInterface $scrapingJobRepository,
|
||||
private readonly MessageBusInterface $eventBus
|
||||
) {}
|
||||
|
||||
public function handle(ScrapeChapterCommand $command): void
|
||||
{
|
||||
$job = $this->scraper->createScrapingJob(
|
||||
$command->chapterId,
|
||||
$command->sourceId
|
||||
);
|
||||
|
||||
$this->scrapingJobRepository->save($job);
|
||||
|
||||
$this->eventBus->dispatch(new ChapterScrapingStarted($job->getId()));
|
||||
|
||||
$this->scraper->scrape($job);
|
||||
private ScraperInterface $scraper,
|
||||
private ScrapingJobRepositoryInterface $scrapingJobRepository,
|
||||
private MessageBusInterface $eventBus
|
||||
) {
|
||||
}
|
||||
}
|
||||
|
||||
public function handle(ScrapeChapter $command): void
|
||||
{
|
||||
try {
|
||||
$job = $this->scraper->createScrapingJob(
|
||||
$command->mangaId,
|
||||
$command->chapterId,
|
||||
$command->sourceId,
|
||||
);
|
||||
|
||||
$this->scrapingJobRepository->save($job);
|
||||
|
||||
$this->eventBus->dispatch(new ChapterScrapingStarted($job->getId()));
|
||||
|
||||
$this->scraper->scrape($job);
|
||||
} catch (\Exception $e) {
|
||||
$this->eventBus->dispatch(new ChapterScrapingFailed($command->chapterId, $e->getMessage()));
|
||||
throw $e;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,10 @@
|
||||
<?php
|
||||
|
||||
namespace App\Domain\Scraping\Domain\Contract\Repository;
|
||||
|
||||
use App\Domain\Scraping\Domain\Model\Manga;
|
||||
|
||||
interface MangaRepositoryInterface
|
||||
{
|
||||
public function getById(string $id): ?Manga;
|
||||
}
|
||||
@@ -1,6 +1,6 @@
|
||||
<?php
|
||||
|
||||
namespace App\Domain\Scraping\Domain\Repository;
|
||||
namespace App\Domain\Scraping\Domain\Contract\Repository;
|
||||
|
||||
use App\Domain\Scraping\Domain\Model\ScrapingJob;
|
||||
|
||||
@@ -9,4 +9,4 @@ interface ScrapingJobRepositoryInterface
|
||||
public function save(ScrapingJob $job): void;
|
||||
public function findById(string $id): ?ScrapingJob;
|
||||
public function findByChapterId(string $chapterId): ?ScrapingJob;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,10 @@
|
||||
<?php
|
||||
|
||||
namespace App\Domain\Scraping\Domain\Repository;
|
||||
|
||||
use App\Domain\Scraping\Domain\Model\Source;
|
||||
|
||||
interface SourceRepositoryInterface
|
||||
{
|
||||
public function getById(string $id): ?Source;
|
||||
}
|
||||
@@ -1,12 +1,12 @@
|
||||
<?php
|
||||
|
||||
namespace App\Domain\Scraping\Domain\Contract;
|
||||
namespace App\Domain\Scraping\Domain\Contract\Service;
|
||||
|
||||
use App\Domain\Scraping\Domain\Model\ScrapingJob;
|
||||
|
||||
interface ScraperInterface
|
||||
{
|
||||
public function createScrapingJob(string $chapterId, string $sourceId): ScrapingJob;
|
||||
public function createScrapingJob(string $mangaId, string $chapterId, string $sourceId): ScrapingJob;
|
||||
public function scrape(ScrapingJob $job): void;
|
||||
public function supports(string $sourceType): bool;
|
||||
}
|
||||
21
src/Domain/Scraping/Domain/Event/ChapterScrapingFailed.php
Normal file
21
src/Domain/Scraping/Domain/Event/ChapterScrapingFailed.php
Normal file
@@ -0,0 +1,21 @@
|
||||
<?php
|
||||
|
||||
namespace App\Domain\Scraping\Domain\Event;
|
||||
|
||||
class ChapterScrapingFailed
|
||||
{
|
||||
public function __construct(
|
||||
private readonly string $chapterId,
|
||||
private readonly string $reason
|
||||
) {}
|
||||
|
||||
public function getChapterId(): string
|
||||
{
|
||||
return $this->chapterId;
|
||||
}
|
||||
|
||||
public function getReason(): string
|
||||
{
|
||||
return $this->reason;
|
||||
}
|
||||
}
|
||||
39
src/Domain/Scraping/Domain/Model/Manga.php
Normal file
39
src/Domain/Scraping/Domain/Model/Manga.php
Normal file
@@ -0,0 +1,39 @@
|
||||
<?php
|
||||
|
||||
namespace App\Domain\Scraping\Domain\Model;
|
||||
|
||||
class Manga
|
||||
{
|
||||
public function __construct(
|
||||
private readonly string $id,
|
||||
private readonly string $title,
|
||||
private readonly string $slug,
|
||||
private readonly string $description,
|
||||
private readonly string $author,
|
||||
) {}
|
||||
|
||||
public function getId(): string
|
||||
{
|
||||
return $this->id;
|
||||
}
|
||||
|
||||
public function getTitle(): string
|
||||
{
|
||||
return $this->title;
|
||||
}
|
||||
|
||||
public function getSlug(): string
|
||||
{
|
||||
return $this->slug;
|
||||
}
|
||||
|
||||
public function getDescription(): string
|
||||
{
|
||||
return $this->description;
|
||||
}
|
||||
|
||||
public function getAuthor(): string
|
||||
{
|
||||
return $this->author;
|
||||
}
|
||||
}
|
||||
@@ -14,8 +14,8 @@ class ScrapingJob
|
||||
|
||||
public function __construct(
|
||||
private readonly string $id,
|
||||
private readonly string $chapterId,
|
||||
private readonly string $mangaId,
|
||||
private readonly string $chapterId,
|
||||
private readonly string $sourceId
|
||||
) {
|
||||
$this->status = ScrapingStatus::PENDING;
|
||||
|
||||
59
src/Domain/Scraping/Domain/Model/Source.php
Normal file
59
src/Domain/Scraping/Domain/Model/Source.php
Normal file
@@ -0,0 +1,59 @@
|
||||
<?php
|
||||
|
||||
namespace App\Domain\Scraping\Domain\Model;
|
||||
|
||||
use DateTimeImmutable;
|
||||
|
||||
class Source
|
||||
{
|
||||
public function __construct(
|
||||
private readonly string $id,
|
||||
private readonly string $name,
|
||||
private readonly string $description,
|
||||
private readonly string $baseUrl,
|
||||
private readonly array $scrappingParameters,
|
||||
private readonly bool $isActive,
|
||||
private readonly DateTimeImmutable $createdAt,
|
||||
private readonly DateTimeImmutable $updatedAt
|
||||
) {}
|
||||
|
||||
public function getId(): string
|
||||
{
|
||||
return $this->id;
|
||||
}
|
||||
|
||||
public function getName(): string
|
||||
{
|
||||
return $this->name;
|
||||
}
|
||||
|
||||
public function getDescription(): string
|
||||
{
|
||||
return $this->description;
|
||||
}
|
||||
|
||||
public function getBaseUrl(): string
|
||||
{
|
||||
return $this->baseUrl;
|
||||
}
|
||||
|
||||
public function getScrappingParameters(): array
|
||||
{
|
||||
return $this->scrappingParameters;
|
||||
}
|
||||
|
||||
public function isActive(): bool
|
||||
{
|
||||
return $this->isActive;
|
||||
}
|
||||
|
||||
public function getCreatedAt(): DateTimeImmutable
|
||||
{
|
||||
return $this->createdAt;
|
||||
}
|
||||
|
||||
public function getUpdatedAt(): DateTimeImmutable
|
||||
{
|
||||
return $this->updatedAt;
|
||||
}
|
||||
}
|
||||
18
src/Domain/Scraping/Domain/Model/ValueObject/ChapterId.php
Normal file
18
src/Domain/Scraping/Domain/Model/ValueObject/ChapterId.php
Normal file
@@ -0,0 +1,18 @@
|
||||
<?php
|
||||
|
||||
namespace App\Domain\Scraping\Domain\Model\ValueObject;
|
||||
|
||||
class ChapterId
|
||||
{
|
||||
public function __construct(private readonly string $value)
|
||||
{
|
||||
if (empty($value)) {
|
||||
throw new \InvalidArgumentException('Chapter ID cannot be empty');
|
||||
}
|
||||
}
|
||||
|
||||
public function getValue(): string
|
||||
{
|
||||
return $this->value;
|
||||
}
|
||||
}
|
||||
18
src/Domain/Scraping/Domain/Model/ValueObject/SourceId.php
Normal file
18
src/Domain/Scraping/Domain/Model/ValueObject/SourceId.php
Normal file
@@ -0,0 +1,18 @@
|
||||
<?php
|
||||
|
||||
namespace App\Domain\Scraping\Domain\Model\ValueObject;
|
||||
|
||||
class SourceId
|
||||
{
|
||||
public function __construct(private readonly string $value)
|
||||
{
|
||||
if (empty($value)) {
|
||||
throw new \InvalidArgumentException('Source ID cannot be empty');
|
||||
}
|
||||
}
|
||||
|
||||
public function getValue(): string
|
||||
{
|
||||
return $this->value;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,18 @@
|
||||
<?php
|
||||
|
||||
namespace App\Domain\Scraping\Domain\Model\ValueObject;
|
||||
|
||||
class TempDirectory
|
||||
{
|
||||
public function __construct(private readonly string $path)
|
||||
{
|
||||
if (!is_dir($path) && !mkdir($path)) {
|
||||
throw new \RuntimeException("Failed to create directory: $path");
|
||||
}
|
||||
}
|
||||
|
||||
public function getPath(): string
|
||||
{
|
||||
return $this->path;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,22 @@
|
||||
<?php
|
||||
|
||||
namespace App\Domain\Scraping\Infrastructure\Persistence;
|
||||
|
||||
use App\Domain\Scraping\Domain\Contract\Repository\MangaRepositoryInterface;
|
||||
use App\Domain\Scraping\Domain\Model\Manga;
|
||||
use App\Domain\Scraping\Infrastructure\Persistence\Entity\MangaEntity;
|
||||
use Doctrine\ORM\EntityManagerInterface;
|
||||
|
||||
class DoctrineMangaRepository implements MangaRepositoryInterface
|
||||
{
|
||||
public function __construct(
|
||||
private readonly EntityManagerInterface $entityManager
|
||||
) {}
|
||||
|
||||
public function getById(string $id): ?Manga
|
||||
{
|
||||
$manga = $this->entityManager->getRepository(MangaEntity::class)->find($id);
|
||||
|
||||
return $manga ? $manga->toDomain() : null;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,26 @@
|
||||
<?php
|
||||
|
||||
namespace App\Domain\Scraping\Infrastructure\Persistence;
|
||||
|
||||
use App\Domain\Scraping\Domain\Model\Source;
|
||||
use App\Domain\Scraping\Domain\Repository\SourceRepositoryInterface;
|
||||
use Doctrine\ORM\EntityManagerInterface;
|
||||
use App\Domain\Scraping\Infrastructure\Persistence\Entity\SourceEntity as SourceEntityEntity;
|
||||
|
||||
class DoctrineSourceRepository implements SourceRepositoryInterface
|
||||
{
|
||||
public function __construct(
|
||||
private readonly EntityManagerInterface $entityManager
|
||||
) {}
|
||||
|
||||
public function getById(string $id): ?Source
|
||||
{
|
||||
$sourceEntity = $this->entityManager->getRepository(SourceEntityEntity::class)->find($id);
|
||||
|
||||
if (!$sourceEntity) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return $sourceEntity->toDomain();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,75 @@
|
||||
<?php
|
||||
|
||||
namespace App\Domain\Scraping\Infrastructure\Persistence\Entity;
|
||||
|
||||
use App\Domain\Scraping\Domain\Model\Manga;
|
||||
use Doctrine\DBAL\Types\Types;
|
||||
use Doctrine\ORM\Mapping as ORM;
|
||||
|
||||
#[ORM\Entity]
|
||||
#[ORM\Table(name: 'mangas')]
|
||||
class MangaEntity
|
||||
{
|
||||
#[ORM\Id]
|
||||
#[ORM\Column(type: 'string', length: 36)]
|
||||
private string $id;
|
||||
|
||||
#[ORM\Column(length: 255)]
|
||||
private string $title;
|
||||
|
||||
#[ORM\Column(length: 255, unique: true)]
|
||||
private string $slug;
|
||||
|
||||
#[ORM\Column(length: 255, nullable: true)]
|
||||
private ?string $imageUrl = null;
|
||||
|
||||
#[ORM\Column(nullable: true)]
|
||||
private ?int $publicationYear = null;
|
||||
|
||||
#[ORM\Column(type: Types::TEXT, nullable: true)]
|
||||
private ?string $description = null;
|
||||
|
||||
#[ORM\Column(type: Types::ARRAY, nullable: true)]
|
||||
private ?array $genres = null;
|
||||
|
||||
#[ORM\Column(type: 'datetime_immutable')]
|
||||
private \DateTimeImmutable $createdAt;
|
||||
|
||||
#[ORM\Column(nullable: true)]
|
||||
private ?float $rating = null;
|
||||
|
||||
#[ORM\Column(length: 255, nullable: true)]
|
||||
private ?string $author = null;
|
||||
|
||||
#[ORM\Column(length: 255, nullable: true)]
|
||||
private ?string $status = null;
|
||||
|
||||
#[ORM\Column]
|
||||
private bool $monitored;
|
||||
|
||||
public static function fromDomain(Manga $manga): self
|
||||
{
|
||||
$entity = new self();
|
||||
$entity->id = $manga->getId();
|
||||
$entity->title = $manga->getTitle();
|
||||
$entity->slug = $manga->getSlug();
|
||||
$entity->description = $manga->getDescription();
|
||||
$entity->author = $manga->getAuthor();
|
||||
|
||||
|
||||
return $entity;
|
||||
}
|
||||
|
||||
public function toDomain(): Manga
|
||||
{
|
||||
$manga = new Manga(
|
||||
$this->id,
|
||||
$this->title,
|
||||
$this->slug,
|
||||
$this->description,
|
||||
$this->author
|
||||
);
|
||||
|
||||
return $manga;
|
||||
}
|
||||
}
|
||||
@@ -3,7 +3,6 @@
|
||||
namespace App\Domain\Scraping\Infrastructure\Persistence\Entity;
|
||||
|
||||
use App\Domain\Scraping\Domain\Model\ScrapingJob;
|
||||
use App\Domain\Scraping\Domain\Model\ScrapingStatus;
|
||||
use Doctrine\ORM\Mapping as ORM;
|
||||
|
||||
#[ORM\Entity]
|
||||
@@ -59,25 +58,6 @@ class ScrapingJobEntity
|
||||
$this->sourceId
|
||||
);
|
||||
|
||||
// Reconstruire l'état du job à partir des données persistées
|
||||
$reflection = new \ReflectionClass(ScrapingJob::class);
|
||||
|
||||
$pagesProperty = $reflection->getProperty('pages');
|
||||
$pagesProperty->setAccessible(true);
|
||||
$pagesProperty->setValue($job, $this->pages);
|
||||
|
||||
$statusProperty = $reflection->getProperty('status');
|
||||
$statusProperty->setAccessible(true);
|
||||
$statusProperty->setValue($job, ScrapingStatus::from($this->status));
|
||||
|
||||
$createdAtProperty = $reflection->getProperty('createdAt');
|
||||
$createdAtProperty->setAccessible(true);
|
||||
$createdAtProperty->setValue($job, $this->createdAt);
|
||||
|
||||
$completedAtProperty = $reflection->getProperty('completedAt');
|
||||
$completedAtProperty->setAccessible(true);
|
||||
$completedAtProperty->setValue($job, $this->completedAt);
|
||||
|
||||
return $job;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,65 @@
|
||||
<?php
|
||||
|
||||
namespace App\Domain\Scraping\Infrastructure\Persistence\Entity;
|
||||
|
||||
use App\Domain\Scraping\Domain\Model\Source;
|
||||
use Doctrine\ORM\Mapping as ORM;
|
||||
|
||||
#[ORM\Entity]
|
||||
#[ORM\Table(name: 'sources')]
|
||||
class SourceEntity
|
||||
{
|
||||
#[ORM\Id]
|
||||
#[ORM\Column(type: 'string', length: 36)]
|
||||
private string $id;
|
||||
|
||||
#[ORM\Column(type: 'string', nullable: true)]
|
||||
private ?string $name = null;
|
||||
|
||||
#[ORM\Column(type: 'text', nullable: true)]
|
||||
private ?string $description = null;
|
||||
|
||||
#[ORM\Column(type: 'string')]
|
||||
private string $baseUrl;
|
||||
|
||||
#[ORM\Column(type: 'json')]
|
||||
private array $scrappingParameters = [];
|
||||
|
||||
#[ORM\Column(type: 'boolean')]
|
||||
private bool $isActive;
|
||||
|
||||
#[ORM\Column(type: 'datetime_immutable')]
|
||||
private \DateTimeImmutable $createdAt;
|
||||
|
||||
#[ORM\Column(type: 'datetime_immutable')]
|
||||
private \DateTimeImmutable $updatedAt;
|
||||
|
||||
public static function fromDomain(Source $source): self
|
||||
{
|
||||
$entity = new self();
|
||||
$entity->id = $source->getId();
|
||||
$entity->name = $source->getName();
|
||||
$entity->description = $source->getDescription();
|
||||
$entity->baseUrl = $source->getBaseUrl();
|
||||
$entity->scrappingParameters = $source->getScrappingParameters();
|
||||
$entity->isActive = $source->isActive();
|
||||
$entity->createdAt = $source->getCreatedAt();
|
||||
$entity->updatedAt = $source->getUpdatedAt();
|
||||
|
||||
return $entity;
|
||||
}
|
||||
|
||||
public function toDomain(): Source
|
||||
{
|
||||
return new Source(
|
||||
$this->id,
|
||||
$this->name ?? '',
|
||||
$this->description ?? '',
|
||||
$this->baseUrl,
|
||||
$this->scrappingParameters,
|
||||
$this->isActive,
|
||||
$this->createdAt,
|
||||
$this->updatedAt
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,23 @@
|
||||
<?php
|
||||
|
||||
namespace App\Domain\Scraping\Infrastructure\Service;
|
||||
|
||||
use Symfony\Contracts\HttpClient\HttpClientInterface;
|
||||
|
||||
class ImageDownloader
|
||||
{
|
||||
public function __construct(
|
||||
private readonly HttpClientInterface $httpClient
|
||||
) {}
|
||||
|
||||
public function download(string $url, string $destination): void
|
||||
{
|
||||
$response = $this->httpClient->request('GET', $url);
|
||||
|
||||
if (!str_starts_with($response->getHeaders()['content-type'][0], 'image/')) {
|
||||
throw new \RuntimeException('Invalid content type');
|
||||
}
|
||||
|
||||
file_put_contents($destination, $response->getContent());
|
||||
}
|
||||
}
|
||||
@@ -3,67 +3,37 @@
|
||||
namespace App\Domain\Scraping\Infrastructure\Service\Scraper;
|
||||
|
||||
use App\Domain\Scraping\Domain\Contract\ScraperInterface;
|
||||
use App\Domain\Scraping\Domain\Model\ScrapingJob;
|
||||
use App\Domain\Scraping\Domain\Event\PageScrapingProgressed;
|
||||
use App\Domain\Scraping\Domain\Event\ChapterScrapingCompleted;
|
||||
use App\Domain\Scraping\Domain\Event\ChapterScrapingStarted;
|
||||
use App\Domain\Scraping\Domain\Event\PageScrapingProgressed;
|
||||
use App\Domain\Scraping\Domain\Model\ScrapingJob;
|
||||
use App\Domain\Scraping\Domain\Model\ScrapingProgress;
|
||||
use Symfony\Component\EventDispatcher\EventDispatcherInterface;
|
||||
use Symfony\Contracts\HttpClient\HttpClientInterface;
|
||||
use App\Domain\Scraping\Domain\Model\Source;
|
||||
use App\Domain\Scraping\Domain\Model\ValueObject\TempDirectory;
|
||||
use App\Domain\Scraping\Infrastructure\Service\ImageDownloader;
|
||||
use Symfony\Component\Messenger\MessageBusInterface;
|
||||
use Ramsey\Uuid\Uuid;
|
||||
|
||||
abstract class AbstractScraper implements ScraperInterface
|
||||
{
|
||||
public function __construct(
|
||||
protected readonly HttpClientInterface $httpClient,
|
||||
protected readonly EventDispatcherInterface $eventDispatcher,
|
||||
protected readonly string $tempDir
|
||||
protected readonly ImageDownloader $imageDownloader,
|
||||
protected readonly MessageBusInterface $eventBus
|
||||
) {}
|
||||
|
||||
public function createScrapingJob(string $chapterId, string $sourceId): ScrapingJob
|
||||
public function createScrapingJob(string $mangaId, string $chapterId, string $sourceId): ScrapingJob
|
||||
{
|
||||
return new ScrapingJob(
|
||||
uniqid('scraping_'),
|
||||
Uuid::uuid4()->toString(),
|
||||
$mangaId,
|
||||
$chapterId,
|
||||
$sourceId
|
||||
$sourceId,
|
||||
);
|
||||
}
|
||||
|
||||
public function scrape(ScrapingJob $job): void
|
||||
{
|
||||
try {
|
||||
$this->eventDispatcher->dispatch(new ChapterScrapingStarted($job->getId()));
|
||||
|
||||
$tempDir = $this->createTempDirectory($job);
|
||||
$pageData = $this->scrapePages($job);
|
||||
|
||||
foreach ($pageData as $page) {
|
||||
$this->downloadPage($job, $page, $tempDir);
|
||||
}
|
||||
|
||||
$job->complete();
|
||||
|
||||
$this->eventDispatcher->dispatch(
|
||||
new ChapterScrapingCompleted($job->getId(), $job->getPages())
|
||||
);
|
||||
|
||||
$this->cleanupTempDirectory($tempDir);
|
||||
|
||||
} catch (\Exception $e) {
|
||||
$job->fail();
|
||||
throw $e;
|
||||
}
|
||||
}
|
||||
|
||||
abstract protected function scrapePages(ScrapingJob $job): array;
|
||||
|
||||
protected function createTempDirectory(ScrapingJob $job): string
|
||||
{
|
||||
$tempDir = $this->tempDir . '/' . uniqid('scraping_' . $job->getId() . '_');
|
||||
if (!mkdir($tempDir) && !is_dir($tempDir)) {
|
||||
throw new \RuntimeException("Failed to create temporary directory: $tempDir");
|
||||
}
|
||||
return $tempDir;
|
||||
}
|
||||
abstract public function scrape(ScrapingJob $job): void;
|
||||
|
||||
abstract protected function scrapePages(ScrapingJob $job, Source $source): array;
|
||||
|
||||
protected function cleanupTempDirectory(string $tempDir): void
|
||||
{
|
||||
@@ -84,11 +54,32 @@ abstract class AbstractScraper implements ScraperInterface
|
||||
}
|
||||
}
|
||||
|
||||
protected function dispatchProgressEvent(ScrapingJob $job, int $current, int $total): void
|
||||
protected function dispatchProgressEvent(ScrapingJob $job, int $currentPage, int $totalPages): void
|
||||
{
|
||||
$progress = new ScrapingProgress($current, $total);
|
||||
$this->eventDispatcher->dispatch(
|
||||
new PageScrapingProgressed($job->getId(), $progress)
|
||||
);
|
||||
$progress = new ScrapingProgress($currentPage, $totalPages);
|
||||
$this->eventBus->dispatch(new PageScrapingProgressed($job->getId(), $progress));
|
||||
}
|
||||
|
||||
protected function downloadImage(string $imageUrl, string $destination): void
|
||||
{
|
||||
$this->imageDownloader->download($imageUrl, $destination);
|
||||
}
|
||||
|
||||
protected function createTempDirectory(): TempDirectory
|
||||
{
|
||||
return new TempDirectory(sys_get_temp_dir() . '/' . uniqid('manga_scraper_'));
|
||||
}
|
||||
|
||||
protected function cleanupTempFiles(TempDirectory $tempDirectory): void
|
||||
{
|
||||
$files = glob($tempDirectory->getPath() . '/*');
|
||||
foreach ($files as $file) {
|
||||
if (is_file($file)) {
|
||||
unlink($file);
|
||||
}
|
||||
}
|
||||
rmdir($tempDirectory->getPath());
|
||||
}
|
||||
|
||||
abstract public function supports(string $sourceType): bool;
|
||||
}
|
||||
@@ -3,61 +3,131 @@
|
||||
namespace App\Domain\Scraping\Infrastructure\Service\Scraper;
|
||||
|
||||
use App\Domain\Scraping\Domain\Model\ScrapingJob;
|
||||
use App\Domain\Scraping\Domain\Model\Source;
|
||||
use App\Domain\Scraping\Domain\Model\ValueObject\ImageUrl;
|
||||
use App\Domain\Scraping\Domain\Model\ValueObject\PageNumber;
|
||||
use App\Domain\Scraping\Domain\Repository\SourceRepositoryInterface;
|
||||
use Symfony\Component\DomCrawler\Crawler;
|
||||
use Symfony\Contracts\HttpClient\HttpClientInterface;
|
||||
use Symfony\Component\Messenger\MessageBusInterface;
|
||||
use App\Domain\Scraping\Infrastructure\Service\ImageDownloader;
|
||||
|
||||
class HtmlScraper extends AbstractScraper
|
||||
{
|
||||
protected function scrapePages(ScrapingJob $job): array
|
||||
public function __construct(
|
||||
ImageDownloader $imageDownloader,
|
||||
MessageBusInterface $eventBus,
|
||||
private readonly HttpClientInterface $httpClient,
|
||||
private readonly SourceRepositoryInterface $sourceRepository
|
||||
) {
|
||||
parent::__construct($imageDownloader, $eventBus);
|
||||
}
|
||||
|
||||
public function scrape(ScrapingJob $job): void
|
||||
{
|
||||
$url = $this->buildUrl($job);
|
||||
$response = $this->httpClient->request('GET', $url);
|
||||
$sourceConfig = $this->sourceRepository->getById($job->getSourceId());
|
||||
$tempDir = $this->createTempDirectory();
|
||||
|
||||
try {
|
||||
$pages = $this->scrapePages($job, $sourceConfig);
|
||||
|
||||
foreach ($pages as $index => $imageUrl) {
|
||||
$pageNumber = new PageNumber($index + 1);
|
||||
$extension = pathinfo(parse_url($imageUrl, PHP_URL_PATH), PATHINFO_EXTENSION);
|
||||
$destination = sprintf(
|
||||
'%s/%s.%s',
|
||||
$tempDir->getPath(),
|
||||
$pageNumber->getFormattedNumber(),
|
||||
$extension
|
||||
);
|
||||
|
||||
$this->downloadImage($imageUrl, $destination);
|
||||
$job->addPage($pageNumber, new ImageUrl($imageUrl));
|
||||
|
||||
$this->dispatchProgressEvent($job, $index + 1, count($pages));
|
||||
}
|
||||
|
||||
$job->complete();
|
||||
} catch (\Exception $e) {
|
||||
$job->fail();
|
||||
throw $e;
|
||||
} finally {
|
||||
$this->cleanupTempFiles($tempDir);
|
||||
}
|
||||
}
|
||||
|
||||
protected function scrapePages(ScrapingJob $job, Source $sourceConfig): array
|
||||
{
|
||||
if (!$sourceConfig['next_page_selector']) {
|
||||
return $this->scrapeVerticalReader($job, $sourceConfig);
|
||||
}
|
||||
|
||||
$crawler = new Crawler($response->getContent());
|
||||
$images = $crawler->filter('img.manga-page'); // Adapter selon le site
|
||||
return $this->scrapeHorizontalReader($job, $sourceConfig);
|
||||
}
|
||||
|
||||
private function scrapeVerticalReader(ScrapingJob $job, Source $sourceConfig): array
|
||||
{
|
||||
$html = $this->fetchHtml($this->buildChapterUrl($job, $sourceConfig));
|
||||
$crawler = new Crawler($html);
|
||||
|
||||
return $crawler->filter($sourceConfig['image_selector'])
|
||||
->each(function ($node) {
|
||||
return $this->cleanImageUrl(
|
||||
$node->attr('src') ?: $node->attr('data-src')
|
||||
);
|
||||
});
|
||||
}
|
||||
|
||||
private function scrapeHorizontalReader(ScrapingJob $job, Source $sourceConfig): array
|
||||
{
|
||||
$pages = [];
|
||||
$images->each(function (Crawler $image) use (&$pages) {
|
||||
$pages[] = [
|
||||
'url' => $image->attr('src'),
|
||||
'number' => count($pages) + 1
|
||||
];
|
||||
});
|
||||
|
||||
$currentUrl = $this->buildChapterUrl($job, $sourceConfig);
|
||||
|
||||
while ($currentUrl) {
|
||||
$html = $this->fetchHtml($currentUrl);
|
||||
$crawler = new Crawler($html);
|
||||
|
||||
$imageUrl = $crawler->filter($sourceConfig['image_selector'])
|
||||
->attr('src') ?: $crawler->filter($sourceConfig['image_selector'])
|
||||
->attr('data-src');
|
||||
|
||||
$pages[] = $this->cleanImageUrl($imageUrl);
|
||||
|
||||
$nextLink = $crawler->filter($sourceConfig['next_page_selector']);
|
||||
$currentUrl = $nextLink->count() > 0 ? $nextLink->attr('href') : null;
|
||||
}
|
||||
|
||||
return $pages;
|
||||
}
|
||||
|
||||
protected function downloadPage(ScrapingJob $job, array $page, string $tempDir): void
|
||||
private function fetchHtml(string $url): string
|
||||
{
|
||||
$imageUrl = new ImageUrl($page['url']);
|
||||
$pageNumber = new PageNumber($page['number']);
|
||||
$response = $this->httpClient->request('GET', $url);
|
||||
|
||||
$fileName = sprintf('%s/%03d.%s',
|
||||
$tempDir,
|
||||
$pageNumber->getValue(),
|
||||
$imageUrl->getExtension()
|
||||
if ($response->getStatusCode() >= 400) {
|
||||
throw new \RuntimeException('Failed to fetch page: ' . $url);
|
||||
}
|
||||
|
||||
return $response->getContent();
|
||||
}
|
||||
|
||||
private function cleanImageUrl(string $url): string
|
||||
{
|
||||
// Logique de nettoyage d'URL d'image
|
||||
return $url;
|
||||
}
|
||||
|
||||
|
||||
private function buildChapterUrl(ScrapingJob $job, Source $sourceConfig): string
|
||||
{
|
||||
return sprintf(
|
||||
$sourceConfig->getBaseUrl(),
|
||||
$job->getChapterId()
|
||||
);
|
||||
|
||||
$response = $this->httpClient->request('GET', $imageUrl->getValue());
|
||||
file_put_contents($fileName, $response->getContent());
|
||||
|
||||
$job->addPage($pageNumber, $imageUrl);
|
||||
$this->dispatchProgressEvent($job, $page['number'], count($pages));
|
||||
}
|
||||
|
||||
public function supports(string $sourceType): bool
|
||||
{
|
||||
return $sourceType === 'html';
|
||||
}
|
||||
|
||||
private function buildUrl(ScrapingJob $job): string
|
||||
{
|
||||
// À implémenter selon votre logique de construction d'URL
|
||||
// Vous aurez probablement besoin d'injecter un service pour récupérer les informations du chapitre
|
||||
return sprintf('https://example.com/manga/%s/chapter/%s',
|
||||
$job->getMangaId(),
|
||||
$job->getChapterId()
|
||||
);
|
||||
return 'html' === $sourceType;
|
||||
}
|
||||
}
|
||||
@@ -1,38 +0,0 @@
|
||||
<?php
|
||||
|
||||
namespace App\Domain\Scraping\Infrastructure\Service\Scraper;
|
||||
|
||||
use App\Domain\Scraping\Domain\Model\ScrapingJob;
|
||||
use Symfony\Component\Panther\Client as PantherClient;
|
||||
|
||||
class JavascriptScraper extends AbstractScraper
|
||||
{
|
||||
protected function scrapePages(ScrapingJob $job): array
|
||||
{
|
||||
$client = PantherClient::createChromeClient();
|
||||
try {
|
||||
$url = $this->buildUrl($job);
|
||||
$crawler = $client->request('GET', $url);
|
||||
|
||||
// Attendre que les images soient chargées
|
||||
$crawler->waitFor('img.manga-page');
|
||||
|
||||
$pages = [];
|
||||
$crawler->filter('img.manga-page')->each(function ($image) use (&$pages) {
|
||||
$pages[] = [
|
||||
'url' => $image->attr('src'),
|
||||
'number' => count($pages) + 1
|
||||
];
|
||||
});
|
||||
|
||||
return $pages;
|
||||
} finally {
|
||||
$client->quit();
|
||||
}
|
||||
}
|
||||
|
||||
public function supports(string $sourceType): bool
|
||||
{
|
||||
return $sourceType === 'javascript';
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user