feat: scraping endpoints, job persistence, firsts unit tests, legacy entities usage

This commit is contained in:
ext.jeremy.guillot@maxicoffee.domains
2025-02-07 11:56:51 +01:00
parent c55cd62ec7
commit 0374ab0e46
34 changed files with 348 additions and 326 deletions

1
.gitignore vendored
View File

@@ -23,6 +23,7 @@
###> phpunit/phpunit ### ###> phpunit/phpunit ###
/phpunit.xml /phpunit.xml
.phpunit.result.cache .phpunit.result.cache
.phpunit.cache/*
###< phpunit/phpunit ### ###< phpunit/phpunit ###
###> symfony/webpack-encore-bundle ### ###> symfony/webpack-encore-bundle ###

View File

@@ -138,8 +138,8 @@ twig-extension: ## Create a new twig extension
stimulus: ## Create a new stimulus controller stimulus: ## Create a new stimulus controller
@$(SYMFONY) make:stimulus-controller @$(SYMFONY) make:stimulus-controller
consume: ## Consume messages consume:
@$(SYMFONY) messenger:consume async -vv @$(SYMFONY) messenger:consume commands events -vv
consume-schedule: ## Consume schedule messages consume-schedule: ## Consume schedule messages
@$(SYMFONY) messenger:consume async -vv scheduler_default @$(SYMFONY) messenger:consume async -vv scheduler_default

View File

@@ -1,22 +1,40 @@
framework: framework:
messenger: messenger:
# Uncomment this (and the failed transport below) to send failed messages to this transport for later handling. default_bus: command.bus
# failure_transport: failed # Transports
transports: transports:
# https://symfony.com/doc/current/messenger.html#transport-configuration commands:
async:
dsn: '%env(MESSENGER_TRANSPORT_DSN)%' dsn: '%env(MESSENGER_TRANSPORT_DSN)%'
retry_strategy: options:
max_retries: 0 queue_name: commands
# failed: 'doctrine://default?queue_name=failed' events:
# sync: 'sync://' dsn: '%env(MESSENGER_TRANSPORT_DSN)%'
options:
queue_name: events
# Buses configuration
buses:
command.bus:
middleware:
- validation
- doctrine_transaction
event.bus:
default_middleware: allow_no_handlers
# Message routing
routing: routing:
# Route your messages to the transports # Commands
'App\Message\DownloadChapter': async 'App\Domain\Scraping\Application\Command\ScrapeChapter': commands
'App\Message\RefreshMetadata': async
App\Message\RefreshAndDownloadChapters: async # Events
'App\Domain\Scraping\Domain\Event\ChapterScrapingStarted': events
'App\Domain\Scraping\Domain\Event\ChapterScrapingCompleted': events
'App\Domain\Scraping\Domain\Event\ChapterScrapingFailed': events
# Legacy messages (à garder si nécessaire)
'App\Message\DownloadChapter': commands
'App\Message\RefreshMetadata': commands
'App\Message\RefreshAndDownloadChapters': commands
# when@test: # when@test:
# framework: # framework:

View File

@@ -92,3 +92,7 @@ services:
App\Service\Scraper\MangaScraperService: App\Service\Scraper\MangaScraperService:
arguments: arguments:
$scraperFactory: '@App\Service\Scraper\ScraperFactory' $scraperFactory: '@App\Service\Scraper\ScraperFactory'
App\Domain\Scraping\Infrastructure\Handler\SymfonyScrapeChapterHandler:
tags:
- { name: messenger.message_handler, bus: command.bus }

View File

@@ -0,0 +1,34 @@
<?php
declare(strict_types=1);
namespace DoctrineMigrations;
use Doctrine\DBAL\Schema\Schema;
use Doctrine\Migrations\AbstractMigration;
/**
* Auto-generated Migration: Please modify to your needs!
*/
final class Version20250205231923 extends AbstractMigration
{
public function getDescription(): string
{
return '';
}
public function up(Schema $schema): void
{
// this up() migration is auto-generated, please modify it to your needs
$this->addSql('CREATE TABLE scraping_jobs (id VARCHAR(36) NOT NULL, chapter_number VARCHAR(255) NOT NULL, manga_id VARCHAR(255) NOT NULL, source_id VARCHAR(255) NOT NULL, pages JSON NOT NULL, status VARCHAR(255) NOT NULL, created_at TIMESTAMP(0) WITHOUT TIME ZONE NOT NULL, completed_at TIMESTAMP(0) WITHOUT TIME ZONE DEFAULT NULL, PRIMARY KEY(id))');
$this->addSql('COMMENT ON COLUMN scraping_jobs.created_at IS \'(DC2Type:datetime_immutable)\'');
$this->addSql('COMMENT ON COLUMN scraping_jobs.completed_at IS \'(DC2Type:datetime_immutable)\'');
}
public function down(Schema $schema): void
{
// this down() migration is auto-generated, please modify it to your needs
$this->addSql('CREATE SCHEMA public');
$this->addSql('DROP TABLE scraping_jobs');
}
}

View File

@@ -5,9 +5,9 @@ namespace App\Domain\Scraping\Application\Command;
readonly class ScrapeChapter readonly class ScrapeChapter
{ {
public function __construct( public function __construct(
public string $chapterId, public string $mangaId,
public string $sourceId, public string $chapterNumber,
public string $mangaId public string $sourceId
) { ) {
} }
} }

View File

@@ -7,6 +7,8 @@ use App\Domain\Scraping\Domain\Contract\Repository\ScrapingJobRepositoryInterfac
use App\Domain\Scraping\Domain\Contract\Service\ScraperInterface; use App\Domain\Scraping\Domain\Contract\Service\ScraperInterface;
use App\Domain\Scraping\Domain\Event\ChapterScrapingFailed; use App\Domain\Scraping\Domain\Event\ChapterScrapingFailed;
use App\Domain\Scraping\Domain\Event\ChapterScrapingStarted; use App\Domain\Scraping\Domain\Event\ChapterScrapingStarted;
use App\Domain\Scraping\Domain\Model\ScrapingJob;
use Ramsey\Uuid\Uuid;
use Symfony\Component\Messenger\MessageBusInterface; use Symfony\Component\Messenger\MessageBusInterface;
readonly class ScrapeChapterHandler readonly class ScrapeChapterHandler
@@ -21,10 +23,11 @@ readonly class ScrapeChapterHandler
public function handle(ScrapeChapter $command): void public function handle(ScrapeChapter $command): void
{ {
try { try {
$job = $this->scraper->createScrapingJob( $job = new ScrapingJob(
Uuid::uuid4(),
$command->mangaId, $command->mangaId,
$command->chapterId, $command->chapterNumber,
$command->sourceId, $command->sourceId
); );
$this->scrapingJobRepository->save($job); $this->scrapingJobRepository->save($job);
@@ -33,7 +36,7 @@ readonly class ScrapeChapterHandler
$this->scraper->scrape($job); $this->scraper->scrape($job);
} catch (\Exception $e) { } catch (\Exception $e) {
$this->eventBus->dispatch(new ChapterScrapingFailed($command->chapterId, $e->getMessage())); $this->eventBus->dispatch(new ChapterScrapingFailed($command->mangaId, $command->chapterNumber, $e->getMessage()));
throw $e; throw $e;
} }
} }

View File

@@ -0,0 +1,8 @@
<?php
namespace App\Domain\Scraping\Domain\Contract\Service;
interface ImageDownloader
{
public function download(string $url, string $destination): void;
}

View File

@@ -6,7 +6,6 @@ use App\Domain\Scraping\Domain\Model\ScrapingJob;
interface ScraperInterface interface ScraperInterface
{ {
public function createScrapingJob(string $mangaId, string $chapterId, string $sourceId): ScrapingJob;
public function scrape(ScrapingJob $job): void; public function scrape(ScrapingJob $job): void;
public function supports(string $sourceType): bool; public function supports(string $sourceType): bool;
} }

View File

@@ -2,17 +2,23 @@
namespace App\Domain\Scraping\Domain\Event; namespace App\Domain\Scraping\Domain\Event;
class ChapterScrapingFailed readonly class ChapterScrapingFailed
{ {
public function __construct( public function __construct(
private readonly string $chapterId, private string $mangaId,
private readonly string $reason private string $chapterNumber,
private string $reason
) { ) {
} }
public function getChapterId(): string public function getMangaId(): string
{ {
return $this->chapterId; return $this->mangaId;
}
public function getChapterNumber(): string
{
return $this->chapterNumber;
} }
public function getReason(): string public function getReason(): string

View File

@@ -4,11 +4,11 @@ namespace App\Domain\Scraping\Domain\Event;
use App\Domain\Scraping\Domain\Model\ScrapingProgress; use App\Domain\Scraping\Domain\Model\ScrapingProgress;
class PageScrapingProgressed readonly class PageScrapingProgressed
{ {
public function __construct( public function __construct(
private readonly string $jobId, private string $jobId,
private readonly ScrapingProgress $progress private ScrapingProgress $progress
) { ) {
} }

View File

@@ -0,0 +1,9 @@
<?php
namespace App\Domain\Scraping\Domain\Exception;
use Exception;
class SourceNotFoundException extends Exception
{
}

View File

@@ -16,7 +16,7 @@ class ScrapingJob
public function __construct( public function __construct(
private readonly string $id, private readonly string $id,
private readonly string $mangaId, private readonly string $mangaId,
private readonly string $chapterId, private readonly float $chapterNumber,
private readonly string $sourceId private readonly string $sourceId
) { ) {
$this->status = ScrapingStatus::PENDING; $this->status = ScrapingStatus::PENDING;
@@ -48,9 +48,9 @@ class ScrapingJob
return $this->id; return $this->id;
} }
public function getChapterId(): string public function getChapterNumber(): float
{ {
return $this->chapterId; return $this->chapterNumber;
} }
public function getMangaId(): string public function getMangaId(): string

View File

@@ -2,11 +2,11 @@
namespace App\Domain\Scraping\Domain\Model; namespace App\Domain\Scraping\Domain\Model;
class ScrapingProgress readonly class ScrapingProgress
{ {
public function __construct( public function __construct(
private readonly int $pagesScraped, private int $pagesScraped,
private readonly int $totalPages private int $totalPages
) { ) {
} }

View File

@@ -2,23 +2,24 @@
namespace App\Domain\Scraping\Domain\Model; namespace App\Domain\Scraping\Domain\Model;
use App\Domain\Scraping\Domain\Model\ValueObject\SourceId;
use DateTimeImmutable; use DateTimeImmutable;
class Source readonly class Source
{ {
public function __construct( public function __construct(
private readonly string $id, private SourceId $id,
private readonly string $name, private string $name,
private readonly string $description, private string $description,
private readonly string $baseUrl, private string $baseUrl,
private readonly array $scrappingParameters, private array $scrappingParameters,
private readonly bool $isActive, private bool $isActive,
private readonly DateTimeImmutable $createdAt, private DateTimeImmutable $createdAt,
private readonly DateTimeImmutable $updatedAt private DateTimeImmutable $updatedAt
) { ) {
} }
public function getId(): string public function getId(): SourceId
{ {
return $this->id; return $this->id;
} }

View File

@@ -0,0 +1,42 @@
<?php
namespace App\Domain\Scraping\Domain\Model\ValueObject;
use Symfony\Component\HttpClient\Exception\InvalidArgumentException;
class ChapterUrl
{
private string $chapterUrlFormat;
private string $mangaSlug;
private float $chapterNumber;
public function __construct(
string $chapterUrlFormat,
string $mangaSlug,
float $chapterNumber
) {
$this->chapterUrlFormat = $this->validateUrlFormat($chapterUrlFormat);
$this->mangaSlug = $mangaSlug;
$this->chapterNumber = $chapterNumber;
}
public function getUrl(): string
{
$placeholders = [
'{chapterNumber}' => $this->chapterNumber,
'{slug}' => $this->mangaSlug,
];
return str_replace(array_keys($placeholders), array_values($placeholders), $this->chapterUrlFormat);
}
private function validateUrlFormat(string $format): string
{
if (!str_contains($format, '{slug}') || !str_contains($format, '{chapterNumber}')) {
throw new InvalidArgumentException("The URL format must contain both {slug} and {chapterNumber} placeholders.");
}
return $format;
}
}

View File

@@ -2,10 +2,10 @@
namespace App\Domain\Scraping\Domain\Model\ValueObject; namespace App\Domain\Scraping\Domain\Model\ValueObject;
class ImageUrl readonly class ImageUrl
{ {
public function __construct( public function __construct(
private readonly string $url private string $url
) { ) {
if (!filter_var($url, FILTER_VALIDATE_URL)) { if (!filter_var($url, FILTER_VALIDATE_URL)) {
throw new \InvalidArgumentException('Invalid image URL provided'); throw new \InvalidArgumentException('Invalid image URL provided');

View File

@@ -2,9 +2,9 @@
namespace App\Domain\Scraping\Domain\Model\ValueObject; namespace App\Domain\Scraping\Domain\Model\ValueObject;
class SourceId readonly class SourceId
{ {
public function __construct(private readonly string $value) public function __construct(private string $value)
{ {
if (empty($value)) { if (empty($value)) {
throw new \InvalidArgumentException('Source ID cannot be empty'); throw new \InvalidArgumentException('Source ID cannot be empty');

View File

@@ -2,9 +2,9 @@
namespace App\Domain\Scraping\Domain\Model\ValueObject; namespace App\Domain\Scraping\Domain\Model\ValueObject;
class TempDirectory readonly class TempDirectory
{ {
public function __construct(private readonly string $path) public function __construct(private string $path)
{ {
if (!is_dir($path) && !mkdir($path)) { if (!is_dir($path) && !mkdir($path)) {
throw new \RuntimeException("Failed to create directory: $path"); throw new \RuntimeException("Failed to create directory: $path");

View File

@@ -0,0 +1,21 @@
<?php
namespace App\Domain\Scraping\Infrastructure\Handler;
use App\Domain\Scraping\Application\Command\ScrapeChapter;
use App\Domain\Scraping\Application\CommandHandler\ScrapeChapterHandler;
use Symfony\Component\Messenger\Attribute\AsMessageHandler;
#[AsMessageHandler]
class SymfonyScrapeChapterHandler
{
public function __construct(
private ScrapeChapterHandler $handler
) {
}
public function __invoke(ScrapeChapter $command): void
{
$this->handler->handle($command);
}
}

View File

@@ -1,23 +0,0 @@
<?php
namespace App\Domain\Scraping\Infrastructure\Persistence;
use App\Domain\Scraping\Domain\Contract\Repository\MangaRepositoryInterface;
use App\Domain\Scraping\Domain\Model\Manga;
use App\Domain\Scraping\Infrastructure\Persistence\Entity\MangaEntity;
use Doctrine\ORM\EntityManagerInterface;
class DoctrineMangaRepository implements MangaRepositoryInterface
{
public function __construct(
private readonly EntityManagerInterface $entityManager
) {
}
public function getById(string $id): ?Manga
{
$manga = $this->entityManager->getRepository(MangaEntity::class)->find($id);
return $manga ? $manga->toDomain() : null;
}
}

View File

@@ -1,27 +0,0 @@
<?php
namespace App\Domain\Scraping\Infrastructure\Persistence;
use App\Domain\Scraping\Domain\Model\Source;
use App\Domain\Scraping\Domain\Repository\SourceRepositoryInterface;
use Doctrine\ORM\EntityManagerInterface;
use App\Domain\Scraping\Infrastructure\Persistence\Entity\SourceEntity as SourceEntityEntity;
class DoctrineSourceRepository implements SourceRepositoryInterface
{
public function __construct(
private readonly EntityManagerInterface $entityManager
) {
}
public function getById(string $id): ?Source
{
$sourceEntity = $this->entityManager->getRepository(SourceEntityEntity::class)->find($id);
if (!$sourceEntity) {
return null;
}
return $sourceEntity->toDomain();
}
}

View File

@@ -1,75 +0,0 @@
<?php
namespace App\Domain\Scraping\Infrastructure\Persistence\Entity;
use App\Domain\Scraping\Domain\Model\Manga;
use Doctrine\DBAL\Types\Types;
use Doctrine\ORM\Mapping as ORM;
#[ORM\Entity]
#[ORM\Table(name: 'mangas')]
class MangaEntity
{
#[ORM\Id]
#[ORM\Column(type: 'string', length: 36)]
private string $id;
#[ORM\Column(length: 255)]
private string $title;
#[ORM\Column(length: 255, unique: true)]
private string $slug;
#[ORM\Column(length: 255, nullable: true)]
private ?string $imageUrl = null;
#[ORM\Column(nullable: true)]
private ?int $publicationYear = null;
#[ORM\Column(type: Types::TEXT, nullable: true)]
private ?string $description = null;
#[ORM\Column(type: Types::ARRAY, nullable: true)]
private ?array $genres = null;
#[ORM\Column(type: 'datetime_immutable')]
private \DateTimeImmutable $createdAt;
#[ORM\Column(nullable: true)]
private ?float $rating = null;
#[ORM\Column(length: 255, nullable: true)]
private ?string $author = null;
#[ORM\Column(length: 255, nullable: true)]
private ?string $status = null;
#[ORM\Column]
private bool $monitored;
public static function fromDomain(Manga $manga): self
{
$entity = new self();
$entity->id = $manga->getId();
$entity->title = $manga->getTitle();
$entity->slug = $manga->getSlug();
$entity->description = $manga->getDescription();
$entity->author = $manga->getAuthor();
return $entity;
}
public function toDomain(): Manga
{
$manga = new Manga(
$this->id,
$this->title,
$this->slug,
$this->description,
$this->author
);
return $manga;
}
}

View File

@@ -14,7 +14,7 @@ class ScrapingJobEntity
private string $id; private string $id;
#[ORM\Column(type: 'string')] #[ORM\Column(type: 'string')]
private string $chapterId; private string $chapterNumber;
#[ORM\Column(type: 'string')] #[ORM\Column(type: 'string')]
private string $mangaId; private string $mangaId;
@@ -38,7 +38,7 @@ class ScrapingJobEntity
{ {
$entity = new self(); $entity = new self();
$entity->id = $job->getId(); $entity->id = $job->getId();
$entity->chapterId = $job->getChapterId(); $entity->chapterNumber = $job->getChapterNumber();
$entity->mangaId = $job->getMangaId(); $entity->mangaId = $job->getMangaId();
$entity->sourceId = $job->getSourceId(); $entity->sourceId = $job->getSourceId();
$entity->pages = $job->getPages(); $entity->pages = $job->getPages();
@@ -52,10 +52,10 @@ class ScrapingJobEntity
public function toDomain(): ScrapingJob public function toDomain(): ScrapingJob
{ {
$job = new ScrapingJob( $job = new ScrapingJob(
$this->id, id: $this->id,
$this->chapterId, mangaId: $this->mangaId,
$this->mangaId, chapterNumber: $this->chapterNumber,
$this->sourceId sourceId: $this->sourceId
); );
return $job; return $job;

View File

@@ -1,65 +0,0 @@
<?php
namespace App\Domain\Scraping\Infrastructure\Persistence\Entity;
use App\Domain\Scraping\Domain\Model\Source;
use Doctrine\ORM\Mapping as ORM;
#[ORM\Entity]
#[ORM\Table(name: 'sources')]
class SourceEntity
{
#[ORM\Id]
#[ORM\Column(type: 'string', length: 36)]
private string $id;
#[ORM\Column(type: 'string', nullable: true)]
private ?string $name = null;
#[ORM\Column(type: 'text', nullable: true)]
private ?string $description = null;
#[ORM\Column(type: 'string')]
private string $baseUrl;
#[ORM\Column(type: 'json')]
private array $scrappingParameters = [];
#[ORM\Column(type: 'boolean')]
private bool $isActive;
#[ORM\Column(type: 'datetime_immutable')]
private \DateTimeImmutable $createdAt;
#[ORM\Column(type: 'datetime_immutable')]
private \DateTimeImmutable $updatedAt;
public static function fromDomain(Source $source): self
{
$entity = new self();
$entity->id = $source->getId();
$entity->name = $source->getName();
$entity->description = $source->getDescription();
$entity->baseUrl = $source->getBaseUrl();
$entity->scrappingParameters = $source->getScrappingParameters();
$entity->isActive = $source->isActive();
$entity->createdAt = $source->getCreatedAt();
$entity->updatedAt = $source->getUpdatedAt();
return $entity;
}
public function toDomain(): Source
{
return new Source(
$this->id,
$this->name ?? '',
$this->description ?? '',
$this->baseUrl,
$this->scrappingParameters,
$this->isActive,
$this->createdAt,
$this->updatedAt
);
}
}

View File

@@ -0,0 +1,30 @@
<?php
namespace App\Domain\Scraping\Infrastructure\Persistence;
use App\Domain\Scraping\Domain\Contract\Repository\MangaRepositoryInterface;
use App\Domain\Scraping\Domain\Model\Manga;
use App\Entity\Manga as EntityManga;
use Doctrine\ORM\EntityManagerInterface;
readonly class LegacyMangaRepository implements MangaRepositoryInterface
{
public function __construct(
private EntityManagerInterface $entityManager
) {
}
public function getById(string $id): ?Manga
{
/** @var EntityManga|null $mangaEntity */
$mangaEntity = $this->entityManager->getRepository(EntityManga::class)->find($id);
return $mangaEntity ? new Manga(
$mangaEntity->getId(),
$mangaEntity->getTitle(),
$mangaEntity->getSlug(),
$mangaEntity->getDescription(),
$mangaEntity->getAuthor(),
) : null;
}
}

View File

@@ -0,0 +1,49 @@
<?php
namespace App\Domain\Scraping\Infrastructure\Persistence;
use App\Domain\Scraping\Domain\Contract\Repository\SourceRepositoryInterface;
use App\Domain\Scraping\Domain\Exception\SourceNotFoundException;
use App\Domain\Scraping\Domain\Model\Source;
use App\Domain\Scraping\Domain\Model\ValueObject\SourceId;
use App\Entity\ContentSource;
use DateTimeImmutable;
use Doctrine\ORM\EntityManagerInterface;
readonly class LegacySourceRepository implements SourceRepositoryInterface
{
public function __construct(
private EntityManagerInterface $entityManager
) {
}
/**
* @throws SourceNotFoundException
*/
public function getById(string $id): Source
{
/** @var ContentSource|null $source */
$source = $this->entityManager->getRepository(ContentSource::class)->find($id);
if (!$source) {
throw new SourceNotFoundException("Source not found");
}
return new Source(
id: new SourceId($source->getId()),
name: $source->getCleanBaseUrl(),
description: 'Legacy Source: ' . $source->getBaseUrl(),
baseUrl: $source->getBaseUrl(),
scrappingParameters: [
'imageSelector' => $source->getImageSelector(),
'nextPageSelector' => $source->getNextPageSelector(),
'chapterUrlFormat' => $source->getChapterUrlFormat(),
'scrapingType' => $source->getScrapingType(),
'chapterSelector' => $source->getChapterSelector()
],
isActive: true,
createdAt: new DateTimeImmutable(),
updatedAt: new DateTimeImmutable()
);
}
}

View File

@@ -3,11 +3,12 @@
namespace App\Domain\Scraping\Infrastructure\Service; namespace App\Domain\Scraping\Infrastructure\Service;
use Symfony\Contracts\HttpClient\HttpClientInterface; use Symfony\Contracts\HttpClient\HttpClientInterface;
use App\Domain\Scraping\Domain\Contract\Service\ImageDownloader as ImageDownloaderInterface;
class ImageDownloader readonly class ImageDownloader implements ImageDownloaderInterface
{ {
public function __construct( public function __construct(
private readonly HttpClientInterface $httpClient private HttpClientInterface $httpClient
) { ) {
} }

View File

@@ -2,9 +2,7 @@
namespace App\Domain\Scraping\Infrastructure\Service\Scraper; namespace App\Domain\Scraping\Infrastructure\Service\Scraper;
use App\Domain\Scraping\Domain\Contract\ScraperInterface; use App\Domain\Scraping\Domain\Contract\Service\ScraperInterface;
use App\Domain\Scraping\Domain\Event\ChapterScrapingCompleted;
use App\Domain\Scraping\Domain\Event\ChapterScrapingStarted;
use App\Domain\Scraping\Domain\Event\PageScrapingProgressed; use App\Domain\Scraping\Domain\Event\PageScrapingProgressed;
use App\Domain\Scraping\Domain\Model\ScrapingJob; use App\Domain\Scraping\Domain\Model\ScrapingJob;
use App\Domain\Scraping\Domain\Model\ScrapingProgress; use App\Domain\Scraping\Domain\Model\ScrapingProgress;
@@ -22,16 +20,6 @@ abstract class AbstractScraper implements ScraperInterface
) { ) {
} }
public function createScrapingJob(string $mangaId, string $chapterId, string $sourceId): ScrapingJob
{
return new ScrapingJob(
Uuid::uuid4()->toString(),
$mangaId,
$chapterId,
$sourceId,
);
}
abstract public function scrape(ScrapingJob $job): void; abstract public function scrape(ScrapingJob $job): void;
abstract protected function scrapePages(ScrapingJob $job, Source $source): array; abstract protected function scrapePages(ScrapingJob $job, Source $source): array;

View File

@@ -2,11 +2,14 @@
namespace App\Domain\Scraping\Infrastructure\Service\Scraper; namespace App\Domain\Scraping\Infrastructure\Service\Scraper;
use App\Domain\Scraping\Domain\Contract\Repository\MangaRepositoryInterface;
use App\Domain\Scraping\Domain\Contract\Repository\ScrapingJobRepositoryInterface;
use App\Domain\Scraping\Domain\Model\ScrapingJob; use App\Domain\Scraping\Domain\Model\ScrapingJob;
use App\Domain\Scraping\Domain\Model\Source; use App\Domain\Scraping\Domain\Model\Source;
use App\Domain\Scraping\Domain\Model\ValueObject\ImageUrl; use App\Domain\Scraping\Domain\Model\ValueObject\ImageUrl;
use App\Domain\Scraping\Domain\Model\ValueObject\PageNumber; use App\Domain\Scraping\Domain\Model\ValueObject\PageNumber;
use App\Domain\Scraping\Domain\Repository\SourceRepositoryInterface; use App\Domain\Scraping\Domain\Contract\Repository\SourceRepositoryInterface;
use App\Domain\Scraping\Domain\Model\ValueObject\ChapterUrl;
use Symfony\Component\DomCrawler\Crawler; use Symfony\Component\DomCrawler\Crawler;
use Symfony\Contracts\HttpClient\HttpClientInterface; use Symfony\Contracts\HttpClient\HttpClientInterface;
use Symfony\Component\Messenger\MessageBusInterface; use Symfony\Component\Messenger\MessageBusInterface;
@@ -18,7 +21,9 @@ class HtmlScraper extends AbstractScraper
ImageDownloader $imageDownloader, ImageDownloader $imageDownloader,
MessageBusInterface $eventBus, MessageBusInterface $eventBus,
private readonly HttpClientInterface $httpClient, private readonly HttpClientInterface $httpClient,
private readonly SourceRepositoryInterface $sourceRepository private readonly SourceRepositoryInterface $sourceRepository,
private readonly MangaRepositoryInterface $mangaRepository,
private readonly ScrapingJobRepositoryInterface $scrapingJobRepository,
) { ) {
parent::__construct($imageDownloader, $eventBus); parent::__construct($imageDownloader, $eventBus);
} }
@@ -48,21 +53,25 @@ class HtmlScraper extends AbstractScraper
} }
$job->complete(); $job->complete();
$this->scrapingJobRepository->save($job);
} catch (\Exception $e) { } catch (\Exception $e) {
$job->fail(); $job->fail();
$this->scrapingJobRepository->save($job);
throw $e; throw $e;
} finally { } finally {
$this->cleanupTempFiles($tempDir); $this->cleanupTempFiles($tempDir);
} }
} }
protected function scrapePages(ScrapingJob $job, Source $sourceConfig): array protected function scrapePages(ScrapingJob $job, Source $source): array
{ {
if (!$sourceConfig['next_page_selector']) { $scrappingParameters = $source->getScrappingParameters();
return $this->scrapeVerticalReader($job, $sourceConfig);
if (!$scrappingParameters['nextPageSelector']) {
return $this->scrapeVerticalReader($job, $source);
} }
return $this->scrapeHorizontalReader($job, $sourceConfig); return $this->scrapeHorizontalReader($job, $source);
} }
private function scrapeVerticalReader(ScrapingJob $job, Source $sourceConfig): array private function scrapeVerticalReader(ScrapingJob $job, Source $sourceConfig): array
@@ -70,7 +79,7 @@ class HtmlScraper extends AbstractScraper
$html = $this->fetchHtml($this->buildChapterUrl($job, $sourceConfig)); $html = $this->fetchHtml($this->buildChapterUrl($job, $sourceConfig));
$crawler = new Crawler($html); $crawler = new Crawler($html);
return $crawler->filter($sourceConfig['image_selector']) return $crawler->filter($sourceConfig->getScrappingParameters()['imageSelector'])
->each(function ($node) { ->each(function ($node) {
return $this->cleanImageUrl( return $this->cleanImageUrl(
$node->attr('src') ?: $node->attr('data-src') $node->attr('src') ?: $node->attr('data-src')
@@ -87,13 +96,20 @@ class HtmlScraper extends AbstractScraper
$html = $this->fetchHtml($currentUrl); $html = $this->fetchHtml($currentUrl);
$crawler = new Crawler($html); $crawler = new Crawler($html);
$imageUrl = $crawler->filter($sourceConfig['image_selector']) $imageUrl = $crawler->filter($sourceConfig->getScrappingParameters()['imageSelector'])
->attr('src') ?: $crawler->filter($sourceConfig['image_selector']) ->attr('src') ?: $crawler->filter($sourceConfig->getScrappingParameters()['imageSelector'])
->attr('data-src'); ->attr('data-src');
if (!preg_match('/^https?:\/\//', $imageUrl)) {
$urlComponents = parse_url($sourceConfig->getScrappingParameters()['chapterUrlFormat']);
$scheme = $urlComponents['scheme'];
$host = $urlComponents['host'];
$imageUrl = $scheme.'://'.$host.'/'.ltrim($imageUrl, '/');
}
$pages[] = $this->cleanImageUrl($imageUrl); $pages[] = $this->cleanImageUrl($imageUrl);
$nextLink = $crawler->filter($sourceConfig['next_page_selector']); $nextLink = $crawler->filter($sourceConfig->getScrappingParameters()['nextPageSelector']);
$currentUrl = $nextLink->count() > 0 ? $nextLink->attr('href') : null; $currentUrl = $nextLink->count() > 0 ? $nextLink->attr('href') : null;
} }
@@ -120,10 +136,9 @@ class HtmlScraper extends AbstractScraper
private function buildChapterUrl(ScrapingJob $job, Source $sourceConfig): string private function buildChapterUrl(ScrapingJob $job, Source $sourceConfig): string
{ {
return sprintf( $manga = $this->mangaRepository->getById($job->getMangaId());
$sourceConfig->getBaseUrl(), $chapterUrl = new ChapterUrl($sourceConfig->getScrappingParameters()['chapterUrlFormat'], $manga->getSlug(), $job->getChapterNumber());
$job->getChapterId() return $chapterUrl->getUrl();
);
} }
public function supports(string $sourceType): bool public function supports(string $sourceType): bool

View File

@@ -22,21 +22,21 @@ use Symfony\Component\Validator\Constraints as Assert;
#[ORM\Entity(repositoryClass: UserRepository::class)] #[ORM\Entity(repositoryClass: UserRepository::class)]
#[ORM\Table(name: '`user`')] #[ORM\Table(name: '`user`')]
#[ApiResource( // #[ApiResource(
operations: [ // operations: [
new Get(), // new Get(),
new GetCollection(), // new GetCollection(),
new Post( // new Post(
validationContext: ['groups' => ['Default', 'postValidation']] // validationContext: ['groups' => ['Default', 'postValidation']]
), // ),
new Put(), // new Put(),
new Patch(), // new Patch(),
new Delete() // new Delete()
], // ],
normalizationContext: ['groups' => ['user:read']], // normalizationContext: ['groups' => ['user:read']],
denormalizationContext: ['groups' => ['user:write']], // denormalizationContext: ['groups' => ['user:write']],
security: "is_granted('ROLE_USER')" // security: "is_granted('ROLE_USER')"
)] // )]
#[UniqueEntity(fields: ['email'], message: 'There is already an account with this email')] #[UniqueEntity(fields: ['email'], message: 'There is already an account with this email')]
class User implements UserInterface, PasswordAuthenticatedUserInterface class User implements UserInterface, PasswordAuthenticatedUserInterface
{ {

View File

@@ -8,21 +8,8 @@ use Ramsey\Uuid\Uuid;
class InMemoryScraperAdapter implements ScraperInterface class InMemoryScraperAdapter implements ScraperInterface
{ {
private array $jobs = [];
private ?\Exception $shouldThrowException = null; private ?\Exception $shouldThrowException = null;
public function createScrapingJob(string $mangaId, string $chapterId, string $sourceId): ScrapingJob
{
if ($this->shouldThrowException) {
throw $this->shouldThrowException;
}
$job = new ScrapingJob(Uuid::uuid4(), $mangaId, $chapterId, $sourceId);
$this->jobs[] = $job;
return $job;
}
public function scrape(ScrapingJob $job): void public function scrape(ScrapingJob $job): void
{ {
if ($this->shouldThrowException) { if ($this->shouldThrowException) {
@@ -35,11 +22,6 @@ class InMemoryScraperAdapter implements ScraperInterface
$this->shouldThrowException = $exception; $this->shouldThrowException = $exception;
} }
public function getJobs(): array
{
return $this->jobs;
}
public function supports(string $sourceType): bool public function supports(string $sourceType): bool
{ {
return true; return true;

View File

@@ -33,14 +33,14 @@ class ScrapeChapterHandlerTest extends TestCase
public function testHandleSuccessfully(): void public function testHandleSuccessfully(): void
{ {
$command = new ScrapeChapter( $command = new ScrapeChapter(
chapterId: 2, mangaId: 1,
chapterNumber: 2,
sourceId: 3, sourceId: 3,
mangaId: 1
); );
$this->handler->handle($command); $this->handler->handle($command);
$scrapingJobs = $this->scraper->getJobs(); $scrapingJobs = $this->repository->getJobs();
$this->assertCount(1, $scrapingJobs); $this->assertCount(1, $scrapingJobs);
$job = $scrapingJobs[0]; $job = $scrapingJobs[0];
@@ -57,9 +57,9 @@ class ScrapeChapterHandlerTest extends TestCase
public function testHandleThrowsException(): void public function testHandleThrowsException(): void
{ {
$command = new ScrapeChapter( $command = new ScrapeChapter(
chapterId: 2, mangaId: 1,
chapterNumber: 2,
sourceId: 3, sourceId: 3,
mangaId: 1
); );
$exception = new \Exception('Scraping failed'); $exception = new \Exception('Scraping failed');
@@ -72,10 +72,11 @@ class ScrapeChapterHandlerTest extends TestCase
$this->handler->handle($command); $this->handler->handle($command);
} finally { } finally {
$dispatchedMessages = $this->eventBus->getDispatchedMessages(); $dispatchedMessages = $this->eventBus->getDispatchedMessages();
$this->assertCount(1, $dispatchedMessages); $this->assertCount(2, $dispatchedMessages);
$this->assertInstanceOf(ChapterScrapingFailed::class, $dispatchedMessages[0]); $this->assertInstanceOf(ChapterScrapingStarted::class, $dispatchedMessages[0]);
$this->assertEquals(2, $dispatchedMessages[0]->getChapterId()); $this->assertInstanceOf(ChapterScrapingFailed::class, $dispatchedMessages[1]);
$this->assertEquals('Scraping failed', $dispatchedMessages[0]->getReason()); $this->assertEquals(2, $dispatchedMessages[1]->getChapterNumber());
$this->assertEquals('Scraping failed', $dispatchedMessages[1]->getReason());
} }
} }
} }

View File

@@ -30,7 +30,7 @@ class ScrapingStatusTest extends ApiTestCase
{ {
// Given // Given
$jobId = Uuid::uuid4()->toString(); $jobId = Uuid::uuid4()->toString();
$job = new ScrapingJob($jobId, 'manga-123', 'chapter-456', 'source-789'); $job = new ScrapingJob($jobId, 'manga-123', 1, 'source-789');
$job->addPage(new PageNumber(1), new ImageUrl('http://example.com/page1.jpg')); $job->addPage(new PageNumber(1), new ImageUrl('http://example.com/page1.jpg'));
$job->addPage(new PageNumber(2), new ImageUrl('http://example.com/page2.jpg')); $job->addPage(new PageNumber(2), new ImageUrl('http://example.com/page2.jpg'));