From 0374ab0e46651c9a7335b12042f8dc55231b08af Mon Sep 17 00:00:00 2001 From: "ext.jeremy.guillot@maxicoffee.domains" Date: Fri, 7 Feb 2025 11:56:51 +0100 Subject: [PATCH] feat: scraping endpoints, job persistence, firsts unit tests, legacy entities usage --- .gitignore | 1 + Makefile | 4 +- config/packages/messenger.yaml | 44 +++++++---- config/services.yaml | 4 + migrations/Version20250205231923.php | 34 +++++++++ .../Application/Command/ScrapeChapter.php | 6 +- .../CommandHandler/ScrapeChapterHandler.php | 11 ++- .../Contract/Service/ImageDownloader.php | 8 ++ .../Contract/Service/ScraperInterface.php | 1 - .../Domain/Event/ChapterScrapingFailed.php | 16 ++-- .../Domain/Event/PageScrapingProgressed.php | 6 +- .../Exception/SourceNotFoundException.php | 9 +++ .../Scraping/Domain/Model/ScrapingJob.php | 6 +- .../Domain/Model/ScrapingProgress.php | 6 +- src/Domain/Scraping/Domain/Model/Source.php | 21 +++--- .../Domain/Model/ValueObject/ChapterUrl.php | 42 +++++++++++ .../Domain/Model/ValueObject/ImageUrl.php | 4 +- .../Domain/Model/ValueObject/SourceId.php | 4 +- .../Model/ValueObject/TempDirectory.php | 4 +- .../Handler/SymfonyScrapeChapterHandler.php | 21 ++++++ .../Persistence/DoctrineMangaRepository.php | 23 ------ .../Persistence/DoctrineSourceRepository.php | 27 ------- .../Persistence/Entity/MangaEntity.php | 75 ------------------- .../Persistence/Entity/ScrapingJobEntity.php | 12 +-- .../Persistence/Entity/SourceEntity.php | 65 ---------------- .../Persistence/LegacyMangaRepository.php | 30 ++++++++ .../Persistence/LegacySourceRepository.php | 49 ++++++++++++ .../Service/ImageDownloader.php | 5 +- .../Service/Scraper/AbstractScraper.php | 14 +--- .../Service/Scraper/HtmlScraper.php | 43 +++++++---- src/Entity/{User.php.old => User.php} | 30 ++++---- .../Adapter/InMemoryScraperAdapter.php | 18 ----- .../ScrapeChapterHandlerTest.php | 19 ++--- tests/Feature/Scraping/ScrapingStatusTest.php | 12 +-- 34 files changed, 348 insertions(+), 326 deletions(-) create mode 100644 migrations/Version20250205231923.php create mode 100644 src/Domain/Scraping/Domain/Contract/Service/ImageDownloader.php create mode 100644 src/Domain/Scraping/Domain/Exception/SourceNotFoundException.php create mode 100644 src/Domain/Scraping/Domain/Model/ValueObject/ChapterUrl.php create mode 100644 src/Domain/Scraping/Infrastructure/Handler/SymfonyScrapeChapterHandler.php delete mode 100644 src/Domain/Scraping/Infrastructure/Persistence/DoctrineMangaRepository.php delete mode 100644 src/Domain/Scraping/Infrastructure/Persistence/DoctrineSourceRepository.php delete mode 100644 src/Domain/Scraping/Infrastructure/Persistence/Entity/MangaEntity.php delete mode 100644 src/Domain/Scraping/Infrastructure/Persistence/Entity/SourceEntity.php create mode 100644 src/Domain/Scraping/Infrastructure/Persistence/LegacyMangaRepository.php create mode 100644 src/Domain/Scraping/Infrastructure/Persistence/LegacySourceRepository.php rename src/Entity/{User.php.old => User.php} (92%) diff --git a/.gitignore b/.gitignore index a2c57c7..d3814ec 100644 --- a/.gitignore +++ b/.gitignore @@ -23,6 +23,7 @@ ###> phpunit/phpunit ### /phpunit.xml .phpunit.result.cache +.phpunit.cache/* ###< phpunit/phpunit ### ###> symfony/webpack-encore-bundle ### diff --git a/Makefile b/Makefile index 9abd6c1..5dca719 100644 --- a/Makefile +++ b/Makefile @@ -138,8 +138,8 @@ twig-extension: ## Create a new twig extension stimulus: ## Create a new stimulus controller @$(SYMFONY) make:stimulus-controller -consume: ## Consume messages - @$(SYMFONY) messenger:consume async -vv +consume: + @$(SYMFONY) messenger:consume commands events -vv consume-schedule: ## Consume schedule messages @$(SYMFONY) messenger:consume async -vv scheduler_default diff --git a/config/packages/messenger.yaml b/config/packages/messenger.yaml index 9e6fddf..d8acfe8 100644 --- a/config/packages/messenger.yaml +++ b/config/packages/messenger.yaml @@ -1,22 +1,40 @@ framework: messenger: - # Uncomment this (and the failed transport below) to send failed messages to this transport for later handling. - # failure_transport: failed - + default_bus: command.bus + # Transports transports: - # https://symfony.com/doc/current/messenger.html#transport-configuration - async: + commands: dsn: '%env(MESSENGER_TRANSPORT_DSN)%' - retry_strategy: - max_retries: 0 - # failed: 'doctrine://default?queue_name=failed' - # sync: 'sync://' + options: + queue_name: commands + events: + dsn: '%env(MESSENGER_TRANSPORT_DSN)%' + options: + queue_name: events + # Buses configuration + buses: + command.bus: + middleware: + - validation + - doctrine_transaction + event.bus: + default_middleware: allow_no_handlers + + # Message routing routing: - # Route your messages to the transports - 'App\Message\DownloadChapter': async - 'App\Message\RefreshMetadata': async - App\Message\RefreshAndDownloadChapters: async + # Commands + 'App\Domain\Scraping\Application\Command\ScrapeChapter': commands + + # Events + 'App\Domain\Scraping\Domain\Event\ChapterScrapingStarted': events + 'App\Domain\Scraping\Domain\Event\ChapterScrapingCompleted': events + 'App\Domain\Scraping\Domain\Event\ChapterScrapingFailed': events + + # Legacy messages (à garder si nécessaire) + 'App\Message\DownloadChapter': commands + 'App\Message\RefreshMetadata': commands + 'App\Message\RefreshAndDownloadChapters': commands # when@test: # framework: diff --git a/config/services.yaml b/config/services.yaml index 5741937..860d147 100644 --- a/config/services.yaml +++ b/config/services.yaml @@ -92,3 +92,7 @@ services: App\Service\Scraper\MangaScraperService: arguments: $scraperFactory: '@App\Service\Scraper\ScraperFactory' + + App\Domain\Scraping\Infrastructure\Handler\SymfonyScrapeChapterHandler: + tags: + - { name: messenger.message_handler, bus: command.bus } diff --git a/migrations/Version20250205231923.php b/migrations/Version20250205231923.php new file mode 100644 index 0000000..8c996a5 --- /dev/null +++ b/migrations/Version20250205231923.php @@ -0,0 +1,34 @@ +addSql('CREATE TABLE scraping_jobs (id VARCHAR(36) NOT NULL, chapter_number VARCHAR(255) NOT NULL, manga_id VARCHAR(255) NOT NULL, source_id VARCHAR(255) NOT NULL, pages JSON NOT NULL, status VARCHAR(255) NOT NULL, created_at TIMESTAMP(0) WITHOUT TIME ZONE NOT NULL, completed_at TIMESTAMP(0) WITHOUT TIME ZONE DEFAULT NULL, PRIMARY KEY(id))'); + $this->addSql('COMMENT ON COLUMN scraping_jobs.created_at IS \'(DC2Type:datetime_immutable)\''); + $this->addSql('COMMENT ON COLUMN scraping_jobs.completed_at IS \'(DC2Type:datetime_immutable)\''); + } + + public function down(Schema $schema): void + { + // this down() migration is auto-generated, please modify it to your needs + $this->addSql('CREATE SCHEMA public'); + $this->addSql('DROP TABLE scraping_jobs'); + } +} diff --git a/src/Domain/Scraping/Application/Command/ScrapeChapter.php b/src/Domain/Scraping/Application/Command/ScrapeChapter.php index cc91465..ff4ed18 100644 --- a/src/Domain/Scraping/Application/Command/ScrapeChapter.php +++ b/src/Domain/Scraping/Application/Command/ScrapeChapter.php @@ -5,9 +5,9 @@ namespace App\Domain\Scraping\Application\Command; readonly class ScrapeChapter { public function __construct( - public string $chapterId, - public string $sourceId, - public string $mangaId + public string $mangaId, + public string $chapterNumber, + public string $sourceId ) { } } diff --git a/src/Domain/Scraping/Application/CommandHandler/ScrapeChapterHandler.php b/src/Domain/Scraping/Application/CommandHandler/ScrapeChapterHandler.php index b715ffa..44b30e2 100644 --- a/src/Domain/Scraping/Application/CommandHandler/ScrapeChapterHandler.php +++ b/src/Domain/Scraping/Application/CommandHandler/ScrapeChapterHandler.php @@ -7,6 +7,8 @@ use App\Domain\Scraping\Domain\Contract\Repository\ScrapingJobRepositoryInterfac use App\Domain\Scraping\Domain\Contract\Service\ScraperInterface; use App\Domain\Scraping\Domain\Event\ChapterScrapingFailed; use App\Domain\Scraping\Domain\Event\ChapterScrapingStarted; +use App\Domain\Scraping\Domain\Model\ScrapingJob; +use Ramsey\Uuid\Uuid; use Symfony\Component\Messenger\MessageBusInterface; readonly class ScrapeChapterHandler @@ -21,10 +23,11 @@ readonly class ScrapeChapterHandler public function handle(ScrapeChapter $command): void { try { - $job = $this->scraper->createScrapingJob( + $job = new ScrapingJob( + Uuid::uuid4(), $command->mangaId, - $command->chapterId, - $command->sourceId, + $command->chapterNumber, + $command->sourceId ); $this->scrapingJobRepository->save($job); @@ -33,7 +36,7 @@ readonly class ScrapeChapterHandler $this->scraper->scrape($job); } catch (\Exception $e) { - $this->eventBus->dispatch(new ChapterScrapingFailed($command->chapterId, $e->getMessage())); + $this->eventBus->dispatch(new ChapterScrapingFailed($command->mangaId, $command->chapterNumber, $e->getMessage())); throw $e; } } diff --git a/src/Domain/Scraping/Domain/Contract/Service/ImageDownloader.php b/src/Domain/Scraping/Domain/Contract/Service/ImageDownloader.php new file mode 100644 index 0000000..db9b5df --- /dev/null +++ b/src/Domain/Scraping/Domain/Contract/Service/ImageDownloader.php @@ -0,0 +1,8 @@ +chapterId; + return $this->mangaId; + } + + public function getChapterNumber(): string + { + return $this->chapterNumber; } public function getReason(): string diff --git a/src/Domain/Scraping/Domain/Event/PageScrapingProgressed.php b/src/Domain/Scraping/Domain/Event/PageScrapingProgressed.php index fb3cab8..3fe846d 100644 --- a/src/Domain/Scraping/Domain/Event/PageScrapingProgressed.php +++ b/src/Domain/Scraping/Domain/Event/PageScrapingProgressed.php @@ -4,11 +4,11 @@ namespace App\Domain\Scraping\Domain\Event; use App\Domain\Scraping\Domain\Model\ScrapingProgress; -class PageScrapingProgressed +readonly class PageScrapingProgressed { public function __construct( - private readonly string $jobId, - private readonly ScrapingProgress $progress + private string $jobId, + private ScrapingProgress $progress ) { } diff --git a/src/Domain/Scraping/Domain/Exception/SourceNotFoundException.php b/src/Domain/Scraping/Domain/Exception/SourceNotFoundException.php new file mode 100644 index 0000000..56aa369 --- /dev/null +++ b/src/Domain/Scraping/Domain/Exception/SourceNotFoundException.php @@ -0,0 +1,9 @@ +status = ScrapingStatus::PENDING; @@ -48,9 +48,9 @@ class ScrapingJob return $this->id; } - public function getChapterId(): string + public function getChapterNumber(): float { - return $this->chapterId; + return $this->chapterNumber; } public function getMangaId(): string diff --git a/src/Domain/Scraping/Domain/Model/ScrapingProgress.php b/src/Domain/Scraping/Domain/Model/ScrapingProgress.php index fb48ca7..768468c 100644 --- a/src/Domain/Scraping/Domain/Model/ScrapingProgress.php +++ b/src/Domain/Scraping/Domain/Model/ScrapingProgress.php @@ -2,11 +2,11 @@ namespace App\Domain\Scraping\Domain\Model; -class ScrapingProgress +readonly class ScrapingProgress { public function __construct( - private readonly int $pagesScraped, - private readonly int $totalPages + private int $pagesScraped, + private int $totalPages ) { } diff --git a/src/Domain/Scraping/Domain/Model/Source.php b/src/Domain/Scraping/Domain/Model/Source.php index ec123ac..9d88f17 100644 --- a/src/Domain/Scraping/Domain/Model/Source.php +++ b/src/Domain/Scraping/Domain/Model/Source.php @@ -2,23 +2,24 @@ namespace App\Domain\Scraping\Domain\Model; +use App\Domain\Scraping\Domain\Model\ValueObject\SourceId; use DateTimeImmutable; -class Source +readonly class Source { public function __construct( - private readonly string $id, - private readonly string $name, - private readonly string $description, - private readonly string $baseUrl, - private readonly array $scrappingParameters, - private readonly bool $isActive, - private readonly DateTimeImmutable $createdAt, - private readonly DateTimeImmutable $updatedAt + private SourceId $id, + private string $name, + private string $description, + private string $baseUrl, + private array $scrappingParameters, + private bool $isActive, + private DateTimeImmutable $createdAt, + private DateTimeImmutable $updatedAt ) { } - public function getId(): string + public function getId(): SourceId { return $this->id; } diff --git a/src/Domain/Scraping/Domain/Model/ValueObject/ChapterUrl.php b/src/Domain/Scraping/Domain/Model/ValueObject/ChapterUrl.php new file mode 100644 index 0000000..32f78f8 --- /dev/null +++ b/src/Domain/Scraping/Domain/Model/ValueObject/ChapterUrl.php @@ -0,0 +1,42 @@ +chapterUrlFormat = $this->validateUrlFormat($chapterUrlFormat); + $this->mangaSlug = $mangaSlug; + $this->chapterNumber = $chapterNumber; + } + + public function getUrl(): string + { + $placeholders = [ + '{chapterNumber}' => $this->chapterNumber, + '{slug}' => $this->mangaSlug, + ]; + + return str_replace(array_keys($placeholders), array_values($placeholders), $this->chapterUrlFormat); + } + + private function validateUrlFormat(string $format): string + { + if (!str_contains($format, '{slug}') || !str_contains($format, '{chapterNumber}')) { + throw new InvalidArgumentException("The URL format must contain both {slug} and {chapterNumber} placeholders."); + } + + return $format; + } + +} diff --git a/src/Domain/Scraping/Domain/Model/ValueObject/ImageUrl.php b/src/Domain/Scraping/Domain/Model/ValueObject/ImageUrl.php index 13116b1..95df920 100644 --- a/src/Domain/Scraping/Domain/Model/ValueObject/ImageUrl.php +++ b/src/Domain/Scraping/Domain/Model/ValueObject/ImageUrl.php @@ -2,10 +2,10 @@ namespace App\Domain\Scraping\Domain\Model\ValueObject; -class ImageUrl +readonly class ImageUrl { public function __construct( - private readonly string $url + private string $url ) { if (!filter_var($url, FILTER_VALIDATE_URL)) { throw new \InvalidArgumentException('Invalid image URL provided'); diff --git a/src/Domain/Scraping/Domain/Model/ValueObject/SourceId.php b/src/Domain/Scraping/Domain/Model/ValueObject/SourceId.php index b64b92d..2933eb2 100644 --- a/src/Domain/Scraping/Domain/Model/ValueObject/SourceId.php +++ b/src/Domain/Scraping/Domain/Model/ValueObject/SourceId.php @@ -2,9 +2,9 @@ namespace App\Domain\Scraping\Domain\Model\ValueObject; -class SourceId +readonly class SourceId { - public function __construct(private readonly string $value) + public function __construct(private string $value) { if (empty($value)) { throw new \InvalidArgumentException('Source ID cannot be empty'); diff --git a/src/Domain/Scraping/Domain/Model/ValueObject/TempDirectory.php b/src/Domain/Scraping/Domain/Model/ValueObject/TempDirectory.php index 656409b..d3528ab 100644 --- a/src/Domain/Scraping/Domain/Model/ValueObject/TempDirectory.php +++ b/src/Domain/Scraping/Domain/Model/ValueObject/TempDirectory.php @@ -2,9 +2,9 @@ namespace App\Domain\Scraping\Domain\Model\ValueObject; -class TempDirectory +readonly class TempDirectory { - public function __construct(private readonly string $path) + public function __construct(private string $path) { if (!is_dir($path) && !mkdir($path)) { throw new \RuntimeException("Failed to create directory: $path"); diff --git a/src/Domain/Scraping/Infrastructure/Handler/SymfonyScrapeChapterHandler.php b/src/Domain/Scraping/Infrastructure/Handler/SymfonyScrapeChapterHandler.php new file mode 100644 index 0000000..be2a803 --- /dev/null +++ b/src/Domain/Scraping/Infrastructure/Handler/SymfonyScrapeChapterHandler.php @@ -0,0 +1,21 @@ +handler->handle($command); + } +} diff --git a/src/Domain/Scraping/Infrastructure/Persistence/DoctrineMangaRepository.php b/src/Domain/Scraping/Infrastructure/Persistence/DoctrineMangaRepository.php deleted file mode 100644 index 48fe41d..0000000 --- a/src/Domain/Scraping/Infrastructure/Persistence/DoctrineMangaRepository.php +++ /dev/null @@ -1,23 +0,0 @@ -entityManager->getRepository(MangaEntity::class)->find($id); - - return $manga ? $manga->toDomain() : null; - } -} diff --git a/src/Domain/Scraping/Infrastructure/Persistence/DoctrineSourceRepository.php b/src/Domain/Scraping/Infrastructure/Persistence/DoctrineSourceRepository.php deleted file mode 100644 index cb5f421..0000000 --- a/src/Domain/Scraping/Infrastructure/Persistence/DoctrineSourceRepository.php +++ /dev/null @@ -1,27 +0,0 @@ -entityManager->getRepository(SourceEntityEntity::class)->find($id); - - if (!$sourceEntity) { - return null; - } - - return $sourceEntity->toDomain(); - } -} diff --git a/src/Domain/Scraping/Infrastructure/Persistence/Entity/MangaEntity.php b/src/Domain/Scraping/Infrastructure/Persistence/Entity/MangaEntity.php deleted file mode 100644 index 61e1de3..0000000 --- a/src/Domain/Scraping/Infrastructure/Persistence/Entity/MangaEntity.php +++ /dev/null @@ -1,75 +0,0 @@ -id = $manga->getId(); - $entity->title = $manga->getTitle(); - $entity->slug = $manga->getSlug(); - $entity->description = $manga->getDescription(); - $entity->author = $manga->getAuthor(); - - - return $entity; - } - - public function toDomain(): Manga - { - $manga = new Manga( - $this->id, - $this->title, - $this->slug, - $this->description, - $this->author - ); - - return $manga; - } -} diff --git a/src/Domain/Scraping/Infrastructure/Persistence/Entity/ScrapingJobEntity.php b/src/Domain/Scraping/Infrastructure/Persistence/Entity/ScrapingJobEntity.php index e5b2672..c2f21c0 100644 --- a/src/Domain/Scraping/Infrastructure/Persistence/Entity/ScrapingJobEntity.php +++ b/src/Domain/Scraping/Infrastructure/Persistence/Entity/ScrapingJobEntity.php @@ -14,7 +14,7 @@ class ScrapingJobEntity private string $id; #[ORM\Column(type: 'string')] - private string $chapterId; + private string $chapterNumber; #[ORM\Column(type: 'string')] private string $mangaId; @@ -38,7 +38,7 @@ class ScrapingJobEntity { $entity = new self(); $entity->id = $job->getId(); - $entity->chapterId = $job->getChapterId(); + $entity->chapterNumber = $job->getChapterNumber(); $entity->mangaId = $job->getMangaId(); $entity->sourceId = $job->getSourceId(); $entity->pages = $job->getPages(); @@ -52,10 +52,10 @@ class ScrapingJobEntity public function toDomain(): ScrapingJob { $job = new ScrapingJob( - $this->id, - $this->chapterId, - $this->mangaId, - $this->sourceId + id: $this->id, + mangaId: $this->mangaId, + chapterNumber: $this->chapterNumber, + sourceId: $this->sourceId ); return $job; diff --git a/src/Domain/Scraping/Infrastructure/Persistence/Entity/SourceEntity.php b/src/Domain/Scraping/Infrastructure/Persistence/Entity/SourceEntity.php deleted file mode 100644 index 8649693..0000000 --- a/src/Domain/Scraping/Infrastructure/Persistence/Entity/SourceEntity.php +++ /dev/null @@ -1,65 +0,0 @@ -id = $source->getId(); - $entity->name = $source->getName(); - $entity->description = $source->getDescription(); - $entity->baseUrl = $source->getBaseUrl(); - $entity->scrappingParameters = $source->getScrappingParameters(); - $entity->isActive = $source->isActive(); - $entity->createdAt = $source->getCreatedAt(); - $entity->updatedAt = $source->getUpdatedAt(); - - return $entity; - } - - public function toDomain(): Source - { - return new Source( - $this->id, - $this->name ?? '', - $this->description ?? '', - $this->baseUrl, - $this->scrappingParameters, - $this->isActive, - $this->createdAt, - $this->updatedAt - ); - } -} diff --git a/src/Domain/Scraping/Infrastructure/Persistence/LegacyMangaRepository.php b/src/Domain/Scraping/Infrastructure/Persistence/LegacyMangaRepository.php new file mode 100644 index 0000000..50a7028 --- /dev/null +++ b/src/Domain/Scraping/Infrastructure/Persistence/LegacyMangaRepository.php @@ -0,0 +1,30 @@ +entityManager->getRepository(EntityManga::class)->find($id); + + return $mangaEntity ? new Manga( + $mangaEntity->getId(), + $mangaEntity->getTitle(), + $mangaEntity->getSlug(), + $mangaEntity->getDescription(), + $mangaEntity->getAuthor(), + ) : null; + } +} diff --git a/src/Domain/Scraping/Infrastructure/Persistence/LegacySourceRepository.php b/src/Domain/Scraping/Infrastructure/Persistence/LegacySourceRepository.php new file mode 100644 index 0000000..bf3132a --- /dev/null +++ b/src/Domain/Scraping/Infrastructure/Persistence/LegacySourceRepository.php @@ -0,0 +1,49 @@ +entityManager->getRepository(ContentSource::class)->find($id); + + if (!$source) { + throw new SourceNotFoundException("Source not found"); + } + + return new Source( + id: new SourceId($source->getId()), + name: $source->getCleanBaseUrl(), + description: 'Legacy Source: ' . $source->getBaseUrl(), + baseUrl: $source->getBaseUrl(), + scrappingParameters: [ + 'imageSelector' => $source->getImageSelector(), + 'nextPageSelector' => $source->getNextPageSelector(), + 'chapterUrlFormat' => $source->getChapterUrlFormat(), + 'scrapingType' => $source->getScrapingType(), + 'chapterSelector' => $source->getChapterSelector() + ], + isActive: true, + createdAt: new DateTimeImmutable(), + updatedAt: new DateTimeImmutable() + ); + } +} diff --git a/src/Domain/Scraping/Infrastructure/Service/ImageDownloader.php b/src/Domain/Scraping/Infrastructure/Service/ImageDownloader.php index a728c9a..fffaea7 100644 --- a/src/Domain/Scraping/Infrastructure/Service/ImageDownloader.php +++ b/src/Domain/Scraping/Infrastructure/Service/ImageDownloader.php @@ -3,11 +3,12 @@ namespace App\Domain\Scraping\Infrastructure\Service; use Symfony\Contracts\HttpClient\HttpClientInterface; +use App\Domain\Scraping\Domain\Contract\Service\ImageDownloader as ImageDownloaderInterface; -class ImageDownloader +readonly class ImageDownloader implements ImageDownloaderInterface { public function __construct( - private readonly HttpClientInterface $httpClient + private HttpClientInterface $httpClient ) { } diff --git a/src/Domain/Scraping/Infrastructure/Service/Scraper/AbstractScraper.php b/src/Domain/Scraping/Infrastructure/Service/Scraper/AbstractScraper.php index 6ce59a2..e0262d0 100644 --- a/src/Domain/Scraping/Infrastructure/Service/Scraper/AbstractScraper.php +++ b/src/Domain/Scraping/Infrastructure/Service/Scraper/AbstractScraper.php @@ -2,9 +2,7 @@ namespace App\Domain\Scraping\Infrastructure\Service\Scraper; -use App\Domain\Scraping\Domain\Contract\ScraperInterface; -use App\Domain\Scraping\Domain\Event\ChapterScrapingCompleted; -use App\Domain\Scraping\Domain\Event\ChapterScrapingStarted; +use App\Domain\Scraping\Domain\Contract\Service\ScraperInterface; use App\Domain\Scraping\Domain\Event\PageScrapingProgressed; use App\Domain\Scraping\Domain\Model\ScrapingJob; use App\Domain\Scraping\Domain\Model\ScrapingProgress; @@ -22,16 +20,6 @@ abstract class AbstractScraper implements ScraperInterface ) { } - public function createScrapingJob(string $mangaId, string $chapterId, string $sourceId): ScrapingJob - { - return new ScrapingJob( - Uuid::uuid4()->toString(), - $mangaId, - $chapterId, - $sourceId, - ); - } - abstract public function scrape(ScrapingJob $job): void; abstract protected function scrapePages(ScrapingJob $job, Source $source): array; diff --git a/src/Domain/Scraping/Infrastructure/Service/Scraper/HtmlScraper.php b/src/Domain/Scraping/Infrastructure/Service/Scraper/HtmlScraper.php index 8b8d31d..c423f43 100644 --- a/src/Domain/Scraping/Infrastructure/Service/Scraper/HtmlScraper.php +++ b/src/Domain/Scraping/Infrastructure/Service/Scraper/HtmlScraper.php @@ -2,11 +2,14 @@ namespace App\Domain\Scraping\Infrastructure\Service\Scraper; +use App\Domain\Scraping\Domain\Contract\Repository\MangaRepositoryInterface; +use App\Domain\Scraping\Domain\Contract\Repository\ScrapingJobRepositoryInterface; use App\Domain\Scraping\Domain\Model\ScrapingJob; use App\Domain\Scraping\Domain\Model\Source; use App\Domain\Scraping\Domain\Model\ValueObject\ImageUrl; use App\Domain\Scraping\Domain\Model\ValueObject\PageNumber; -use App\Domain\Scraping\Domain\Repository\SourceRepositoryInterface; +use App\Domain\Scraping\Domain\Contract\Repository\SourceRepositoryInterface; +use App\Domain\Scraping\Domain\Model\ValueObject\ChapterUrl; use Symfony\Component\DomCrawler\Crawler; use Symfony\Contracts\HttpClient\HttpClientInterface; use Symfony\Component\Messenger\MessageBusInterface; @@ -18,7 +21,9 @@ class HtmlScraper extends AbstractScraper ImageDownloader $imageDownloader, MessageBusInterface $eventBus, private readonly HttpClientInterface $httpClient, - private readonly SourceRepositoryInterface $sourceRepository + private readonly SourceRepositoryInterface $sourceRepository, + private readonly MangaRepositoryInterface $mangaRepository, + private readonly ScrapingJobRepositoryInterface $scrapingJobRepository, ) { parent::__construct($imageDownloader, $eventBus); } @@ -48,21 +53,25 @@ class HtmlScraper extends AbstractScraper } $job->complete(); + $this->scrapingJobRepository->save($job); } catch (\Exception $e) { $job->fail(); + $this->scrapingJobRepository->save($job); throw $e; } finally { $this->cleanupTempFiles($tempDir); } } - protected function scrapePages(ScrapingJob $job, Source $sourceConfig): array + protected function scrapePages(ScrapingJob $job, Source $source): array { - if (!$sourceConfig['next_page_selector']) { - return $this->scrapeVerticalReader($job, $sourceConfig); + $scrappingParameters = $source->getScrappingParameters(); + + if (!$scrappingParameters['nextPageSelector']) { + return $this->scrapeVerticalReader($job, $source); } - return $this->scrapeHorizontalReader($job, $sourceConfig); + return $this->scrapeHorizontalReader($job, $source); } private function scrapeVerticalReader(ScrapingJob $job, Source $sourceConfig): array @@ -70,7 +79,7 @@ class HtmlScraper extends AbstractScraper $html = $this->fetchHtml($this->buildChapterUrl($job, $sourceConfig)); $crawler = new Crawler($html); - return $crawler->filter($sourceConfig['image_selector']) + return $crawler->filter($sourceConfig->getScrappingParameters()['imageSelector']) ->each(function ($node) { return $this->cleanImageUrl( $node->attr('src') ?: $node->attr('data-src') @@ -87,13 +96,20 @@ class HtmlScraper extends AbstractScraper $html = $this->fetchHtml($currentUrl); $crawler = new Crawler($html); - $imageUrl = $crawler->filter($sourceConfig['image_selector']) - ->attr('src') ?: $crawler->filter($sourceConfig['image_selector']) + $imageUrl = $crawler->filter($sourceConfig->getScrappingParameters()['imageSelector']) + ->attr('src') ?: $crawler->filter($sourceConfig->getScrappingParameters()['imageSelector']) ->attr('data-src'); + if (!preg_match('/^https?:\/\//', $imageUrl)) { + $urlComponents = parse_url($sourceConfig->getScrappingParameters()['chapterUrlFormat']); + $scheme = $urlComponents['scheme']; + $host = $urlComponents['host']; + $imageUrl = $scheme.'://'.$host.'/'.ltrim($imageUrl, '/'); + } + $pages[] = $this->cleanImageUrl($imageUrl); - $nextLink = $crawler->filter($sourceConfig['next_page_selector']); + $nextLink = $crawler->filter($sourceConfig->getScrappingParameters()['nextPageSelector']); $currentUrl = $nextLink->count() > 0 ? $nextLink->attr('href') : null; } @@ -120,10 +136,9 @@ class HtmlScraper extends AbstractScraper private function buildChapterUrl(ScrapingJob $job, Source $sourceConfig): string { - return sprintf( - $sourceConfig->getBaseUrl(), - $job->getChapterId() - ); + $manga = $this->mangaRepository->getById($job->getMangaId()); + $chapterUrl = new ChapterUrl($sourceConfig->getScrappingParameters()['chapterUrlFormat'], $manga->getSlug(), $job->getChapterNumber()); + return $chapterUrl->getUrl(); } public function supports(string $sourceType): bool diff --git a/src/Entity/User.php.old b/src/Entity/User.php similarity index 92% rename from src/Entity/User.php.old rename to src/Entity/User.php index 0cd837e..c4ed74e 100644 --- a/src/Entity/User.php.old +++ b/src/Entity/User.php @@ -22,21 +22,21 @@ use Symfony\Component\Validator\Constraints as Assert; #[ORM\Entity(repositoryClass: UserRepository::class)] #[ORM\Table(name: '`user`')] -#[ApiResource( - operations: [ - new Get(), - new GetCollection(), - new Post( - validationContext: ['groups' => ['Default', 'postValidation']] - ), - new Put(), - new Patch(), - new Delete() - ], - normalizationContext: ['groups' => ['user:read']], - denormalizationContext: ['groups' => ['user:write']], - security: "is_granted('ROLE_USER')" -)] +// #[ApiResource( +// operations: [ +// new Get(), +// new GetCollection(), +// new Post( +// validationContext: ['groups' => ['Default', 'postValidation']] +// ), +// new Put(), +// new Patch(), +// new Delete() +// ], +// normalizationContext: ['groups' => ['user:read']], +// denormalizationContext: ['groups' => ['user:write']], +// security: "is_granted('ROLE_USER')" +// )] #[UniqueEntity(fields: ['email'], message: 'There is already an account with this email')] class User implements UserInterface, PasswordAuthenticatedUserInterface { diff --git a/tests/Domain/Scraping/Adapter/InMemoryScraperAdapter.php b/tests/Domain/Scraping/Adapter/InMemoryScraperAdapter.php index 13a5e4f..cf53903 100644 --- a/tests/Domain/Scraping/Adapter/InMemoryScraperAdapter.php +++ b/tests/Domain/Scraping/Adapter/InMemoryScraperAdapter.php @@ -8,21 +8,8 @@ use Ramsey\Uuid\Uuid; class InMemoryScraperAdapter implements ScraperInterface { - private array $jobs = []; private ?\Exception $shouldThrowException = null; - public function createScrapingJob(string $mangaId, string $chapterId, string $sourceId): ScrapingJob - { - if ($this->shouldThrowException) { - throw $this->shouldThrowException; - } - - $job = new ScrapingJob(Uuid::uuid4(), $mangaId, $chapterId, $sourceId); - $this->jobs[] = $job; - - return $job; - } - public function scrape(ScrapingJob $job): void { if ($this->shouldThrowException) { @@ -35,11 +22,6 @@ class InMemoryScraperAdapter implements ScraperInterface $this->shouldThrowException = $exception; } - public function getJobs(): array - { - return $this->jobs; - } - public function supports(string $sourceType): bool { return true; diff --git a/tests/Domain/Scraping/Application/CommandHandler/ScrapeChapterHandlerTest.php b/tests/Domain/Scraping/Application/CommandHandler/ScrapeChapterHandlerTest.php index 3fc5269..07aafbe 100644 --- a/tests/Domain/Scraping/Application/CommandHandler/ScrapeChapterHandlerTest.php +++ b/tests/Domain/Scraping/Application/CommandHandler/ScrapeChapterHandlerTest.php @@ -33,14 +33,14 @@ class ScrapeChapterHandlerTest extends TestCase public function testHandleSuccessfully(): void { $command = new ScrapeChapter( - chapterId: 2, + mangaId: 1, + chapterNumber: 2, sourceId: 3, - mangaId: 1 ); $this->handler->handle($command); - $scrapingJobs = $this->scraper->getJobs(); + $scrapingJobs = $this->repository->getJobs(); $this->assertCount(1, $scrapingJobs); $job = $scrapingJobs[0]; @@ -57,9 +57,9 @@ class ScrapeChapterHandlerTest extends TestCase public function testHandleThrowsException(): void { $command = new ScrapeChapter( - chapterId: 2, + mangaId: 1, + chapterNumber: 2, sourceId: 3, - mangaId: 1 ); $exception = new \Exception('Scraping failed'); @@ -72,10 +72,11 @@ class ScrapeChapterHandlerTest extends TestCase $this->handler->handle($command); } finally { $dispatchedMessages = $this->eventBus->getDispatchedMessages(); - $this->assertCount(1, $dispatchedMessages); - $this->assertInstanceOf(ChapterScrapingFailed::class, $dispatchedMessages[0]); - $this->assertEquals(2, $dispatchedMessages[0]->getChapterId()); - $this->assertEquals('Scraping failed', $dispatchedMessages[0]->getReason()); + $this->assertCount(2, $dispatchedMessages); + $this->assertInstanceOf(ChapterScrapingStarted::class, $dispatchedMessages[0]); + $this->assertInstanceOf(ChapterScrapingFailed::class, $dispatchedMessages[1]); + $this->assertEquals(2, $dispatchedMessages[1]->getChapterNumber()); + $this->assertEquals('Scraping failed', $dispatchedMessages[1]->getReason()); } } } diff --git a/tests/Feature/Scraping/ScrapingStatusTest.php b/tests/Feature/Scraping/ScrapingStatusTest.php index b277347..90be206 100644 --- a/tests/Feature/Scraping/ScrapingStatusTest.php +++ b/tests/Feature/Scraping/ScrapingStatusTest.php @@ -21,7 +21,7 @@ class ScrapingStatusTest extends ApiTestCase parent::setUp(); self::bootKernel(); - + $this->messageBus = self::getContainer()->get(MessageBusInterface::class); $this->repository = self::getContainer()->get(ScrapingJobRepositoryInterface::class); } @@ -30,11 +30,11 @@ class ScrapingStatusTest extends ApiTestCase { // Given $jobId = Uuid::uuid4()->toString(); - $job = new ScrapingJob($jobId, 'manga-123', 'chapter-456', 'source-789'); - + $job = new ScrapingJob($jobId, 'manga-123', 1, 'source-789'); + $job->addPage(new PageNumber(1), new ImageUrl('http://example.com/page1.jpg')); $job->addPage(new PageNumber(2), new ImageUrl('http://example.com/page2.jpg')); - + $this->repository->save($job); // When @@ -63,7 +63,7 @@ class ScrapingStatusTest extends ApiTestCase protected function tearDown(): void { parent::tearDown(); - + self::ensureKernelShutdown(); } -} \ No newline at end of file +}