feat: scraping endpoints, job persistence, firsts unit tests, legacy entities usage
This commit is contained in:
parent
c55cd62ec7
commit
0374ab0e46
@@ -0,0 +1,21 @@
|
||||
<?php
|
||||
|
||||
namespace App\Domain\Scraping\Infrastructure\Handler;
|
||||
|
||||
use App\Domain\Scraping\Application\Command\ScrapeChapter;
|
||||
use App\Domain\Scraping\Application\CommandHandler\ScrapeChapterHandler;
|
||||
use Symfony\Component\Messenger\Attribute\AsMessageHandler;
|
||||
|
||||
#[AsMessageHandler]
|
||||
class SymfonyScrapeChapterHandler
|
||||
{
|
||||
public function __construct(
|
||||
private ScrapeChapterHandler $handler
|
||||
) {
|
||||
}
|
||||
|
||||
public function __invoke(ScrapeChapter $command): void
|
||||
{
|
||||
$this->handler->handle($command);
|
||||
}
|
||||
}
|
||||
@@ -1,23 +0,0 @@
|
||||
<?php
|
||||
|
||||
namespace App\Domain\Scraping\Infrastructure\Persistence;
|
||||
|
||||
use App\Domain\Scraping\Domain\Contract\Repository\MangaRepositoryInterface;
|
||||
use App\Domain\Scraping\Domain\Model\Manga;
|
||||
use App\Domain\Scraping\Infrastructure\Persistence\Entity\MangaEntity;
|
||||
use Doctrine\ORM\EntityManagerInterface;
|
||||
|
||||
class DoctrineMangaRepository implements MangaRepositoryInterface
|
||||
{
|
||||
public function __construct(
|
||||
private readonly EntityManagerInterface $entityManager
|
||||
) {
|
||||
}
|
||||
|
||||
public function getById(string $id): ?Manga
|
||||
{
|
||||
$manga = $this->entityManager->getRepository(MangaEntity::class)->find($id);
|
||||
|
||||
return $manga ? $manga->toDomain() : null;
|
||||
}
|
||||
}
|
||||
@@ -1,27 +0,0 @@
|
||||
<?php
|
||||
|
||||
namespace App\Domain\Scraping\Infrastructure\Persistence;
|
||||
|
||||
use App\Domain\Scraping\Domain\Model\Source;
|
||||
use App\Domain\Scraping\Domain\Repository\SourceRepositoryInterface;
|
||||
use Doctrine\ORM\EntityManagerInterface;
|
||||
use App\Domain\Scraping\Infrastructure\Persistence\Entity\SourceEntity as SourceEntityEntity;
|
||||
|
||||
class DoctrineSourceRepository implements SourceRepositoryInterface
|
||||
{
|
||||
public function __construct(
|
||||
private readonly EntityManagerInterface $entityManager
|
||||
) {
|
||||
}
|
||||
|
||||
public function getById(string $id): ?Source
|
||||
{
|
||||
$sourceEntity = $this->entityManager->getRepository(SourceEntityEntity::class)->find($id);
|
||||
|
||||
if (!$sourceEntity) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return $sourceEntity->toDomain();
|
||||
}
|
||||
}
|
||||
@@ -1,75 +0,0 @@
|
||||
<?php
|
||||
|
||||
namespace App\Domain\Scraping\Infrastructure\Persistence\Entity;
|
||||
|
||||
use App\Domain\Scraping\Domain\Model\Manga;
|
||||
use Doctrine\DBAL\Types\Types;
|
||||
use Doctrine\ORM\Mapping as ORM;
|
||||
|
||||
#[ORM\Entity]
|
||||
#[ORM\Table(name: 'mangas')]
|
||||
class MangaEntity
|
||||
{
|
||||
#[ORM\Id]
|
||||
#[ORM\Column(type: 'string', length: 36)]
|
||||
private string $id;
|
||||
|
||||
#[ORM\Column(length: 255)]
|
||||
private string $title;
|
||||
|
||||
#[ORM\Column(length: 255, unique: true)]
|
||||
private string $slug;
|
||||
|
||||
#[ORM\Column(length: 255, nullable: true)]
|
||||
private ?string $imageUrl = null;
|
||||
|
||||
#[ORM\Column(nullable: true)]
|
||||
private ?int $publicationYear = null;
|
||||
|
||||
#[ORM\Column(type: Types::TEXT, nullable: true)]
|
||||
private ?string $description = null;
|
||||
|
||||
#[ORM\Column(type: Types::ARRAY, nullable: true)]
|
||||
private ?array $genres = null;
|
||||
|
||||
#[ORM\Column(type: 'datetime_immutable')]
|
||||
private \DateTimeImmutable $createdAt;
|
||||
|
||||
#[ORM\Column(nullable: true)]
|
||||
private ?float $rating = null;
|
||||
|
||||
#[ORM\Column(length: 255, nullable: true)]
|
||||
private ?string $author = null;
|
||||
|
||||
#[ORM\Column(length: 255, nullable: true)]
|
||||
private ?string $status = null;
|
||||
|
||||
#[ORM\Column]
|
||||
private bool $monitored;
|
||||
|
||||
public static function fromDomain(Manga $manga): self
|
||||
{
|
||||
$entity = new self();
|
||||
$entity->id = $manga->getId();
|
||||
$entity->title = $manga->getTitle();
|
||||
$entity->slug = $manga->getSlug();
|
||||
$entity->description = $manga->getDescription();
|
||||
$entity->author = $manga->getAuthor();
|
||||
|
||||
|
||||
return $entity;
|
||||
}
|
||||
|
||||
public function toDomain(): Manga
|
||||
{
|
||||
$manga = new Manga(
|
||||
$this->id,
|
||||
$this->title,
|
||||
$this->slug,
|
||||
$this->description,
|
||||
$this->author
|
||||
);
|
||||
|
||||
return $manga;
|
||||
}
|
||||
}
|
||||
@@ -14,7 +14,7 @@ class ScrapingJobEntity
|
||||
private string $id;
|
||||
|
||||
#[ORM\Column(type: 'string')]
|
||||
private string $chapterId;
|
||||
private string $chapterNumber;
|
||||
|
||||
#[ORM\Column(type: 'string')]
|
||||
private string $mangaId;
|
||||
@@ -38,7 +38,7 @@ class ScrapingJobEntity
|
||||
{
|
||||
$entity = new self();
|
||||
$entity->id = $job->getId();
|
||||
$entity->chapterId = $job->getChapterId();
|
||||
$entity->chapterNumber = $job->getChapterNumber();
|
||||
$entity->mangaId = $job->getMangaId();
|
||||
$entity->sourceId = $job->getSourceId();
|
||||
$entity->pages = $job->getPages();
|
||||
@@ -52,10 +52,10 @@ class ScrapingJobEntity
|
||||
public function toDomain(): ScrapingJob
|
||||
{
|
||||
$job = new ScrapingJob(
|
||||
$this->id,
|
||||
$this->chapterId,
|
||||
$this->mangaId,
|
||||
$this->sourceId
|
||||
id: $this->id,
|
||||
mangaId: $this->mangaId,
|
||||
chapterNumber: $this->chapterNumber,
|
||||
sourceId: $this->sourceId
|
||||
);
|
||||
|
||||
return $job;
|
||||
|
||||
@@ -1,65 +0,0 @@
|
||||
<?php
|
||||
|
||||
namespace App\Domain\Scraping\Infrastructure\Persistence\Entity;
|
||||
|
||||
use App\Domain\Scraping\Domain\Model\Source;
|
||||
use Doctrine\ORM\Mapping as ORM;
|
||||
|
||||
#[ORM\Entity]
|
||||
#[ORM\Table(name: 'sources')]
|
||||
class SourceEntity
|
||||
{
|
||||
#[ORM\Id]
|
||||
#[ORM\Column(type: 'string', length: 36)]
|
||||
private string $id;
|
||||
|
||||
#[ORM\Column(type: 'string', nullable: true)]
|
||||
private ?string $name = null;
|
||||
|
||||
#[ORM\Column(type: 'text', nullable: true)]
|
||||
private ?string $description = null;
|
||||
|
||||
#[ORM\Column(type: 'string')]
|
||||
private string $baseUrl;
|
||||
|
||||
#[ORM\Column(type: 'json')]
|
||||
private array $scrappingParameters = [];
|
||||
|
||||
#[ORM\Column(type: 'boolean')]
|
||||
private bool $isActive;
|
||||
|
||||
#[ORM\Column(type: 'datetime_immutable')]
|
||||
private \DateTimeImmutable $createdAt;
|
||||
|
||||
#[ORM\Column(type: 'datetime_immutable')]
|
||||
private \DateTimeImmutable $updatedAt;
|
||||
|
||||
public static function fromDomain(Source $source): self
|
||||
{
|
||||
$entity = new self();
|
||||
$entity->id = $source->getId();
|
||||
$entity->name = $source->getName();
|
||||
$entity->description = $source->getDescription();
|
||||
$entity->baseUrl = $source->getBaseUrl();
|
||||
$entity->scrappingParameters = $source->getScrappingParameters();
|
||||
$entity->isActive = $source->isActive();
|
||||
$entity->createdAt = $source->getCreatedAt();
|
||||
$entity->updatedAt = $source->getUpdatedAt();
|
||||
|
||||
return $entity;
|
||||
}
|
||||
|
||||
public function toDomain(): Source
|
||||
{
|
||||
return new Source(
|
||||
$this->id,
|
||||
$this->name ?? '',
|
||||
$this->description ?? '',
|
||||
$this->baseUrl,
|
||||
$this->scrappingParameters,
|
||||
$this->isActive,
|
||||
$this->createdAt,
|
||||
$this->updatedAt
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,30 @@
|
||||
<?php
|
||||
|
||||
namespace App\Domain\Scraping\Infrastructure\Persistence;
|
||||
|
||||
use App\Domain\Scraping\Domain\Contract\Repository\MangaRepositoryInterface;
|
||||
use App\Domain\Scraping\Domain\Model\Manga;
|
||||
use App\Entity\Manga as EntityManga;
|
||||
use Doctrine\ORM\EntityManagerInterface;
|
||||
|
||||
readonly class LegacyMangaRepository implements MangaRepositoryInterface
|
||||
{
|
||||
public function __construct(
|
||||
private EntityManagerInterface $entityManager
|
||||
) {
|
||||
}
|
||||
|
||||
public function getById(string $id): ?Manga
|
||||
{
|
||||
/** @var EntityManga|null $mangaEntity */
|
||||
$mangaEntity = $this->entityManager->getRepository(EntityManga::class)->find($id);
|
||||
|
||||
return $mangaEntity ? new Manga(
|
||||
$mangaEntity->getId(),
|
||||
$mangaEntity->getTitle(),
|
||||
$mangaEntity->getSlug(),
|
||||
$mangaEntity->getDescription(),
|
||||
$mangaEntity->getAuthor(),
|
||||
) : null;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,49 @@
|
||||
<?php
|
||||
|
||||
namespace App\Domain\Scraping\Infrastructure\Persistence;
|
||||
|
||||
use App\Domain\Scraping\Domain\Contract\Repository\SourceRepositoryInterface;
|
||||
use App\Domain\Scraping\Domain\Exception\SourceNotFoundException;
|
||||
use App\Domain\Scraping\Domain\Model\Source;
|
||||
use App\Domain\Scraping\Domain\Model\ValueObject\SourceId;
|
||||
use App\Entity\ContentSource;
|
||||
use DateTimeImmutable;
|
||||
use Doctrine\ORM\EntityManagerInterface;
|
||||
|
||||
readonly class LegacySourceRepository implements SourceRepositoryInterface
|
||||
{
|
||||
public function __construct(
|
||||
private EntityManagerInterface $entityManager
|
||||
) {
|
||||
}
|
||||
|
||||
/**
|
||||
* @throws SourceNotFoundException
|
||||
*/
|
||||
public function getById(string $id): Source
|
||||
{
|
||||
/** @var ContentSource|null $source */
|
||||
$source = $this->entityManager->getRepository(ContentSource::class)->find($id);
|
||||
|
||||
if (!$source) {
|
||||
throw new SourceNotFoundException("Source not found");
|
||||
}
|
||||
|
||||
return new Source(
|
||||
id: new SourceId($source->getId()),
|
||||
name: $source->getCleanBaseUrl(),
|
||||
description: 'Legacy Source: ' . $source->getBaseUrl(),
|
||||
baseUrl: $source->getBaseUrl(),
|
||||
scrappingParameters: [
|
||||
'imageSelector' => $source->getImageSelector(),
|
||||
'nextPageSelector' => $source->getNextPageSelector(),
|
||||
'chapterUrlFormat' => $source->getChapterUrlFormat(),
|
||||
'scrapingType' => $source->getScrapingType(),
|
||||
'chapterSelector' => $source->getChapterSelector()
|
||||
],
|
||||
isActive: true,
|
||||
createdAt: new DateTimeImmutable(),
|
||||
updatedAt: new DateTimeImmutable()
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -3,11 +3,12 @@
|
||||
namespace App\Domain\Scraping\Infrastructure\Service;
|
||||
|
||||
use Symfony\Contracts\HttpClient\HttpClientInterface;
|
||||
use App\Domain\Scraping\Domain\Contract\Service\ImageDownloader as ImageDownloaderInterface;
|
||||
|
||||
class ImageDownloader
|
||||
readonly class ImageDownloader implements ImageDownloaderInterface
|
||||
{
|
||||
public function __construct(
|
||||
private readonly HttpClientInterface $httpClient
|
||||
private HttpClientInterface $httpClient
|
||||
) {
|
||||
}
|
||||
|
||||
|
||||
@@ -2,9 +2,7 @@
|
||||
|
||||
namespace App\Domain\Scraping\Infrastructure\Service\Scraper;
|
||||
|
||||
use App\Domain\Scraping\Domain\Contract\ScraperInterface;
|
||||
use App\Domain\Scraping\Domain\Event\ChapterScrapingCompleted;
|
||||
use App\Domain\Scraping\Domain\Event\ChapterScrapingStarted;
|
||||
use App\Domain\Scraping\Domain\Contract\Service\ScraperInterface;
|
||||
use App\Domain\Scraping\Domain\Event\PageScrapingProgressed;
|
||||
use App\Domain\Scraping\Domain\Model\ScrapingJob;
|
||||
use App\Domain\Scraping\Domain\Model\ScrapingProgress;
|
||||
@@ -22,16 +20,6 @@ abstract class AbstractScraper implements ScraperInterface
|
||||
) {
|
||||
}
|
||||
|
||||
public function createScrapingJob(string $mangaId, string $chapterId, string $sourceId): ScrapingJob
|
||||
{
|
||||
return new ScrapingJob(
|
||||
Uuid::uuid4()->toString(),
|
||||
$mangaId,
|
||||
$chapterId,
|
||||
$sourceId,
|
||||
);
|
||||
}
|
||||
|
||||
abstract public function scrape(ScrapingJob $job): void;
|
||||
|
||||
abstract protected function scrapePages(ScrapingJob $job, Source $source): array;
|
||||
|
||||
@@ -2,11 +2,14 @@
|
||||
|
||||
namespace App\Domain\Scraping\Infrastructure\Service\Scraper;
|
||||
|
||||
use App\Domain\Scraping\Domain\Contract\Repository\MangaRepositoryInterface;
|
||||
use App\Domain\Scraping\Domain\Contract\Repository\ScrapingJobRepositoryInterface;
|
||||
use App\Domain\Scraping\Domain\Model\ScrapingJob;
|
||||
use App\Domain\Scraping\Domain\Model\Source;
|
||||
use App\Domain\Scraping\Domain\Model\ValueObject\ImageUrl;
|
||||
use App\Domain\Scraping\Domain\Model\ValueObject\PageNumber;
|
||||
use App\Domain\Scraping\Domain\Repository\SourceRepositoryInterface;
|
||||
use App\Domain\Scraping\Domain\Contract\Repository\SourceRepositoryInterface;
|
||||
use App\Domain\Scraping\Domain\Model\ValueObject\ChapterUrl;
|
||||
use Symfony\Component\DomCrawler\Crawler;
|
||||
use Symfony\Contracts\HttpClient\HttpClientInterface;
|
||||
use Symfony\Component\Messenger\MessageBusInterface;
|
||||
@@ -18,7 +21,9 @@ class HtmlScraper extends AbstractScraper
|
||||
ImageDownloader $imageDownloader,
|
||||
MessageBusInterface $eventBus,
|
||||
private readonly HttpClientInterface $httpClient,
|
||||
private readonly SourceRepositoryInterface $sourceRepository
|
||||
private readonly SourceRepositoryInterface $sourceRepository,
|
||||
private readonly MangaRepositoryInterface $mangaRepository,
|
||||
private readonly ScrapingJobRepositoryInterface $scrapingJobRepository,
|
||||
) {
|
||||
parent::__construct($imageDownloader, $eventBus);
|
||||
}
|
||||
@@ -48,21 +53,25 @@ class HtmlScraper extends AbstractScraper
|
||||
}
|
||||
|
||||
$job->complete();
|
||||
$this->scrapingJobRepository->save($job);
|
||||
} catch (\Exception $e) {
|
||||
$job->fail();
|
||||
$this->scrapingJobRepository->save($job);
|
||||
throw $e;
|
||||
} finally {
|
||||
$this->cleanupTempFiles($tempDir);
|
||||
}
|
||||
}
|
||||
|
||||
protected function scrapePages(ScrapingJob $job, Source $sourceConfig): array
|
||||
protected function scrapePages(ScrapingJob $job, Source $source): array
|
||||
{
|
||||
if (!$sourceConfig['next_page_selector']) {
|
||||
return $this->scrapeVerticalReader($job, $sourceConfig);
|
||||
$scrappingParameters = $source->getScrappingParameters();
|
||||
|
||||
if (!$scrappingParameters['nextPageSelector']) {
|
||||
return $this->scrapeVerticalReader($job, $source);
|
||||
}
|
||||
|
||||
return $this->scrapeHorizontalReader($job, $sourceConfig);
|
||||
return $this->scrapeHorizontalReader($job, $source);
|
||||
}
|
||||
|
||||
private function scrapeVerticalReader(ScrapingJob $job, Source $sourceConfig): array
|
||||
@@ -70,7 +79,7 @@ class HtmlScraper extends AbstractScraper
|
||||
$html = $this->fetchHtml($this->buildChapterUrl($job, $sourceConfig));
|
||||
$crawler = new Crawler($html);
|
||||
|
||||
return $crawler->filter($sourceConfig['image_selector'])
|
||||
return $crawler->filter($sourceConfig->getScrappingParameters()['imageSelector'])
|
||||
->each(function ($node) {
|
||||
return $this->cleanImageUrl(
|
||||
$node->attr('src') ?: $node->attr('data-src')
|
||||
@@ -87,13 +96,20 @@ class HtmlScraper extends AbstractScraper
|
||||
$html = $this->fetchHtml($currentUrl);
|
||||
$crawler = new Crawler($html);
|
||||
|
||||
$imageUrl = $crawler->filter($sourceConfig['image_selector'])
|
||||
->attr('src') ?: $crawler->filter($sourceConfig['image_selector'])
|
||||
$imageUrl = $crawler->filter($sourceConfig->getScrappingParameters()['imageSelector'])
|
||||
->attr('src') ?: $crawler->filter($sourceConfig->getScrappingParameters()['imageSelector'])
|
||||
->attr('data-src');
|
||||
|
||||
if (!preg_match('/^https?:\/\//', $imageUrl)) {
|
||||
$urlComponents = parse_url($sourceConfig->getScrappingParameters()['chapterUrlFormat']);
|
||||
$scheme = $urlComponents['scheme'];
|
||||
$host = $urlComponents['host'];
|
||||
$imageUrl = $scheme.'://'.$host.'/'.ltrim($imageUrl, '/');
|
||||
}
|
||||
|
||||
$pages[] = $this->cleanImageUrl($imageUrl);
|
||||
|
||||
$nextLink = $crawler->filter($sourceConfig['next_page_selector']);
|
||||
$nextLink = $crawler->filter($sourceConfig->getScrappingParameters()['nextPageSelector']);
|
||||
$currentUrl = $nextLink->count() > 0 ? $nextLink->attr('href') : null;
|
||||
}
|
||||
|
||||
@@ -120,10 +136,9 @@ class HtmlScraper extends AbstractScraper
|
||||
|
||||
private function buildChapterUrl(ScrapingJob $job, Source $sourceConfig): string
|
||||
{
|
||||
return sprintf(
|
||||
$sourceConfig->getBaseUrl(),
|
||||
$job->getChapterId()
|
||||
);
|
||||
$manga = $this->mangaRepository->getById($job->getMangaId());
|
||||
$chapterUrl = new ChapterUrl($sourceConfig->getScrappingParameters()['chapterUrlFormat'], $manga->getSlug(), $job->getChapterNumber());
|
||||
return $chapterUrl->getUrl();
|
||||
}
|
||||
|
||||
public function supports(string $sourceType): bool
|
||||
|
||||
Reference in New Issue
Block a user