feat: finalizing Scraping endpoint

This commit is contained in:
ext.jeremy.guillot@maxicoffee.domains
2025-02-10 17:28:49 +01:00
parent 0374ab0e46
commit 073439163b
28 changed files with 447 additions and 86 deletions

View File

@@ -24,13 +24,13 @@ final readonly class ScrapingStatusStateProvider implements ProviderInterface
}
$progress = 0;
if ($job->getTotalPages() > 0) {
$progress = (count($job->getPages()) / $job->getTotalPages()) * 100;
if ($job->totalPages > 0) {
$progress = (count($job->pages) / $job->totalPages) * 100;
}
return new ScrapingStatusResponse(
jobId: $job->getId(),
status: $job->getStatus()->value,
status: $job->status->value,
progress: $progress
);
}

View File

@@ -17,8 +17,19 @@ readonly class DoctrineScrapingJobRepository implements ScrapingJobRepositoryInt
public function save(ScrapingJob $job): void
{
$entity = ScrapingJobEntity::fromDomain($job);
$this->entityManager->persist($entity);
$existingEntity = $this->entityManager->getRepository(ScrapingJobEntity::class)->find($job->getId());
if ($existingEntity) {
$existingEntity->setStatus($job->status->value);
$existingEntity->setPages($job->pages);
$existingEntity->setCompletedAt($job->completedAt);
$existingEntity->setCbzPath($job->cbzPath?->getPath());
$existingEntity->setFailureReason($job->failureReason);
} else {
$entity = ScrapingJobEntity::fromDomain($job);
$this->entityManager->persist($entity);
}
$this->entityManager->flush();
}

View File

@@ -3,6 +3,7 @@
namespace App\Domain\Scraping\Infrastructure\Persistence\Entity;
use App\Domain\Scraping\Domain\Model\ScrapingJob;
use App\Domain\Scraping\Domain\Model\ScrapingStatus;
use Doctrine\ORM\Mapping as ORM;
#[ORM\Entity]
@@ -28,6 +29,12 @@ class ScrapingJobEntity
#[ORM\Column(type: 'string')]
private string $status;
#[ORM\Column(type: 'string', nullable: true)]
private ?string $cbzPath = null;
#[ORM\Column(type: 'string', nullable: true)]
private ?string $failureReason = '';
#[ORM\Column(type: 'datetime_immutable')]
private \DateTimeImmutable $createdAt;
@@ -41,11 +48,12 @@ class ScrapingJobEntity
$entity->chapterNumber = $job->getChapterNumber();
$entity->mangaId = $job->getMangaId();
$entity->sourceId = $job->getSourceId();
$entity->pages = $job->getPages();
$entity->status = $job->getStatus()->value;
$entity->createdAt = $job->getCreatedAt();
$entity->completedAt = $job->getCompletedAt();
$entity->pages = $job->pages;
$entity->status = $job->status->value;
$entity->createdAt = $job->createdAt;
$entity->completedAt = $job->completedAt;
$entity->cbzPath = $job->cbzPath?->getPath();
$entity->failureReason = $job->failureReason;
return $entity;
}
@@ -58,6 +66,38 @@ class ScrapingJobEntity
sourceId: $this->sourceId
);
$job->status = ScrapingStatus::from($this->status);
$job->pages = $this->pages;
$job->createdAt = $this->createdAt;
$job->completedAt = $this->completedAt;
$job->cbzPath = $this->cbzPath;
$job->failureReason = $this->failureReason;
return $job;
}
public function setStatus(string $status): void
{
$this->status = $status;
}
public function setPages(array $pages): void
{
$this->pages = $pages;
}
public function setCompletedAt(\DateTimeImmutable $completedAt): void
{
$this->completedAt = $completedAt;
}
public function setCbzPath(?string $cbzPath = null): void
{
$this->cbzPath = $cbzPath;
}
public function setFailureReason(string $failureReason): void
{
$this->failureReason = $failureReason;
}
}

View File

@@ -0,0 +1,34 @@
<?php
namespace App\Domain\Scraping\Infrastructure\Persistence;
use App\Domain\Scraping\Domain\Contract\Repository\ChapterRepositoryInterface;
use App\Domain\Scraping\Domain\Exception\ChapterNotFoundException;
use App\Domain\Scraping\Domain\Model\Chapter;
use App\Entity\Chapter as EntityChapter;
use Doctrine\ORM\EntityManagerInterface;
class LegacyChapterRepository implements ChapterRepositoryInterface
{
public function __construct(
private readonly EntityManagerInterface $entityManager,
) {}
public function getByMangaIdAndChapterNumber(string $mangaId, int $chapterNumber): Chapter
{
$chapterEntity = $this->entityManager->getRepository(EntityChapter::class)->findOneBy([
'manga' => $mangaId,
'number' => $chapterNumber,
]);
if (!$chapterEntity) {
throw new ChapterNotFoundException();
}
return new Chapter(
id: $chapterEntity->getId(),
mangaId: $chapterEntity->getManga()->getId(),
chapterNumber: $chapterEntity->getNumber(),
volumeNumber: $chapterEntity->getVolume(),
);
}
}

View File

@@ -25,6 +25,7 @@ readonly class LegacyMangaRepository implements MangaRepositoryInterface
$mangaEntity->getSlug(),
$mangaEntity->getDescription(),
$mangaEntity->getAuthor(),
$mangaEntity->getPublicationYear(),
) : null;
}
}

View File

@@ -0,0 +1,104 @@
<?php
namespace App\Domain\Scraping\Infrastructure\Service;
use App\Domain\Scraping\Domain\Contract\Repository\ChapterRepositoryInterface;
use App\Domain\Scraping\Domain\Contract\Repository\MangaRepositoryInterface;
use App\Domain\Scraping\Domain\Model\Manga;
use App\Domain\Scraping\Domain\Model\ScrapingJob;
use App\Domain\Scraping\Domain\Model\ValueObject\CbzPath;
use App\Domain\Scraping\Domain\Contract\Service\CbzGeneratorInterface;
use App\Domain\Scraping\Domain\Model\Chapter;
use App\Domain\Scraping\Domain\Model\ValueObject\TempDirectory;
use App\Domain\Scraping\Domain\Exception\CbzGenerationException;
use Exception;
readonly class CbzGenerator implements CbzGeneratorInterface
{
public function __construct(
private string $projectDir,
private MangaRepositoryInterface $mangaRepository,
private ChapterRepositoryInterface $chapterRepository,
) {}
/**
* @throws Exception
*/
public function generate(ScrapingJob $job, TempDirectory $tempDirectory): CbzPath
{
$cbzPath = $this->generateCbzPath($job);
$this->createCbzArchive($tempDirectory->getPath(), $cbzPath);
return new CbzPath($cbzPath);
}
private function generateCbzPath(ScrapingJob $job): string
{
$manga = $this->mangaRepository->getById($job->getMangaId());
$chapter = $this->chapterRepository->getByMangaIdAndChapterNumber($job->getMangaId(), $job->getChapterNumber());
$baseDir = sprintf(
'%s/public/cbz/%s/%s',
$this->projectDir,
$manga->getTitle() . ' (' . $manga->getPublicationYear() . ')',
sprintf('volume_%02d', $chapter->volumeNumber)
);
try {
if (!is_dir($baseDir)) {
if (!mkdir($baseDir, 0755, true)) {
throw new CbzGenerationException();
}
}
} catch (Exception $e) {
throw CbzGenerationException::unableToCreateDirectory($baseDir);
}
$chapterNumber = $job->getChapterNumber();
$formattedNumber = $chapterNumber == floor($chapterNumber)
? sprintf('%02d', (int)$chapterNumber)
: sprintf('%04.1f', $chapterNumber);
return sprintf(
'%s/%s_vol%s_ch%s.cbz',
$baseDir,
strtolower($manga->getTitle()),
sprintf('%02d', $chapter->volumeNumber),
$formattedNumber
);
}
/**
* @throws Exception
*/
private function createCbzArchive(string $sourceDirectory, string $destinationPath): void
{
$zip = new \ZipArchive();
if ($zip->open($destinationPath, \ZipArchive::CREATE) !== true) {
throw CbzGenerationException::unableToCreateCbz($destinationPath);
}
try {
$files = new \RecursiveIteratorIterator(
new \RecursiveDirectoryIterator($sourceDirectory),
\RecursiveIteratorIterator::LEAVES_ONLY
);
foreach ($files as $file) {
if (!$file->isDir()) {
$filePath = $file->getRealPath();
$relativePath = substr($filePath, strlen($sourceDirectory) + 1);
if (!$zip->addFile($filePath, $relativePath)) {
throw CbzGenerationException::unableToAddFileToArchive($filePath);
}
}
}
} catch (Exception $e) {
$zip->close();
throw $e;
}
$zip->close();
}
}

View File

@@ -3,7 +3,7 @@
namespace App\Domain\Scraping\Infrastructure\Service;
use Symfony\Contracts\HttpClient\HttpClientInterface;
use App\Domain\Scraping\Domain\Contract\Service\ImageDownloader as ImageDownloaderInterface;
use App\Domain\Scraping\Domain\Contract\Service\ImageDownloaderInterface;
readonly class ImageDownloader implements ImageDownloaderInterface
{

View File

@@ -2,25 +2,25 @@
namespace App\Domain\Scraping\Infrastructure\Service\Scraper;
use App\Domain\Scraping\Domain\Contract\Service\ImageDownloaderInterface;
use App\Domain\Scraping\Domain\Contract\Service\ScraperInterface;
use App\Domain\Scraping\Domain\Event\PageScrapingProgressed;
use App\Domain\Scraping\Domain\Model\ScrapingJob;
use App\Domain\Scraping\Domain\Model\ScrapingProgress;
use App\Domain\Scraping\Domain\Model\Source;
use App\Domain\Scraping\Domain\Model\ValueObject\TempDirectory;
use App\Domain\Scraping\Infrastructure\Service\ImageDownloader;
use Symfony\Component\Messenger\MessageBusInterface;
use Ramsey\Uuid\Uuid;
abstract class AbstractScraper implements ScraperInterface
{
public function __construct(
protected readonly ImageDownloader $imageDownloader,
protected readonly MessageBusInterface $eventBus
protected ImageDownloaderInterface $imageDownloader,
protected MessageBusInterface $eventBus
) {
}
abstract public function scrape(ScrapingJob $job): void;
abstract public function scrape(ScrapingJob $job): ScrapingJob;
abstract protected function scrapePages(ScrapingJob $job, Source $source): array;

View File

@@ -9,26 +9,27 @@ use App\Domain\Scraping\Domain\Model\Source;
use App\Domain\Scraping\Domain\Model\ValueObject\ImageUrl;
use App\Domain\Scraping\Domain\Model\ValueObject\PageNumber;
use App\Domain\Scraping\Domain\Contract\Repository\SourceRepositoryInterface;
use App\Domain\Scraping\Domain\Contract\Service\CbzGeneratorInterface;
use App\Domain\Scraping\Domain\Contract\Service\ImageDownloaderInterface;
use App\Domain\Scraping\Domain\Model\ValueObject\ChapterUrl;
use Symfony\Component\DomCrawler\Crawler;
use Symfony\Contracts\HttpClient\HttpClientInterface;
use Symfony\Component\Messenger\MessageBusInterface;
use App\Domain\Scraping\Infrastructure\Service\ImageDownloader;
class HtmlScraper extends AbstractScraper
{
public function __construct(
ImageDownloader $imageDownloader,
MessageBusInterface $eventBus,
private readonly HttpClientInterface $httpClient,
ImageDownloaderInterface $imageDownloader,
MessageBusInterface $eventBus,
private readonly CbzGeneratorInterface $cbzGenerator,
private readonly HttpClientInterface $httpClient,
private readonly SourceRepositoryInterface $sourceRepository,
private readonly MangaRepositoryInterface $mangaRepository,
private readonly ScrapingJobRepositoryInterface $scrapingJobRepository,
private readonly MangaRepositoryInterface $mangaRepository,
) {
parent::__construct($imageDownloader, $eventBus);
}
public function scrape(ScrapingJob $job): void
public function scrape(ScrapingJob $job): ScrapingJob
{
$sourceConfig = $this->sourceRepository->getById($job->getSourceId());
$tempDir = $this->createTempDirectory();
@@ -52,12 +53,14 @@ class HtmlScraper extends AbstractScraper
$this->dispatchProgressEvent($job, $index + 1, count($pages));
}
$cbzPath = $this->cbzGenerator->generate($job, $tempDir);
$job->cbzPath = $cbzPath;
$job->complete();
$this->scrapingJobRepository->save($job);
return $job;
} catch (\Exception $e) {
$job->fail();
$this->scrapingJobRepository->save($job);
throw $e;
$job->fail($e->getMessage());
return $job;
} finally {
$this->cleanupTempFiles($tempDir);
}