feat: refonte du gestionnaire de chapitres pour intégrer la génération de fichiers CBZ, le téléchargement d'images en lot et la gestion des requêtes de scraping, avec mise à jour des interfaces et des modèles associés

This commit is contained in:
ext.jeremy.guillot@maxicoffee.domains
2025-03-28 20:42:21 +01:00
parent cdee6f77fc
commit d7088b14c2
22 changed files with 620 additions and 195 deletions

View File

@@ -17,6 +17,7 @@ readonly class DoctrineScrapingJobRepository implements ScrapingJobRepositoryInt
public function save(ScrapingJob $job): void
{
/** @var ScrapingJobEntity $existingEntity */
$existingEntity = $this->entityManager->getRepository(ScrapingJobEntity::class)->find($job->getId());
if ($existingEntity) {

View File

@@ -86,7 +86,7 @@ class ScrapingJobEntity
$this->pages = $pages;
}
public function setCompletedAt(\DateTimeImmutable $completedAt): void
public function setCompletedAt(?\DateTimeImmutable $completedAt): void
{
$this->completedAt = $completedAt;
}

View File

@@ -2,103 +2,87 @@
namespace App\Domain\Scraping\Infrastructure\Service;
use App\Domain\Scraping\Domain\Contract\Repository\ChapterRepositoryInterface;
use App\Domain\Scraping\Domain\Contract\Repository\MangaRepositoryInterface;
use App\Domain\Scraping\Domain\Model\Manga;
use App\Domain\Scraping\Domain\Model\ScrapingJob;
use App\Domain\Scraping\Domain\Model\ValueObject\CbzPath;
use App\Domain\Scraping\Domain\Contract\Service\CbzGeneratorInterface;
use App\Domain\Scraping\Domain\Model\Chapter;
use App\Domain\Scraping\Domain\Model\ValueObject\TempDirectory;
use App\Domain\Scraping\Domain\Exception\CbzGenerationException;
use Exception;
use App\Domain\Scraping\Domain\Model\ValueObject\CbzGenerationRequest;
use App\Domain\Scraping\Domain\Model\ValueObject\CbzPath;
readonly class CbzGenerator implements CbzGeneratorInterface
{
public function __construct(
private string $projectDir,
private MangaRepositoryInterface $mangaRepository,
private ChapterRepositoryInterface $chapterRepository,
) {}
private string $projectDir
) {
}
/**
* @throws Exception
*/
public function generate(ScrapingJob $job, TempDirectory $tempDirectory): CbzPath
public function generate(CbzGenerationRequest $request): CbzPath
{
$cbzPath = $this->generateCbzPath($job);
$this->createCbzArchive($tempDirectory->getPath(), $cbzPath);
$cbzPath = $this->generateCbzPath($request);
$this->createCbzArchive($request->getFiles(), $cbzPath);
return new CbzPath($cbzPath);
}
private function generateCbzPath(ScrapingJob $job): string
private function generateCbzPath(CbzGenerationRequest $request): string
{
$manga = $this->mangaRepository->getById($job->getMangaId());
$chapter = $this->chapterRepository->getByMangaIdAndChapterNumber($job->getMangaId(), $job->getChapterNumber());
$baseDir = sprintf(
'%s/public/cbz/%s/%s',
$this->projectDir,
$manga->getTitle() . ' (' . $manga->getPublicationYear() . ')',
sprintf('volume_%02d', $chapter->volumeNumber)
$mangaDir = $this->createMangaDirectory(
$this->slugify($request->getMangaTitle()),
$request->getPublicationYear()
);
try {
if (!is_dir($baseDir)) {
if (!mkdir($baseDir, 0755, true)) {
throw new CbzGenerationException();
}
}
} catch (Exception $e) {
throw CbzGenerationException::unableToCreateDirectory($baseDir);
}
$chapterNumber = $job->getChapterNumber();
$formattedNumber = $chapterNumber == floor($chapterNumber)
? sprintf('%02d', (int)$chapterNumber)
: sprintf('%04.1f', $chapterNumber);
$volumeDir = $this->createVolumeDirectory($mangaDir, $request->getVolumeNumber());
return sprintf(
'%s/%s_vol%s_ch%s.cbz',
$baseDir,
strtolower($manga->getTitle()),
sprintf('%02d', $chapter->volumeNumber),
$formattedNumber
'%s/%s_vol%d_ch%s.cbz',
$volumeDir,
$this->slugify($request->getMangaTitle()),
$request->getVolumeNumber(),
$request->getChapterNumber()
);
}
/**
* @throws Exception
*/
private function createCbzArchive(string $sourceDirectory, string $destinationPath): void
private function createCbzArchive(array $files, string $cbzPath): void
{
$zip = new \ZipArchive();
if ($zip->open($destinationPath, \ZipArchive::CREATE) !== true) {
throw CbzGenerationException::unableToCreateCbz($destinationPath);
if ($zip->open($cbzPath, \ZipArchive::CREATE | \ZipArchive::OVERWRITE) !== true) {
throw new \RuntimeException('Failed to create CBZ archive');
}
try {
$files = new \RecursiveIteratorIterator(
new \RecursiveDirectoryIterator($sourceDirectory),
\RecursiveIteratorIterator::LEAVES_ONLY
);
foreach ($files as $file) {
if (!$file->isDir()) {
$filePath = $file->getRealPath();
$relativePath = substr($filePath, strlen($sourceDirectory) + 1);
if (!$zip->addFile($filePath, $relativePath)) {
throw CbzGenerationException::unableToAddFileToArchive($filePath);
}
}
foreach ($files as $file) {
if (!file_exists($file)) {
throw new \RuntimeException("File not found: $file");
}
} catch (Exception $e) {
$zip->close();
throw $e;
$zip->addFile($file, basename($file));
}
$zip->close();
if (!$zip->close()) {
throw new \RuntimeException('Failed to close CBZ archive');
}
}
private function createMangaDirectory(string $mangaSlug, string $publicationYear): string
{
$dir = sprintf('%s/public/cbz/%s', $this->projectDir, ucfirst($mangaSlug) . ' (' . $publicationYear . ')');
if (!is_dir($dir) && !mkdir($dir, 0755, true)) {
throw new \RuntimeException("Failed to create directory: $dir");
}
return $dir;
}
private function createVolumeDirectory(string $mangaDir, int $volumeNumber): string
{
$dir = sprintf('%s/volume_%02d', $mangaDir, $volumeNumber);
if (!is_dir($dir) && !mkdir($dir, 0755, true)) {
throw new \RuntimeException("Failed to create directory: $dir");
}
return $dir;
}
private function slugify(string $text): string
{
$text = preg_replace('~[^\pL\d]+~u', '-', $text);
$text = iconv('utf-8', 'us-ascii//TRANSLIT', $text);
$text = preg_replace('~[^-\w]+~', '', $text);
$text = trim($text, '-');
$text = preg_replace('~-+~', '-', $text);
$text = strtolower($text);
return $text ?: 'n-a';
}
}

View File

@@ -2,13 +2,19 @@
namespace App\Domain\Scraping\Infrastructure\Service;
use Symfony\Contracts\HttpClient\HttpClientInterface;
use App\Domain\Scraping\Domain\Contract\Service\ImageDownloaderInterface;
use App\Domain\Scraping\Domain\Event\PageScrapingProgressed;
use App\Domain\Scraping\Domain\Model\ScrapingProgress;
use App\Domain\Scraping\Domain\Model\ValueObject\DownloadResult;
use App\Domain\Scraping\Domain\Model\ValueObject\TempDirectory;
use Symfony\Component\Messenger\MessageBusInterface;
use Symfony\Contracts\HttpClient\HttpClientInterface;
readonly class ImageDownloader implements ImageDownloaderInterface
{
public function __construct(
private HttpClientInterface $httpClient
private HttpClientInterface $httpClient,
private MessageBusInterface $eventBus
) {
}
@@ -22,4 +28,44 @@ readonly class ImageDownloader implements ImageDownloaderInterface
file_put_contents($destination, $response->getContent());
}
public function downloadBatch(array $urls, TempDirectory $tempDir, string $jobId): array
{
$results = [];
$totalUrls = count($urls);
foreach ($urls as $index => $url) {
try {
$extension = pathinfo(parse_url($url, PHP_URL_PATH), PATHINFO_EXTENSION) ?: 'jpg';
$destination = sprintf(
'%s/%03d.%s',
$tempDir->getPath(),
$index + 1,
$extension
);
$this->download($url, $destination);
$results[] = new DownloadResult($destination, $url);
$this->dispatchProgressEvent($jobId, $index + 1, $totalUrls);
} catch (\Exception $e) {
// Log l'erreur mais continue avec les autres images
error_log("Failed to download image {$url}: " . $e->getMessage());
}
}
if (empty($results)) {
throw new \RuntimeException('Failed to download any images');
}
return $results;
}
private function dispatchProgressEvent(string $jobId, int $currentPage, int $totalPages): void
{
$this->eventBus->dispatch(new PageScrapingProgressed(
$jobId,
new ScrapingProgress($currentPage, $totalPages)
));
}
}

View File

@@ -8,6 +8,8 @@ use App\Domain\Scraping\Domain\Event\PageScrapingProgressed;
use App\Domain\Scraping\Domain\Model\ScrapingJob;
use App\Domain\Scraping\Domain\Model\ScrapingProgress;
use App\Domain\Scraping\Domain\Model\Source;
use App\Domain\Scraping\Domain\Model\ValueObject\ScrapingRequest;
use App\Domain\Scraping\Domain\Model\ValueObject\ScrapingResult;
use App\Domain\Scraping\Domain\Model\ValueObject\TempDirectory;
use Symfony\Component\Messenger\MessageBusInterface;
use Ramsey\Uuid\Uuid;
@@ -20,7 +22,7 @@ abstract class AbstractScraper implements ScraperInterface
) {
}
abstract public function scrape(ScrapingJob $job): ScrapingJob;
abstract public function scrape(ScrapingRequest $request): ScrapingResult;
abstract protected function scrapePages(ScrapingJob $job, Source $source): array;

View File

@@ -15,74 +15,49 @@ use App\Domain\Scraping\Domain\Model\ValueObject\ChapterUrl;
use Symfony\Component\DomCrawler\Crawler;
use Symfony\Contracts\HttpClient\HttpClientInterface;
use Symfony\Component\Messenger\MessageBusInterface;
use App\Domain\Scraping\Domain\Contract\Service\ScraperInterface;
use App\Domain\Scraping\Domain\Event\PageScrapingProgressed;
use App\Domain\Scraping\Domain\Model\ValueObject\ScrapingRequest;
use App\Domain\Scraping\Domain\Model\ValueObject\ScrapingResult;
use App\Domain\Scraping\Domain\Model\ValueObject\TempDirectory;
use App\Domain\Scraping\Domain\Model\ScrapingProgress;
class HtmlScraper extends AbstractScraper
class HtmlScraper implements ScraperInterface
{
public function __construct(
ImageDownloaderInterface $imageDownloader,
MessageBusInterface $eventBus,
private readonly CbzGeneratorInterface $cbzGenerator,
private readonly HttpClientInterface $httpClient,
private readonly SourceRepositoryInterface $sourceRepository,
private readonly MangaRepositoryInterface $mangaRepository,
private readonly ImageDownloaderInterface $imageDownloader,
private readonly MessageBusInterface $eventBus,
private readonly HttpClientInterface $httpClient
) {
parent::__construct($imageDownloader, $eventBus);
}
public function scrape(ScrapingJob $job): ScrapingJob
public function scrape(ScrapingRequest $request): ScrapingResult
{
$sourceConfig = $this->sourceRepository->getById($job->getSourceId());
$tempDir = $this->createTempDirectory();
$scrappingParameters = $request->getScrapingParameters();
try {
$pages = $this->scrapePages($job, $sourceConfig);
$pages = !$scrappingParameters['nextPageSelector']
? $this->scrapeVerticalReader($request)
: $this->scrapeHorizontalReader($request);
foreach ($pages as $index => $imageUrl) {
$pageNumber = new PageNumber($index + 1);
$extension = pathinfo(parse_url($imageUrl, PHP_URL_PATH), PATHINFO_EXTENSION);
$destination = sprintf(
'%s/%s.%s',
$tempDir->getPath(),
$pageNumber->getFormattedNumber(),
$extension
);
$this->downloadImage($imageUrl, $destination);
$job->addPage($pageNumber, new ImageUrl($imageUrl));
$this->dispatchProgressEvent($job, $index + 1, count($pages));
}
$cbzPath = $this->cbzGenerator->generate($job, $tempDir);
$job->cbzPath = $cbzPath;
$job->complete();
return $job;
return new ScrapingResult($pages, count($pages));
} catch (\Exception $e) {
$job->fail($e->getMessage());
return $job;
} finally {
$this->cleanupTempFiles($tempDir);
throw new \RuntimeException('Scraping failed: ' . $e->getMessage(), 0, $e);
}
}
protected function scrapePages(ScrapingJob $job, Source $source): array
public function supports(string $sourceType): bool
{
$scrappingParameters = $source->getScrappingParameters();
if (!$scrappingParameters['nextPageSelector']) {
return $this->scrapeVerticalReader($job, $source);
}
return $this->scrapeHorizontalReader($job, $source);
return 'html' === $sourceType;
}
private function scrapeVerticalReader(ScrapingJob $job, Source $sourceConfig): array
private function scrapeVerticalReader(ScrapingRequest $request): array
{
$html = $this->fetchHtml($this->buildChapterUrl($job, $sourceConfig));
$html = $this->fetchHtml($request->getChapterUrl());
$crawler = new Crawler($html);
$params = $request->getScrapingParameters();
return $crawler->filter($sourceConfig->getScrappingParameters()['imageSelector'])
return $crawler->filter($params['imageSelector'])
->each(function ($node) {
return $this->cleanImageUrl(
$node->attr('src') ?: $node->attr('data-src')
@@ -90,21 +65,22 @@ class HtmlScraper extends AbstractScraper
});
}
private function scrapeHorizontalReader(ScrapingJob $job, Source $sourceConfig): array
private function scrapeHorizontalReader(ScrapingRequest $request): array
{
$pages = [];
$currentUrl = $this->buildChapterUrl($job, $sourceConfig);
$currentUrl = $request->getChapterUrl();
$params = $request->getScrapingParameters();
while ($currentUrl) {
$html = $this->fetchHtml($currentUrl);
$crawler = new Crawler($html);
$imageUrl = $crawler->filter($sourceConfig->getScrappingParameters()['imageSelector'])
->attr('src') ?: $crawler->filter($sourceConfig->getScrappingParameters()['imageSelector'])
$imageUrl = $crawler->filter($params['imageSelector'])
->attr('src') ?: $crawler->filter($params['imageSelector'])
->attr('data-src');
if (!preg_match('/^https?:\/\//', $imageUrl)) {
$urlComponents = parse_url($sourceConfig->getScrappingParameters()['chapterUrlFormat']);
$urlComponents = parse_url($params['chapterUrlFormat']);
$scheme = $urlComponents['scheme'];
$host = $urlComponents['host'];
$imageUrl = $scheme.'://'.$host.'/'.ltrim($imageUrl, '/');
@@ -112,8 +88,10 @@ class HtmlScraper extends AbstractScraper
$pages[] = $this->cleanImageUrl($imageUrl);
$nextLink = $crawler->filter($sourceConfig->getScrappingParameters()['nextPageSelector']);
$nextLink = $crawler->filter($params['nextPageSelector']);
$currentUrl = $nextLink->count() > 0 ? $nextLink->attr('href') : null;
$this->dispatchProgressEvent($request->getJobId(), count($pages), count($pages));
}
return $pages;
@@ -121,31 +99,30 @@ class HtmlScraper extends AbstractScraper
private function fetchHtml(string $url): string
{
$response = $this->httpClient->request('GET', $url);
try {
$response = $this->httpClient->request('GET', $url);
$statusCode = $response->getStatusCode();
if ($response->getStatusCode() >= 400) {
throw new \RuntimeException('Failed to fetch page: ' . $url);
if ($statusCode >= 300 && $statusCode < 400 || $statusCode === 404) {
throw new \RuntimeException('Chapter Not Found at ' . $url);
}
return $response->getContent();
} catch (\Exception $e) {
throw new \RuntimeException('Failed to fetch HTML: ' . $e->getMessage(), 0, $e);
}
return $response->getContent();
}
private function cleanImageUrl(string $url): string
{
// Logique de nettoyage d'URL d'image
return $url;
return preg_replace('/[\x00-\x1F\x7F]/', '', trim($url));
}
private function buildChapterUrl(ScrapingJob $job, Source $sourceConfig): string
private function dispatchProgressEvent(string $jobId, int $currentPage, int $totalPages): void
{
$manga = $this->mangaRepository->getById($job->getMangaId());
$chapterUrl = new ChapterUrl($sourceConfig->getScrappingParameters()['chapterUrlFormat'], $manga->getSlug(), $job->getChapterNumber());
return $chapterUrl->getUrl();
}
public function supports(string $sourceType): bool
{
return 'html' === $sourceType;
$this->eventBus->dispatch(new PageScrapingProgressed(
$jobId,
new ScrapingProgress($currentPage, $totalPages)
));
}
}