feat: refactorisation de la gestion du scraping des chapitres en remplaçant les identifiants de manga et de chapitre par un identifiant de chapitre unique, amélioration de la récupération des sources préférées et ajout de la gestion des erreurs pour les échecs de scraping.

This commit is contained in:
ext.jeremy.guillot@maxicoffee.domains
2025-04-03 16:34:30 +02:00
parent e29433bb0c
commit c9f1771522
15 changed files with 270 additions and 104 deletions

View File

@@ -5,9 +5,7 @@ namespace App\Domain\Scraping\Application\Command;
readonly class ScrapeChapter
{
public function __construct(
public string $mangaId,
public string $chapterNumber,
public string $sourceId
public string $chapterId
) {
}
}

View File

@@ -12,6 +12,7 @@ use App\Domain\Scraping\Domain\Contract\Service\ScraperInterface;
use App\Domain\Scraping\Domain\Event\ChapterScraped;
use App\Domain\Scraping\Domain\Event\ChapterScrapingFailed;
use App\Domain\Scraping\Domain\Model\ScrapingJob;
use App\Domain\Scraping\Domain\Model\Source;
use App\Domain\Scraping\Domain\Model\ValueObject\CbzGenerationRequest;
use App\Domain\Scraping\Domain\Model\ValueObject\ScrapingRequest;
use App\Domain\Scraping\Domain\Model\ValueObject\TempDirectory;
@@ -39,76 +40,141 @@ readonly class ScrapeChapterHandler
{
$job = null;
try {
// 1. Création du job dans sa propre transaction
$job = new ScrapingJob(
Uuid::uuid4()->toString(),
$command->mangaId,
$command->chapterNumber,
$command->sourceId
);
$job->start();
$this->jobRepository->save($job);
// 1. Récupération du chapitre
$chapter = $this->chapterRepository->getById($command->chapterId);
if (!$chapter) {
throw new \InvalidArgumentException("Chapter not found with ID: {$command->chapterId}");
}
// 2. Nouvelle transaction pour le reste des opérations
$this->entityManager->beginTransaction();
try {
// Préparation des données
$manga = $this->mangaRepository->getById($command->mangaId);
$chapter = $this->chapterRepository->getByMangaIdAndChapterNumber($command->mangaId, $command->chapterNumber);
$source = $this->sourceRepository->getById($command->sourceId);
// 2. Récupération du manga
$manga = $this->mangaRepository->getById($chapter->mangaId);
if (!$manga) {
throw new \InvalidArgumentException("Manga not found with ID: {$chapter->mangaId}");
}
// 3. Scraping des URLs
$scrapingRequest = new ScrapingRequest(
'html',
$source->buildChapterUrl($manga->getSlug(), $command->chapterNumber),
$source->getScrappingParameters()
// 3. Détermination des sources à utiliser
$sources = $this->getSourcesToTry($manga);
if (empty($sources)) {
throw new \InvalidArgumentException("No sources available for scraping");
}
// 4. Essai de scraping sur chaque source jusqu'à succès
$success = false;
$lastException = null;
foreach ($sources as $source) {
$job = new ScrapingJob(
Uuid::uuid4()->toString(),
$chapter->mangaId,
$chapter->chapterNumber,
$source->getId()->getValue()
);
$scrapingResult = $this->scraper->scrape($scrapingRequest);
// 4. Téléchargement des images
$tempDir = new TempDirectory();
$downloadResults = $this->imageDownloader->downloadBatch(
$scrapingResult->getImageUrls(),
$tempDir,
$job->id
);
// 5. Génération du CBZ
$cbzRequest = new CbzGenerationRequest(
$manga->getTitle(),
$manga->getPublicationYear(),
$chapter->volumeNumber,
$command->chapterNumber,
$tempDir,
array_map(fn($r) => $r->getLocalPath(), $downloadResults)
);
$cbzPath = $this->cbzGenerator->generate($cbzRequest);
// 6. Mise à jour et sauvegarde
$chapter->cbzPath = $cbzPath->getPath();
$this->chapterRepository->save($chapter);
$job->complete();
$job->start();
$this->jobRepository->save($job);
$this->entityManager->commit();
try {
$this->entityManager->beginTransaction();
$this->eventBus->dispatch(new ChapterScraped($job->id));
// 5. Scraping des URLs
$scrapingRequest = new ScrapingRequest(
'html',
$source->buildChapterUrl($manga->getSlug(), $chapter->chapterNumber),
$source->getScrappingParameters()
);
// 7. Nettoyage
$tempDir->cleanup();
} catch (\Exception $e) {
$this->entityManager->rollback();
throw $e;
$scrapingResult = $this->scraper->scrape($scrapingRequest);
// 6. Téléchargement des images
$tempDir = new TempDirectory();
$downloadResults = $this->imageDownloader->downloadBatch(
$scrapingResult->getImageUrls(),
$tempDir,
$job->id
);
// 7. Génération du CBZ
$cbzRequest = new CbzGenerationRequest(
$manga->getTitle(),
$manga->getPublicationYear(),
$chapter->volumeNumber,
$chapter->chapterNumber,
$tempDir,
array_map(fn($r) => $r->getLocalPath(), $downloadResults)
);
$cbzPath = $this->cbzGenerator->generate($cbzRequest);
// 8. Mise à jour et sauvegarde
$chapter->cbzPath = $cbzPath->getPath();
$this->chapterRepository->save($chapter);
$job->complete();
$this->jobRepository->save($job);
$this->entityManager->commit();
$this->eventBus->dispatch(new ChapterScraped($job->id));
// 9. Nettoyage
$tempDir->cleanup();
// Scraping réussi, pas besoin d'essayer d'autres sources
$success = true;
break;
} catch (\Exception $e) {
$this->entityManager->rollback();
if (isset($job)) {
$job->fail($e->getMessage());
$this->jobRepository->save($job);
}
$lastException = $e;
// Continuer avec la source suivante
}
}
// Si toutes les sources ont échoué
if (!$success) {
$errorMessage = $lastException ? $lastException->getMessage() : "Failed to scrape chapter from all available sources";
$this->eventBus->dispatch(new ChapterScrapingFailed($chapter->mangaId, $chapter->chapterNumber, $errorMessage));
}
} catch (\Exception $e) {
if (isset($job)) {
$job->fail($e->getMessage());
$this->jobRepository->save($job);
}
$this->eventBus->dispatch(new ChapterScrapingFailed($command->mangaId, $command->chapterNumber, $e->getMessage()));
$this->eventBus->dispatch(new ChapterScrapingFailed($chapter->mangaId ?? 'unknown', $chapter->chapterNumber ?? 'unknown', $e->getMessage()));
}
}
/**
* Détermine les sources à utiliser pour le scraping en fonction des préférences du manga
*
* @param \App\Domain\Scraping\Domain\Model\Manga $manga
* @return Source[]
*/
private function getSourcesToTry(\App\Domain\Scraping\Domain\Model\Manga $manga): array
{
// Si le manga a des sources préférées, les utiliser
if ($manga->hasPreferredSources()) {
$preferredSources = [];
foreach ($manga->getPreferredSources() as $sourceId) {
$source = $this->sourceRepository->getById($sourceId);
if ($source) {
$preferredSources[] = $source;
}
}
if (!empty($preferredSources)) {
return $preferredSources;
}
}
// Sinon, utiliser toutes les sources disponibles
return $this->sourceRepository->getAll();
}
}

View File

@@ -6,6 +6,7 @@ use App\Domain\Scraping\Domain\Model\Chapter;
interface ChapterRepositoryInterface
{
public function getById(string $id): ?Chapter;
public function getByMangaIdAndChapterNumber(string $mangaId, int $chapterNumber): Chapter;
public function save(Chapter $chapter): void;
}

View File

@@ -7,4 +7,9 @@ use App\Domain\Scraping\Domain\Model\Source;
interface SourceRepositoryInterface
{
public function getById(string $id): ?Source;
/**
* @return Source[]
*/
public function getAll(): array;
}

View File

@@ -4,6 +4,9 @@ namespace App\Domain\Scraping\Domain\Model;
class Manga
{
/**
* @param string[] $preferredSources
*/
public function __construct(
private readonly string $id,
private readonly string $title,
@@ -11,6 +14,7 @@ class Manga
private readonly string $description,
private readonly string $author,
private readonly string $publicationYear,
private readonly array $preferredSources = [],
) {
}
@@ -43,4 +47,20 @@ class Manga
{
return $this->publicationYear;
}
/**
* @return string[]
*/
public function getPreferredSources(): array
{
return $this->preferredSources;
}
/**
* @return bool
*/
public function hasPreferredSources(): bool
{
return !empty($this->preferredSources);
}
}

View File

@@ -21,15 +21,9 @@ use Symfony\Component\Validator\Constraints as Assert;
readonly class ScrapeChapterRequest
{
public function __construct(
#[ApiProperty(description: 'ID du manga')]
#[ApiProperty(description: 'ID du chapitre à scraper')]
#[Assert\NotBlank]
public string $mangaId,
#[ApiProperty(description: 'Numéro du chapitre')]
#[Assert\NotBlank]
public string $chapterNumber,
#[ApiProperty(description: 'ID de la source')]
#[Assert\NotBlank]
public string $sourceId,
public string $chapterId,
) {
}
}

View File

@@ -22,9 +22,7 @@ final class ScrapeChapterStateProcessor implements ProcessorInterface
{
$this->commandBus->dispatch(
new ScrapeChapter(
$data->mangaId,
$data->chapterNumber,
$data->sourceId
$data->chapterId
)
);
}

View File

@@ -13,6 +13,26 @@ readonly class LegacyChapterRepository implements ChapterRepositoryInterface
private EntityManagerInterface $entityManager,
) {}
/**
* Récupère un chapitre par son identifiant
*/
public function getById(string $id): ?Chapter
{
$chapterEntity = $this->entityManager->getRepository(EntityChapter::class)->find($id);
if (!$chapterEntity) {
return null;
}
return new Chapter(
id: $chapterEntity->getId(),
mangaId: $chapterEntity->getManga()->getId(),
chapterNumber: $chapterEntity->getNumber(),
volumeNumber: $chapterEntity->getVolume(),
cbzPath: $chapterEntity->getCbzPath(),
);
}
/**
* @throws ChapterNotFoundException
*/
@@ -32,6 +52,7 @@ readonly class LegacyChapterRepository implements ChapterRepositoryInterface
mangaId: $chapterEntity->getManga()->getId(),
chapterNumber: $chapterEntity->getNumber(),
volumeNumber: $chapterEntity->getVolume(),
cbzPath: $chapterEntity->getCbzPath(),
);
}

View File

@@ -19,13 +19,24 @@ readonly class LegacyMangaRepository implements MangaRepositoryInterface
/** @var EntityManga|null $mangaEntity */
$mangaEntity = $this->entityManager->getRepository(EntityManga::class)->find($id);
return $mangaEntity ? new Manga(
if (!$mangaEntity) {
return null;
}
// Récupération des sources préférées
$preferredSourceIds = [];
foreach ($mangaEntity->getPreferredSources() as $source) {
$preferredSourceIds[] = $source->getId();
}
return new Manga(
$mangaEntity->getId(),
$mangaEntity->getTitle(),
$mangaEntity->getSlug(),
$mangaEntity->getDescription(),
$mangaEntity->getAuthor(),
$mangaEntity->getPublicationYear(),
) : null;
$mangaEntity->getDescription() ?? '',
$mangaEntity->getAuthor() ?? '',
$mangaEntity->getPublicationYear() ?? '',
$preferredSourceIds,
);
}
}

View File

@@ -46,4 +46,43 @@ readonly class LegacySourceRepository implements SourceRepositoryInterface
updatedAt: new DateTimeImmutable()
);
}
/**
* @return Source[]
*/
public function getAll(): array
{
/** @var ContentSource[] $sourceEntities */
$sourceEntities = $this->entityManager->getRepository(ContentSource::class)->findAll();
$sources = [];
foreach ($sourceEntities as $sourceEntity) {
$sources[] = $this->convertEntityToModel($sourceEntity);
}
return $sources;
}
/**
* Convertit une entité ContentSource en modèle Source
*/
private function convertEntityToModel(ContentSource $source): Source
{
return new Source(
id: new SourceId($source->getId()),
name: $source->getCleanBaseUrl(),
description: 'Legacy Source: ' . $source->getBaseUrl(),
baseUrl: $source->getBaseUrl(),
scrappingParameters: [
'imageSelector' => $source->getImageSelector(),
'nextPageSelector' => $source->getNextPageSelector(),
'chapterUrlFormat' => $source->getChapterUrlFormat(),
'scrapingType' => $source->getScrapingType(),
'chapterSelector' => $source->getChapterSelector()
],
isActive: true,
createdAt: new DateTimeImmutable(),
updatedAt: new DateTimeImmutable()
);
}
}

View File

@@ -8,6 +8,12 @@ use App\Domain\Scraping\Domain\Model\Chapter;
class InMemoryChapterRepository implements ChapterRepositoryInterface
{
private array $chapters = [];
public function getById(string $id): ?Chapter
{
return $this->chapters[$id] ?? null;
}
public function getByMangaIdAndChapterNumber(string $mangaId, int $chapterNumber): Chapter
{
foreach ($this->chapters as $chapter) {

View File

@@ -16,19 +16,27 @@ class InMemoryMangaRepository implements MangaRepositoryInterface
'test-manga',
'Test Manga',
'test-manga',
'2024',
'A test manga description',
'Test Author',
'A test manga description'
'2024',
[] // Pas de sources préférées par défaut
);
// Ajoute un manga avec des sources préférées pour les tests
$this->mangas['test-manga-with-sources'] = new Manga(
'test-manga-with-sources',
'Test Manga With Sources',
'test-manga-with-sources',
'A test manga with preferred sources',
'Test Author',
'2024',
['test-source'] // Une source préférée
);
}
public function getById(string $id): Manga
public function getById(string $id): ?Manga
{
if (!isset($this->mangas[$id])) {
throw new \RuntimeException('Manga not found');
}
return $this->mangas[$id];
return $this->mangas[$id] ?? null;
}
public function save(Manga $manga): void

View File

@@ -22,7 +22,9 @@ class InMemorySourceRepository implements SourceRepositoryInterface
[
'imageSelector' => 'img.manga-image',
'nextPageSelector' => null,
'chapterUrlFormat' => 'https://example.com/manga/{slug}/chapter-{chapterNumber}'
'chapterUrlFormat' => 'https://example.com/manga/{slug}/chapter-{chapterNumber}',
'scrapingType' => 'html',
'chapterSelector' => '.chapter-item'
],
true,
new DateTimeImmutable(),
@@ -30,13 +32,17 @@ class InMemorySourceRepository implements SourceRepositoryInterface
);
}
public function getById(string $id): Source
public function getById(string $id): ?Source
{
if (!isset($this->sources[$id])) {
throw new \RuntimeException('Source not found');
}
return $this->sources[$id] ?? null;
}
return $this->sources[$id];
/**
* @return Source[]
*/
public function getAll(): array
{
return array_values($this->sources);
}
public function save(Source $source): void

View File

@@ -74,9 +74,7 @@ class ScrapeChapterHandlerTest extends TestCase
public function testHandleSuccessfully(): void
{
$command = new ScrapeChapter(
mangaId: 'test-manga',
chapterNumber: '2',
sourceId: 'test-source'
chapterId: '1'
);
$this->handler->handle($command);
@@ -90,16 +88,14 @@ class ScrapeChapterHandlerTest extends TestCase
$this->assertInstanceOf(ChapterScraped::class, $dispatchedMessages[0]);
$this->assertEquals($job->id, $dispatchedMessages[0]->getJobId());
$chapter = $this->chapterRepository->getByMangaIdAndChapterNumber('test-manga', 2);
$chapter = $this->chapterRepository->getById('1');
$this->assertNotNull($chapter->cbzPath);
}
public function testHandleThrowsException(): void
{
$command = new ScrapeChapter(
mangaId: 'test-manga',
chapterNumber: '2',
sourceId: 'test-source'
chapterId: '1'
);
$exception = new \Exception('Scraping failed');

View File

@@ -23,9 +23,7 @@ class ScrapeChapterTest extends AbstractApiTestCase
{
// Given
$payload = [
'chapterNumber' => 'chapter-123',
'sourceId' => 'source-456',
'mangaId' => 'manga-789',
'chapterId' => 'chapter-123',
];
// When
@@ -43,15 +41,14 @@ class ScrapeChapterTest extends AbstractApiTestCase
/** @var ScrapeChapter $message */
$message = $messages[0];
$this->assertInstanceOf(ScrapeChapter::class, $message);
$this->assertEquals('chapter-123', $message->chapterId);
}
public function testInitiateChapterScrapingWithInvalidPayload(): void
{
// Given
$payload = [
'chapterNumber' => '',
'sourceId' => 'source-456',
'mangaId' => 'manga-789',
'chapterId' => '',
];
// When
@@ -65,7 +62,7 @@ class ScrapeChapterTest extends AbstractApiTestCase
$this->assertJsonContains([
'violations' => [
[
'propertyPath' => 'chapterNumber',
'propertyPath' => 'chapterId',
'message' => 'This value should not be blank.',
],
],