From fafff5014cdb5e791ee6f29c7b1673cbaae00273 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9my=20Guillot?= Date: Sun, 21 Jul 2024 19:08:46 +0200 Subject: [PATCH] Added: - Refactor MangaScraperService (not used everywhere now) - Added JavascriptScraper.php - Added alternatives slugs in Manga.php - Improvement in manga edit form --- assets/controllers/collection_controller.js | 26 +++ config/services.yaml | 26 +++ migrations/Version20240721142304.php | 34 ++++ migrations/Version20240721145225.php | 32 ++++ src/Controller/MangaController.php | 23 +++ src/Controller/SettingsController.php | 5 +- src/Entity/ContentSource.php | 15 ++ src/Entity/Manga.php | 15 ++ src/Form/ContentSourceType.php | 4 + src/Form/MangaEditType.php | 95 ++++++++++ src/Service/ChapterUrlGenerator.php | 2 +- src/Service/MangaScraperService.php | 189 ++++++++++++++++++- src/Service/Scraper/AbstractScraper.php | 110 +++++++++++ src/Service/Scraper/HtmlScraper.php | 197 ++++++++++++++++++++ src/Service/Scraper/JavascriptScraper.php | 188 +++++++++++++++++++ src/Service/Scraper/MangaScraperService.php | 28 +++ src/Service/Scraper/MangadexScraper.php | 89 +++++++++ src/Service/Scraper/ScraperFactory.php | 25 +++ src/Service/Scraper/ScraperInterface.php | 13 ++ templates/components/Modal.html.twig | 2 +- templates/manga/_manga_details.html.twig | 90 +++++++-- 21 files changed, 1180 insertions(+), 28 deletions(-) create mode 100644 assets/controllers/collection_controller.js create mode 100644 migrations/Version20240721142304.php create mode 100644 migrations/Version20240721145225.php create mode 100644 src/Form/MangaEditType.php create mode 100644 src/Service/Scraper/AbstractScraper.php create mode 100644 src/Service/Scraper/HtmlScraper.php create mode 100644 src/Service/Scraper/JavascriptScraper.php create mode 100644 src/Service/Scraper/MangaScraperService.php create mode 100644 src/Service/Scraper/MangadexScraper.php create mode 100644 src/Service/Scraper/ScraperFactory.php create mode 100644 src/Service/Scraper/ScraperInterface.php diff --git a/assets/controllers/collection_controller.js b/assets/controllers/collection_controller.js new file mode 100644 index 0000000..ac5a6a1 --- /dev/null +++ b/assets/controllers/collection_controller.js @@ -0,0 +1,26 @@ +import {Controller} from '@hotwired/stimulus'; + +/* +* The following line makes this controller "lazy": it won't be downloaded until needed +* See https://github.com/symfony/stimulus-bridge#lazy-controllers +*/ +/* stimulusFetch: 'lazy' */ +export default class extends Controller { + static targets = ['container', 'template', 'item']; + + connect() { + this.index = this.itemTargets.length; + } + + add(event) { + event.preventDefault(); + const template = this.templateTarget.innerHTML.replace(/__name__/g, this.index); + this.containerTarget.insertAdjacentHTML('beforeend', template); + this.index++; + } + + remove(event) { + event.preventDefault(); + event.target.closest('.collection-item').remove(); + } +} diff --git a/config/services.yaml b/config/services.yaml index f6bfd50..4b5ed1c 100644 --- a/config/services.yaml +++ b/config/services.yaml @@ -76,3 +76,29 @@ services: App\Service\MangadexProvider: arguments: $client: '@App\Client\MangadexClient' + + # Scrapers + App\Service\Scraper\HtmlScraper: + arguments: + $projectDir: '%kernel.project_dir%' + tags: [ 'app.scraper' ] + + App\Service\Scraper\JavascriptScraper: + arguments: + $projectDir: '%kernel.project_dir%' + tags: [ 'app.scraper' ] + + App\Service\Scraper\MangadexScraper: + arguments: + $projectDir: '%kernel.project_dir%' + tags: [ 'app.scraper' ] + + # Scraper Factory + App\Service\Scraper\ScraperFactory: + arguments: + $scrapers: !tagged_iterator app.scraper + + # Manga Scraper Service + App\Service\Scraper\MangaScraperService: + arguments: + $scraperFactory: '@App\Service\Scraper\ScraperFactory' diff --git a/migrations/Version20240721142304.php b/migrations/Version20240721142304.php new file mode 100644 index 0000000..fa9cff5 --- /dev/null +++ b/migrations/Version20240721142304.php @@ -0,0 +1,34 @@ +addSql('ALTER TABLE content_source ADD chapter_selector VARCHAR(255) DEFAULT NULL'); + $this->addSql('ALTER TABLE manga ALTER monitored DROP DEFAULT'); + } + + public function down(Schema $schema): void + { + // this down() migration is auto-generated, please modify it to your needs + $this->addSql('CREATE SCHEMA public'); + $this->addSql('ALTER TABLE manga ALTER monitored SET DEFAULT false'); + $this->addSql('ALTER TABLE content_source DROP chapter_selector'); + } +} diff --git a/migrations/Version20240721145225.php b/migrations/Version20240721145225.php new file mode 100644 index 0000000..1eb198a --- /dev/null +++ b/migrations/Version20240721145225.php @@ -0,0 +1,32 @@ +addSql('ALTER TABLE manga ADD alternative_slugs JSON DEFAULT NULL'); + } + + public function down(Schema $schema): void + { + // this down() migration is auto-generated, please modify it to your needs + $this->addSql('CREATE SCHEMA public'); + $this->addSql('ALTER TABLE manga DROP alternative_slugs'); + } +} diff --git a/src/Controller/MangaController.php b/src/Controller/MangaController.php index aa0fc3c..3bfbbcd 100644 --- a/src/Controller/MangaController.php +++ b/src/Controller/MangaController.php @@ -4,6 +4,7 @@ namespace App\Controller; use App\Entity\Chapter; use App\Entity\Manga; +use App\Form\MangaEditType; use App\Manager\Toolbar\Factory\ToolbarFactory; use App\Message\DownloadChapter; use App\Message\RefreshMetadata; @@ -79,9 +80,12 @@ class MangaController extends AbstractController throw new NotFoundHttpException("Le manga demandé n'existe pas."); } + $form = $this->createForm(MangaEditType::class, $manga); + return $this->render('manga/show_chapters.html.twig', [ 'manga' => $manga, 'toolbar' => $this->toolbarFactory->createToolbar('chapter_list', ['mangaId' => $manga->getId(), 'isMonitored' => (int) $manga->isMonitored()])->getGroups(), + 'form' => $form->createView(), ]); } @@ -101,6 +105,25 @@ class MangaController extends AbstractController } } + #[Route('/manga/{id}/edit', name: 'app_manga_edit', methods: ['POST'])] + public function edit(Request $request, Manga $manga, EntityManagerInterface $entityManager): JsonResponse|Response + { + $form = $this->createForm(MangaEditType::class, $manga); + $form->handleRequest($request); + + if ($form->isSubmitted() && $form->isValid()) { + $entityManager->flush(); + + return $this->redirectToRoute('app_manga_show', ['mangaSlug' => $manga->getSlug()]); + } + + $errors = []; + foreach ($form->getErrors(true) as $error) { + $errors[] = $error->getMessage(); + } + + return new JsonResponse(['errors' => $errors], 400); + } public function _chaptersByManga(int $id): Response { diff --git a/src/Controller/SettingsController.php b/src/Controller/SettingsController.php index e692f91..91337f2 100644 --- a/src/Controller/SettingsController.php +++ b/src/Controller/SettingsController.php @@ -5,8 +5,9 @@ namespace App\Controller; use App\Entity\ContentSource; use App\Form\ContentSourceType; use App\Repository\ContentSourceRepository; -use App\Service\MangaScraperService; + use App\Service\NotificationService; +use App\Service\Scraper\MangaScraperService; use Doctrine\ORM\EntityManagerInterface; use GuzzleHttp\Exception\GuzzleException; use Symfony\Bundle\FrameworkBundle\Controller\AbstractController; @@ -97,7 +98,7 @@ class SettingsController extends AbstractController $chapterNumber = $request->request->get('chapterNumber'); try { - $scrapedData = $this->mangaScraperService->testScrapingHtml($mangaSlug, $chapterNumber, $contentSource); + $scrapedData = $this->mangaScraperService->testScraping($mangaSlug, $chapterNumber, $contentSource); }catch (\Exception $e){ $this->notificationService->sendUpdate(['status' => 'error', 'message' => $e->getMessage()]); return new JsonResponse([ diff --git a/src/Entity/ContentSource.php b/src/Entity/ContentSource.php index cc1ee8c..b98d8ca 100644 --- a/src/Entity/ContentSource.php +++ b/src/Entity/ContentSource.php @@ -33,6 +33,9 @@ class ContentSource #[ORM\Column(length: 255)] private ?string $scrapingType = null; + #[ORM\Column(length: 255, nullable: true)] + private ?string $ChapterSelector = null; + public function getId(): ?int { return $this->id; @@ -103,4 +106,16 @@ class ContentSource return $this; } + + public function getChapterSelector(): ?string + { + return $this->ChapterSelector; + } + + public function setChapterSelector(?string $ChapterSelector): static + { + $this->ChapterSelector = $ChapterSelector; + + return $this; + } } diff --git a/src/Entity/Manga.php b/src/Entity/Manga.php index 41cf847..7dcd28c 100644 --- a/src/Entity/Manga.php +++ b/src/Entity/Manga.php @@ -59,6 +59,9 @@ class Manga #[ORM\Column] private ?bool $monitored = null; + #[ORM\Column(type: Types::JSON, nullable: true)] + private ?array $AlternativeSlugs = null; + public function __construct() { $this->chapters = new ArrayCollection(); @@ -265,4 +268,16 @@ class Manga return $this; } + + public function getAlternativeSlugs(): ?array + { + return $this->AlternativeSlugs; + } + + public function setAlternativeSlugs(?array $AlternativeSlugs): static + { + $this->AlternativeSlugs = $AlternativeSlugs; + + return $this; + } } diff --git a/src/Form/ContentSourceType.php b/src/Form/ContentSourceType.php index 1341c1c..a693d1d 100644 --- a/src/Form/ContentSourceType.php +++ b/src/Form/ContentSourceType.php @@ -28,6 +28,10 @@ class ContentSourceType extends AbstractType 'label' => 'Next Page Selector (let empty if vertical reader)', 'required' => false, ]) + ->add('ChapterSelector', TextType::class, [ + 'label' => 'Chapter Selector (required for Javascript scraping)', + 'required' => false, + ]) ->add('scrapingType', ChoiceType::class, [ 'label' => 'Scraping Type', 'choices' => [ diff --git a/src/Form/MangaEditType.php b/src/Form/MangaEditType.php new file mode 100644 index 0000000..5a48968 --- /dev/null +++ b/src/Form/MangaEditType.php @@ -0,0 +1,95 @@ +add('title', TextType::class, [ + 'label' => 'Titre', + 'attr' => ['class' => 'w-full px-3 py-2 border border-gray-300 rounded-md focus:outline-none focus:ring-indigo-500 focus:border-indigo-500'] + ]) + ->add('slug', TextType::class, [ + 'label' => 'Slug', + 'attr' => [ + 'readonly' => true, + 'class' => 'bg-gray-100 w-full px-3 py-2 border border-gray-300 rounded-md focus:outline-none focus:ring-indigo-500 focus:border-indigo-500' + ], + ]) + ->add('alternativeSlugs', CollectionType::class, [ + 'entry_type' => TextType::class, + 'allow_add' => true, + 'allow_delete' => true, + 'by_reference' => false, + 'label' => false, + 'prototype' => true, + 'entry_options' => ['attr' => ['class' => 'w-full px-3 py-2 border border-gray-300 rounded-md focus:outline-none focus:ring-indigo-500 focus:border-indigo-500'], 'label' => false], + 'required' => false, + ]) + ->add('publicationYear', NumberType::class, [ + 'label' => 'Année de publication', + 'attr' => ['class' => 'w-full px-3 py-2 border border-gray-300 rounded-md focus:outline-none focus:ring-indigo-500 focus:border-indigo-500'] + ]) + ->add('description', TextareaType::class, [ + 'label' => 'Description', + 'attr' => ['class' => 'w-full px-3 py-2 border border-gray-300 rounded-md focus:outline-none focus:ring-indigo-500 focus:border-indigo-500', 'rows' => 8] + ]) + ->add('genres', CollectionType::class, [ + 'entry_type' => TextType::class, + 'allow_add' => true, + 'allow_delete' => true, + 'by_reference' => false, + 'label' => 'Genres', + 'entry_options' => ['attr' => ['class' => 'w-full px-3 py-2 border border-gray-300 rounded-md focus:outline-none focus:ring-indigo-500 focus:border-indigo-500']], + 'required' => false, + ]) + ->add('rating', NumberType::class, [ + 'label' => 'Note', + 'attr' => ['class' => 'w-full px-3 py-2 border border-gray-300 rounded-md focus:outline-none focus:ring-indigo-500 focus:border-indigo-500'], + 'required' => false, + ]) + ->add('author', TextType::class, [ + 'label' => 'Auteur', + 'attr' => ['class' => 'w-full px-3 py-2 border border-gray-300 rounded-md focus:outline-none focus:ring-indigo-500 focus:border-indigo-500'], + 'required' => false, + ]) + ->add('status', TextType::class, [ + 'label' => 'Statut', + 'attr' => ['class' => 'w-full px-3 py-2 border border-gray-300 rounded-md focus:outline-none focus:ring-indigo-500 focus:border-indigo-500'], + 'required' => false, + ]) + ; + + $builder->addEventListener(FormEvents::PRE_SUBMIT, function (FormEvent $event) { + $data = $event->getData(); + $manga = $event->getForm()->getData(); + + if ($manga && $manga->getSlug()) { + $data['slug'] = $manga->getSlug(); + } + + $event->setData($data); + }); + } + + public function configureOptions(OptionsResolver $resolver): void + { + $resolver->setDefaults([ + 'data_class' => Manga::class, + ]); + } +} diff --git a/src/Service/ChapterUrlGenerator.php b/src/Service/ChapterUrlGenerator.php index 0529d14..5ec6829 100644 --- a/src/Service/ChapterUrlGenerator.php +++ b/src/Service/ChapterUrlGenerator.php @@ -26,7 +26,7 @@ class ChapterUrlGenerator private function validateUrlFormat(string $format): void { - if (!str_contains($format, '{slug}') || !str_contains($format, '{chapterNumber}')) { + if (!str_contains($format, '{slug}')) { throw new InvalidArgumentException("The URL format must contain both {slug} and {chapterNumber} placeholders."); } } diff --git a/src/Service/MangaScraperService.php b/src/Service/MangaScraperService.php index ba4db85..3038ad5 100644 --- a/src/Service/MangaScraperService.php +++ b/src/Service/MangaScraperService.php @@ -6,8 +6,12 @@ use App\Entity\Chapter; use App\Entity\Manga; use App\Entity\ContentSource; use App\Event\PageScrappingProgressEvent; +use App\Repository\ChapterRepository; +use App\Repository\MangaRepository; use Doctrine\ORM\EntityManagerInterface; use Exception; +use Facebook\WebDriver\Remote\RemoteWebElement; +use Facebook\WebDriver\WebDriverExpectedCondition; use GuzzleHttp\Client; use GuzzleHttp\Exception\GuzzleException; use GuzzleHttp\Exception\RequestException; @@ -18,6 +22,8 @@ use Symfony\Component\Routing\Route; use Symfony\Component\Routing\RouteCollection; use Symfony\Contracts\EventDispatcher\EventDispatcherInterface; +use Symfony\Component\Panther\Client as PantherClient; + class MangaScraperService { const string PUBLIC_CBZ = '/public/cbz'; @@ -25,7 +31,8 @@ class MangaScraperService public function __construct( private readonly string $projectDir, private readonly EventDispatcherInterface $eventDispatcher, - private readonly EntityManagerInterface $entityManager + private readonly EntityManagerInterface $entityManager, + private readonly MangaRepository $mangaRepository, ) { } @@ -140,12 +147,160 @@ class MangaScraperService return true; } - private function scrapeChapterJavaScript(Manga $manga, Chapter $chapter, ContentSource $mangaSource): array|bool + private function scrapeChapterJavascript(Manga $manga, Chapter $chapter, ContentSource $mangaSource): array|bool { - $chapterUrl = $mangaSource->getChapterUrl($manga->getTitle(), $chapter->getNumber()); - $imgUrls = $this->fetchImagesUsingPuppeteer($chapterUrl, $mangaSource->getImageSelector(), $mangaSource->getNextPageSelector()); + $pantherClient = PantherClient::createChromeClient(); + $chapterUrl = $mangaSource->getChapterUrl($manga->getSlug(), $chapter->getNumber()); - return false; + $pantherClient->request('GET', $chapterUrl); + + // Sélection du chapitre dans le menu déroulant + try { + $crawler = $pantherClient->waitFor('body'); + $select = $crawler->filter('#selectChapitres'); + + if ($select->count() > 0) { + $chapterNumber = $chapter->getNumber(); + $options = $select->filter('option'); + $targetindex = null; + + /** @var RemoteWebElement $option */ + foreach ($options->getIterator() as $index => $option) { + $optionText = $option->getText(); + // Recherche plus flexible du numéro de chapitre + if (preg_match("/\b{$chapterNumber}\b/", $optionText)) { + $targetIndex = $index; + break; + } + } + + + if ($targetIndex !== null) { + $pantherClient->executeScript(" + var select = document.querySelector('#selectChapitres'); + select.selectedIndex = $targetIndex; + select.dispatchEvent(new Event('change')); + "); + + // Attendre que la page se mette à jour après la sélection + $pantherClient->wait(60000)->until( // 60 secondes de timeout + function ($driver) { + return $driver->executeScript(" + var scansPlacement = document.querySelector('#scansPlacement'); + if (!scansPlacement) return false; + + var lazyImages = scansPlacement.querySelectorAll('img.lazy'); + var loadingGif = scansPlacement.querySelector('img[src*=\"loading_scans.gif\"]'); + + // Vérifier que toutes les images lazy sont chargées et que le GIF de chargement n'est plus présent + var allImagesLoaded = Array.from(lazyImages).every(img => img.complete && img.naturalWidth > 0); + + return lazyImages.length > 0 && allImagesLoaded && !loadingGif; + "); + } + ); + } else { + throw new \Exception("Chapitre $chapterNumber non trouvé dans le menu déroulant"); + } + } + } catch (\Exception $e) { +// $this->logger->warning('Erreur lors de la sélection du chapitre : ' . $e->getMessage()); + $pantherClient->close(); + return false; + } + + $pageData = []; + + try { + if ($mangaSource->getNextPageSelector() === null) { + // Lecteur vertical + $pageData = $this->scrapeVerticalReaderJavascript($pantherClient, $mangaSource, $chapter); + } else { + // Lecteur horizontal + $pageData = $this->scrapeHorizontalReaderJavascript($pantherClient, $mangaSource, $chapter); + } + } catch (\Exception $e) { + throw $e; +// $this->logger->warning('Erreur lors du scraping du chapitre ' . $chapter->getNumber() . ' du manga ' . $manga->getTitle() . ': ' . $e->getMessage()); + } finally { + $pantherClient->close(); + } + + return $pageData; + } + + private function scrapeVerticalReaderJavascript(PantherClient $pantherClient, ContentSource $mangaSource, Chapter $chapter): array + { + $pageData = []; + $pageNumber = 1; + + $crawler = $pantherClient->waitFor($mangaSource->getImageSelector()); + $images = $crawler->filter($mangaSource->getImageSelector()); + + foreach ($images->getIterator() as $image) { + $imageUrl = $image->getAttribute('src') ?: $image->getAttribute('data-src'); + + $pageData[] = [ + 'image_url' => $this->cleanImageUrl($imageUrl), + 'page_number' => $pageNumber, + ]; + + $event = new PageScrappingProgressEvent($chapter->getId(), $pageNumber, $images->count()); + $this->eventDispatcher->dispatch($event, PageScrappingProgressEvent::NAME); + + $pageNumber++; + } + + return $pageData; + } + + private function scrapeHorizontalReaderJavascript(PantherClient $pantherClient, ContentSource $mangaSource, Chapter $chapter): array + { + $pageData = []; + $pageNumber = 1; + + while (true) { + try { + $crawler = $pantherClient->waitFor($mangaSource->getImageSelector()); + + $imageElement = $crawler->filter($mangaSource->getImageSelector())->first(); + if ($imageElement->count() === 0) { + break; // Fin du chapitre + } + + $imageUrl = $imageElement->attr('src') ?: $imageElement->attr('data-src'); + + $pageData[] = [ + 'image_url' => $this->cleanImageUrl($imageUrl), + 'page_number' => $pageNumber, + ]; + + $event = new PageScrappingProgressEvent($chapter->getId(), $pageNumber, 0); + $this->eventDispatcher->dispatch($event, PageScrappingProgressEvent::NAME); + + // Passer à la page suivante + $nextButton = $pantherCrawler->filter($mangaSource->getNextPageSelector()); + if ($nextButton->count() === 0) { + break; // Pas de bouton suivant, fin du chapitre + } + + $nextButton->click(); + + // Attendre que la page change + $pantherClient->waitFor($mangaSource->getImageSelector(), 10); + + // Mettre à jour le crawler avec le nouveau contenu de la page + $pantherCrawler = $pantherClient->refreshCrawler(); + + $pageNumber++; + } catch (\Exception $e) { + throw $e; +// $this->logger->warning('Erreur lors du scraping de la page ' . $pageNumber . ' du chapitre ' . $chapter->getNumber() . ': ' . $e->getMessage()); + break; + } + } + + return $pageData; } private function fetchImagesUsingPuppeteer(string $url, string $imageSelector, string $nextButtonSelector): array @@ -162,6 +317,26 @@ class MangaScraperService return json_decode(implode("", $output), true); } + public function testScraping(string $mangaSlug, string $chapterNumber, ContentSource $contentSource): array + { + return match ($contentSource->getScrapingType()) { + 'html' => $this->testScrapingHtml($mangaSlug, $chapterNumber, $contentSource), + 'javascript' => $this->testScrapingJavascript($mangaSlug, $chapterNumber, $contentSource), + default => throw new Exception('Unsupported scraping type: ' . $contentSource->getScrapingType()), + }; + } + + /** + * @throws Exception + */ + public function testScrapingJavascript(string $mangaSlug, string $chapterNumber, ContentSource $contentSource): array + { + $manga = $this->mangaRepository->findOneBy(['slug' => $mangaSlug]); + $chapter = $manga->getChapterByNumber($chapterNumber); + + return $this->scrapeChapterJavascript($manga, $chapter, $contentSource); + } + /** * @throws GuzzleException */ @@ -231,9 +406,9 @@ class MangaScraperService $pageData = []; foreach ($images as $index => $image) { - if($image->getAttribute('src') === ''){ + if ($image->getAttribute('src') === '') { $imgUrl = $image->getAttribute('data-src'); - }else{ + } else { $imgUrl = $image->getAttribute('src'); } $pageData[] = [ diff --git a/src/Service/Scraper/AbstractScraper.php b/src/Service/Scraper/AbstractScraper.php new file mode 100644 index 0000000..108eaff --- /dev/null +++ b/src/Service/Scraper/AbstractScraper.php @@ -0,0 +1,110 @@ +httpClient = new Client(); + } + + protected function getValidChapterUrl(ContentSource $contentSource, Manga $manga, float $chapterNumber): ?string + { + $slugs = array_merge([$manga->getSlug()], $manga->getAlternativeSlugs() ?? []); + + foreach ($slugs as $slug) { + $url = $contentSource->getChapterUrl($slug, $chapterNumber); + if ($this->isChapterUrlValid($url)) { + return $url; + } + } + + return null; + } + + protected function isChapterUrlValid(string $url): bool + { + try { + $response = $this->httpClient->head($url); + return $response->getStatusCode() === 200; + } catch (RequestException $e) { + return false; + } + } + + protected function generateCbzPath(Manga $manga, Chapter $chapter): string + { + $volumeDir = $this->createDirectories($manga, $chapter->getVolume()); + $fileName = sprintf('%s_vol%d_ch%s.cbz', + $manga->getSlug(), + $chapter->getVolume(), + $chapter->getNumber() + ); + return $volumeDir . '/' . $fileName; + } + + protected function createCbzFile(string $tempDir, array $pageData, string $cbzFilePath): void + { + $zip = new \ZipArchive(); + + if ($zip->open($cbzFilePath, \ZipArchive::CREATE) === TRUE) { + foreach ($pageData as $page) { + $zip->addFile($page['local_image_url'], basename($page['local_image_url'])); + } + $zip->close(); + } + } + + protected function cleanupTempFiles(string $directory): void + { + $files = glob($directory . '/*'); + foreach ($files as $file) { + if (is_file($file)) { + unlink($file); + } + } + rmdir($directory); + } + + protected function createDirectories(Manga $manga, int $volume): string + { + $mangaYear = $manga->getPublicationYear() ?? 'unknown'; + $mangaDir = sprintf('%s/%s (%s)', $this->projectDir . self::PUBLIC_CBZ, ucfirst($manga->getSlug()), $mangaYear); + $volumeDir = sprintf('%s/volume_%d', $mangaDir, sprintf('%02d', $volume)); + + if (!is_dir($volumeDir)) { + mkdir($volumeDir, 0755, true); + } + + return $volumeDir; + } + + protected function cleanImageUrl(string $url): string + { + return preg_replace('/[\x00-\x1F\x7F]/', '', trim($url)); + } + + protected function dispatchProgressEvent(Chapter $chapter, int $currentPage, int $totalPages): void + { + $event = new PageScrappingProgressEvent($chapter->getId(), $currentPage, $totalPages); + $this->eventDispatcher->dispatch($event, PageScrappingProgressEvent::NAME); + } +} diff --git a/src/Service/Scraper/HtmlScraper.php b/src/Service/Scraper/HtmlScraper.php new file mode 100644 index 0000000..2cd2f81 --- /dev/null +++ b/src/Service/Scraper/HtmlScraper.php @@ -0,0 +1,197 @@ +client = new Client(); + } + + /** + * @throws Exception + */ + public function scrapeChapter(Chapter $chapter, ContentSource $contentSource): array|bool + { + $manga = $chapter->getManga(); + $chapterUrl = $this->getValidChapterUrl($contentSource, $manga, $chapter->getNumber()); + + if (!$chapterUrl) { + throw new Exception("Aucune URL valide trouvée pour le chapitre {$chapter->getNumber()} du manga {$manga->getTitle()}"); + } + + $tempDir = sys_get_temp_dir() . '/' . uniqid('manga_scraper_'); + mkdir($tempDir); + + $pageData = []; + + if ($contentSource->getNextPageSelector() === null) { + // Lecteur vertical + $html = $this->fetchHtml($chapterUrl); + $pageData = $this->scrapeVerticalReader($html, $contentSource); + } else { + // Lecteur horizontal (paginé) + $pageData = $this->scrapeHorizontalReader($chapterUrl, $contentSource); + } + + // Télécharger et sauvegarder les images + foreach ($pageData as $index => &$page) { + $imageName = sprintf('%03d.%s', $index + 1, pathinfo(parse_url($page['image_url'], PHP_URL_PATH), PATHINFO_EXTENSION)); + $imagePath = $tempDir . '/' . $imageName; + + $this->downloadAndSaveImage($page['image_url'], $imagePath); + + $this->dispatchProgressEvent($chapter, $index + 1, count($pageData)); + + $page['local_image_url'] = $imagePath; + } + + $cbzFilePath = $this->generateCbzPath($manga, $chapter); + $this->createCbzFile($tempDir, $pageData, $cbzFilePath); + + $chapter->setCbzPath($cbzFilePath); + $this->entityManager->persist($chapter); + $this->entityManager->flush(); + + // Nettoyage du répertoire temporaire + $this->cleanupTempFiles($tempDir); + + return $pageData; + } + + /** + * @throws Exception + */ + public function testScraping(string $mangaSlug, string $chapterNumber, ContentSource $contentSource): array + { + $chapterUrl = $contentSource->getChapterUrl($mangaSlug, $chapterNumber); + + if (!$this->isChapterUrlValid($chapterUrl)) { + throw new \Exception("Invalid URL, check format and slug"); + } + + $html = $this->fetchHtml($chapterUrl); + + if ($contentSource->getNextPageSelector() === null) { + return $this->scrapeVerticalReader($html, $contentSource); + } else { + return $this->scrapeHorizontalReader($chapterUrl, $contentSource); + } + } + + public function supports(string $scrapingType): bool + { + return $scrapingType === 'html'; + } + + private function scrapeVerticalReader(string $html, ContentSource $contentSource): array + { + $crawler = new Crawler($html); + $images = $crawler->filter($contentSource->getImageSelector()); + + $pageData = []; + foreach ($images as $index => $image) { + $imgUrl = $image->getAttribute('src') ?: $image->getAttribute('data-src'); + $pageData[] = [ + 'image_url' => $this->cleanImageUrl($imgUrl), + 'page_number' => $index + 1, + ]; + } + + return $pageData; + } + + private function scrapeHorizontalReader(string $chapterUrl, ContentSource $contentSource): array + { + $pageData = []; + $currentPageUrl = $chapterUrl; + + do { + $html = $this->fetchHtml($currentPageUrl); + $page = $this->extractMangaPageData($html, $contentSource); + + $pageData[] = [ + 'image_url' => $this->cleanImageUrl($page['image_url']), + 'page_number' => count($pageData) + 1, + ]; + + $currentPageUrl = $page['next_page_url']; + } while ($currentPageUrl); + + return $pageData; + } + + private function fetchHtml(string $url): string + { + try { + $response = $this->client->get($url, [ + 'http_errors' => true, + 'allow_redirects' => false + ]); + + $statusCode = $response->getStatusCode(); + + if ($statusCode >= 300 && $statusCode < 400 || $statusCode == 404) { + throw new Exception('Chapter Not Found at ' . $url); + } + + return (string)$response->getBody(); + } catch (Exception $e) { + throw new Exception('Bad Request: ' . $e->getMessage()); + } + } + + private function downloadAndSaveImage(string $imageUrl, string $destinationPath): void + { + try { + $response = $this->client->get($imageUrl); + $contentType = $response->getHeaderLine('Content-Type'); + + if (str_starts_with($contentType, 'image/')) { + file_put_contents($destinationPath, $response->getBody()->getContents()); + } else { + throw new Exception('Le contenu récupéré n\'est pas une image. Type de contenu : ' . $contentType); + } + } catch (Exception $e) { + throw new Exception('Erreur lors de la récupération de l\'image : ' . $e->getMessage()); + } + } + + private function extractMangaPageData(string $html, ContentSource $mangaSource): array + { + $crawler = new Crawler($html); + $imgUrl = $crawler->filter($mangaSource->getImageSelector())->attr('src') + ?? $crawler->filter($mangaSource->getImageSelector())->attr('data-src'); + + $nextLink = $crawler->filter($mangaSource->getNextPageSelector()); + $nextUrl = $nextLink->count() > 0 ? $nextLink->attr('href') : null; + + // Convert relative URLs to absolute URLs + if (!preg_match('/^https?:\/\//', $imgUrl)) { + $urlComponents = parse_url($mangaSource->getBaseUrl()); + $scheme = $urlComponents['scheme']; + $host = $urlComponents['host']; + $imgUrl = $scheme . '://' . $host . '/' . ltrim($imgUrl, '/'); + } + + return [ + 'image_url' => $imgUrl, + 'next_page_url' => $nextUrl, + ]; + } +} diff --git a/src/Service/Scraper/JavascriptScraper.php b/src/Service/Scraper/JavascriptScraper.php new file mode 100644 index 0000000..fc11115 --- /dev/null +++ b/src/Service/Scraper/JavascriptScraper.php @@ -0,0 +1,188 @@ +getManga(); + $pantherClient = PantherClient::createChromeClient(); + $chapterUrl = $this->getValidChapterUrl($contentSource, $manga, $chapter->getNumber()); + + if (!$chapterUrl) { + throw new Exception("Aucune URL valide trouvée pour le chapitre {$chapter->getNumber()} du manga {$manga->getTitle()}"); + } + + $pantherClient->request('GET', $chapterUrl); + + try { + $this->selectChapter($pantherClient, $chapter, $contentSource); + + $pageData = $contentSource->getNextPageSelector() === null + ? $this->scrapeVerticalReaderJavascript($pantherClient, $contentSource, $chapter) + : $this->scrapeHorizontalReaderJavascript($pantherClient, $contentSource, $chapter); + + $tempDir = sys_get_temp_dir() . '/' . uniqid('manga_scraper_'); + mkdir($tempDir); + + // Télécharger et sauvegarder les images + foreach ($pageData as $index => &$page) { + $imageName = sprintf('%03d.%s', $index + 1, pathinfo(parse_url($page['image_url'], PHP_URL_PATH), PATHINFO_EXTENSION)); + $imagePath = $tempDir . '/' . $imageName; + + file_put_contents($imagePath, file_get_contents($page['image_url'])); + + $page['local_image_url'] = $imagePath; + } + + $cbzFilePath = $this->generateCbzPath($manga, $chapter); + $this->createCbzFile($tempDir, $pageData, $cbzFilePath); + + $chapter->setCbzPath($cbzFilePath); + $this->entityManager->persist($chapter); + $this->entityManager->flush(); + + $this->cleanupTempFiles($tempDir); + + return $pageData; + } catch (Exception $e) { + // Log the error + return false; + } finally { + $pantherClient->close(); + } + } + + public function testScraping(string $mangaSlug, string $chapterNumber, ContentSource $contentSource): array + { + $chapterUrl = $contentSource->getChapterUrl($mangaSlug, $chapterNumber); + + if (!$this->isChapterUrlValid($chapterUrl)) { + throw new \Exception("Invalid URL, check format and slug"); + } + + $pantherClient = PantherClient::createChromeClient(); + $pantherClient->request('GET', $chapterUrl); + + try { + $chapter = new Chapter(); + $chapter->setNumber((float)$chapterNumber); + + $this->selectChapter($pantherClient, $chapter, $contentSource); + + return $contentSource->getNextPageSelector() === null + ? $this->scrapeVerticalReaderJavascript($pantherClient, $contentSource, $chapter) + : $this->scrapeHorizontalReaderJavascript($pantherClient, $contentSource, $chapter); + } catch (Exception $e) { + throw $e; + } finally { + $pantherClient->close(); + } + } + + public function supports(string $scrapingType): bool + { + return $scrapingType === 'javascript'; + } + + private function selectChapter(PantherClient $pantherClient, Chapter $chapter, ContentSource $contentSource): void + { + $chapterSelector = $contentSource->getChapterSelector(); + if (!$chapterSelector) { + return; // Si aucun sélecteur n'est défini, on ne fait rien + } + + $crawler = $pantherClient->waitFor($chapterSelector); + $select = $crawler->filter($chapterSelector); + + if ($select->count() > 0) { + $chapterNumber = $chapter->getNumber(); + $options = $select->filter('option'); + $targetIndex = null; + + foreach ($options as $index => $option) { + if (preg_match("/\b{$chapterNumber}\b/", $option->getText())) { + $targetIndex = $index; + break; + } + } + + if ($targetIndex !== null) { + $pantherClient->executeScript(" + var select = document.querySelector('$chapterSelector'); + select.selectedIndex = $targetIndex; + select.dispatchEvent(new Event('change')); + "); + + $this->waitForImagesLoaded($pantherClient, $contentSource); + } else { + throw new Exception("Chapitre $chapterNumber non trouvé dans le menu déroulant"); + } + } + } + + private function waitForImagesLoaded(PantherClient $pantherClient, ContentSource $contentSource): void + { + $imageSelector = $contentSource->getImageSelector(); + $pantherClient->wait(30)->until( + function ($driver) use ($imageSelector) { + return $driver->executeScript(" + return new Promise((resolve) => { + let lastImageCount = 0; + let stableCount = 0; + const stableThreshold = 10; + + function checkImages() { + const images = document.querySelectorAll('$imageSelector'); + const loadedImages = Array.from(images).filter(img => img.complete && img.naturalWidth > 0); + + if (loadedImages.length === lastImageCount) { + stableCount++; + } else { + stableCount = 0; + lastImageCount = loadedImages.length; + } + + if (stableCount >= stableThreshold) { + resolve(true); + } else { + setTimeout(checkImages, 200); + } + } + + checkImages(); + }); + "); + } + ); + } + + private function scrapeVerticalReaderJavascript(PantherClient $pantherClient, ContentSource $contentSource, Chapter $chapter): array + { + $pageData = []; + $crawler = $pantherClient->waitFor($contentSource->getImageSelector()); + $images = $crawler->filter($contentSource->getImageSelector()); + + foreach ($images as $index => $image) { + $imageUrl = $image->getAttribute('src') ?: $image->getAttribute('data-src'); + $pageData[] = [ + 'image_url' => $this->cleanImageUrl($imageUrl), + 'page_number' => $index + 1, + ]; + } + + return $pageData; + } + + private function scrapeHorizontalReaderJavascript(PantherClient $pantherClient, ContentSource $contentSource, Chapter $chapter): array + { + $pageData = []; + return $pageData; + } +} diff --git a/src/Service/Scraper/MangaScraperService.php b/src/Service/Scraper/MangaScraperService.php new file mode 100644 index 0000000..a99c465 --- /dev/null +++ b/src/Service/Scraper/MangaScraperService.php @@ -0,0 +1,28 @@ +scraperFactory = $scraperFactory; + } + + public function scrapeChapter(Chapter $chapter, ContentSource $contentSource): array|bool + { + $scraper = $this->scraperFactory->createScraper($contentSource); + return $scraper->scrapeChapter($chapter, $contentSource); + } + + public function testScraping(string $mangaSlug, string $chapterNumber, ContentSource $contentSource): array + { + $scraper = $this->scraperFactory->createScraper($contentSource); + return $scraper->testScraping($mangaSlug, $chapterNumber, $contentSource); + } +} diff --git a/src/Service/Scraper/MangadexScraper.php b/src/Service/Scraper/MangadexScraper.php new file mode 100644 index 0000000..d60d75a --- /dev/null +++ b/src/Service/Scraper/MangadexScraper.php @@ -0,0 +1,89 @@ +client = new Client(); + } + + public function scrapeChapter(Chapter $chapter, ContentSource $contentSource): array|bool + { + $chapterUrl = $contentSource->getBaseUrl() . sprintf($contentSource->getChapterUrlFormat(), $chapter->getExternalId()); + $manga = $chapter->getManga(); + $pageData = []; + + try { + $response = $this->client->get($chapterUrl); + $results = json_decode($response->getBody()->getContents(), true); + + if ($results['result'] !== 'ok' || count($results['chapter']['dataSaver']) === 0) { + throw new \Exception('Error while fetching chapter data from Mangadex ' . $manga->getTitle() . ' ' . $chapter->getNumber()); + } + + $tempDir = sys_get_temp_dir() . '/' . uniqid('manga_scraper_'); + mkdir($tempDir); + + foreach ($results['chapter']['dataSaver'] as $index => $page) { + $pageUrl = $results['baseUrl'] . '/data-saver/' . $results['chapter']['hash'] . '/' . $page; + $imagePath = $tempDir . '/' . sprintf('%03d.%s', $index + 1, pathinfo($page, PATHINFO_EXTENSION)); + + $this->downloadAndSaveImage($pageUrl, $imagePath); + + $this->dispatchProgressEvent($chapter, $index + 1, count($results['chapter']['dataSaver'])); + + $pageData[] = [ + 'image_url' => $pageUrl, + 'local_image_url' => $imagePath, + 'page_number' => $index + 1, + ]; + } + + $cbzFilePath = $this->generateCbzPath($manga, $chapter); + $this->createCbzFile($tempDir, $pageData, $cbzFilePath); + + $chapter->setCbzPath($cbzFilePath); + $this->entityManager->persist($chapter); + $this->entityManager->flush(); + + $this->cleanupTempFiles($tempDir); + + return $pageData; + } catch (\Exception $e) { + // Log the error + return false; + } + } + + public function testScraping(string $mangaSlug, string $chapterNumber, ContentSource $contentSource): array + { + // For Mangadex, we need the chapter's external ID, which we don't have in this context. + // We could potentially fetch it first, but for simplicity, let's return an empty array. + return []; + } + + public function supports(string $scrapingType): bool + { + return $scrapingType === 'mangadex'; + } + + private function downloadAndSaveImage(string $imageUrl, string $destinationPath): void + { + $response = $this->client->get($imageUrl); + file_put_contents($destinationPath, $response->getBody()->getContents()); + } +} diff --git a/src/Service/Scraper/ScraperFactory.php b/src/Service/Scraper/ScraperFactory.php new file mode 100644 index 0000000..d7741c9 --- /dev/null +++ b/src/Service/Scraper/ScraperFactory.php @@ -0,0 +1,25 @@ +scrapers = iterator_to_array($scrapers); + } + + public function createScraper(ContentSource $contentSource): ScraperInterface + { + foreach ($this->scrapers as $scraper) { + if ($scraper->supports($contentSource->getScrapingType())) { + return $scraper; + } + } + throw new \InvalidArgumentException('Unsupported scraping type: ' . $contentSource->getScrapingType()); + } +} diff --git a/src/Service/Scraper/ScraperInterface.php b/src/Service/Scraper/ScraperInterface.php new file mode 100644 index 0000000..3cf27ed --- /dev/null +++ b/src/Service/Scraper/ScraperInterface.php @@ -0,0 +1,13 @@ +​ {# Modal panel #} -
+