Files
Mangarr/src/Service/MangaScraperService.php
ext.jeremy.guillot@maxicoffee.domains 7506a7a3c1 style: apply php-cs-fixer formatting (PSR-12)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-09 20:46:59 +01:00

626 lines
23 KiB
PHP

<?php
namespace App\Service;
use App\Entity\Chapter;
use App\Entity\Manga;
use App\Entity\ContentSource;
use App\Event\PageScrappingProgressEvent;
use App\Repository\ChapterRepository;
use App\Repository\MangaRepository;
use Doctrine\ORM\EntityManagerInterface;
use Exception;
use Facebook\WebDriver\Remote\RemoteWebElement;
use Facebook\WebDriver\WebDriverExpectedCondition;
use GuzzleHttp\Client;
use GuzzleHttp\Exception\GuzzleException;
use GuzzleHttp\Exception\RequestException;
use Symfony\Component\DomCrawler\Crawler;
use Symfony\Component\Routing\Matcher\UrlMatcher;
use Symfony\Component\Routing\RequestContext;
use Symfony\Component\Routing\Route;
use Symfony\Component\Routing\RouteCollection;
use Symfony\Contracts\EventDispatcher\EventDispatcherInterface;
use Symfony\Component\Panther\Client as PantherClient;
class MangaScraperService
{
public const string PUBLIC_CBZ = '/public/cbz';
public function __construct(
private readonly string $projectDir,
private readonly EventDispatcherInterface $eventDispatcher,
private readonly EntityManagerInterface $entityManager,
private readonly MangaRepository $mangaRepository,
) {
}
private function extractMangaPageData(string $html, ContentSource $mangaSource): array
{
$crawler = new Crawler($html);
$imgUrl = $crawler->filter($mangaSource->getImageSelector())->attr('src')
?? $crawler->filter($mangaSource->getImageSelector())->attr('data-src');
// dd($imgUrl);
// if (empty($imgUrl)) {
// throw new \Exception('No valid image found on the page.');
// }
$nextLink = $crawler->filter($mangaSource->getNextPageSelector());
$nextUrl = $nextLink->count() > 0 ? $nextLink->attr('href') : null;
// Convert relative URLs to absolute URLs
if (!preg_match('/^https?:\/\//', $imgUrl)) {
$urlComponents = parse_url($mangaSource->getBaseUrl());
$scheme = $urlComponents['scheme'];
$host = $urlComponents['host'];
$imgUrl = $scheme . '://' . $host . '/' . ltrim($imgUrl, '/');
}
return [
'image_url' => $imgUrl,
'next_page_url' => $nextUrl,
];
}
/**
* @throws GuzzleException
*/
public function scrapeManga(Manga $manga, ContentSource $mangaSource): array
{
$allChaptersData = [];
foreach ($manga->getChapters() as $chapter) {
$chapterData = $this->scrapeChapter($chapter, $mangaSource);
if ($chapterData !== false) {
$allChaptersData[$chapter->getNumber()] = $chapterData;
}
}
return $allChaptersData;
}
/**
* @throws GuzzleException
* @throws Exception
*/
public function scrapeChapter(Chapter $chapter, ContentSource $mangaSource): array|bool
{
return match ($mangaSource->getScrapingType()) {
'html' => $this->scrapeChapterHtml($chapter->getManga(), $chapter, $mangaSource),
'javascript' => $this->scrapeChapterJavaScript($chapter->getManga(), $chapter, $mangaSource),
'mangadex' => $this->scrapeChapterMangadex($chapter, $mangaSource),
default => throw new Exception('Unsupported scraping type: ' . $mangaSource->getScrapingType()),
};
}
/**
* @throws GuzzleException
* @throws Exception
*/
private function scrapeChapterMangadex(Chapter $chapter, ContentSource $mangaSource): bool
{
$client = new Client();
$chapterUrl = $mangaSource->getBaseUrl() . sprintf($mangaSource->getChapterUrlFormat(), $chapter->getExternalId());
$manga = $chapter->getManga();
$pageData = [];
$response = $client->get($chapterUrl);
$results = json_decode($response->getBody()->getContents(), true);
if ($results['result'] !== 'ok' || count($results['chapter']['dataSaver']) === 0) {
throw new Exception('Error while fetching chapter data from Mangadex ' . $manga->getTitle() . ' ' . $chapter->getNumber());
}
$tempDir = sys_get_temp_dir() . '/' . uniqid('manga_scraper_');
mkdir($tempDir);
foreach ($results['chapter']['dataSaver'] as $index => $page) {
$pageUrl = $results['baseUrl'] . '/data-saver/' . $results['chapter']['hash'] . '/' . $page;
$imagePath = $tempDir . '/' . sprintf('%03d.%s', $index + 1, pathinfo($page, PATHINFO_EXTENSION));
$this->downloadAndSaveImage($pageUrl, $imagePath);
$event = new PageScrappingProgressEvent($chapter->getId(), $index + 1, count($results['chapter']['dataSaver']));
$this->eventDispatcher->dispatch($event, PageScrappingProgressEvent::NAME);
$pageData[] = [
'image_url' => $pageUrl,
'local_image_url' => $imagePath,
'page_number' => $index + 1,
];
}
$cbzFilePath = $this->generateCbzPath($manga, $chapter);
$this->createCbzFile($tempDir, $pageData, $cbzFilePath);
$chapter->setCbzPath($cbzFilePath);
$this->entityManager->persist($chapter);
$this->entityManager->flush();
// Nettoyage du répertoire temporaire
$this->cleanupTempFiles($tempDir);
return true;
}
private function scrapeChapterJavascript(Manga $manga, Chapter $chapter, ContentSource $mangaSource): array|bool
{
$pantherClient = PantherClient::createChromeClient();
$chapterUrl = $mangaSource->getChapterUrl($manga->getSlug(), $chapter->getNumber());
$pantherClient->request('GET', $chapterUrl);
// Sélection du chapitre dans le menu déroulant
try {
$crawler = $pantherClient->waitFor('body');
$select = $crawler->filter('#selectChapitres');
if ($select->count() > 0) {
$chapterNumber = $chapter->getNumber();
$options = $select->filter('option');
$targetindex = null;
/** @var RemoteWebElement $option */
foreach ($options->getIterator() as $index => $option) {
$optionText = $option->getText();
// Recherche plus flexible du numéro de chapitre
if (preg_match("/\b{$chapterNumber}\b/", $optionText)) {
$targetIndex = $index;
break;
}
}
if ($targetIndex !== null) {
$pantherClient->executeScript("
var select = document.querySelector('#selectChapitres');
select.selectedIndex = $targetIndex;
select.dispatchEvent(new Event('change'));
");
// Attendre que la page se mette à jour après la sélection
$pantherClient->wait(60000)->until( // 60 secondes de timeout
function ($driver) {
return $driver->executeScript("
var scansPlacement = document.querySelector('#scansPlacement');
if (!scansPlacement) return false;
var lazyImages = scansPlacement.querySelectorAll('img.lazy');
var loadingGif = scansPlacement.querySelector('img[src*=\"loading_scans.gif\"]');
// Vérifier que toutes les images lazy sont chargées et que le GIF de chargement n'est plus présent
var allImagesLoaded = Array.from(lazyImages).every(img => img.complete && img.naturalWidth > 0);
return lazyImages.length > 0 && allImagesLoaded && !loadingGif;
");
}
);
} else {
throw new \Exception("Chapitre $chapterNumber non trouvé dans le menu déroulant");
}
}
} catch (\Exception $e) {
// $this->logger->warning('Erreur lors de la sélection du chapitre : ' . $e->getMessage());
$pantherClient->close();
return false;
}
$pageData = [];
try {
if ($mangaSource->getNextPageSelector() === null) {
// Lecteur vertical
$pageData = $this->scrapeVerticalReaderJavascript($pantherClient, $mangaSource, $chapter);
} else {
// Lecteur horizontal
$pageData = $this->scrapeHorizontalReaderJavascript($pantherClient, $mangaSource, $chapter);
}
} catch (\Exception $e) {
throw $e;
// $this->logger->warning('Erreur lors du scraping du chapitre ' . $chapter->getNumber() . ' du manga ' . $manga->getTitle() . ': ' . $e->getMessage());
} finally {
$pantherClient->close();
}
return $pageData;
}
private function scrapeVerticalReaderJavascript(PantherClient $pantherClient, ContentSource $mangaSource, Chapter $chapter): array
{
$pageData = [];
$pageNumber = 1;
$crawler = $pantherClient->waitFor($mangaSource->getImageSelector());
$images = $crawler->filter($mangaSource->getImageSelector());
foreach ($images->getIterator() as $image) {
$imageUrl = $image->getAttribute('src') ?: $image->getAttribute('data-src');
$pageData[] = [
'image_url' => $this->cleanImageUrl($imageUrl),
'page_number' => $pageNumber,
];
$event = new PageScrappingProgressEvent($chapter->getId(), $pageNumber, $images->count());
$this->eventDispatcher->dispatch($event, PageScrappingProgressEvent::NAME);
$pageNumber++;
}
return $pageData;
}
private function scrapeHorizontalReaderJavascript(PantherClient $pantherClient, ContentSource $mangaSource, Chapter $chapter): array
{
$pageData = [];
$pageNumber = 1;
while (true) {
try {
$crawler = $pantherClient->waitFor($mangaSource->getImageSelector());
$imageElement = $crawler->filter($mangaSource->getImageSelector())->first();
if ($imageElement->count() === 0) {
break; // Fin du chapitre
}
$imageUrl = $imageElement->attr('src') ?: $imageElement->attr('data-src');
$pageData[] = [
'image_url' => $this->cleanImageUrl($imageUrl),
'page_number' => $pageNumber,
];
$event = new PageScrappingProgressEvent($chapter->getId(), $pageNumber, 0);
$this->eventDispatcher->dispatch($event, PageScrappingProgressEvent::NAME);
// Passer à la page suivante
$nextButton = $pantherCrawler->filter($mangaSource->getNextPageSelector());
if ($nextButton->count() === 0) {
break; // Pas de bouton suivant, fin du chapitre
}
$nextButton->click();
// Attendre que la page change
$pantherClient->waitFor($mangaSource->getImageSelector(), 10);
// Mettre à jour le crawler avec le nouveau contenu de la page
$pantherCrawler = $pantherClient->refreshCrawler();
$pageNumber++;
} catch (\Exception $e) {
throw $e;
// $this->logger->warning('Erreur lors du scraping de la page ' . $pageNumber . ' du chapitre ' . $chapter->getNumber() . ': ' . $e->getMessage());
break;
}
}
return $pageData;
}
private function fetchImagesUsingPuppeteer(string $url, string $imageSelector, string $nextButtonSelector): array
{
// Appeler le script Puppeteer avec les paramètres nécessaires
$output = [];
$command = sprintf('node puppeteer-script.js "%s" "%s" "%s" 2>&1', $url, $imageSelector, $nextButtonSelector); // Redirect stderr to stdout
// dump($command);
// exec($command, $output, $return_var);
// dd($command, $output);
// Convertir la sortie JSON en tableau PHP
return json_decode(implode("", $output), true);
}
public function testScraping(string $mangaSlug, string $chapterNumber, ContentSource $contentSource): array
{
return match ($contentSource->getScrapingType()) {
'html' => $this->testScrapingHtml($mangaSlug, $chapterNumber, $contentSource),
'javascript' => $this->testScrapingJavascript($mangaSlug, $chapterNumber, $contentSource),
default => throw new Exception('Unsupported scraping type: ' . $contentSource->getScrapingType()),
};
}
/**
* @throws Exception
*/
public function testScrapingJavascript(string $mangaSlug, string $chapterNumber, ContentSource $contentSource): array
{
$manga = $this->mangaRepository->findOneBy(['slug' => $mangaSlug]);
$chapter = $manga->getChapterByNumber($chapterNumber);
return $this->scrapeChapterJavascript($manga, $chapter, $contentSource);
}
/**
* @throws GuzzleException
*/
public function testScrapingHtml(string $mangaSlug, string $chapterNumber, ContentSource $contentSource): array
{
$chapterUrl = $contentSource->getChapterUrl($mangaSlug, $chapterNumber);
$html = $this->fetchHtml($chapterUrl);
if ($contentSource->getNextPageSelector() === null) {
return $this->scrapeVerticalReader($html, $contentSource);
} else {
return $this->scrapeHorizontalReader($chapterUrl, $contentSource);
}
}
/**
* @throws GuzzleException
*/
private function scrapeChapterHtml(Manga $manga, Chapter $chapter, ContentSource $mangaSource): array|bool
{
$chapterUrl = $mangaSource->getChapterUrl($manga->getSlug(), $chapter->getNumber());
$tempDir = sys_get_temp_dir() . '/' . uniqid('manga_scraper_');
mkdir($tempDir);
$pageData = [];
if ($mangaSource->getNextPageSelector() === null) {
// Lecteur vertical
$html = $this->fetchHtml($chapterUrl);
$pageData = $this->scrapeVerticalReader($html, $mangaSource);
} else {
// Lecteur horizontal (paginé)
$pageData = $this->scrapeHorizontalReader($chapterUrl, $mangaSource);
}
// Télécharger et sauvegarder les images
foreach ($pageData as $index => &$page) {
$imageName = sprintf('%03d.%s', $index + 1, pathinfo(parse_url($page['image_url'], PHP_URL_PATH), PATHINFO_EXTENSION));
$imagePath = $tempDir . '/' . $imageName;
$this->downloadAndSaveImage($page['image_url'], $imagePath);
$event = new PageScrappingProgressEvent($chapter->getId(), $index + 1, count($pageData));
$this->eventDispatcher->dispatch($event, PageScrappingProgressEvent::NAME);
$page['local_image_url'] = $imagePath;
}
$cbzFilePath = $this->generateCbzPath($manga, $chapter);
$this->createCbzFile($tempDir, $pageData, $cbzFilePath);
$chapter->setCbzPath($cbzFilePath);
$this->entityManager->persist($chapter);
$this->entityManager->flush();
// Nettoyage du répertoire temporaire
$this->cleanupTempFiles($tempDir);
return $pageData;
}
private function scrapeVerticalReader(string $html, ContentSource $contentSource): array
{
$crawler = new Crawler($html);
$images = $crawler->filter($contentSource->getImageSelector());
$pageData = [];
foreach ($images as $index => $image) {
if ($image->getAttribute('src') === '') {
$imgUrl = $image->getAttribute('data-src');
} else {
$imgUrl = $image->getAttribute('src');
}
$pageData[] = [
'image_url' => $this->cleanImageUrl($imgUrl),
'page_number' => $index + 1,
];
}
return $pageData;
}
/**
* @throws GuzzleException
*/
private function scrapeHorizontalReader(string $chapterUrl, ContentSource $contentSource): array
{
$pageData = [];
$currentPageUrl = $chapterUrl;
do {
$html = $this->fetchHtml($currentPageUrl);
$page = $this->extractMangaPageData($html, $contentSource);
$pageData[] = [
'image_url' => $this->cleanImageUrl($page['image_url']),
'page_number' => count($pageData) + 1,
];
$currentPageUrl = $page['next_page_url'];
} while ($currentPageUrl);
return $pageData;
}
/**
* Processes a single image
* @throws GuzzleException
*/
private function processImage(string $imgUrl, string $tempDir, array &$pageData, int $index, Chapter $chapter): void
{
$imgUrl = $this->cleanImageUrl($imgUrl);
$imageName = sprintf('%03d.%s', $index + 1, pathinfo(parse_url($imgUrl, PHP_URL_PATH), PATHINFO_EXTENSION));
$imagePath = $tempDir . '/' . $imageName;
$this->downloadAndSaveImage($imgUrl, $imagePath);
// $event = new PageScrappingProgressEvent($chapter->getId(), $index + 1, 0);
// $this->eventDispatcher->dispatch($event, PageScrappingProgressEvent::NAME);
$pageData[] = [
'image_url' => $imgUrl,
'local_image_url' => $imagePath,
'page_number' => $index + 1,
];
}
private function cleanImageUrl(string $url): string
{
return preg_replace('/[\x00-\x1F\x7F]/', '', trim($url));
}
/**
* @throws GuzzleException
* @throws Exception
*/
private function fetchHtml(string $url): string
{
$client = new Client();
try {
$response = $client->get($url, [
'http_errors' => true,
'allow_redirects' => false
]);
$statusCode = $response->getStatusCode();
if ($statusCode >= 300 && $statusCode < 400) {
throw new Exception('Chapter Not Found at ' . $url);
} elseif ($statusCode == 404) {
throw new Exception('Chapter Not Found at ' . $url);
}
return (string)$response->getBody();
} catch (Exception $e) {
throw new Exception('Bad Request: ' . $e->getMessage());
}
}
/**
* @throws GuzzleException
*/
private function downloadAndSaveImage(string $imageUrl, string $destinationPath): void
{
$client = new Client();
$startTime = microtime(true);
try {
$response = $client->get($imageUrl);
$endTime = microtime(true);
$contentType = $response->getHeaderLine('Content-Type');
$xCacheHeader = $response->getHeaderLine('X-Cache');
$isCached = str_starts_with($xCacheHeader, 'HIT');
$contentLength = $response->getHeaderLine('Content-Length');
if (str_starts_with($contentType, 'image/')) {
file_put_contents($destinationPath, $response->getBody()->getContents());
// if ($this->scrapingType === 'mangadex') {
// $this->sendReport($imageUrl, true, $isCached, (int)$contentLength, ($endTime - $startTime) * 1000);
// }
} else {
// if ($this->scrapingType === 'mangadex') {
// $this->sendReport($imageUrl, false, $isCached, (int)$contentLength, ($endTime - $startTime) * 1000);
// }
throw new \Exception('Le contenu récupéré n\'est pas une image. Type de contenu : ' . $contentType);
}
} catch (RequestException $e) {
throw new \Exception('Erreur lors de la récupération de l\'image : ' . $e->getMessage());
}
}
/**
* @throws GuzzleException
*/
private function isChapterAvailable(string $chapterUrl, float $chapterNumber, ContentSource $mangaSource): bool
{
$html = $this->fetchHtml($chapterUrl);
$crawler = new Crawler($html);
$nextLink = $crawler->filter($mangaSource->getNextPageSelector());
if ($nextLink->count() === 0) {
return false;
}
$nextUrl = $nextLink->attr('href');
$routeCollection = new RouteCollection();
$routeCollection->add('manga_chapter', new Route('/scan-{manga}/{chapter}/{page}'));
$context = new RequestContext('/');
$matcher = new UrlMatcher($routeCollection, $context);
$path = parse_url($nextUrl, PHP_URL_PATH);
$parameters = $matcher->match($path);
return (float)$parameters['chapter'] === $chapterNumber;
}
private function sendReport(string $imageUrl, bool $success, bool $cached, int $bytes, float $duration): void
{
$client = new Client();
try {
$client->post('https://api.mangadex.network/report', [
'headers' => [
'Content-Type' => 'application/json',
],
'json' => [
'url' => $imageUrl,
'success' => $success,
'cached' => $cached,
'bytes' => $bytes,
'duration' => $duration,
],
]);
} catch (RequestException $e) {
// Gérer les exceptions de requête pour le rapport
throw new \Exception('Erreur lors de l\'envoi du rapport : ' . $e->getMessage());
}
}
private function createCbzFile(string $tempDir, array $pageData, string $cbzFilePath): void
{
$zip = new \ZipArchive();
if ($zip->open($cbzFilePath, \ZipArchive::CREATE) === true) {
foreach ($pageData as $page) {
$zip->addFile($page['local_image_url'], basename($page['local_image_url']));
}
$zip->close();
}
}
private function generateCbzPath(Manga $manga, Chapter $chapter): string
{
$volumeDir = $this->createDirectories($manga, $chapter->getVolume());
$fileName = sprintf(
'%s_vol%d_ch%s.cbz',
$manga->getSlug(),
$chapter->getVolume(),
$chapter->getNumber()
);
return $volumeDir . '/' . $fileName;
}
private function createDirectories(Manga $manga, int $volume): string
{
$mangaYear = $manga->getPublicationYear() ?? 'unknown';
$mangaDir = sprintf('%s/%s (%s)', $this->projectDir . self::PUBLIC_CBZ, ucfirst($manga->getSlug()), $mangaYear);
$volumeDir = sprintf('%s/volume_%d', $mangaDir, sprintf('%02d', $volume));
if (!is_dir($volumeDir)) {
mkdir($volumeDir, 0755, true);
}
return $volumeDir;
}
private function cleanupTempFiles(string $directory): void
{
$files = glob($directory . '/*');
foreach ($files as $file) {
if (is_file($file)) {
unlink($file);
}
}
rmdir($directory);
}
}