383 lines
14 KiB
PHP
383 lines
14 KiB
PHP
<?php
|
|
|
|
namespace App\Service;
|
|
|
|
use App\Entity\Chapter;
|
|
use App\Entity\Manga;
|
|
use App\Entity\ContentSource;
|
|
use App\Event\PageScrappingProgressEvent;
|
|
use App\EventSubscriber\MangaScrapedEvent;
|
|
use Exception;
|
|
use GuzzleHttp\Client;
|
|
use GuzzleHttp\Exception\GuzzleException;
|
|
use GuzzleHttp\Exception\RequestException;
|
|
use Symfony\Component\DomCrawler\Crawler;
|
|
use Symfony\Component\HttpKernel\Exception\BadRequestHttpException;
|
|
use Symfony\Component\HttpKernel\Exception\HttpException;
|
|
use Symfony\Component\HttpKernel\Exception\NotFoundHttpException;
|
|
use Symfony\Component\Routing\Matcher\UrlMatcher;
|
|
use Symfony\Component\Routing\RequestContext;
|
|
use Symfony\Component\Routing\Route;
|
|
use Symfony\Component\Routing\RouteCollection;
|
|
use Symfony\Contracts\EventDispatcher\EventDispatcherInterface;
|
|
|
|
class MangaScraperService
|
|
{
|
|
const string IMG_BASE_DIR = '/public/manga-images';
|
|
private string $projectDir;
|
|
private EventDispatcherInterface $eventDispatcher;
|
|
private string $scrapingType = '';
|
|
|
|
public function __construct($projectDir, EventDispatcherInterface $eventDispatcher)
|
|
{
|
|
$this->projectDir = $projectDir;
|
|
$this->eventDispatcher = $eventDispatcher;
|
|
}
|
|
|
|
private function extractMangaPageData(string $html, ContentSource $mangaSource): array
|
|
{
|
|
$crawler = new Crawler($html);
|
|
$imgUrl = $crawler->filter($mangaSource->getImageSelector())->attr('src')
|
|
?? $crawler->filter($mangaSource->getImageSelector())->attr('data-src');
|
|
|
|
// dd($imgUrl);
|
|
|
|
// if (empty($imgUrl)) {
|
|
// throw new \Exception('No valid image found on the page.');
|
|
// }
|
|
|
|
$nextLink = $crawler->filter($mangaSource->getNextPageSelector());
|
|
$nextUrl = $nextLink->count() > 0 ? $nextLink->attr('href') : null;
|
|
|
|
// Convert relative URLs to absolute URLs
|
|
if (!preg_match('/^https?:\/\//', $imgUrl)) {
|
|
$urlComponents = parse_url($mangaSource->getBaseUrl());
|
|
$scheme = $urlComponents['scheme'];
|
|
$host = $urlComponents['host'];
|
|
$imgUrl = $scheme . '://' . $host . '/' . ltrim($imgUrl, '/');
|
|
}
|
|
|
|
return [
|
|
'image_url' => $imgUrl,
|
|
'next_page_url' => $nextUrl,
|
|
];
|
|
}
|
|
|
|
/**
|
|
* @throws GuzzleException
|
|
*/
|
|
public function scrapeManga(Manga $manga, ContentSource $mangaSource): array
|
|
{
|
|
$allChaptersData = [];
|
|
|
|
foreach ($manga->getChapters() as $chapter) {
|
|
$chapterData = $this->scrapeChapter($chapter, $mangaSource);
|
|
if ($chapterData !== false) {
|
|
$allChaptersData[$chapter->getNumber()] = $chapterData;
|
|
}
|
|
}
|
|
|
|
return $allChaptersData;
|
|
}
|
|
|
|
/**
|
|
* @throws GuzzleException
|
|
* @throws Exception
|
|
*/
|
|
public function scrapeChapter(Chapter $chapter, ContentSource $mangaSource): array|bool
|
|
{
|
|
return match ($mangaSource->getScrapingType()) {
|
|
'html' => $this->scrapeChapterHtml($chapter->getManga(), $chapter, $mangaSource),
|
|
'javascript' => $this->scrapeChapterJavaScript($chapter->getManga(), $chapter, $mangaSource),
|
|
'mangadex' => $this->scrapeChapterMangadex($chapter, $mangaSource),
|
|
default => throw new Exception('Unsupported scraping type: ' . $mangaSource->getScrapingType()),
|
|
};
|
|
}
|
|
|
|
// private function scrapeChapterHtml(Manga $manga, Chapter $chapter, MangaSource $mangaSource): array|bool
|
|
// {
|
|
// $chapterUrl = $mangaSource->getChapterUrl($manga->getTitle(), $chapter->getChapterNumber());
|
|
// $html = $this->fetchHtml($chapterUrl);
|
|
// $imgUrls = $this->extractMangaPageData($html);
|
|
//
|
|
// return $this->saveChapterImages($manga, $chapter, $imgUrls);
|
|
// }
|
|
|
|
/**
|
|
* @throws GuzzleException
|
|
* @throws Exception
|
|
*/
|
|
private function scrapeChapterMangadex(Chapter $chapter, ContentSource $mangaSource): array|bool
|
|
{
|
|
$this->scrapingType = 'mangadex';
|
|
$client = new Client();
|
|
$chapterUrl = $mangaSource->getBaseUrl() . sprintf($mangaSource->getChapterUrlFormat(), $chapter->getExternalId());
|
|
$mangaTitle = $chapter->getManga()->getTitle();
|
|
$chapterNumber = $chapter->getNumber();
|
|
$pageData = [];
|
|
|
|
$response = $client->get($chapterUrl);
|
|
$results = json_decode($response->getBody()->getContents(), true);
|
|
|
|
$mangaDir = sprintf('%s/%s', $this->projectDir . self::IMG_BASE_DIR, $mangaTitle);
|
|
if (!is_dir($mangaDir)) {
|
|
mkdir($mangaDir, 0755, true);
|
|
}
|
|
$chapterDir = sprintf('%s/%s', $mangaDir, $chapterNumber);
|
|
if (!is_dir($chapterDir)) {
|
|
mkdir($chapterDir, 0755, true);
|
|
}
|
|
|
|
if(count($results['chapter']['dataSaver']) === 0){
|
|
throw new Exception('Error while fetching chapter data from Mangadex ' . $chapter->getManga()->getTitle() . ' ' . $chapter->getNumber());
|
|
}
|
|
|
|
if ($results['result'] === 'ok') {
|
|
foreach ($results['chapter']['dataSaver'] as $page) {
|
|
$pageUrl = $results['baseUrl'] . '/data-saver/' . $results['chapter']['hash'] . '/' . $page;
|
|
// Déterminer l'extension de l'image
|
|
$imageExtension = pathinfo(parse_url($pageUrl, PHP_URL_PATH), PATHINFO_EXTENSION);
|
|
|
|
// Construire le nom de fichier de l'image
|
|
$imageName = sprintf('%03d.%s', count($pageData) + 1, $imageExtension);
|
|
$imagePath = sprintf('%s/%s', $chapterDir, $imageName);
|
|
|
|
$this->downloadAndSaveImage($pageUrl, $imagePath);
|
|
|
|
$event = new PageScrappingProgressEvent($chapter->getId(), count($pageData) + 1, count($results['chapter']['dataSaver']));
|
|
$this->eventDispatcher->dispatch($event, PageScrappingProgressEvent::NAME);
|
|
|
|
$pageData[] = [
|
|
'image_url' => $pageUrl,
|
|
'local_image_url' => sprintf('/manga-images/%s/%s/%s', $mangaTitle, $chapterNumber, $imageName),
|
|
'page_number' => count($pageData) + 1,
|
|
];
|
|
}
|
|
}
|
|
|
|
$event = new MangaScrapedEvent($mangaTitle, $chapterNumber, $pageData, $chapterDir);
|
|
$this->eventDispatcher->dispatch($event, MangaScrapedEvent::NAME);
|
|
|
|
return $pageData;
|
|
}
|
|
|
|
private function scrapeChapterJavaScript(Manga $manga, Chapter $chapter, ContentSource $mangaSource): array|bool
|
|
{
|
|
$chapterUrl = $mangaSource->getChapterUrl($manga->getTitle(), $chapter->getNumber());
|
|
$imgUrls = $this->fetchImagesUsingPuppeteer($chapterUrl, $mangaSource->getImageSelector(), $mangaSource->getNextPageSelector());
|
|
|
|
return $this->saveChapterImages($manga, $chapter, $imgUrls);
|
|
}
|
|
|
|
private function fetchImagesUsingPuppeteer(string $url, string $imageSelector, string $nextButtonSelector): array
|
|
{
|
|
// Appeler le script Puppeteer avec les paramètres nécessaires
|
|
$output = [];
|
|
$command = sprintf('node puppeteer-script.js "%s" "%s" "%s" 2>&1', $url, $imageSelector, $nextButtonSelector); // Redirect stderr to stdout
|
|
// dump($command);
|
|
// exec($command, $output, $return_var);
|
|
|
|
// dd($command, $output);
|
|
|
|
// Convertir la sortie JSON en tableau PHP
|
|
return json_decode(implode("", $output), true);
|
|
}
|
|
|
|
/**
|
|
* @throws GuzzleException
|
|
*/
|
|
private function scrapeChapterHtml(Manga $manga, Chapter $chapter, ContentSource $mangaSource): array|bool
|
|
{
|
|
$this->scrapingType = 'html';
|
|
$chapterUrl = $mangaSource->getChapterUrl($manga->getSlug(), $chapter->getNumber());
|
|
|
|
$pageData = [];
|
|
$currentPageUrl = $chapterUrl;
|
|
$mangaTitle = $manga->getTitle();
|
|
$chapterNumber = $chapter->getNumber();
|
|
|
|
$mangaDir = sprintf('%s/%s', $this->projectDir . self::IMG_BASE_DIR, $mangaTitle);
|
|
if (!is_dir($mangaDir)) {
|
|
mkdir($mangaDir, 0755, true);
|
|
}
|
|
|
|
$chapterDir = sprintf('%s/%s', $mangaDir, $chapterNumber);
|
|
if (!is_dir($chapterDir)) {
|
|
mkdir($chapterDir, 0755, true);
|
|
}
|
|
|
|
do {
|
|
$html = $this->fetchHtml($currentPageUrl);
|
|
$page = $this->extractMangaPageData($html, $mangaSource);
|
|
|
|
// Déterminer l'extension de l'image
|
|
$imageExtension = pathinfo(parse_url($page['image_url'], PHP_URL_PATH), PATHINFO_EXTENSION);
|
|
|
|
// Construire le nom de fichier de l'image
|
|
$imageName = sprintf('%03d.%s', count($pageData) + 1, $imageExtension);
|
|
$imagePath = sprintf('%s/%s', $chapterDir, $imageName);
|
|
|
|
$this->downloadAndSaveImage($page['image_url'], $imagePath);
|
|
|
|
$event = new PageScrappingProgressEvent($chapter->getId(), count($pageData) + 1, 0);
|
|
$this->eventDispatcher->dispatch($event, PageScrappingProgressEvent::NAME);
|
|
|
|
$pageData[] = [
|
|
'image_url' => $page['image_url'],
|
|
'local_image_url' => sprintf('/manga-images/%s/%s/%s', $mangaTitle, $chapterNumber, $imageName),
|
|
'page_number' => count($pageData) + 1,
|
|
];
|
|
|
|
$currentPageUrl = $page['next_page_url'];
|
|
} while ($currentPageUrl);
|
|
|
|
$event = new MangaScrapedEvent($mangaTitle, $chapterNumber, $pageData, $chapterDir);
|
|
$this->eventDispatcher->dispatch($event, MangaScrapedEvent::NAME);
|
|
|
|
return $pageData;
|
|
}
|
|
|
|
/**
|
|
* @throws GuzzleException
|
|
* @throws Exception
|
|
*/
|
|
private function fetchHtml(string $url): string
|
|
{
|
|
$client = new Client();
|
|
|
|
try {
|
|
$response = $client->get($url, [
|
|
'http_errors' => true,
|
|
'allow_redirects' => false
|
|
]);
|
|
|
|
$statusCode = $response->getStatusCode();
|
|
|
|
if ($statusCode >= 300 && $statusCode < 400) {
|
|
throw new Exception('Chapter Not Found at ' . $url);
|
|
} elseif ($statusCode == 404) {
|
|
throw new Exception('Chapter Not Found at ' . $url);
|
|
}
|
|
|
|
return (string)$response->getBody();
|
|
} catch (Exception $e) {
|
|
throw new Exception('Bad Request: ' . $e->getMessage());
|
|
}
|
|
}
|
|
|
|
/**
|
|
* @throws GuzzleException
|
|
*/
|
|
private function downloadAndSaveImage(string $imageUrl, string $destinationPath): void
|
|
{
|
|
$client = new Client();
|
|
$startTime = microtime(true);
|
|
|
|
try {
|
|
$response = $client->get($imageUrl);
|
|
$endTime = microtime(true);
|
|
$contentType = $response->getHeaderLine('Content-Type');
|
|
$xCacheHeader = $response->getHeaderLine('X-Cache');
|
|
$isCached = str_starts_with($xCacheHeader, 'HIT');
|
|
$contentLength = $response->getHeaderLine('Content-Length');
|
|
|
|
if (str_starts_with($contentType, 'image/')) {
|
|
file_put_contents($destinationPath, $response->getBody()->getContents());
|
|
if ($this->scrapingType === 'mangadex') {
|
|
$this->sendReport($imageUrl, true, $isCached, (int)$contentLength, ($endTime - $startTime) * 1000);
|
|
}
|
|
} else {
|
|
if ($this->scrapingType === 'mangadex') {
|
|
$this->sendReport($imageUrl, false, $isCached, (int)$contentLength, ($endTime - $startTime) * 1000);
|
|
}
|
|
throw new \Exception('Le contenu récupéré n\'est pas une image. Type de contenu : ' . $contentType);
|
|
}
|
|
} catch
|
|
(RequestException $e) {
|
|
throw new \Exception('Erreur lors de la récupération de l\'image : ' . $e->getMessage());
|
|
}
|
|
}
|
|
|
|
private function saveChapterImages(Manga $manga, Chapter $chapter, array $imgUrls): array
|
|
{
|
|
$mangaTitle = $manga->getTitle();
|
|
$chapterNumber = $chapter->getNumber();
|
|
|
|
$mangaDir = sprintf('%s/%s', $this->projectDir . self::IMG_BASE_DIR, $mangaTitle);
|
|
if (!is_dir($mangaDir)) {
|
|
mkdir($mangaDir, 0755, true);
|
|
}
|
|
|
|
$chapterDir = sprintf('%s/%s', $mangaDir, $chapterNumber);
|
|
if (!is_dir($chapterDir)) {
|
|
mkdir($chapterDir, 0755, true);
|
|
}
|
|
|
|
$pageData = [];
|
|
foreach ($imgUrls as $index => $imgUrl) {
|
|
$imageName = sprintf('%03d.%s', $index + 1, pathinfo(parse_url($imgUrl, PHP_URL_PATH), PATHINFO_EXTENSION));
|
|
$imagePath = sprintf('%s/%s', $chapterDir, $imageName);
|
|
|
|
$this->downloadAndSaveImage($imgUrl, $imagePath);
|
|
|
|
$pageData[] = [
|
|
'image_url' => $imgUrl,
|
|
'local_image_url' => sprintf('/manga-images/%s/%s/%s', $mangaTitle, $chapterNumber, $imageName),
|
|
'page_number' => $index + 1,
|
|
];
|
|
}
|
|
|
|
$event = new MangaScrapedEvent($mangaTitle, $chapterNumber, $pageData, $chapterDir);
|
|
$this->eventDispatcher->dispatch($event, MangaScrapedEvent::NAME);
|
|
|
|
return $pageData;
|
|
}
|
|
|
|
/**
|
|
* @throws GuzzleException
|
|
*/
|
|
private function isChapterAvailable(string $chapterUrl, float $chapterNumber, ContentSource $mangaSource): bool
|
|
{
|
|
$html = $this->fetchHtml($chapterUrl);
|
|
$crawler = new Crawler($html);
|
|
$nextLink = $crawler->filter($mangaSource->getNextPageSelector());
|
|
|
|
if ($nextLink->count() === 0) {
|
|
return false;
|
|
}
|
|
|
|
$nextUrl = $nextLink->attr('href');
|
|
$routeCollection = new RouteCollection();
|
|
$routeCollection->add('manga_chapter', new Route('/scan-{manga}/{chapter}/{page}'));
|
|
$context = new RequestContext('/');
|
|
$matcher = new UrlMatcher($routeCollection, $context);
|
|
$path = parse_url($nextUrl, PHP_URL_PATH);
|
|
$parameters = $matcher->match($path);
|
|
|
|
return (float)$parameters['chapter'] === $chapterNumber;
|
|
}
|
|
|
|
private function sendReport(string $imageUrl, bool $success, bool $cached, int $bytes, float $duration): void
|
|
{
|
|
$client = new Client();
|
|
|
|
try {
|
|
$client->post('https://api.mangadex.network/report', [
|
|
'headers' => [
|
|
'Content-Type' => 'application/json',
|
|
],
|
|
'json' => [
|
|
'url' => $imageUrl,
|
|
'success' => $success,
|
|
'cached' => $cached,
|
|
'bytes' => $bytes,
|
|
'duration' => $duration,
|
|
],
|
|
]);
|
|
} catch (RequestException $e) {
|
|
// Gérer les exceptions de requête pour le rapport
|
|
throw new \Exception('Erreur lors de l\'envoi du rapport : ' . $e->getMessage());
|
|
}
|
|
}
|
|
}
|