- Messenger, Mercure
- chapter download flow (lelscan only)
This commit is contained in:
Jérémy Guillot
2024-06-13 18:11:11 +02:00
parent f88fa2c232
commit bc85649789
24 changed files with 744 additions and 78 deletions

View File

@@ -8,7 +8,11 @@ use App\Entity\ContentSource;
use App\EventSubscriber\MangaScrapedEvent;
use GuzzleHttp\Client;
use GuzzleHttp\Exception\GuzzleException;
use GuzzleHttp\Exception\RequestException;
use Symfony\Component\DomCrawler\Crawler;
use Symfony\Component\HttpKernel\Exception\BadRequestHttpException;
use Symfony\Component\HttpKernel\Exception\HttpException;
use Symfony\Component\HttpKernel\Exception\NotFoundHttpException;
use Symfony\Component\Routing\Matcher\UrlMatcher;
use Symfony\Component\Routing\RequestContext;
use Symfony\Component\Routing\Route;
@@ -27,42 +31,31 @@ class MangaScraperService
$this->eventDispatcher = $eventDispatcher;
}
public function extractMangaPageData(string $html, ContentSource $mangaSource): array
private function extractMangaPageData(string $html, ContentSource $mangaSource): array
{
$crawler = new Crawler($html);
$imgUrls = [];
$imgUrl = $crawler->filter($mangaSource->getImageSelector())->attr('src')
?? $crawler->filter($mangaSource->getImageSelector())->attr('data-src');
// Search for images with different extensions
foreach (['img[src$=".jpg"]', 'img[src$=".jpeg"]', 'img[src$=".png"]', 'img'] as $selector) {
$crawler->filter($selector)->each(function (Crawler $node) use (&$imgUrls) {
$src = $node->attr('src') ?? $node->attr('data-src');
if ($src) {
$imgUrls[] = $src;
}
});
}
// dd($imgUrl);
if (empty($imgUrls)) {
throw new \Exception('No valid image found on the page.');
}
// if (empty($imgUrl)) {
// throw new \Exception('No valid image found on the page.');
// }
$nextLink = $crawler->filter($mangaSource->getNextPageSelector());
$nextUrl = $nextLink->count() > 0 ? $nextLink->attr('href') : null;
// Convert relative URLs to absolute URLs
$baseUrl = $mangaSource->getBaseUrl();
$imgUrls = array_map(function ($imgUrl) use ($baseUrl) {
if (!preg_match('/^https?:\/\//', $imgUrl)) {
$urlComponents = parse_url($baseUrl);
$scheme = $urlComponents['scheme'];
$host = $urlComponents['host'];
$imgUrl = $scheme . '://' . $host . '/' . ltrim($imgUrl, '/');
}
return $imgUrl;
}, $imgUrls);
if (!preg_match('/^https?:\/\//', $imgUrl)) {
$urlComponents = parse_url($mangaSource->getBaseUrl());
$scheme = $urlComponents['scheme'];
$host = $urlComponents['host'];
$imgUrl = $scheme . '://' . $host . '/' . ltrim($imgUrl, '/');
}
return [
'image_urls' => $imgUrls,
'image_url' => $imgUrl,
'next_page_url' => $nextUrl,
];
}
@@ -75,7 +68,7 @@ class MangaScraperService
$allChaptersData = [];
foreach ($manga->getChapters() as $chapter) {
$chapterData = $this->scrapeChapter($manga, $chapter, $mangaSource);
$chapterData = $this->scrapeChapter($chapter, $mangaSource);
if ($chapterData !== false) {
$allChaptersData[$chapter->getNumber()] = $chapterData;
}
@@ -84,13 +77,13 @@ class MangaScraperService
return $allChaptersData;
}
private function scrapeChapter(Manga $manga, Chapter $chapter, ContentSource $mangaSource): array|bool
public function scrapeChapter(Chapter $chapter, ContentSource $mangaSource): array|bool
{
switch ($mangaSource->getScrapingType()) {
case 'html':
return $this->scrapeChapterHtml($manga, $chapter, $mangaSource);
return $this->scrapeChapterHtml($chapter->getManga(), $chapter, $mangaSource);
case 'javascript':
return $this->scrapeChapterJavaScript($manga, $chapter, $mangaSource);
return $this->scrapeChapterJavaScript($chapter->getManga(), $chapter, $mangaSource);
// case 'api':
// // Implémentez la méthode de scraping par API si nécessaire
// return $this->scrapeChapterApi($manga, $chapter, $mangaSource);
@@ -121,10 +114,10 @@ class MangaScraperService
// Appeler le script Puppeteer avec les paramètres nécessaires
$output = [];
$command = sprintf('node puppeteer-script.js "%s" "%s" "%s" 2>&1', $url, $imageSelector, $nextButtonSelector); // Redirect stderr to stdout
dump($command);
// dump($command);
// exec($command, $output, $return_var);
dd($command, $output);
// dd($command, $output);
// Convertir la sortie JSON en tableau PHP
return json_decode(implode("", $output), true);
@@ -156,34 +149,25 @@ class MangaScraperService
$html = $this->fetchHtml($currentPageUrl);
$page = $this->extractMangaPageData($html, $mangaSource);
foreach ($page['image_urls'] as $imgUrl) {
dump($imgUrl);
dump(base64_decode($imgUrl));
// Déterminer l'extension de l'image
$imageExtension = pathinfo(parse_url($imgUrl, PHP_URL_PATH), PATHINFO_EXTENSION);
// Déterminer l'extension de l'image
$imageExtension = pathinfo(parse_url($page['image_url'], PHP_URL_PATH), PATHINFO_EXTENSION);
// Construire le nom de fichier de l'image
$imageName = sprintf('%03d.%s', count($pageData) + 1, $imageExtension);
$imagePath = sprintf('%s/%s', $chapterDir, $imageName);
// Construire le nom de fichier de l'image
$imageName = sprintf('%03d.%s', count($pageData) + 1, $imageExtension);
$imagePath = sprintf('%s/%s', $chapterDir, $imageName);
$this->downloadAndSaveImage($imgUrl, $imagePath);
$this->downloadAndSaveImage($page['image_url'], $imagePath);
$pageData[] = [
'image_url' => $imgUrl,
'local_image_url' => sprintf('/manga-images/%s/%s/%s', $mangaTitle, $chapterNumber, $imageName),
'page_number' => count($pageData) + 1,
];
}
// Si plus d'une image a été trouvée, ne pas chercher la page suivante
if (count($page['image_urls']) > 1) {
break;
}
$pageData[] = [
'image_url' => $page['image_url'],
'local_image_url' => sprintf('/manga-images/%s/%s/%s', $mangaTitle, $chapterNumber, $imageName),
'page_number' => count($pageData) + 1,
];
$currentPageUrl = $page['next_page_url'];
} while ($currentPageUrl);
$event = new MangaScrapedEvent($mangaTitle, $chapterNumber, $pageData);
$event = new MangaScrapedEvent($mangaTitle, $chapterNumber, $pageData, $chapterDir);
$this->eventDispatcher->dispatch($event, MangaScrapedEvent::NAME);
return $pageData;
@@ -195,9 +179,25 @@ class MangaScraperService
private function fetchHtml(string $url): string
{
$client = new Client();
$response = $client->get($url);
return (string)$response->getBody();
try {
$response = $client->get($url, [
'http_errors' => true,
'allow_redirects' => false
]);
$statusCode = $response->getStatusCode();
if ($statusCode >= 300 && $statusCode < 400) {
throw new NotFoundHttpException('Chapter Not Found at ' . $url);
} elseif ($statusCode == 404) {
throw new NotFoundHttpException('Chapter Not Found at ' . $url);
}
return (string)$response->getBody();
} catch (HttpException $e) {
throw new BadRequestHttpException('Bad Request: ' . $e->getMessage());
}
}
/**
@@ -240,7 +240,7 @@ class MangaScraperService
];
}
$event = new MangaScrapedEvent($mangaTitle, $chapterNumber, $pageData);
$event = new MangaScrapedEvent($mangaTitle, $chapterNumber, $pageData, $chapterDir);
$this->eventDispatcher->dispatch($event, MangaScrapedEvent::NAME);
return $pageData;