- trop de trucs d'un coup... je vais faire attention ensuite ^^'

This commit is contained in:
Jérémy Guillot
2024-06-10 13:57:50 +02:00
parent 9595831aa3
commit c46e1a0a5c
69 changed files with 4004 additions and 385 deletions

View File

@@ -2,6 +2,9 @@
namespace App\Service;
use App\Entity\Chapter;
use App\Entity\Manga;
use App\Entity\ContentSource;
use App\EventSubscriber\MangaScrapedEvent;
use GuzzleHttp\Client;
use GuzzleHttp\Exception\GuzzleException;
@@ -14,144 +17,256 @@ use Symfony\Contracts\EventDispatcher\EventDispatcherInterface;
class MangaScraperService
{
const string IMG_BASE_DIR = '/public/manga-images';
private string $projectDir;
private EventDispatcherInterface $eventDispatcher;
const IMG_BASE_DIR = '/public/manga-images';
private string $projectDir;
private EventDispatcherInterface $eventDispatcher;
public function __construct($projectDir, EventDispatcherInterface $eventDispatcher)
{
$this->projectDir = $projectDir;
$this->eventDispatcher = $eventDispatcher;
}
public function __construct($projectDir, EventDispatcherInterface $eventDispatcher)
{
$this->projectDir = $projectDir;
$this->eventDispatcher = $eventDispatcher;
}
public function extractMangaPageData(string $html): array
{
$baseUrl = 'https://lelscans.net';
//pour éviter à PhpStorm de gueuler...
$selector = 'img';
$crawler = new Crawler($html);
$imgUrl = $crawler->filter($selector)->attr('src');
$nextLink = $crawler->filter('a[title="Suivant"]');
public function extractMangaPageData(string $html, ContentSource $mangaSource): array
{
$crawler = new Crawler($html);
$imgUrls = [];
if (!preg_match('/^https?:\/\//', $imgUrl)) {
$urlComponents = parse_url($baseUrl);
$scheme = $urlComponents['scheme'];
$host = $urlComponents['host'];
// Search for images with different extensions
foreach (['img[src$=".jpg"]', 'img[src$=".jpeg"]', 'img[src$=".png"]', 'img'] as $selector) {
$crawler->filter($selector)->each(function (Crawler $node) use (&$imgUrls) {
$src = $node->attr('src') ?? $node->attr('data-src');
if ($src) {
$imgUrls[] = $src;
}
});
}
// Construit l'URL absolue de l'image
$imgUrl = $scheme . '://' . $host . '/' . ltrim($imgUrl, '/');
}
if (empty($imgUrls)) {
throw new \Exception('No valid image found on the page.');
}
if($nextLink->count() > 0){
$nextUrl = $nextLink->attr('href');
}else{
$nextUrl = null;
}
$nextLink = $crawler->filter($mangaSource->getNextPageSelector());
$nextUrl = $nextLink->count() > 0 ? $nextLink->attr('href') : null;
return [
'image_url' => $imgUrl,
'next_page_url' => $nextUrl,
];
}
// Convert relative URLs to absolute URLs
$baseUrl = $mangaSource->getBaseUrl();
$imgUrls = array_map(function ($imgUrl) use ($baseUrl) {
if (!preg_match('/^https?:\/\//', $imgUrl)) {
$urlComponents = parse_url($baseUrl);
$scheme = $urlComponents['scheme'];
$host = $urlComponents['host'];
$imgUrl = $scheme . '://' . $host . '/' . ltrim($imgUrl, '/');
}
return $imgUrl;
}, $imgUrls);
return [
'image_urls' => $imgUrls,
'next_page_url' => $nextUrl,
];
}
/**
* @throws GuzzleException
*/
public function scrapeMangaChapter(string $chapterUrl, string $mangaTitle, float $chapterNumber): array|bool
{
if(!$this->isChapterAvailable($chapterUrl, $chapterNumber)){
return false;
}
public function scrapeManga(Manga $manga, ContentSource $mangaSource): array
{
$allChaptersData = [];
$pageData = [];
$currentPageUrl = $chapterUrl;
foreach ($manga->getChapters() as $chapter) {
$chapterData = $this->scrapeChapter($manga, $chapter, $mangaSource);
if ($chapterData !== false) {
$allChaptersData[$chapter->getNumber()] = $chapterData;
}
}
$mangaDir = sprintf('%s/%s', $this->projectDir . self::IMG_BASE_DIR, $mangaTitle);
if (!is_dir($mangaDir)) {
mkdir($mangaDir, 0755, true);
}
return $allChaptersData;
}
// Créez le dossier du chapitre s'il n'existe pas
$chapterDir = sprintf('%s/%s', $mangaDir, $chapterNumber);
if (!is_dir($chapterDir)) {
mkdir($chapterDir, 0755, true);
}
private function scrapeChapter(Manga $manga, Chapter $chapter, ContentSource $mangaSource): array|bool
{
switch ($mangaSource->getScrapingType()) {
case 'html':
return $this->scrapeChapterHtml($manga, $chapter, $mangaSource);
case 'javascript':
return $this->scrapeChapterJavaScript($manga, $chapter, $mangaSource);
// case 'api':
// // Implémentez la méthode de scraping par API si nécessaire
// return $this->scrapeChapterApi($manga, $chapter, $mangaSource);
default:
throw new \Exception('Unsupported scraping type: ' . $mangaSource->getScrapingType());
}
}
do {
$html = $this->fetchHtml($currentPageUrl);
$page = $this->extractMangaPageData($html);
$pageData[] = $page;
$currentPageUrl = $page['next_page_url'];
// private function scrapeChapterHtml(Manga $manga, Chapter $chapter, MangaSource $mangaSource): array|bool
// {
// $chapterUrl = $mangaSource->getChapterUrl($manga->getTitle(), $chapter->getChapterNumber());
// $html = $this->fetchHtml($chapterUrl);
// $imgUrls = $this->extractMangaPageData($html);
//
// return $this->saveChapterImages($manga, $chapter, $imgUrls);
// }
// Construisez le nom de fichier de l'image
$imageName = sprintf('%03d.jpg', count($pageData));
private function scrapeChapterJavaScript(Manga $manga, Chapter $chapter, ContentSource $mangaSource): array|bool
{
$chapterUrl = $mangaSource->getChapterUrl($manga->getTitle(), $chapter->getNumber());
$imgUrls = $this->fetchImagesUsingPuppeteer($chapterUrl, $mangaSource->getImageSelector(), $mangaSource->getNextPageSelector());
// Construisez le chemin du fichier de l'image
$imagePath = sprintf('%s/%s', $chapterDir, $imageName);
return $this->saveChapterImages($manga, $chapter, $imgUrls);
}
// Téléchargez et enregistrez l'image
$this->downloadAndSaveImage($page['image_url'], $imagePath);
private function fetchImagesUsingPuppeteer(string $url, string $imageSelector, string $nextButtonSelector): array
{
// Appeler le script Puppeteer avec les paramètres nécessaires
$output = [];
$command = sprintf('node puppeteer-script.js "%s" "%s" "%s" 2>&1', $url, $imageSelector, $nextButtonSelector); // Redirect stderr to stdout
dump($command);
// exec($command, $output, $return_var);
// Modifiez les données de la page pour inclure l'URL de l'image stockée localement
$pageData[count($pageData) - 1]['local_image_url'] = sprintf('/manga-images/%s/%s/%s', $mangaTitle, $chapterNumber, $imageName);
$pageData[count($pageData) - 1]['page_number'] = count($pageData);
dd($command, $output);
} while ($currentPageUrl);
// Convertir la sortie JSON en tableau PHP
return json_decode(implode("", $output), true);
}
$event = new MangaScrapedEvent($mangaTitle, $chapterNumber, $pageData);
$this->eventDispatcher->dispatch($event, MangaScrapedEvent::NAME);
/**
* @throws GuzzleException
*/
private function scrapeChapterHtml(Manga $manga, Chapter $chapter, ContentSource $mangaSource): array|bool
{
$chapterUrl = $mangaSource->getChapterUrl($manga->getSlug(), $chapter->getNumber());
return $pageData;
}
$pageData = [];
$currentPageUrl = $chapterUrl;
$mangaTitle = $manga->getTitle();
$chapterNumber = $chapter->getNumber();
$mangaDir = sprintf('%s/%s', $this->projectDir . self::IMG_BASE_DIR, $mangaTitle);
if (!is_dir($mangaDir)) {
mkdir($mangaDir, 0755, true);
}
$chapterDir = sprintf('%s/%s', $mangaDir, $chapterNumber);
if (!is_dir($chapterDir)) {
mkdir($chapterDir, 0755, true);
}
do {
$html = $this->fetchHtml($currentPageUrl);
$page = $this->extractMangaPageData($html, $mangaSource);
foreach ($page['image_urls'] as $imgUrl) {
dump($imgUrl);
dump(base64_decode($imgUrl));
// Déterminer l'extension de l'image
$imageExtension = pathinfo(parse_url($imgUrl, PHP_URL_PATH), PATHINFO_EXTENSION);
// Construire le nom de fichier de l'image
$imageName = sprintf('%03d.%s', count($pageData) + 1, $imageExtension);
$imagePath = sprintf('%s/%s', $chapterDir, $imageName);
$this->downloadAndSaveImage($imgUrl, $imagePath);
$pageData[] = [
'image_url' => $imgUrl,
'local_image_url' => sprintf('/manga-images/%s/%s/%s', $mangaTitle, $chapterNumber, $imageName),
'page_number' => count($pageData) + 1,
];
}
// Si plus d'une image a été trouvée, ne pas chercher la page suivante
if (count($page['image_urls']) > 1) {
break;
}
$currentPageUrl = $page['next_page_url'];
} while ($currentPageUrl);
$event = new MangaScrapedEvent($mangaTitle, $chapterNumber, $pageData);
$this->eventDispatcher->dispatch($event, MangaScrapedEvent::NAME);
return $pageData;
}
/**
* @throws GuzzleException
*/
private function fetchHtml(string $url): string
{
$client = new Client();
$response = $client->get($url);
{
$client = new Client();
$response = $client->get($url);
return (string) $response->getBody();
}
return (string)$response->getBody();
}
/**
* @throws GuzzleException
*/
private function downloadAndSaveImage(string $imageUrl, string $destinationPath): void
{
$client = new Client();
$response = $client->get($imageUrl);
{
$client = new Client();
$response = $client->get($imageUrl);
file_put_contents($destinationPath, $response->getBody()->getContents());
}
file_put_contents($destinationPath, $response->getBody()->getContents());
}
private function saveChapterImages(Manga $manga, Chapter $chapter, array $imgUrls): array
{
$mangaTitle = $manga->getTitle();
$chapterNumber = $chapter->getNumber();
$mangaDir = sprintf('%s/%s', $this->projectDir . self::IMG_BASE_DIR, $mangaTitle);
if (!is_dir($mangaDir)) {
mkdir($mangaDir, 0755, true);
}
$chapterDir = sprintf('%s/%s', $mangaDir, $chapterNumber);
if (!is_dir($chapterDir)) {
mkdir($chapterDir, 0755, true);
}
$pageData = [];
foreach ($imgUrls as $index => $imgUrl) {
$imageName = sprintf('%03d.%s', $index + 1, pathinfo(parse_url($imgUrl, PHP_URL_PATH), PATHINFO_EXTENSION));
$imagePath = sprintf('%s/%s', $chapterDir, $imageName);
$this->downloadAndSaveImage($imgUrl, $imagePath);
$pageData[] = [
'image_url' => $imgUrl,
'local_image_url' => sprintf('/manga-images/%s/%s/%s', $mangaTitle, $chapterNumber, $imageName),
'page_number' => $index + 1,
];
}
$event = new MangaScrapedEvent($mangaTitle, $chapterNumber, $pageData);
$this->eventDispatcher->dispatch($event, MangaScrapedEvent::NAME);
return $pageData;
}
/**
* @throws GuzzleException
*/
private function isChapterAvailable(string $chapterUrl, float $chapterNumber): bool
{
$html = $this->fetchHtml($chapterUrl);
$crawler = new Crawler($html);
$nextLink = $crawler->filter('a[title="Suivant"]');
private function isChapterAvailable(string $chapterUrl, float $chapterNumber, ContentSource $mangaSource): bool
{
$html = $this->fetchHtml($chapterUrl);
$crawler = new Crawler($html);
$nextLink = $crawler->filter($mangaSource->getNextPageSelector());
if($nextLink->count() === 0){
return false;
}else{
$nextUrl = $nextLink->attr('href');
}
if ($nextLink->count() === 0) {
return false;
}
$routeCollection = new RouteCollection();
$routeCollection->add('manga_chapter', new Route('/scan-{manga}/{chapter}/{page}'));
$context = new RequestContext('/');
$matcher = new UrlMatcher($routeCollection, $context);
$path = parse_url($nextUrl, PHP_URL_PATH);
$parameters = $matcher->match($path);
$nextUrl = $nextLink->attr('href');
$routeCollection = new RouteCollection();
$routeCollection->add('manga_chapter', new Route('/scan-{manga}/{chapter}/{page}'));
$context = new RequestContext('/');
$matcher = new UrlMatcher($routeCollection, $context);
$path = parse_url($nextUrl, PHP_URL_PATH);
$parameters = $matcher->match($path);
if((float) $parameters['chapter'] !== $chapterNumber){
return false;
}
return true;
}
return (float)$parameters['chapter'] === $chapterNumber;
}
}