- Refactor MangaScraperService (not used everywhere now)
- Added JavascriptScraper.php
- Added alternatives slugs in Manga.php
- Improvement in manga edit form
This commit is contained in:
Jérémy Guillot
2024-07-21 19:08:46 +02:00
parent ff59aa5d77
commit fafff5014c
21 changed files with 1180 additions and 28 deletions

View File

@@ -6,8 +6,12 @@ use App\Entity\Chapter;
use App\Entity\Manga;
use App\Entity\ContentSource;
use App\Event\PageScrappingProgressEvent;
use App\Repository\ChapterRepository;
use App\Repository\MangaRepository;
use Doctrine\ORM\EntityManagerInterface;
use Exception;
use Facebook\WebDriver\Remote\RemoteWebElement;
use Facebook\WebDriver\WebDriverExpectedCondition;
use GuzzleHttp\Client;
use GuzzleHttp\Exception\GuzzleException;
use GuzzleHttp\Exception\RequestException;
@@ -18,6 +22,8 @@ use Symfony\Component\Routing\Route;
use Symfony\Component\Routing\RouteCollection;
use Symfony\Contracts\EventDispatcher\EventDispatcherInterface;
use Symfony\Component\Panther\Client as PantherClient;
class MangaScraperService
{
const string PUBLIC_CBZ = '/public/cbz';
@@ -25,7 +31,8 @@ class MangaScraperService
public function __construct(
private readonly string $projectDir,
private readonly EventDispatcherInterface $eventDispatcher,
private readonly EntityManagerInterface $entityManager
private readonly EntityManagerInterface $entityManager,
private readonly MangaRepository $mangaRepository,
)
{
}
@@ -140,12 +147,160 @@ class MangaScraperService
return true;
}
private function scrapeChapterJavaScript(Manga $manga, Chapter $chapter, ContentSource $mangaSource): array|bool
private function scrapeChapterJavascript(Manga $manga, Chapter $chapter, ContentSource $mangaSource): array|bool
{
$chapterUrl = $mangaSource->getChapterUrl($manga->getTitle(), $chapter->getNumber());
$imgUrls = $this->fetchImagesUsingPuppeteer($chapterUrl, $mangaSource->getImageSelector(), $mangaSource->getNextPageSelector());
$pantherClient = PantherClient::createChromeClient();
$chapterUrl = $mangaSource->getChapterUrl($manga->getSlug(), $chapter->getNumber());
return false;
$pantherClient->request('GET', $chapterUrl);
// Sélection du chapitre dans le menu déroulant
try {
$crawler = $pantherClient->waitFor('body');
$select = $crawler->filter('#selectChapitres');
if ($select->count() > 0) {
$chapterNumber = $chapter->getNumber();
$options = $select->filter('option');
$targetindex = null;
/** @var RemoteWebElement $option */
foreach ($options->getIterator() as $index => $option) {
$optionText = $option->getText();
// Recherche plus flexible du numéro de chapitre
if (preg_match("/\b{$chapterNumber}\b/", $optionText)) {
$targetIndex = $index;
break;
}
}
if ($targetIndex !== null) {
$pantherClient->executeScript("
var select = document.querySelector('#selectChapitres');
select.selectedIndex = $targetIndex;
select.dispatchEvent(new Event('change'));
");
// Attendre que la page se mette à jour après la sélection
$pantherClient->wait(60000)->until( // 60 secondes de timeout
function ($driver) {
return $driver->executeScript("
var scansPlacement = document.querySelector('#scansPlacement');
if (!scansPlacement) return false;
var lazyImages = scansPlacement.querySelectorAll('img.lazy');
var loadingGif = scansPlacement.querySelector('img[src*=\"loading_scans.gif\"]');
// Vérifier que toutes les images lazy sont chargées et que le GIF de chargement n'est plus présent
var allImagesLoaded = Array.from(lazyImages).every(img => img.complete && img.naturalWidth > 0);
return lazyImages.length > 0 && allImagesLoaded && !loadingGif;
");
}
);
} else {
throw new \Exception("Chapitre $chapterNumber non trouvé dans le menu déroulant");
}
}
} catch (\Exception $e) {
// $this->logger->warning('Erreur lors de la sélection du chapitre : ' . $e->getMessage());
$pantherClient->close();
return false;
}
$pageData = [];
try {
if ($mangaSource->getNextPageSelector() === null) {
// Lecteur vertical
$pageData = $this->scrapeVerticalReaderJavascript($pantherClient, $mangaSource, $chapter);
} else {
// Lecteur horizontal
$pageData = $this->scrapeHorizontalReaderJavascript($pantherClient, $mangaSource, $chapter);
}
} catch (\Exception $e) {
throw $e;
// $this->logger->warning('Erreur lors du scraping du chapitre ' . $chapter->getNumber() . ' du manga ' . $manga->getTitle() . ': ' . $e->getMessage());
} finally {
$pantherClient->close();
}
return $pageData;
}
private function scrapeVerticalReaderJavascript(PantherClient $pantherClient, ContentSource $mangaSource, Chapter $chapter): array
{
$pageData = [];
$pageNumber = 1;
$crawler = $pantherClient->waitFor($mangaSource->getImageSelector());
$images = $crawler->filter($mangaSource->getImageSelector());
foreach ($images->getIterator() as $image) {
$imageUrl = $image->getAttribute('src') ?: $image->getAttribute('data-src');
$pageData[] = [
'image_url' => $this->cleanImageUrl($imageUrl),
'page_number' => $pageNumber,
];
$event = new PageScrappingProgressEvent($chapter->getId(), $pageNumber, $images->count());
$this->eventDispatcher->dispatch($event, PageScrappingProgressEvent::NAME);
$pageNumber++;
}
return $pageData;
}
private function scrapeHorizontalReaderJavascript(PantherClient $pantherClient, ContentSource $mangaSource, Chapter $chapter): array
{
$pageData = [];
$pageNumber = 1;
while (true) {
try {
$crawler = $pantherClient->waitFor($mangaSource->getImageSelector());
$imageElement = $crawler->filter($mangaSource->getImageSelector())->first();
if ($imageElement->count() === 0) {
break; // Fin du chapitre
}
$imageUrl = $imageElement->attr('src') ?: $imageElement->attr('data-src');
$pageData[] = [
'image_url' => $this->cleanImageUrl($imageUrl),
'page_number' => $pageNumber,
];
$event = new PageScrappingProgressEvent($chapter->getId(), $pageNumber, 0);
$this->eventDispatcher->dispatch($event, PageScrappingProgressEvent::NAME);
// Passer à la page suivante
$nextButton = $pantherCrawler->filter($mangaSource->getNextPageSelector());
if ($nextButton->count() === 0) {
break; // Pas de bouton suivant, fin du chapitre
}
$nextButton->click();
// Attendre que la page change
$pantherClient->waitFor($mangaSource->getImageSelector(), 10);
// Mettre à jour le crawler avec le nouveau contenu de la page
$pantherCrawler = $pantherClient->refreshCrawler();
$pageNumber++;
} catch (\Exception $e) {
throw $e;
// $this->logger->warning('Erreur lors du scraping de la page ' . $pageNumber . ' du chapitre ' . $chapter->getNumber() . ': ' . $e->getMessage());
break;
}
}
return $pageData;
}
private function fetchImagesUsingPuppeteer(string $url, string $imageSelector, string $nextButtonSelector): array
@@ -162,6 +317,26 @@ class MangaScraperService
return json_decode(implode("", $output), true);
}
public function testScraping(string $mangaSlug, string $chapterNumber, ContentSource $contentSource): array
{
return match ($contentSource->getScrapingType()) {
'html' => $this->testScrapingHtml($mangaSlug, $chapterNumber, $contentSource),
'javascript' => $this->testScrapingJavascript($mangaSlug, $chapterNumber, $contentSource),
default => throw new Exception('Unsupported scraping type: ' . $contentSource->getScrapingType()),
};
}
/**
* @throws Exception
*/
public function testScrapingJavascript(string $mangaSlug, string $chapterNumber, ContentSource $contentSource): array
{
$manga = $this->mangaRepository->findOneBy(['slug' => $mangaSlug]);
$chapter = $manga->getChapterByNumber($chapterNumber);
return $this->scrapeChapterJavascript($manga, $chapter, $contentSource);
}
/**
* @throws GuzzleException
*/
@@ -231,9 +406,9 @@ class MangaScraperService
$pageData = [];
foreach ($images as $index => $image) {
if($image->getAttribute('src') === ''){
if ($image->getAttribute('src') === '') {
$imgUrl = $image->getAttribute('data-src');
}else{
} else {
$imgUrl = $image->getAttribute('src');
}
$pageData[] = [