Added:
- Refactor MangaScraperService (not used everywhere now) - Added JavascriptScraper.php - Added alternatives slugs in Manga.php - Improvement in manga edit form
This commit is contained in:
@@ -6,8 +6,12 @@ use App\Entity\Chapter;
|
||||
use App\Entity\Manga;
|
||||
use App\Entity\ContentSource;
|
||||
use App\Event\PageScrappingProgressEvent;
|
||||
use App\Repository\ChapterRepository;
|
||||
use App\Repository\MangaRepository;
|
||||
use Doctrine\ORM\EntityManagerInterface;
|
||||
use Exception;
|
||||
use Facebook\WebDriver\Remote\RemoteWebElement;
|
||||
use Facebook\WebDriver\WebDriverExpectedCondition;
|
||||
use GuzzleHttp\Client;
|
||||
use GuzzleHttp\Exception\GuzzleException;
|
||||
use GuzzleHttp\Exception\RequestException;
|
||||
@@ -18,6 +22,8 @@ use Symfony\Component\Routing\Route;
|
||||
use Symfony\Component\Routing\RouteCollection;
|
||||
use Symfony\Contracts\EventDispatcher\EventDispatcherInterface;
|
||||
|
||||
use Symfony\Component\Panther\Client as PantherClient;
|
||||
|
||||
class MangaScraperService
|
||||
{
|
||||
const string PUBLIC_CBZ = '/public/cbz';
|
||||
@@ -25,7 +31,8 @@ class MangaScraperService
|
||||
public function __construct(
|
||||
private readonly string $projectDir,
|
||||
private readonly EventDispatcherInterface $eventDispatcher,
|
||||
private readonly EntityManagerInterface $entityManager
|
||||
private readonly EntityManagerInterface $entityManager,
|
||||
private readonly MangaRepository $mangaRepository,
|
||||
)
|
||||
{
|
||||
}
|
||||
@@ -140,12 +147,160 @@ class MangaScraperService
|
||||
return true;
|
||||
}
|
||||
|
||||
private function scrapeChapterJavaScript(Manga $manga, Chapter $chapter, ContentSource $mangaSource): array|bool
|
||||
private function scrapeChapterJavascript(Manga $manga, Chapter $chapter, ContentSource $mangaSource): array|bool
|
||||
{
|
||||
$chapterUrl = $mangaSource->getChapterUrl($manga->getTitle(), $chapter->getNumber());
|
||||
$imgUrls = $this->fetchImagesUsingPuppeteer($chapterUrl, $mangaSource->getImageSelector(), $mangaSource->getNextPageSelector());
|
||||
$pantherClient = PantherClient::createChromeClient();
|
||||
$chapterUrl = $mangaSource->getChapterUrl($manga->getSlug(), $chapter->getNumber());
|
||||
|
||||
return false;
|
||||
$pantherClient->request('GET', $chapterUrl);
|
||||
|
||||
// Sélection du chapitre dans le menu déroulant
|
||||
try {
|
||||
$crawler = $pantherClient->waitFor('body');
|
||||
$select = $crawler->filter('#selectChapitres');
|
||||
|
||||
if ($select->count() > 0) {
|
||||
$chapterNumber = $chapter->getNumber();
|
||||
$options = $select->filter('option');
|
||||
$targetindex = null;
|
||||
|
||||
/** @var RemoteWebElement $option */
|
||||
foreach ($options->getIterator() as $index => $option) {
|
||||
$optionText = $option->getText();
|
||||
// Recherche plus flexible du numéro de chapitre
|
||||
if (preg_match("/\b{$chapterNumber}\b/", $optionText)) {
|
||||
$targetIndex = $index;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if ($targetIndex !== null) {
|
||||
$pantherClient->executeScript("
|
||||
var select = document.querySelector('#selectChapitres');
|
||||
select.selectedIndex = $targetIndex;
|
||||
select.dispatchEvent(new Event('change'));
|
||||
");
|
||||
|
||||
// Attendre que la page se mette à jour après la sélection
|
||||
$pantherClient->wait(60000)->until( // 60 secondes de timeout
|
||||
function ($driver) {
|
||||
return $driver->executeScript("
|
||||
var scansPlacement = document.querySelector('#scansPlacement');
|
||||
if (!scansPlacement) return false;
|
||||
|
||||
var lazyImages = scansPlacement.querySelectorAll('img.lazy');
|
||||
var loadingGif = scansPlacement.querySelector('img[src*=\"loading_scans.gif\"]');
|
||||
|
||||
// Vérifier que toutes les images lazy sont chargées et que le GIF de chargement n'est plus présent
|
||||
var allImagesLoaded = Array.from(lazyImages).every(img => img.complete && img.naturalWidth > 0);
|
||||
|
||||
return lazyImages.length > 0 && allImagesLoaded && !loadingGif;
|
||||
");
|
||||
}
|
||||
);
|
||||
} else {
|
||||
throw new \Exception("Chapitre $chapterNumber non trouvé dans le menu déroulant");
|
||||
}
|
||||
}
|
||||
} catch (\Exception $e) {
|
||||
// $this->logger->warning('Erreur lors de la sélection du chapitre : ' . $e->getMessage());
|
||||
$pantherClient->close();
|
||||
return false;
|
||||
}
|
||||
|
||||
$pageData = [];
|
||||
|
||||
try {
|
||||
if ($mangaSource->getNextPageSelector() === null) {
|
||||
// Lecteur vertical
|
||||
$pageData = $this->scrapeVerticalReaderJavascript($pantherClient, $mangaSource, $chapter);
|
||||
} else {
|
||||
// Lecteur horizontal
|
||||
$pageData = $this->scrapeHorizontalReaderJavascript($pantherClient, $mangaSource, $chapter);
|
||||
}
|
||||
} catch (\Exception $e) {
|
||||
throw $e;
|
||||
// $this->logger->warning('Erreur lors du scraping du chapitre ' . $chapter->getNumber() . ' du manga ' . $manga->getTitle() . ': ' . $e->getMessage());
|
||||
} finally {
|
||||
$pantherClient->close();
|
||||
}
|
||||
|
||||
return $pageData;
|
||||
}
|
||||
|
||||
private function scrapeVerticalReaderJavascript(PantherClient $pantherClient, ContentSource $mangaSource, Chapter $chapter): array
|
||||
{
|
||||
$pageData = [];
|
||||
$pageNumber = 1;
|
||||
|
||||
$crawler = $pantherClient->waitFor($mangaSource->getImageSelector());
|
||||
$images = $crawler->filter($mangaSource->getImageSelector());
|
||||
|
||||
foreach ($images->getIterator() as $image) {
|
||||
$imageUrl = $image->getAttribute('src') ?: $image->getAttribute('data-src');
|
||||
|
||||
$pageData[] = [
|
||||
'image_url' => $this->cleanImageUrl($imageUrl),
|
||||
'page_number' => $pageNumber,
|
||||
];
|
||||
|
||||
$event = new PageScrappingProgressEvent($chapter->getId(), $pageNumber, $images->count());
|
||||
$this->eventDispatcher->dispatch($event, PageScrappingProgressEvent::NAME);
|
||||
|
||||
$pageNumber++;
|
||||
}
|
||||
|
||||
return $pageData;
|
||||
}
|
||||
|
||||
private function scrapeHorizontalReaderJavascript(PantherClient $pantherClient, ContentSource $mangaSource, Chapter $chapter): array
|
||||
{
|
||||
$pageData = [];
|
||||
$pageNumber = 1;
|
||||
|
||||
while (true) {
|
||||
try {
|
||||
$crawler = $pantherClient->waitFor($mangaSource->getImageSelector());
|
||||
|
||||
$imageElement = $crawler->filter($mangaSource->getImageSelector())->first();
|
||||
if ($imageElement->count() === 0) {
|
||||
break; // Fin du chapitre
|
||||
}
|
||||
|
||||
$imageUrl = $imageElement->attr('src') ?: $imageElement->attr('data-src');
|
||||
|
||||
$pageData[] = [
|
||||
'image_url' => $this->cleanImageUrl($imageUrl),
|
||||
'page_number' => $pageNumber,
|
||||
];
|
||||
|
||||
$event = new PageScrappingProgressEvent($chapter->getId(), $pageNumber, 0);
|
||||
$this->eventDispatcher->dispatch($event, PageScrappingProgressEvent::NAME);
|
||||
|
||||
// Passer à la page suivante
|
||||
$nextButton = $pantherCrawler->filter($mangaSource->getNextPageSelector());
|
||||
if ($nextButton->count() === 0) {
|
||||
break; // Pas de bouton suivant, fin du chapitre
|
||||
}
|
||||
|
||||
$nextButton->click();
|
||||
|
||||
// Attendre que la page change
|
||||
$pantherClient->waitFor($mangaSource->getImageSelector(), 10);
|
||||
|
||||
// Mettre à jour le crawler avec le nouveau contenu de la page
|
||||
$pantherCrawler = $pantherClient->refreshCrawler();
|
||||
|
||||
$pageNumber++;
|
||||
} catch (\Exception $e) {
|
||||
throw $e;
|
||||
// $this->logger->warning('Erreur lors du scraping de la page ' . $pageNumber . ' du chapitre ' . $chapter->getNumber() . ': ' . $e->getMessage());
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return $pageData;
|
||||
}
|
||||
|
||||
private function fetchImagesUsingPuppeteer(string $url, string $imageSelector, string $nextButtonSelector): array
|
||||
@@ -162,6 +317,26 @@ class MangaScraperService
|
||||
return json_decode(implode("", $output), true);
|
||||
}
|
||||
|
||||
public function testScraping(string $mangaSlug, string $chapterNumber, ContentSource $contentSource): array
|
||||
{
|
||||
return match ($contentSource->getScrapingType()) {
|
||||
'html' => $this->testScrapingHtml($mangaSlug, $chapterNumber, $contentSource),
|
||||
'javascript' => $this->testScrapingJavascript($mangaSlug, $chapterNumber, $contentSource),
|
||||
default => throw new Exception('Unsupported scraping type: ' . $contentSource->getScrapingType()),
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* @throws Exception
|
||||
*/
|
||||
public function testScrapingJavascript(string $mangaSlug, string $chapterNumber, ContentSource $contentSource): array
|
||||
{
|
||||
$manga = $this->mangaRepository->findOneBy(['slug' => $mangaSlug]);
|
||||
$chapter = $manga->getChapterByNumber($chapterNumber);
|
||||
|
||||
return $this->scrapeChapterJavascript($manga, $chapter, $contentSource);
|
||||
}
|
||||
|
||||
/**
|
||||
* @throws GuzzleException
|
||||
*/
|
||||
@@ -231,9 +406,9 @@ class MangaScraperService
|
||||
|
||||
$pageData = [];
|
||||
foreach ($images as $index => $image) {
|
||||
if($image->getAttribute('src') === ''){
|
||||
if ($image->getAttribute('src') === '') {
|
||||
$imgUrl = $image->getAttribute('data-src');
|
||||
}else{
|
||||
} else {
|
||||
$imgUrl = $image->getAttribute('src');
|
||||
}
|
||||
$pageData[] = [
|
||||
|
||||
Reference in New Issue
Block a user