Added:
- Refactor MangaScraperService (not used everywhere now) - Added JavascriptScraper.php - Added alternatives slugs in Manga.php - Improvement in manga edit form
This commit is contained in:
188
src/Service/Scraper/JavascriptScraper.php
Normal file
188
src/Service/Scraper/JavascriptScraper.php
Normal file
@@ -0,0 +1,188 @@
|
||||
<?php
|
||||
|
||||
namespace App\Service\Scraper;
|
||||
|
||||
use App\Entity\Chapter;
|
||||
use App\Entity\ContentSource;
|
||||
use Exception;
|
||||
use Symfony\Component\Panther\Client as PantherClient;
|
||||
|
||||
class JavascriptScraper extends AbstractScraper
|
||||
{
|
||||
public function scrapeChapter(Chapter $chapter, ContentSource $contentSource): array|bool
|
||||
{
|
||||
$manga = $chapter->getManga();
|
||||
$pantherClient = PantherClient::createChromeClient();
|
||||
$chapterUrl = $this->getValidChapterUrl($contentSource, $manga, $chapter->getNumber());
|
||||
|
||||
if (!$chapterUrl) {
|
||||
throw new Exception("Aucune URL valide trouvée pour le chapitre {$chapter->getNumber()} du manga {$manga->getTitle()}");
|
||||
}
|
||||
|
||||
$pantherClient->request('GET', $chapterUrl);
|
||||
|
||||
try {
|
||||
$this->selectChapter($pantherClient, $chapter, $contentSource);
|
||||
|
||||
$pageData = $contentSource->getNextPageSelector() === null
|
||||
? $this->scrapeVerticalReaderJavascript($pantherClient, $contentSource, $chapter)
|
||||
: $this->scrapeHorizontalReaderJavascript($pantherClient, $contentSource, $chapter);
|
||||
|
||||
$tempDir = sys_get_temp_dir() . '/' . uniqid('manga_scraper_');
|
||||
mkdir($tempDir);
|
||||
|
||||
// Télécharger et sauvegarder les images
|
||||
foreach ($pageData as $index => &$page) {
|
||||
$imageName = sprintf('%03d.%s', $index + 1, pathinfo(parse_url($page['image_url'], PHP_URL_PATH), PATHINFO_EXTENSION));
|
||||
$imagePath = $tempDir . '/' . $imageName;
|
||||
|
||||
file_put_contents($imagePath, file_get_contents($page['image_url']));
|
||||
|
||||
$page['local_image_url'] = $imagePath;
|
||||
}
|
||||
|
||||
$cbzFilePath = $this->generateCbzPath($manga, $chapter);
|
||||
$this->createCbzFile($tempDir, $pageData, $cbzFilePath);
|
||||
|
||||
$chapter->setCbzPath($cbzFilePath);
|
||||
$this->entityManager->persist($chapter);
|
||||
$this->entityManager->flush();
|
||||
|
||||
$this->cleanupTempFiles($tempDir);
|
||||
|
||||
return $pageData;
|
||||
} catch (Exception $e) {
|
||||
// Log the error
|
||||
return false;
|
||||
} finally {
|
||||
$pantherClient->close();
|
||||
}
|
||||
}
|
||||
|
||||
public function testScraping(string $mangaSlug, string $chapterNumber, ContentSource $contentSource): array
|
||||
{
|
||||
$chapterUrl = $contentSource->getChapterUrl($mangaSlug, $chapterNumber);
|
||||
|
||||
if (!$this->isChapterUrlValid($chapterUrl)) {
|
||||
throw new \Exception("Invalid URL, check format and slug");
|
||||
}
|
||||
|
||||
$pantherClient = PantherClient::createChromeClient();
|
||||
$pantherClient->request('GET', $chapterUrl);
|
||||
|
||||
try {
|
||||
$chapter = new Chapter();
|
||||
$chapter->setNumber((float)$chapterNumber);
|
||||
|
||||
$this->selectChapter($pantherClient, $chapter, $contentSource);
|
||||
|
||||
return $contentSource->getNextPageSelector() === null
|
||||
? $this->scrapeVerticalReaderJavascript($pantherClient, $contentSource, $chapter)
|
||||
: $this->scrapeHorizontalReaderJavascript($pantherClient, $contentSource, $chapter);
|
||||
} catch (Exception $e) {
|
||||
throw $e;
|
||||
} finally {
|
||||
$pantherClient->close();
|
||||
}
|
||||
}
|
||||
|
||||
public function supports(string $scrapingType): bool
|
||||
{
|
||||
return $scrapingType === 'javascript';
|
||||
}
|
||||
|
||||
private function selectChapter(PantherClient $pantherClient, Chapter $chapter, ContentSource $contentSource): void
|
||||
{
|
||||
$chapterSelector = $contentSource->getChapterSelector();
|
||||
if (!$chapterSelector) {
|
||||
return; // Si aucun sélecteur n'est défini, on ne fait rien
|
||||
}
|
||||
|
||||
$crawler = $pantherClient->waitFor($chapterSelector);
|
||||
$select = $crawler->filter($chapterSelector);
|
||||
|
||||
if ($select->count() > 0) {
|
||||
$chapterNumber = $chapter->getNumber();
|
||||
$options = $select->filter('option');
|
||||
$targetIndex = null;
|
||||
|
||||
foreach ($options as $index => $option) {
|
||||
if (preg_match("/\b{$chapterNumber}\b/", $option->getText())) {
|
||||
$targetIndex = $index;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if ($targetIndex !== null) {
|
||||
$pantherClient->executeScript("
|
||||
var select = document.querySelector('$chapterSelector');
|
||||
select.selectedIndex = $targetIndex;
|
||||
select.dispatchEvent(new Event('change'));
|
||||
");
|
||||
|
||||
$this->waitForImagesLoaded($pantherClient, $contentSource);
|
||||
} else {
|
||||
throw new Exception("Chapitre $chapterNumber non trouvé dans le menu déroulant");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private function waitForImagesLoaded(PantherClient $pantherClient, ContentSource $contentSource): void
|
||||
{
|
||||
$imageSelector = $contentSource->getImageSelector();
|
||||
$pantherClient->wait(30)->until(
|
||||
function ($driver) use ($imageSelector) {
|
||||
return $driver->executeScript("
|
||||
return new Promise((resolve) => {
|
||||
let lastImageCount = 0;
|
||||
let stableCount = 0;
|
||||
const stableThreshold = 10;
|
||||
|
||||
function checkImages() {
|
||||
const images = document.querySelectorAll('$imageSelector');
|
||||
const loadedImages = Array.from(images).filter(img => img.complete && img.naturalWidth > 0);
|
||||
|
||||
if (loadedImages.length === lastImageCount) {
|
||||
stableCount++;
|
||||
} else {
|
||||
stableCount = 0;
|
||||
lastImageCount = loadedImages.length;
|
||||
}
|
||||
|
||||
if (stableCount >= stableThreshold) {
|
||||
resolve(true);
|
||||
} else {
|
||||
setTimeout(checkImages, 200);
|
||||
}
|
||||
}
|
||||
|
||||
checkImages();
|
||||
});
|
||||
");
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
private function scrapeVerticalReaderJavascript(PantherClient $pantherClient, ContentSource $contentSource, Chapter $chapter): array
|
||||
{
|
||||
$pageData = [];
|
||||
$crawler = $pantherClient->waitFor($contentSource->getImageSelector());
|
||||
$images = $crawler->filter($contentSource->getImageSelector());
|
||||
|
||||
foreach ($images as $index => $image) {
|
||||
$imageUrl = $image->getAttribute('src') ?: $image->getAttribute('data-src');
|
||||
$pageData[] = [
|
||||
'image_url' => $this->cleanImageUrl($imageUrl),
|
||||
'page_number' => $index + 1,
|
||||
];
|
||||
}
|
||||
|
||||
return $pageData;
|
||||
}
|
||||
|
||||
private function scrapeHorizontalReaderJavascript(PantherClient $pantherClient, ContentSource $contentSource, Chapter $chapter): array
|
||||
{
|
||||
$pageData = [];
|
||||
return $pageData;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user