Files
Mangarr/src/Service/Scraper/JavascriptScraper.php
ext.jeremy.guillot@maxicoffee.domains 5f15d14ae1 Convertion des images webp et png vers jpeg
2024-09-30 22:16:20 +02:00

191 lines
6.8 KiB
PHP

<?php
namespace App\Service\Scraper;
use App\Entity\Chapter;
use App\Entity\ContentSource;
use Exception;
use GuzzleHttp\Exception\GuzzleException;
use Symfony\Component\Panther\Client as PantherClient;
class JavascriptScraper extends AbstractScraper
{
/**
* @throws Exception
*/
public function scrapeChapter(Chapter $chapter, ContentSource $contentSource): array|bool
{
$manga = $chapter->getManga();
$pantherClient = PantherClient::createChromeClient();
$chapterUrl = $this->getValidChapterUrl($contentSource, $manga, $chapter->getNumber());
if (!$chapterUrl) {
throw new Exception("Aucune URL valide trouvée pour le chapitre {$chapter->getNumber()} du manga {$manga->getTitle()}");
}
$pantherClient->request('GET', $chapterUrl);
try {
$this->selectChapter($pantherClient, $chapter, $contentSource);
$pageData = $contentSource->getNextPageSelector() === null
? $this->scrapeVerticalReaderJavascript($pantherClient, $contentSource, $chapter)
: $this->scrapeHorizontalReaderJavascript($pantherClient, $contentSource, $chapter);
$tempDir = sys_get_temp_dir() . '/' . uniqid('manga_scraper_');
mkdir($tempDir);
// Télécharger et sauvegarder les images
foreach ($pageData as $index => &$page) {
$imageName = sprintf('%03d.%s', $index + 1, pathinfo(parse_url($page['image_url'], PHP_URL_PATH), PATHINFO_EXTENSION));
$imagePath = $tempDir . '/' . $imageName;
$destinationPath = $this->downloadAndSaveImage($page['image_url'], $imagePath);
$this->dispatchProgressEvent($chapter, $index + 1, count($pageData));
$page['local_image_url'] = $destinationPath;
}
$cbzFilePath = $this->generateCbzPath($manga, $chapter);
$this->createCbzFile($pageData, $cbzFilePath);
$chapter->setCbzPath($cbzFilePath);
$this->entityManager->persist($chapter);
$this->entityManager->flush();
$this->cleanupTempFiles($tempDir);
return $pageData;
} finally {
$pantherClient->close();
}
}
public function testScraping(string $mangaSlug, string $chapterNumber, ContentSource $contentSource): array
{
$chapterUrl = $contentSource->getChapterUrl($mangaSlug, $chapterNumber);
if (!$this->isChapterUrlValid($chapterUrl)) {
throw new \Exception("Invalid URL, check format and slug");
}
$pantherClient = PantherClient::createChromeClient();
$pantherClient->request('GET', $chapterUrl);
try {
$chapter = new Chapter();
$chapter->setNumber((float)$chapterNumber);
$this->selectChapter($pantherClient, $chapter, $contentSource);
return $contentSource->getNextPageSelector() === null
? $this->scrapeVerticalReaderJavascript($pantherClient, $contentSource, $chapter)
: $this->scrapeHorizontalReaderJavascript($pantherClient, $contentSource, $chapter);
} catch (Exception $e) {
throw $e;
} finally {
$pantherClient->close();
}
}
public function supports(string $scrapingType): bool
{
return $scrapingType === 'javascript';
}
private function selectChapter(PantherClient $pantherClient, Chapter $chapter, ContentSource $contentSource): void
{
$chapterSelector = $contentSource->getChapterSelector();
if (!$chapterSelector) {
return;
}
$crawler = $pantherClient->waitFor($chapterSelector);
$select = $crawler->filter($chapterSelector);
if ($select->count() > 0) {
$chapterNumber = $chapter->getNumber();
$options = $select->filter('option');
$targetIndex = null;
foreach ($options as $index => $option) {
if (preg_match("/\b{$chapterNumber}\b/", $option->getText())) {
$targetIndex = $index;
break;
}
}
if ($targetIndex !== null) {
$pantherClient->executeScript("
var select = document.querySelector('$chapterSelector');
select.selectedIndex = $targetIndex;
select.dispatchEvent(new Event('change'));
");
$this->waitForImagesLoaded($pantherClient, $contentSource);
} else {
throw new Exception("Chapitre $chapterNumber non trouvé dans le menu déroulant");
}
}
}
private function waitForImagesLoaded(PantherClient $pantherClient, ContentSource $contentSource): void
{
$imageSelector = $contentSource->getImageSelector();
$pantherClient->wait(30)->until(
function ($driver) use ($imageSelector) {
return $driver->executeScript("
return new Promise((resolve) => {
let lastImageCount = 0;
let stableCount = 0;
const stableThreshold = 10;
function checkImages() {
const images = document.querySelectorAll('$imageSelector');
const loadedImages = Array.from(images).filter(img => img.complete && img.naturalWidth > 0);
if (loadedImages.length === lastImageCount) {
stableCount++;
} else {
stableCount = 0;
lastImageCount = loadedImages.length;
}
if (stableCount >= stableThreshold) {
resolve(true);
} else {
setTimeout(checkImages, 200);
}
}
checkImages();
});
");
}
);
}
private function scrapeVerticalReaderJavascript(PantherClient $pantherClient, ContentSource $contentSource, Chapter $chapter): array
{
$pageData = [];
$crawler = $pantherClient->waitFor($contentSource->getImageSelector());
$images = $crawler->filter($contentSource->getImageSelector());
foreach ($images as $index => $image) {
$imageUrl = $image->getAttribute('src') ?: $image->getAttribute('data-src');
$pageData[] = [
'image_url' => $this->cleanImageUrl($imageUrl),
'page_number' => $index + 1,
];
}
return $pageData;
}
private function scrapeHorizontalReaderJavascript(PantherClient $pantherClient, ContentSource $contentSource, Chapter $chapter): array
{
$pageData = [];
return $pageData;
}
}