- ContentSource handling in message
- ContentSource list, add/update ui
- nextPageSelector and imageSelector can be null
- cleanup
This commit is contained in:
Jérémy Guillot
2024-06-30 20:47:27 +02:00
parent ba30d3102d
commit 3012adfee7
24 changed files with 762 additions and 707 deletions

View File

@@ -166,6 +166,21 @@ class MangaScraperService
return json_decode(implode("", $output), true);
}
/**
* @throws GuzzleException
*/
public function testScrapingHtml(string $mangaSlug, string $chapterNumber, ContentSource $contentSource): array
{
$chapterUrl = $contentSource->getChapterUrl($mangaSlug, $chapterNumber);
$html = $this->fetchHtml($chapterUrl);
if ($contentSource->getNextPageSelector() === null) {
return $this->scrapeVerticalReader($html, $contentSource);
} else {
return $this->scrapeHorizontalReader($chapterUrl, $contentSource);
}
}
/**
* @throws GuzzleException
*/
@@ -173,32 +188,32 @@ class MangaScraperService
{
$chapterUrl = $mangaSource->getChapterUrl($manga->getSlug(), $chapter->getNumber());
$pageData = [];
$currentPageUrl = $chapterUrl;
$tempDir = sys_get_temp_dir() . '/' . uniqid('manga_scraper_');
mkdir($tempDir);
do {
$html = $this->fetchHtml($currentPageUrl);
$page = $this->extractMangaPageData($html, $mangaSource);
$pageData = [];
$imageName = sprintf('%03d.%s', count($pageData) + 1, pathinfo(parse_url($page['image_url'], PHP_URL_PATH), PATHINFO_EXTENSION));
if ($mangaSource->getNextPageSelector() === null) {
// Lecteur vertical
$html = $this->fetchHtml($chapterUrl);
$pageData = $this->scrapeVerticalReader($html, $mangaSource);
} else {
// Lecteur horizontal (paginé)
$pageData = $this->scrapeHorizontalReader($chapterUrl, $mangaSource);
}
// Télécharger et sauvegarder les images
foreach ($pageData as $index => &$page) {
$imageName = sprintf('%03d.%s', $index + 1, pathinfo(parse_url($page['image_url'], PHP_URL_PATH), PATHINFO_EXTENSION));
$imagePath = $tempDir . '/' . $imageName;
$this->downloadAndSaveImage($page['image_url'], $imagePath);
$event = new PageScrappingProgressEvent($chapter->getId(), count($pageData) + 1, 0);
$event = new PageScrappingProgressEvent($chapter->getId(), $index + 1, count($pageData));
$this->eventDispatcher->dispatch($event, PageScrappingProgressEvent::NAME);
$pageData[] = [
'image_url' => $page['image_url'],
'local_image_url' => $imagePath,
'page_number' => count($pageData) + 1,
];
$currentPageUrl = $page['next_page_url'];
} while ($currentPageUrl);
$page['local_image_url'] = $imagePath;
}
$cbzFilePath = $this->generateCbzPath($manga, $chapter);
$this->createCbzFile($tempDir, $pageData, $cbzFilePath);
@@ -210,7 +225,78 @@ class MangaScraperService
// Nettoyage du répertoire temporaire
$this->cleanupTempFiles($tempDir);
return true;
return $pageData;
}
private function scrapeVerticalReader(string $html, ContentSource $contentSource): array
{
$crawler = new Crawler($html);
$images = $crawler->filter($contentSource->getImageSelector());
$pageData = [];
foreach ($images as $index => $image) {
if($image->getAttribute('src') === ''){
$imgUrl = $image->getAttribute('data-src');
}else{
$imgUrl = $image->getAttribute('src');
}
$pageData[] = [
'image_url' => $this->cleanImageUrl($imgUrl),
'page_number' => $index + 1,
];
}
return $pageData;
}
/**
* @throws GuzzleException
*/
private function scrapeHorizontalReader(string $chapterUrl, ContentSource $contentSource): array
{
$pageData = [];
$currentPageUrl = $chapterUrl;
do {
$html = $this->fetchHtml($currentPageUrl);
$page = $this->extractMangaPageData($html, $contentSource);
$pageData[] = [
'image_url' => $this->cleanImageUrl($page['image_url']),
'page_number' => count($pageData) + 1,
];
$currentPageUrl = $page['next_page_url'];
} while ($currentPageUrl);
return $pageData;
}
/**
* Processes a single image
* @throws GuzzleException
*/
private function processImage(string $imgUrl, string $tempDir, array &$pageData, int $index, Chapter $chapter): void
{
$imgUrl = $this->cleanImageUrl($imgUrl);
$imageName = sprintf('%03d.%s', $index + 1, pathinfo(parse_url($imgUrl, PHP_URL_PATH), PATHINFO_EXTENSION));
$imagePath = $tempDir . '/' . $imageName;
$this->downloadAndSaveImage($imgUrl, $imagePath);
// $event = new PageScrappingProgressEvent($chapter->getId(), $index + 1, 0);
// $this->eventDispatcher->dispatch($event, PageScrappingProgressEvent::NAME);
$pageData[] = [
'image_url' => $imgUrl,
'local_image_url' => $imagePath,
'page_number' => $index + 1,
];
}
private function cleanImageUrl(string $url): string
{
return preg_replace('/[\x00-\x1F\x7F]/', '', trim($url));
}
/**