Files
Mangarr/src/Service/Scraper/HtmlScraper.php
Jérémy Guillot fafff5014c Added:
- Refactor MangaScraperService (not used everywhere now)
- Added JavascriptScraper.php
- Added alternatives slugs in Manga.php
- Improvement in manga edit form
2024-07-21 19:08:46 +02:00

198 lines
6.5 KiB
PHP

<?php
namespace App\Service\Scraper;
use App\Entity\Chapter;
use App\Entity\ContentSource;
use Doctrine\ORM\EntityManagerInterface;
use Exception;
use GuzzleHttp\Client;
use Symfony\Component\EventDispatcher\EventDispatcherInterface;
use Symfony\Component\DomCrawler\Crawler;
class HtmlScraper extends AbstractScraper
{
private Client $client;
public function __construct(
string $projectDir,
EventDispatcherInterface $eventDispatcher,
EntityManagerInterface $entityManager
) {
parent::__construct($projectDir, $eventDispatcher, $entityManager);
$this->client = new Client();
}
/**
* @throws Exception
*/
public function scrapeChapter(Chapter $chapter, ContentSource $contentSource): array|bool
{
$manga = $chapter->getManga();
$chapterUrl = $this->getValidChapterUrl($contentSource, $manga, $chapter->getNumber());
if (!$chapterUrl) {
throw new Exception("Aucune URL valide trouvée pour le chapitre {$chapter->getNumber()} du manga {$manga->getTitle()}");
}
$tempDir = sys_get_temp_dir() . '/' . uniqid('manga_scraper_');
mkdir($tempDir);
$pageData = [];
if ($contentSource->getNextPageSelector() === null) {
// Lecteur vertical
$html = $this->fetchHtml($chapterUrl);
$pageData = $this->scrapeVerticalReader($html, $contentSource);
} else {
// Lecteur horizontal (paginé)
$pageData = $this->scrapeHorizontalReader($chapterUrl, $contentSource);
}
// Télécharger et sauvegarder les images
foreach ($pageData as $index => &$page) {
$imageName = sprintf('%03d.%s', $index + 1, pathinfo(parse_url($page['image_url'], PHP_URL_PATH), PATHINFO_EXTENSION));
$imagePath = $tempDir . '/' . $imageName;
$this->downloadAndSaveImage($page['image_url'], $imagePath);
$this->dispatchProgressEvent($chapter, $index + 1, count($pageData));
$page['local_image_url'] = $imagePath;
}
$cbzFilePath = $this->generateCbzPath($manga, $chapter);
$this->createCbzFile($tempDir, $pageData, $cbzFilePath);
$chapter->setCbzPath($cbzFilePath);
$this->entityManager->persist($chapter);
$this->entityManager->flush();
// Nettoyage du répertoire temporaire
$this->cleanupTempFiles($tempDir);
return $pageData;
}
/**
* @throws Exception
*/
public function testScraping(string $mangaSlug, string $chapterNumber, ContentSource $contentSource): array
{
$chapterUrl = $contentSource->getChapterUrl($mangaSlug, $chapterNumber);
if (!$this->isChapterUrlValid($chapterUrl)) {
throw new \Exception("Invalid URL, check format and slug");
}
$html = $this->fetchHtml($chapterUrl);
if ($contentSource->getNextPageSelector() === null) {
return $this->scrapeVerticalReader($html, $contentSource);
} else {
return $this->scrapeHorizontalReader($chapterUrl, $contentSource);
}
}
public function supports(string $scrapingType): bool
{
return $scrapingType === 'html';
}
private function scrapeVerticalReader(string $html, ContentSource $contentSource): array
{
$crawler = new Crawler($html);
$images = $crawler->filter($contentSource->getImageSelector());
$pageData = [];
foreach ($images as $index => $image) {
$imgUrl = $image->getAttribute('src') ?: $image->getAttribute('data-src');
$pageData[] = [
'image_url' => $this->cleanImageUrl($imgUrl),
'page_number' => $index + 1,
];
}
return $pageData;
}
private function scrapeHorizontalReader(string $chapterUrl, ContentSource $contentSource): array
{
$pageData = [];
$currentPageUrl = $chapterUrl;
do {
$html = $this->fetchHtml($currentPageUrl);
$page = $this->extractMangaPageData($html, $contentSource);
$pageData[] = [
'image_url' => $this->cleanImageUrl($page['image_url']),
'page_number' => count($pageData) + 1,
];
$currentPageUrl = $page['next_page_url'];
} while ($currentPageUrl);
return $pageData;
}
private function fetchHtml(string $url): string
{
try {
$response = $this->client->get($url, [
'http_errors' => true,
'allow_redirects' => false
]);
$statusCode = $response->getStatusCode();
if ($statusCode >= 300 && $statusCode < 400 || $statusCode == 404) {
throw new Exception('Chapter Not Found at ' . $url);
}
return (string)$response->getBody();
} catch (Exception $e) {
throw new Exception('Bad Request: ' . $e->getMessage());
}
}
private function downloadAndSaveImage(string $imageUrl, string $destinationPath): void
{
try {
$response = $this->client->get($imageUrl);
$contentType = $response->getHeaderLine('Content-Type');
if (str_starts_with($contentType, 'image/')) {
file_put_contents($destinationPath, $response->getBody()->getContents());
} else {
throw new Exception('Le contenu récupéré n\'est pas une image. Type de contenu : ' . $contentType);
}
} catch (Exception $e) {
throw new Exception('Erreur lors de la récupération de l\'image : ' . $e->getMessage());
}
}
private function extractMangaPageData(string $html, ContentSource $mangaSource): array
{
$crawler = new Crawler($html);
$imgUrl = $crawler->filter($mangaSource->getImageSelector())->attr('src')
?? $crawler->filter($mangaSource->getImageSelector())->attr('data-src');
$nextLink = $crawler->filter($mangaSource->getNextPageSelector());
$nextUrl = $nextLink->count() > 0 ? $nextLink->attr('href') : null;
// Convert relative URLs to absolute URLs
if (!preg_match('/^https?:\/\//', $imgUrl)) {
$urlComponents = parse_url($mangaSource->getBaseUrl());
$scheme = $urlComponents['scheme'];
$host = $urlComponents['host'];
$imgUrl = $scheme . '://' . $host . '/' . ltrim($imgUrl, '/');
}
return [
'image_url' => $imgUrl,
'next_page_url' => $nextUrl,
];
}
}