GenshinUID/utils/download_resource/download_from_fandom.py
2022-08-29 23:26:46 +08:00

157 lines
5.2 KiB
Python

import re
import random
import asyncio
from pathlib import Path
import aiofiles
from bs4 import BeautifulSoup
from nonebot.log import logger
from aiohttp.client import ClientSession
from .RESOURCE_PATH import GACHA_IMG_PATH, CHAR_STAND_PATH, CHAR_NAMECARD_PATH
MAX_TASKS = 4
baseurl = 'https://genshin-impact.fandom.com/wiki/Genshin_Impact_Wiki'
async def get_url(url: str, sess: ClientSession):
req = await sess.get(url=url)
return await req.read()
async def _download(
url: str,
sess: ClientSession,
sem: asyncio.Semaphore,
file_name: str,
file_path: Path,
log_prefix: str,
):
async with sem:
logger.info(f'{log_prefix}正在下载 {file_name} ,URL为{url}')
async with sess.get(url, timeout=60) as res:
content = await res.read()
if res.status != 200:
logger.info(f"{log_prefix}{file_name} 下载失败: {res.status}")
async with aiofiles.open(file_path / file_name, "+wb") as f:
await asyncio.sleep(random.randint(0, 3))
await f.write(content)
logger.info(f"{log_prefix}{file_name} 下载成功: {res.status}")
async def get_char_url_list():
async with ClientSession() as sess:
base_data = await get_url(baseurl, sess)
content_bs = BeautifulSoup(base_data, 'lxml')
raw_data_5star = content_bs.find_all(
"div", class_='card_container card_5 hidden'
)
raw_data_4star = content_bs.find_all(
"div", class_='card_container card_4 hidden'
)
raw_data_5astar = content_bs.find_all(
"div", class_='card_container card_5a hidden'
)
raw_data = raw_data_5star + raw_data_4star + raw_data_5astar
char_list = {}
for i in raw_data:
char_url = (
"https://genshin-impact.fandom.com"
+ i.find("a")["href"]
+ "/Media"
)
if i.find("a")["title"] != "Traveler":
char_list[i.find("a")["title"]] = char_url
return char_list
async def download_by_fandom(char_list: dict):
# 判断需要下载哪些名片和抽卡图片
if len(list(CHAR_NAMECARD_PATH.iterdir())) < len(char_list) or len(
list(CHAR_STAND_PATH.iterdir())
) < len(char_list):
logger.info('[fandom] 本次需要下载图片')
await get_namecard_and_gacha_pic(char_list)
else:
logger.info('[fandom] 无需下载名片和抽卡图片!')
return ''
async def get_namecard_and_gacha_pic(char_list: dict):
tasks = []
sem = asyncio.Semaphore(MAX_TASKS)
async with ClientSession() as sess:
li = char_list.keys()
for index, i in enumerate(li):
log_prefix = f'[fandom {index + 1}/{len(li)}] '
char_data = await get_url(char_list[i], sess)
char_info_data = await get_url(char_list[i][:-6], sess)
info_bs = BeautifulSoup(char_info_data, 'lxml')
chinese_name = info_bs.find_all("span", lang='zh-Hans')[0].text
logger.info(f'{log_prefix}正在下载{chinese_name}的图片资源...')
char_data_bs = BeautifulSoup(char_data, 'lxml')
gachaImg_data = char_data_bs.find_all(
"img", {'data-caption': 'Full Wish'}
)
namecard_data = char_data_bs.find_all(
"div", class_='wikia-gallery-item'
)
# 特殊排除
if i == "Gorou":
namecard = namecard_data[-3].find_all("img")[0]["src"]
else:
namecard = namecard_data[-2].find_all("img")[0]["src"]
gachaImg_url = re.search(r"[\s\S]+.png", gachaImg_data[0]['src'])
if gachaImg_url:
gachaImg_url = gachaImg_url.group(0)
else:
continue
namecard_url = re.search(r"[\s\S]+.png", namecard)
if namecard_url:
namecard_url = namecard_url.group(0)
else:
continue
# 添加任务
logger.info(f'{log_prefix}添加{chinese_name}的名片资源下载任务...')
tasks.append(
asyncio.wait_for(
_download(
namecard_url,
sess,
sem,
f'{chinese_name}.png',
CHAR_NAMECARD_PATH,
log_prefix,
),
timeout=30,
)
)
logger.info(f'{log_prefix}添加{chinese_name}的抽卡图片资源下载任务...')
tasks.append(
asyncio.wait_for(
_download(
gachaImg_url,
sess,
sem,
f'{chinese_name}.png',
GACHA_IMG_PATH,
log_prefix,
),
timeout=30,
)
)
if len(tasks) >= MAX_TASKS:
await asyncio.gather(*tasks)
tasks = []
await asyncio.gather(*tasks)
logger.info('全部下载完成!')
return '资源下载成功!'