feat: added module for work with howlongtobeat.com

Signed-off-by: Boris Yumankulov <boria138@altlinux.org>
This commit is contained in:
2025-07-13 08:52:15 +05:00
parent 700a478598
commit 233dab1269
8 changed files with 902 additions and 401 deletions

View File

@ -0,0 +1,461 @@
import orjson
import re
from dataclasses import dataclass, field
from enum import Enum
from typing import Any
from difflib import SequenceMatcher
import requests
from bs4 import BeautifulSoup, Tag
from portprotonqt.config_utils import read_proxy_config
class SearchModifiers(Enum):
"""Модификаторы поиска для фильтрации результатов."""
NONE = ""
ONLY_DLC = "only_dlc"
ONLY_MODS = "only_mods"
ONLY_HACKS = "only_hacks"
HIDE_DLC = "hide_dlc"
@dataclass
class GameEntry:
"""Информация об игре из HowLongToBeat."""
# Основная информация
game_id: int = -1
game_name: str | None = None
game_alias: str | None = None
game_type: str | None = None
game_image_url: str | None = None
game_web_link: str | None = None
review_score: float | None = None
developer: str | None = None
platforms: list[str] = field(default_factory=list)
release_year: int | None = None
similarity: float = -1.0
# Времена прохождения (в часах)
main_story: float | None = None
main_extra: float | None = None
completionist: float | None = None
all_styles: float | None = None
coop_time: float | None = None
multiplayer_time: float | None = None
# Флаги сложности
has_single_player: bool = False
has_coop: bool = False
has_multiplayer: bool = False
has_combined_complexity: bool = False
# Исходные данные JSON
raw_data: dict[str, Any] = field(default_factory=dict)
@dataclass
class SearchConfig:
"""Конфигурация для поиска."""
api_key: str | None = None
search_url: str | None = None
class APIKeyExtractor:
"""Извлекает API ключ и URL поиска из скриптов сайта."""
@staticmethod
def extract_from_script(script_content: str) -> SearchConfig:
"""Извлекает конфигурацию из содержимого скрипта."""
config = SearchConfig()
config.api_key = APIKeyExtractor._extract_api_key(script_content)
config.search_url = APIKeyExtractor._extract_search_url(script_content, config.api_key)
return config
@staticmethod
def _extract_api_key(script_content: str) -> str | None:
"""Извлекает API ключ из скрипта."""
# Паттерн для поиска user ID
user_id_pattern = r'users\s*:\s*{\s*id\s*:\s*"([^"]+)"'
matches = re.findall(user_id_pattern, script_content)
if matches:
return ''.join(matches)
# Паттерн для поиска конкатенированного API ключа
concat_pattern = r'\/api\/\w+\/"(?:\.concat\("[^"]*"\))+'
matches = re.findall(concat_pattern, script_content)
if matches:
parts = str(matches).split('.concat')
cleaned_parts = [re.sub(r'["\(\)\[\]\']', '', part) for part in parts[1:]]
return ''.join(cleaned_parts)
return None
@staticmethod
def _extract_search_url(script_content: str, api_key: str | None) -> str | None:
"""Извлекает URL поиска из скрипта."""
if not api_key:
return None
pattern = re.compile(
r'fetch\(\s*["\'](\/api\/[^"\']*)["\']'
r'((?:\s*\.concat\(\s*["\']([^"\']*)["\']\s*\))+)'
r'\s*,',
re.DOTALL
)
for match in pattern.finditer(script_content):
endpoint = match.group(1)
concat_calls = match.group(2)
concat_strings = re.findall(r'\.concat\(\s*["\']([^"\']*)["\']\s*\)', concat_calls)
concatenated_str = ''.join(concat_strings)
if concatenated_str == api_key:
return endpoint
return None
class HTTPClient:
"""HTTP клиент для работы с API HowLongToBeat."""
BASE_URL = 'https://howlongtobeat.com/'
GAME_URL = BASE_URL + "game"
SEARCH_URL = BASE_URL + "api/s/"
def __init__(self, timeout: int = 60):
self.timeout = timeout
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'referer': self.BASE_URL
})
# Apply proxy settings from config
proxy_config = read_proxy_config()
if proxy_config:
self.session.proxies.update(proxy_config)
def get_search_config(self, parse_all_scripts: bool = False) -> SearchConfig | None:
"""Получает конфигурацию поиска с главной страницы."""
try:
response = self.session.get(self.BASE_URL, timeout=self.timeout)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
scripts = soup.find_all('script', src=True)
# Filter for Tag objects and ensure src is a string
if parse_all_scripts:
script_urls = []
for script in scripts:
if isinstance(script, Tag):
src = script.get('src')
if src is not None and isinstance(src, str):
script_urls.append(src)
else:
script_urls = []
for script in scripts:
if isinstance(script, Tag):
src = script.get('src')
if src is not None and isinstance(src, str) and '_app-' in src:
script_urls.append(src)
for script_url in script_urls:
full_url = self.BASE_URL + script_url
script_response = self.session.get(full_url, timeout=self.timeout)
if script_response.status_code == 200:
config = APIKeyExtractor.extract_from_script(script_response.text)
if config.api_key:
return config
except requests.RequestException:
pass
return None
def search_games(self, game_name: str, search_modifiers: SearchModifiers = SearchModifiers.NONE,
page: int = 1, config: SearchConfig | None = None) -> str | None:
"""Выполняет поиск игр."""
if not config:
config = self.get_search_config()
if not config:
config = self.get_search_config(parse_all_scripts=True)
if not config or not config.api_key:
return None
search_url = self.SEARCH_URL
if config.search_url:
search_url = self.BASE_URL + config.search_url.lstrip('/')
payload = self._build_search_payload(game_name, search_modifiers, page, config)
headers = {
'content-type': 'application/json',
'accept': '*/*'
}
# Попытка с API ключом в URL
try:
response = self.session.post(
search_url + config.api_key,
headers=headers,
data=orjson.dumps(payload),
timeout=self.timeout
)
if response.status_code == 200:
return response.text
except requests.RequestException:
pass
# Попытка с API ключом в payload
try:
response = self.session.post(
search_url,
headers=headers,
data=orjson.dumps(payload),
timeout=self.timeout
)
if response.status_code == 200:
return response.text
except requests.RequestException:
pass
return None
def get_game_title(self, game_id: int) -> str | None:
"""Получает название игры по ID."""
try:
params = {'id': str(game_id)}
response = self.session.get(self.GAME_URL, params=params, timeout=self.timeout)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
title_tag = soup.title
if title_tag and title_tag.string:
# Обрезаем стандартные части заголовка
title = title_tag.string[12:-17].strip()
return title
except requests.RequestException:
pass
return None
def _build_search_payload(self, game_name: str, search_modifiers: SearchModifiers,
page: int, config: SearchConfig) -> dict[str, Any]:
"""Строит payload для поискового запроса."""
payload = {
'searchType': "games",
'searchTerms': game_name.split(),
'searchPage': page,
'size': 20,
'searchOptions': {
'games': {
'userId': 0,
'platform': "",
'sortCategory': "popular",
'rangeCategory': "main",
'rangeTime': {'min': 0, 'max': 0},
'gameplay': {
'perspective': "",
'flow': "",
'genre': "",
"difficulty": ""
},
'rangeYear': {'max': "", 'min': ""},
'modifier': search_modifiers.value,
},
'users': {'sortCategory': "postcount"},
'lists': {'sortCategory': "follows"},
'filter': "",
'sort': 0,
'randomizer': 0
},
'useCache': True
}
if config.api_key:
payload['searchOptions']['users']['id'] = config.api_key
return payload
class ResultParser:
"""Парсер результатов поиска."""
IMAGE_URL_PREFIX = "https://howlongtobeat.com/games/"
GAME_URL_PREFIX = "https://howlongtobeat.com/game/"
def __init__(self, search_query: str, minimum_similarity: float = 0.4,
case_sensitive: bool = True, auto_filter_times: bool = False):
self.search_query = search_query
self.minimum_similarity = minimum_similarity
self.case_sensitive = case_sensitive
self.auto_filter_times = auto_filter_times
self.search_numbers = self._extract_numbers(search_query)
def parse_results(self, json_response: str, target_game_id: int | None = None) -> list[GameEntry]:
"""Парсит JSON ответ и возвращает список игр."""
try:
data = orjson.loads(json_response)
games = []
for game_data in data.get("data", []):
game = self._parse_game_entry(game_data)
if target_game_id is not None:
if game.game_id == target_game_id:
games.append(game)
elif self.minimum_similarity == 0.0 or game.similarity >= self.minimum_similarity:
games.append(game)
return games
except (orjson.JSONDecodeError, KeyError):
return []
def _parse_game_entry(self, game_data: dict[str, Any]) -> GameEntry:
"""Парсит данные одной игры."""
game = GameEntry()
# Основная информация
game.game_id = game_data.get("game_id", -1)
game.game_name = game_data.get("game_name")
game.game_alias = game_data.get("game_alias")
game.game_type = game_data.get("game_type")
game.review_score = game_data.get("review_score")
game.developer = game_data.get("profile_dev")
game.release_year = game_data.get("release_world")
game.raw_data = game_data
# URL изображения
if "game_image" in game_data:
game.game_image_url = self.IMAGE_URL_PREFIX + game_data["game_image"]
# Ссылка на игру
game.game_web_link = f"{self.GAME_URL_PREFIX}{game.game_id}"
# Платформы
if "profile_platform" in game_data:
game.platforms = game_data["profile_platform"].split(", ")
# Времена прохождения (конвертация из секунд в часы)
time_fields = [
("comp_main", "main_story"),
("comp_plus", "main_extra"),
("comp_100", "completionist"),
("comp_all", "all_styles"),
("invested_co", "coop_time"),
("invested_mp", "multiplayer_time")
]
for json_field, attr_name in time_fields:
if json_field in game_data:
time_hours = round(game_data[json_field] / 3600, 2)
setattr(game, attr_name, time_hours)
# Флаги сложности
game.has_combined_complexity = bool(game_data.get("comp_lvl_combine", 0))
game.has_single_player = bool(game_data.get("comp_lvl_sp", 0))
game.has_coop = bool(game_data.get("comp_lvl_co", 0))
game.has_multiplayer = bool(game_data.get("comp_lvl_mp", 0))
# Автофильтрация времен
if self.auto_filter_times:
if not game.has_single_player:
game.main_story = None
game.main_extra = None
game.completionist = None
game.all_styles = None
if not game.has_coop:
game.coop_time = None
if not game.has_multiplayer:
game.multiplayer_time = None
# Вычисление similarity
game.similarity = self._calculate_similarity(game)
return game
def _calculate_similarity(self, game: GameEntry) -> float:
"""Вычисляет similarity между поисковым запросом и игрой."""
name_similarity = self._compare_strings(self.search_query, game.game_name)
alias_similarity = self._compare_strings(self.search_query, game.game_alias)
return max(name_similarity, alias_similarity)
def _compare_strings(self, a: str | None, b: str | None) -> float:
"""Сравнивает две строки и возвращает коэффициент similarity."""
if not a or not b:
return 0.0
if self.case_sensitive:
similarity = SequenceMatcher(None, a, b).ratio()
else:
similarity = SequenceMatcher(None, a.lower(), b.lower()).ratio()
# Штраф за отсутствие чисел из оригинального запроса
if self.search_numbers and not self._contains_numbers(b, self.search_numbers):
similarity -= 0.1
return max(0.0, similarity)
@staticmethod
def _extract_numbers(text: str) -> list[str]:
"""Извлекает числа из текста."""
return [word for word in text.split() if word.isdigit()]
@staticmethod
def _contains_numbers(text: str, numbers: list[str]) -> bool:
"""Проверяет, содержит ли текст указанные числа."""
if not numbers:
return True
cleaned_text = re.sub(r'([^\s\w]|_)+', '', text)
text_numbers = [word for word in cleaned_text.split() if word.isdigit()]
return any(num in text_numbers for num in numbers)
class HowLongToBeat:
"""Основной класс для работы с API HowLongToBeat."""
def __init__(self, minimum_similarity: float = 0.4, auto_filter_times: bool = False,
timeout: int = 60):
self.minimum_similarity = minimum_similarity
self.auto_filter_times = auto_filter_times
self.http_client = HTTPClient(timeout)
def search(self, game_name: str, search_modifiers: SearchModifiers = SearchModifiers.NONE,
case_sensitive: bool = True) -> list[GameEntry] | None:
"""Ищет игры по названию."""
if not game_name or not game_name.strip():
return None
json_response = self.http_client.search_games(game_name, search_modifiers)
if not json_response:
return None
parser = ResultParser(
game_name,
self.minimum_similarity,
case_sensitive,
self.auto_filter_times
)
return parser.parse_results(json_response)
def search_by_id(self, game_id: int) -> GameEntry | None:
"""Ищет игру по ID."""
if not game_id or game_id <= 0:
return None
game_title = self.http_client.get_game_title(game_id)
if not game_title:
return None
json_response = self.http_client.search_games(game_title)
if not json_response:
return None
parser = ResultParser(game_title, 0.0, False, self.auto_filter_times)
results = parser.parse_results(json_response, target_game_id=game_id)
return results[0] if results else None