Source code for ogu_api.resources.feed

from __future__ import annotations

import re
from typing import Any

import tls_client.response

from ..models import ThreadSummary
from ._base import ResourceBase

__all__ = ['FeedResource']


_NUMERIC_TID_PATTERN = re.compile(r'showthread\.php\?(?:[^#]*&)?tid=(\d+)')


[docs] class FeedResource(ResourceBase): """Forum landing pages: explore (marketplace) and home. Paths: - ``/explore.php`` — marketplace landing (slug-rewritten URLs). - ``/index.php`` — home / activity feed (numeric tid URLs). Most callers want :meth:`explore` and :meth:`home` (parsed :class:`~ogu_api.ThreadSummary` lists). The ``get_*`` and ``extract_*`` methods are escape hatches. """
[docs] async def get_explore(self) -> tls_client.response.Response: """Fetch the raw explore page.""" return await self._http.get('/explore.php')
[docs] async def get_home(self) -> tls_client.response.Response: """Fetch the raw home page.""" return await self._http.get('/index.php')
[docs] async def explore(self) -> list[ThreadSummary]: """Parsed explore feed. Returns: List of :class:`~ogu_api.ThreadSummary`. ``link`` is the slug-rewritten path (``/Thread-...``). ``tid`` is ``None`` for entries whose only link form is slug-only. """ response = await self.get_explore() return self.extract_thread_summaries(response.text)
[docs] async def home(self) -> list[ThreadSummary]: """Parsed home feed. Returns: List of :class:`~ogu_api.ThreadSummary` for every thread on ``/index.php``. The "last post" jump-link in each row is merged with the title link, so each thread appears exactly once. """ response = await self.get_home() return self.extract_thread_summaries(response.text)
[docs] @staticmethod def extract_thread_summaries(page_html: str) -> list[ThreadSummary]: """Walk every thread anchor in the page and collapse rows into summaries. Title links (``/Thread-Slug``) and "last post" jump-links (``/showthread.php?tid=N&action=lastpost``) for the same thread are merged into one :class:`~ogu_api.ThreadSummary` by looking for a sibling tid anchor in the same ``<tr>`` row when the current anchor has no tid of its own. Args: page_html: HTML of any page containing thread links — explore, home, ``/forumdisplay.php?fid=...``, search results, etc. Returns: One :class:`~ogu_api.ThreadSummary` per unique thread, in document order. """ soup = ResourceBase._soup(page_html) by_key: dict[str, ThreadSummary] = {} order: list[str] = [] for a in soup.find_all('a', href = True): link = _normalize_thread_link(a['href']) if link is None: continue title = a.get_text(' ', strip = True) if not title: h3 = a.find('h3') if h3: title = h3.get_text(' ', strip = True) tid = _extract_tid(link) if tid is None: tid = _row_sibling_tid(a) key = f'tid:{tid}' if tid is not None else link current_lastpost = 'action=lastpost' in link existing = by_key.get(key) if existing is None: by_key[key] = ThreadSummary(title = title, link = link, tid = tid) order.append(key) continue existing_lastpost = 'action=lastpost' in existing.link best_link = existing.link if existing_lastpost and not current_lastpost: best_link = link best_title = existing.title if title and not current_lastpost: if not best_title or existing_lastpost or len(title) > len(best_title): best_title = title by_key[key] = ThreadSummary(title = best_title, link = best_link, tid = tid) return [by_key[k] for k in order]
[docs] @staticmethod def extract_thread_ids(page_html: str) -> list[str]: """Numeric thread ids only (skips slug-rewritten links).""" soup = ResourceBase._soup(page_html) ids: list[str] = [] seen: set[str] = set() for a in soup.find_all('a', href = True): match = _NUMERIC_TID_PATTERN.search(a['href']) if not match: continue tid = match.group(1) if tid not in seen: seen.add(tid) ids.append(tid) return ids
def _extract_tid(link: str) -> int | None: match = _NUMERIC_TID_PATTERN.search(link) if not match: return None try: return int(match.group(1)) except ValueError: return None def _row_sibling_tid(anchor: Any) -> int | None: row = anchor.find_parent('tr') if row is None: return None for sibling in row.find_all('a', href = True): if sibling is anchor: continue sibling_link = _normalize_thread_link(sibling.get('href') or '') if sibling_link is None: continue sibling_tid = _extract_tid(sibling_link) if sibling_tid is not None: return sibling_tid return None def _normalize_thread_link(href: str) -> str | None: href = href.strip() if not href: return None if href.startswith(('http://', 'https://')): if 'oguser.com' not in href: return None _, _, tail = href.partition('oguser.com') href = tail or '/' if not href.startswith('/'): href = '/' + href path = href.split('#', 1)[0] base = path.split('?', 1)[0] if base.startswith('/showthread.php') and 'tid=' in path: return path if base.startswith('/Thread-'): return path return None