Source code for ogu_api.resources.feed
from __future__ import annotations
import re
from typing import Any
import tls_client.response
from ..models import ThreadSummary
from ._base import ResourceBase
__all__ = ['FeedResource']
_NUMERIC_TID_PATTERN = re.compile(r'showthread\.php\?(?:[^#]*&)?tid=(\d+)')
[docs]
class FeedResource(ResourceBase):
"""Forum landing pages: explore (marketplace) and home.
Paths:
- ``/explore.php`` — marketplace landing (slug-rewritten URLs).
- ``/index.php`` — home / activity feed (numeric tid URLs).
Most callers want :meth:`explore` and :meth:`home` (parsed
:class:`~ogu_api.ThreadSummary` lists). The ``get_*`` and ``extract_*``
methods are escape hatches.
"""
[docs]
async def get_explore(self) -> tls_client.response.Response:
"""Fetch the raw explore page."""
return await self._http.get('/explore.php')
[docs]
async def get_home(self) -> tls_client.response.Response:
"""Fetch the raw home page."""
return await self._http.get('/index.php')
[docs]
async def explore(self) -> list[ThreadSummary]:
"""Parsed explore feed.
Returns:
List of :class:`~ogu_api.ThreadSummary`. ``link`` is the
slug-rewritten path (``/Thread-...``). ``tid`` is ``None`` for
entries whose only link form is slug-only.
"""
response = await self.get_explore()
return self.extract_thread_summaries(response.text)
[docs]
async def home(self) -> list[ThreadSummary]:
"""Parsed home feed.
Returns:
List of :class:`~ogu_api.ThreadSummary` for every thread on
``/index.php``. The "last post" jump-link in each row is merged
with the title link, so each thread appears exactly once.
"""
response = await self.get_home()
return self.extract_thread_summaries(response.text)
[docs]
@staticmethod
def extract_thread_summaries(page_html: str) -> list[ThreadSummary]:
"""Walk every thread anchor in the page and collapse rows into summaries.
Title links (``/Thread-Slug``) and "last post" jump-links
(``/showthread.php?tid=N&action=lastpost``) for the same thread are
merged into one :class:`~ogu_api.ThreadSummary` by looking for a
sibling tid anchor in the same ``<tr>`` row when the current anchor
has no tid of its own.
Args:
page_html: HTML of any page containing thread links — explore,
home, ``/forumdisplay.php?fid=...``, search results, etc.
Returns:
One :class:`~ogu_api.ThreadSummary` per unique thread, in
document order.
"""
soup = ResourceBase._soup(page_html)
by_key: dict[str, ThreadSummary] = {}
order: list[str] = []
for a in soup.find_all('a', href = True):
link = _normalize_thread_link(a['href'])
if link is None:
continue
title = a.get_text(' ', strip = True)
if not title:
h3 = a.find('h3')
if h3:
title = h3.get_text(' ', strip = True)
tid = _extract_tid(link)
if tid is None:
tid = _row_sibling_tid(a)
key = f'tid:{tid}' if tid is not None else link
current_lastpost = 'action=lastpost' in link
existing = by_key.get(key)
if existing is None:
by_key[key] = ThreadSummary(title = title, link = link, tid = tid)
order.append(key)
continue
existing_lastpost = 'action=lastpost' in existing.link
best_link = existing.link
if existing_lastpost and not current_lastpost:
best_link = link
best_title = existing.title
if title and not current_lastpost:
if not best_title or existing_lastpost or len(title) > len(best_title):
best_title = title
by_key[key] = ThreadSummary(title = best_title, link = best_link, tid = tid)
return [by_key[k] for k in order]
[docs]
@staticmethod
def extract_thread_links(page_html: str) -> list[str]:
"""Return deduped thread URL paths in document order.
Catches both ``/showthread.php?tid=N`` and ``/Thread-Slug`` forms.
Use this when you don't need titles — :meth:`extract_thread_summaries`
gives you titles + tid for free.
"""
soup = ResourceBase._soup(page_html)
links: list[str] = []
seen: set[str] = set()
for a in soup.find_all('a', href = True):
normalized = _normalize_thread_link(a['href'])
if normalized is None or normalized in seen:
continue
seen.add(normalized)
links.append(normalized)
return links
[docs]
@staticmethod
def extract_thread_ids(page_html: str) -> list[str]:
"""Numeric thread ids only (skips slug-rewritten links)."""
soup = ResourceBase._soup(page_html)
ids: list[str] = []
seen: set[str] = set()
for a in soup.find_all('a', href = True):
match = _NUMERIC_TID_PATTERN.search(a['href'])
if not match:
continue
tid = match.group(1)
if tid not in seen:
seen.add(tid)
ids.append(tid)
return ids
def _extract_tid(link: str) -> int | None:
match = _NUMERIC_TID_PATTERN.search(link)
if not match:
return None
try:
return int(match.group(1))
except ValueError:
return None
def _row_sibling_tid(anchor: Any) -> int | None:
row = anchor.find_parent('tr')
if row is None:
return None
for sibling in row.find_all('a', href = True):
if sibling is anchor:
continue
sibling_link = _normalize_thread_link(sibling.get('href') or '')
if sibling_link is None:
continue
sibling_tid = _extract_tid(sibling_link)
if sibling_tid is not None:
return sibling_tid
return None
def _normalize_thread_link(href: str) -> str | None:
href = href.strip()
if not href:
return None
if href.startswith(('http://', 'https://')):
if 'oguser.com' not in href:
return None
_, _, tail = href.partition('oguser.com')
href = tail or '/'
if not href.startswith('/'):
href = '/' + href
path = href.split('#', 1)[0]
base = path.split('?', 1)[0]
if base.startswith('/showthread.php') and 'tid=' in path:
return path
if base.startswith('/Thread-'):
return path
return None