Source code for ogu_api.resources.feed

from __future__ import annotations

import re
from typing import Any

import tls_client.response

from ..models import ThreadSummary
from ._base import ResourceBase

__all__ = ['FeedResource']


_NUMERIC_TID_PATTERN = re.compile(r'showthread\.php\?(?:[^#]*&)?tid=(\d+)')



[docs]
class FeedResource(ResourceBase):
    """Forum landing pages: explore (marketplace) and home.

    Paths:

    - ``/explore.php`` — marketplace landing (slug-rewritten URLs).
    - ``/index.php`` — home / activity feed (numeric tid URLs).

    Most callers want :meth:`explore` and :meth:`home` (parsed
    :class:`~ogu_api.ThreadSummary` lists). The ``get_*`` and ``extract_*``
    methods are escape hatches.
    """


[docs]
    async def get_explore(self) -> tls_client.response.Response:
        """Fetch the raw explore page."""
        return await self._http.get('/explore.php')



[docs]
    async def get_home(self) -> tls_client.response.Response:
        """Fetch the raw home page."""
        return await self._http.get('/index.php')



[docs]
    async def explore(self) -> list[ThreadSummary]:
        """Parsed explore feed.

        Returns:
            List of :class:`~ogu_api.ThreadSummary`. ``link`` is the
            slug-rewritten path (``/Thread-...``). ``tid`` is ``None`` for
            entries whose only link form is slug-only.
        """
        response = await self.get_explore()
        return self.extract_thread_summaries(response.text)



[docs]
    async def home(self) -> list[ThreadSummary]:
        """Parsed home feed.

        Returns:
            List of :class:`~ogu_api.ThreadSummary` for every thread on
            ``/index.php``. The "last post" jump-link in each row is merged
            with the title link, so each thread appears exactly once.
        """
        response = await self.get_home()
        return self.extract_thread_summaries(response.text)



[docs]
    @staticmethod
    def extract_thread_summaries(page_html: str) -> list[ThreadSummary]:
        """Walk every thread anchor in the page and collapse rows into summaries.

        Title links (``/Thread-Slug``) and "last post" jump-links
        (``/showthread.php?tid=N&action=lastpost``) for the same thread are
        merged into one :class:`~ogu_api.ThreadSummary` by looking for a
        sibling tid anchor in the same ``<tr>`` row when the current anchor
        has no tid of its own.

        Args:
            page_html: HTML of any page containing thread links — explore,
                home, ``/forumdisplay.php?fid=...``, search results, etc.

        Returns:
            One :class:`~ogu_api.ThreadSummary` per unique thread, in
            document order.
        """
        soup = ResourceBase._soup(page_html)
        by_key: dict[str, ThreadSummary] = {}
        order: list[str] = []

        for a in soup.find_all('a', href = True):
            link = _normalize_thread_link(a['href'])
            if link is None:
                continue

            title = a.get_text(' ', strip = True)
            if not title:
                h3 = a.find('h3')
                if h3:
                    title = h3.get_text(' ', strip = True)

            tid = _extract_tid(link)
            if tid is None:
                tid = _row_sibling_tid(a)

            key = f'tid:{tid}' if tid is not None else link
            current_lastpost = 'action=lastpost' in link

            existing = by_key.get(key)
            if existing is None:
                by_key[key] = ThreadSummary(title = title, link = link, tid = tid)
                order.append(key)
                continue

            existing_lastpost = 'action=lastpost' in existing.link

            best_link = existing.link
            if existing_lastpost and not current_lastpost:
                best_link = link

            best_title = existing.title
            if title and not current_lastpost:
                if not best_title or existing_lastpost or len(title) > len(best_title):
                    best_title = title

            by_key[key] = ThreadSummary(title = best_title, link = best_link, tid = tid)

        return [by_key[k] for k in order]



[docs]
    @staticmethod
    def extract_thread_links(page_html: str) -> list[str]:
        """Return deduped thread URL paths in document order.

        Catches both ``/showthread.php?tid=N`` and ``/Thread-Slug`` forms.
        Use this when you don't need titles — :meth:`extract_thread_summaries`
        gives you titles + tid for free.
        """
        soup = ResourceBase._soup(page_html)
        links: list[str] = []
        seen: set[str] = set()

        for a in soup.find_all('a', href = True):
            normalized = _normalize_thread_link(a['href'])
            if normalized is None or normalized in seen:
                continue

            seen.add(normalized)
            links.append(normalized)

        return links



[docs]
    @staticmethod
    def extract_thread_ids(page_html: str) -> list[str]:
        """Numeric thread ids only (skips slug-rewritten links)."""
        soup = ResourceBase._soup(page_html)
        ids: list[str] = []
        seen: set[str] = set()

        for a in soup.find_all('a', href = True):
            match = _NUMERIC_TID_PATTERN.search(a['href'])
            if not match:
                continue

            tid = match.group(1)
            if tid not in seen:
                seen.add(tid)
                ids.append(tid)

        return ids




def _extract_tid(link: str) -> int | None:
    match = _NUMERIC_TID_PATTERN.search(link)
    if not match:
        return None

    try:
        return int(match.group(1))

    except ValueError:
        return None


def _row_sibling_tid(anchor: Any) -> int | None:
    row = anchor.find_parent('tr')
    if row is None:
        return None

    for sibling in row.find_all('a', href = True):
        if sibling is anchor:
            continue

        sibling_link = _normalize_thread_link(sibling.get('href') or '')
        if sibling_link is None:
            continue

        sibling_tid = _extract_tid(sibling_link)
        if sibling_tid is not None:
            return sibling_tid

    return None


def _normalize_thread_link(href: str) -> str | None:
    href = href.strip()
    if not href:
        return None

    if href.startswith(('http://', 'https://')):
        if 'oguser.com' not in href:
            return None

        _, _, tail = href.partition('oguser.com')
        href = tail or '/'

    if not href.startswith('/'):
        href = '/' + href

    path = href.split('#', 1)[0]
    base = path.split('?', 1)[0]

    if base.startswith('/showthread.php') and 'tid=' in path:
        return path

    if base.startswith('/Thread-'):
        return path

    return None