Source code for prompt_risk.llm_output

# -*- coding: utf-8 -*-

"""
LLM output post-processing utilities.

Reusable helpers for cleaning and extracting structured data from raw LLM
text responses.
"""

import json
import re
import typing as T

from .exc import JsonExtractionError



[docs]
def extract_json(text: str) -> T.Any:
    """Extract and parse a single JSON object from raw LLM response text.

    Assumes the LLM output contains exactly **one** JSON value — either
    bare or wrapped in a single markdown code fence (````` ```json … ``` `````
    or ````` ``` … ``` `````).  If a code fence is present, only its content
    is parsed; any text outside the fence is ignored.  If no fence is found,
    the entire *text* is treated as JSON.

    This function does **not** handle multiple JSON values in a single
    response.  If the LLM returns more than one JSON block, only the first
    fenced block (or the full text when unfenced) is considered.

    Parameters
    ----------
    text:
        Raw LLM response text, potentially wrapped in markdown code fences.

    Returns
    -------
    Any
        The parsed JSON value (typically a ``dict`` or ``list``).

    Raises
    ------
    JsonExtractionError
        If the extracted text is not valid JSON.  The exception carries the
        full raw LLM output (``raw_output`` attribute) and the original
        parse error as ``__cause__`` for downstream inspection.
    """
    match = re.search(r"```(?:json)?\s*(.*?)\s*```", text, re.DOTALL)
    raw = match.group(1) if match else text

    try:
        return json.loads(raw)
    except (json.JSONDecodeError, ValueError) as exc:
        raise JsonExtractionError(raw_output=text, cause=exc) from exc