langfuse.experiment

Langfuse experiment functionality for running and evaluating tasks on datasets.

This module provides the core experiment functionality for the Langfuse Python SDK, allowing users to run experiments on datasets with automatic tracing, evaluation, and result formatting.

   1"""Langfuse experiment functionality for running and evaluating tasks on datasets.
   2
   3This module provides the core experiment functionality for the Langfuse Python SDK,
   4allowing users to run experiments on datasets with automatic tracing, evaluation,
   5and result formatting.
   6"""
   7
   8import asyncio
   9from datetime import datetime
  10from typing import (
  11    TYPE_CHECKING,
  12    Any,
  13    Awaitable,
  14    Dict,
  15    List,
  16    Optional,
  17    Protocol,
  18    TypedDict,
  19    Union,
  20    overload,
  21)
  22
  23from langfuse.api import DatasetItem
  24from langfuse.logger import langfuse_logger as logger
  25from langfuse.types import ExperimentScoreType
  26
  27if TYPE_CHECKING:
  28    from langfuse._client.client import Langfuse
  29    from langfuse.batch_evaluation import CompositeEvaluatorFunction
  30
  31
  32class LocalExperimentItem(TypedDict, total=False):
  33    """Structure for local experiment data items (not from Langfuse datasets).
  34
  35    This TypedDict defines the structure for experiment items when using local data
  36    rather than Langfuse-hosted datasets. All fields are optional to provide
  37    flexibility in data structure.
  38
  39    Attributes:
  40        input: The input data to pass to the task function. Can be any type that
  41            your task function can process (string, dict, list, etc.). This is
  42            typically the prompt, question, or data that your task will operate on.
  43        expected_output: Optional expected/ground truth output for evaluation purposes.
  44            Used by evaluators to assess correctness or quality. Can be None if
  45            no ground truth is available.
  46        metadata: Optional metadata dictionary containing additional context about
  47            this specific item. Can include information like difficulty level,
  48            category, source, or any other relevant attributes that evaluators
  49            might use for context-aware evaluation.
  50
  51    Examples:
  52        Simple text processing item:
  53        ```python
  54        item: LocalExperimentItem = {
  55            "input": "Summarize this article: ...",
  56            "expected_output": "Expected summary...",
  57            "metadata": {"difficulty": "medium", "category": "news"}
  58        }
  59        ```
  60
  61        Classification item:
  62        ```python
  63        item: LocalExperimentItem = {
  64            "input": {"text": "This movie is great!", "context": "movie review"},
  65            "expected_output": "positive",
  66            "metadata": {"dataset_source": "imdb", "confidence": 0.95}
  67        }
  68        ```
  69
  70        Minimal item with only input:
  71        ```python
  72        item: LocalExperimentItem = {
  73            "input": "What is the capital of France?"
  74        }
  75        ```
  76    """
  77
  78    input: Any
  79    expected_output: Any
  80    metadata: Optional[Dict[str, Any]]
  81
  82
  83ExperimentItem = Union[LocalExperimentItem, DatasetItem]
  84"""Type alias for items that can be processed in experiments.
  85
  86Can be either:
  87- LocalExperimentItem: Dict-like items with 'input', 'expected_output', 'metadata' keys
  88- DatasetItem: Items from Langfuse datasets with .input, .expected_output, .metadata attributes
  89"""
  90
  91ExperimentData = Union[List[LocalExperimentItem], List[DatasetItem]]
  92"""Type alias for experiment datasets.
  93
  94Represents the collection of items to process in an experiment. Can be either:
  95- List[LocalExperimentItem]: Local data items as dictionaries
  96- List[DatasetItem]: Items from a Langfuse dataset (typically from dataset.items)
  97"""
  98
  99
 100class Evaluation:
 101    """Represents an evaluation result for an experiment item or an entire experiment run.
 102
 103    This class provides a strongly-typed way to create evaluation results in evaluator functions.
 104    Users must use keyword arguments when instantiating this class.
 105
 106    Attributes:
 107        name: Unique identifier for the evaluation metric. Should be descriptive
 108            and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity").
 109            Used for aggregation and comparison across experiment runs.
 110        value: The evaluation score or result. Can be:
 111            - Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42)
 112            - String: For categorical results like "positive", "negative", "neutral"
 113            - Boolean: For binary assessments like "passes_safety_check"
 114        comment: Optional human-readable explanation of the evaluation result.
 115            Useful for providing context, explaining scoring rationale, or noting
 116            special conditions. Displayed in Langfuse UI for interpretability.
 117        metadata: Optional structured metadata about the evaluation process.
 118            Can include confidence scores, intermediate calculations, model versions,
 119            or any other relevant technical details.
 120        data_type: Optional score data type. Required if value is not NUMERIC.
 121            One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC.
 122        config_id: Optional Langfuse score config ID.
 123
 124    Examples:
 125        Basic accuracy evaluation:
 126        ```python
 127        from langfuse import Evaluation
 128
 129        def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
 130            if not expected_output:
 131                return Evaluation(name="accuracy", value=0, comment="No expected output")
 132
 133            is_correct = output.strip().lower() == expected_output.strip().lower()
 134            return Evaluation(
 135                name="accuracy",
 136                value=1.0 if is_correct else 0.0,
 137                comment="Correct answer" if is_correct else "Incorrect answer"
 138            )
 139        ```
 140
 141        Multi-metric evaluator:
 142        ```python
 143        def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs):
 144            return [
 145                Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"),
 146                Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"),
 147                Evaluation(
 148                    name="quality",
 149                    value=0.85,
 150                    comment="High quality response",
 151                    metadata={"confidence": 0.92, "model": "gpt-4"}
 152                )
 153            ]
 154        ```
 155
 156        Categorical evaluation:
 157        ```python
 158        def sentiment_evaluator(*, input, output, **kwargs):
 159            sentiment = analyze_sentiment(output)  # Returns "positive", "negative", or "neutral"
 160            return Evaluation(
 161                name="sentiment",
 162                value=sentiment,
 163                comment=f"Response expresses {sentiment} sentiment",
 164                data_type="CATEGORICAL"
 165            )
 166        ```
 167
 168        Failed evaluation with error handling:
 169        ```python
 170        def external_api_evaluator(*, input, output, **kwargs):
 171            try:
 172                score = external_api.evaluate(output)
 173                return Evaluation(name="external_score", value=score)
 174            except Exception as e:
 175                return Evaluation(
 176                    name="external_score",
 177                    value=0,
 178                    comment=f"API unavailable: {e}",
 179                    metadata={"error": str(e), "retry_count": 3}
 180                )
 181        ```
 182
 183    Note:
 184        All arguments must be passed as keywords. Positional arguments are not allowed
 185        to ensure code clarity and prevent errors from argument reordering.
 186    """
 187
 188    def __init__(
 189        self,
 190        *,
 191        name: str,
 192        value: Union[int, float, str, bool],
 193        comment: Optional[str] = None,
 194        metadata: Optional[Dict[str, Any]] = None,
 195        data_type: Optional[ExperimentScoreType] = None,
 196        config_id: Optional[str] = None,
 197    ):
 198        """Initialize an Evaluation with the provided data.
 199
 200        Args:
 201            name: Unique identifier for the evaluation metric.
 202            value: The evaluation score or result.
 203            comment: Optional human-readable explanation of the result.
 204            metadata: Optional structured metadata about the evaluation process.
 205            data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
 206            config_id: Optional Langfuse score config ID.
 207
 208        Note:
 209            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
 210        """
 211        self.name = name
 212        self.value = value
 213        self.comment = comment
 214        self.metadata = metadata
 215        self.data_type = data_type
 216        self.config_id = config_id
 217
 218
 219class ExperimentItemResult:
 220    """Result structure for individual experiment items.
 221
 222    This class represents the complete result of processing a single item
 223    during an experiment run, including the original input, task output,
 224    evaluations, and tracing information. Users must use keyword arguments when instantiating this class.
 225
 226    Attributes:
 227        item: The original experiment item that was processed. Can be either
 228            a dictionary with 'input', 'expected_output', and 'metadata' keys,
 229            or a DatasetItem from Langfuse datasets.
 230        output: The actual output produced by the task function for this item.
 231            Can be any type depending on what your task function returns.
 232        evaluations: List of evaluation results for this item. Each evaluation
 233            contains a name, value, optional comment, and optional metadata.
 234        trace_id: Optional Langfuse trace ID for this item's execution. Used
 235            to link the experiment result with the detailed trace in Langfuse UI.
 236        dataset_run_id: Optional dataset run ID if this item was part of a
 237            Langfuse dataset. None for local experiments.
 238
 239    Examples:
 240        Accessing item result data:
 241        ```python
 242        result = langfuse.run_experiment(...)
 243        for item_result in result.item_results:
 244            print(f"Input: {item_result.item}")
 245            print(f"Output: {item_result.output}")
 246            print(f"Trace: {item_result.trace_id}")
 247
 248            # Access evaluations
 249            for evaluation in item_result.evaluations:
 250                print(f"{evaluation.name}: {evaluation.value}")
 251        ```
 252
 253        Working with different item types:
 254        ```python
 255        # Local experiment item (dict)
 256        if isinstance(item_result.item, dict):
 257            input_data = item_result.item["input"]
 258            expected = item_result.item.get("expected_output")
 259
 260        # Langfuse dataset item (object with attributes)
 261        else:
 262            input_data = item_result.item.input
 263            expected = item_result.item.expected_output
 264        ```
 265
 266    Note:
 267        All arguments must be passed as keywords. Positional arguments are not allowed
 268        to ensure code clarity and prevent errors from argument reordering.
 269    """
 270
 271    def __init__(
 272        self,
 273        *,
 274        item: ExperimentItem,
 275        output: Any,
 276        evaluations: List[Evaluation],
 277        trace_id: Optional[str],
 278        dataset_run_id: Optional[str],
 279    ):
 280        """Initialize an ExperimentItemResult with the provided data.
 281
 282        Args:
 283            item: The original experiment item that was processed.
 284            output: The actual output produced by the task function for this item.
 285            evaluations: List of evaluation results for this item.
 286            trace_id: Optional Langfuse trace ID for this item's execution.
 287            dataset_run_id: Optional dataset run ID if this item was part of a Langfuse dataset.
 288
 289        Note:
 290            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
 291        """
 292        self.item = item
 293        self.output = output
 294        self.evaluations = evaluations
 295        self.trace_id = trace_id
 296        self.dataset_run_id = dataset_run_id
 297
 298
 299class ExperimentResult:
 300    """Complete result structure for experiment execution.
 301
 302    This class encapsulates the complete results of running an experiment on a dataset,
 303    including individual item results, aggregate run-level evaluations, and metadata
 304    about the experiment execution.
 305
 306    Attributes:
 307        name: The name of the experiment as specified during execution.
 308        run_name: The name of the current experiment run.
 309        description: Optional description of the experiment's purpose or methodology.
 310        item_results: List of results from processing each individual dataset item,
 311            containing the original item, task output, evaluations, and trace information.
 312        run_evaluations: List of aggregate evaluation results computed across all items,
 313            such as average scores, statistical summaries, or cross-item analyses.
 314        experiment_id: ID of the experiment run propagated across all items. For
 315            Langfuse datasets, this matches the dataset run ID. For local experiments,
 316            this is a stable SDK-generated identifier for the run.
 317        dataset_run_id: Optional ID of the dataset run in Langfuse (when using Langfuse datasets).
 318        dataset_run_url: Optional direct URL to view the experiment results in Langfuse UI.
 319
 320    Examples:
 321        Basic usage with local dataset:
 322        ```python
 323        result = langfuse.run_experiment(
 324            name="Capital Cities Test",
 325            data=local_data,
 326            task=generate_capital,
 327            evaluators=[accuracy_check]
 328        )
 329
 330        print(f"Processed {len(result.item_results)} items")
 331        print(result.format())  # Human-readable summary
 332
 333        # Access individual results
 334        for item_result in result.item_results:
 335            print(f"Input: {item_result.item}")
 336            print(f"Output: {item_result.output}")
 337            print(f"Scores: {item_result.evaluations}")
 338        ```
 339
 340        Usage with Langfuse datasets:
 341        ```python
 342        dataset = langfuse.get_dataset("qa-eval-set")
 343        result = dataset.run_experiment(
 344            name="GPT-4 QA Evaluation",
 345            task=answer_question,
 346            evaluators=[relevance_check, accuracy_check]
 347        )
 348
 349        # View in Langfuse UI
 350        if result.dataset_run_url:
 351            print(f"View detailed results: {result.dataset_run_url}")
 352        ```
 353
 354        Formatted output:
 355        ```python
 356        # Get summary view
 357        summary = result.format()
 358        print(summary)
 359
 360        # Get detailed view with individual items
 361        detailed = result.format(include_item_results=True)
 362        with open("experiment_report.txt", "w") as f:
 363            f.write(detailed)
 364        ```
 365    """
 366
 367    def __init__(
 368        self,
 369        *,
 370        name: str,
 371        run_name: str,
 372        description: Optional[str],
 373        item_results: List[ExperimentItemResult],
 374        run_evaluations: List[Evaluation],
 375        experiment_id: str,
 376        dataset_run_id: Optional[str] = None,
 377        dataset_run_url: Optional[str] = None,
 378    ):
 379        """Initialize an ExperimentResult with the provided data.
 380
 381        Args:
 382            name: The name of the experiment.
 383            run_name: The current experiment run name.
 384            description: Optional description of the experiment.
 385            item_results: List of results from processing individual dataset items.
 386            run_evaluations: List of aggregate evaluation results for the entire run.
 387            experiment_id: ID of the experiment run.
 388            dataset_run_id: Optional ID of the dataset run (for Langfuse datasets).
 389            dataset_run_url: Optional URL to view results in Langfuse UI.
 390        """
 391        self.name = name
 392        self.run_name = run_name
 393        self.description = description
 394        self.item_results = item_results
 395        self.run_evaluations = run_evaluations
 396        self.experiment_id = experiment_id
 397        self.dataset_run_id = dataset_run_id
 398        self.dataset_run_url = dataset_run_url
 399
 400    def format(self, *, include_item_results: bool = False) -> str:
 401        r"""Format the experiment result for human-readable display.
 402
 403        Converts the experiment result into a nicely formatted string suitable for
 404        console output, logging, or reporting. The output includes experiment overview,
 405        aggregate statistics, and optionally individual item details.
 406
 407        This method provides a comprehensive view of experiment performance including:
 408        - Experiment metadata (name, description, item count)
 409        - List of evaluation metrics used across items
 410        - Average scores computed across all processed items
 411        - Run-level evaluation results (aggregate metrics)
 412        - Links to view detailed results in Langfuse UI (when available)
 413        - Individual item details (when requested)
 414
 415        Args:
 416            include_item_results: Whether to include detailed results for each individual
 417                item in the formatted output. When False (default), only shows aggregate
 418                statistics and summary information. When True, includes input/output/scores
 419                for every processed item, making the output significantly longer but more
 420                detailed for debugging and analysis purposes.
 421
 422        Returns:
 423            A formatted multi-line string containing:
 424            - Experiment name and description (if provided)
 425            - Total number of items successfully processed
 426            - List of all evaluation metrics that were applied
 427            - Average scores across all items for each numeric metric
 428            - Run-level evaluation results with comments
 429            - Dataset run URL for viewing in Langfuse UI (if applicable)
 430            - Individual item details including inputs, outputs, and scores (if requested)
 431
 432        Examples:
 433            Basic usage showing aggregate results only:
 434            ```python
 435            result = langfuse.run_experiment(
 436                name="Capital Cities",
 437                data=dataset,
 438                task=generate_capital,
 439                evaluators=[accuracy_evaluator]
 440            )
 441
 442            print(result.format())
 443            # Output:
 444            # ──────────────────────────────────────────────────
 445            # 📊 Capital Cities
 446            # 100 items
 447            # Evaluations:
 448            #   • accuracy
 449            # Average Scores:
 450            #   • accuracy: 0.850
 451            ```
 452
 453            Detailed output including all individual item results:
 454            ```python
 455            detailed_report = result.format(include_item_results=True)
 456            print(detailed_report)
 457            # Output includes each item:
 458            # 1. Item 1:
 459            #    Input:    What is the capital of France?
 460            #    Expected: Paris
 461            #    Actual:   The capital of France is Paris.
 462            #    Scores:
 463            #      • accuracy: 1.000
 464            #        💭 Correct answer found
 465            # [... continues for all items ...]
 466            ```
 467
 468            Saving formatted results to file for reporting:
 469            ```python
 470            with open("experiment_report.txt", "w") as f:
 471                f.write(result.format(include_item_results=True))
 472
 473            # Or create summary report
 474            summary = result.format()  # Aggregate view only
 475            print(f"Experiment Summary:\n{summary}")
 476            ```
 477
 478            Integration with logging systems:
 479            ```python
 480            import logging
 481            logger = logging.getLogger("experiments")
 482
 483            # Log summary after experiment
 484            logger.info(f"Experiment completed:\n{result.format()}")
 485
 486            # Log detailed results for failed experiments
 487            if any(eval['value'] < threshold for eval in result.run_evaluations):
 488                logger.warning(f"Poor performance detected:\n{result.format(include_item_results=True)}")
 489            ```
 490        """
 491        if not self.item_results:
 492            return "No experiment results to display."
 493
 494        output = ""
 495
 496        # Individual results section
 497        if include_item_results:
 498            for i, result in enumerate(self.item_results):
 499                output += f"\n{i + 1}. Item {i + 1}:\n"
 500
 501                # Extract and display input
 502                item_input = None
 503                if isinstance(result.item, dict):
 504                    item_input = result.item.get("input")
 505                elif hasattr(result.item, "input"):
 506                    item_input = result.item.input
 507
 508                if item_input is not None:
 509                    output += f"   Input:    {_format_value(item_input)}\n"
 510
 511                # Extract and display expected output
 512                expected_output = None
 513                if isinstance(result.item, dict):
 514                    expected_output = result.item.get("expected_output")
 515                elif hasattr(result.item, "expected_output"):
 516                    expected_output = result.item.expected_output
 517
 518                if expected_output is not None:
 519                    output += f"   Expected: {_format_value(expected_output)}\n"
 520                output += f"   Actual:   {_format_value(result.output)}\n"
 521
 522                # Display evaluation scores
 523                if result.evaluations:
 524                    output += "   Scores:\n"
 525                    for evaluation in result.evaluations:
 526                        score = evaluation.value
 527                        if isinstance(score, (int, float)):
 528                            score = f"{score:.3f}"
 529                        output += f"     • {evaluation.name}: {score}"
 530                        if evaluation.comment:
 531                            output += f"\n       💭 {evaluation.comment}"
 532                        output += "\n"
 533
 534                # Display trace link if available
 535                if result.trace_id:
 536                    output += f"\n   Trace ID: {result.trace_id}\n"
 537        else:
 538            output += f"Individual Results: Hidden ({len(self.item_results)} items)\n"
 539            output += "💡 Set include_item_results=True to view them\n"
 540
 541        # Experiment overview section
 542        output += f"\n{'─' * 50}\n"
 543        output += f"🧪 Experiment: {self.name}"
 544        output += f"\n📋 Run name: {self.run_name}"
 545        if self.description:
 546            output += f" - {self.description}"
 547
 548        output += f"\n{len(self.item_results)} items"
 549
 550        # Collect unique evaluation names across all items
 551        evaluation_names = set()
 552        for result in self.item_results:
 553            for evaluation in result.evaluations:
 554                evaluation_names.add(evaluation.name)
 555
 556        if evaluation_names:
 557            output += "\nEvaluations:"
 558            for eval_name in evaluation_names:
 559                output += f"\n{eval_name}"
 560            output += "\n"
 561
 562        # Calculate and display average scores
 563        if evaluation_names:
 564            output += "\nAverage Scores:"
 565            for eval_name in evaluation_names:
 566                scores = []
 567                for result in self.item_results:
 568                    for evaluation in result.evaluations:
 569                        if evaluation.name == eval_name and isinstance(
 570                            evaluation.value, (int, float)
 571                        ):
 572                            scores.append(evaluation.value)
 573
 574                if scores:
 575                    avg = sum(scores) / len(scores)
 576                    output += f"\n{eval_name}: {avg:.3f}"
 577            output += "\n"
 578
 579        # Display run-level evaluations
 580        if self.run_evaluations:
 581            output += "\nRun Evaluations:"
 582            for run_eval in self.run_evaluations:
 583                score = run_eval.value
 584                if isinstance(score, (int, float)):
 585                    score = f"{score:.3f}"
 586                output += f"\n{run_eval.name}: {score}"
 587                if run_eval.comment:
 588                    output += f"\n    💭 {run_eval.comment}"
 589            output += "\n"
 590
 591        # Add dataset run URL if available
 592        if self.dataset_run_url:
 593            output += f"\n🔗 Dataset Run:\n   {self.dataset_run_url}"
 594
 595        return output
 596
 597
 598class TaskFunction(Protocol):
 599    """Protocol defining the interface for experiment task functions.
 600
 601    Task functions are the core processing functions that operate on each item
 602    in an experiment dataset. They receive an experiment item as input and
 603    produce some output that will be evaluated.
 604
 605    Task functions must:
 606    - Accept 'item' as a keyword argument
 607    - Return any type of output (will be passed to evaluators)
 608    - Can be either synchronous or asynchronous
 609    - Should handle their own errors gracefully (exceptions will be logged)
 610    """
 611
 612    def __call__(
 613        self,
 614        *,
 615        item: ExperimentItem,
 616        **kwargs: Dict[str, Any],
 617    ) -> Union[Any, Awaitable[Any]]:
 618        """Execute the task on an experiment item.
 619
 620        This method defines the core processing logic for each item in your experiment.
 621        The implementation should focus on the specific task you want to evaluate,
 622        such as text generation, classification, summarization, etc.
 623
 624        Args:
 625            item: The experiment item to process. Can be either:
 626                - Dict with keys like 'input', 'expected_output', 'metadata'
 627                - Langfuse DatasetItem object with .input, .expected_output attributes
 628            **kwargs: Additional keyword arguments that may be passed by the framework
 629
 630        Returns:
 631            Any: The output of processing the item. This output will be:
 632            - Stored in the experiment results
 633            - Passed to all item-level evaluators for assessment
 634            - Traced automatically in Langfuse for observability
 635
 636            Can return either a direct value or an awaitable (async) result.
 637
 638        Examples:
 639            Simple synchronous task:
 640            ```python
 641            def my_task(*, item, **kwargs):
 642                prompt = f"Summarize: {item['input']}"
 643                return my_llm_client.generate(prompt)
 644            ```
 645
 646            Async task with error handling:
 647            ```python
 648            async def my_async_task(*, item, **kwargs):
 649                try:
 650                    response = await openai_client.chat.completions.create(
 651                        model="gpt-4",
 652                        messages=[{"role": "user", "content": item["input"]}]
 653                    )
 654                    return response.choices[0].message.content
 655                except Exception as e:
 656                    # Log error and return fallback
 657                    print(f"Task failed for item {item}: {e}")
 658                    return "Error: Could not process item"
 659            ```
 660
 661            Task using dataset item attributes:
 662            ```python
 663            def classification_task(*, item, **kwargs):
 664                # Works with both dict items and DatasetItem objects
 665                text = item["input"] if isinstance(item, dict) else item.input
 666                return classify_text(text)
 667            ```
 668        """
 669        ...
 670
 671
 672class EvaluatorFunction(Protocol):
 673    """Protocol defining the interface for item-level evaluator functions.
 674
 675    Item-level evaluators assess the quality, correctness, or other properties
 676    of individual task outputs. They receive the input, output, expected output,
 677    and metadata for each item and return evaluation metrics.
 678
 679    Evaluators should:
 680    - Accept input, output, expected_output, and metadata as keyword arguments
 681    - Return Evaluation dict(s) with 'name', 'value', 'comment', 'metadata' fields
 682    - Be deterministic when possible for reproducible results
 683    - Handle edge cases gracefully (missing expected output, malformed data, etc.)
 684    - Can be either synchronous or asynchronous
 685    """
 686
 687    def __call__(
 688        self,
 689        *,
 690        input: Any,
 691        output: Any,
 692        expected_output: Any,
 693        metadata: Optional[Dict[str, Any]],
 694        **kwargs: Dict[str, Any],
 695    ) -> Union[
 696        Evaluation, List[Evaluation], Awaitable[Union[Evaluation, List[Evaluation]]]
 697    ]:
 698        r"""Evaluate a task output for quality, correctness, or other metrics.
 699
 700        This method should implement specific evaluation logic such as accuracy checking,
 701        similarity measurement, toxicity detection, fluency assessment, etc.
 702
 703        Args:
 704            input: The original input that was passed to the task function.
 705                This is typically the item['input'] or item.input value.
 706            output: The output produced by the task function for this input.
 707                This is the direct return value from your task function.
 708            expected_output: The expected/ground truth output for comparison.
 709                May be None if not available in the dataset. Evaluators should
 710                handle this case appropriately.
 711            metadata: Optional metadata from the experiment item that might
 712                contain additional context for evaluation (categories, difficulty, etc.)
 713            **kwargs: Additional keyword arguments that may be passed by the framework
 714
 715        Returns:
 716            Evaluation results in one of these formats:
 717            - Single Evaluation dict: {"name": "accuracy", "value": 0.85, "comment": "..."}
 718            - List of Evaluation dicts: [{"name": "precision", ...}, {"name": "recall", ...}]
 719            - Awaitable returning either of the above (for async evaluators)
 720
 721            Each Evaluation dict should contain:
 722            - name (str): Unique identifier for this evaluation metric
 723            - value (int|float|str|bool): The evaluation score or result
 724            - comment (str, optional): Human-readable explanation of the result
 725            - metadata (dict, optional): Additional structured data about the evaluation
 726
 727        Examples:
 728            Simple accuracy evaluator:
 729            ```python
 730            def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
 731                if expected_output is None:
 732                    return {"name": "accuracy", "value": 0, "comment": "No expected output"}
 733
 734                is_correct = output.strip().lower() == expected_output.strip().lower()
 735                return {
 736                    "name": "accuracy",
 737                    "value": 1.0 if is_correct else 0.0,
 738                    "comment": "Exact match" if is_correct else "No match"
 739                }
 740            ```
 741
 742            Multi-metric evaluator:
 743            ```python
 744            def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs):
 745                results = []
 746
 747                # Length check
 748                results.append({
 749                    "name": "output_length",
 750                    "value": len(output),
 751                    "comment": f"Output contains {len(output)} characters"
 752                })
 753
 754                # Sentiment analysis
 755                sentiment_score = analyze_sentiment(output)
 756                results.append({
 757                    "name": "sentiment",
 758                    "value": sentiment_score,
 759                    "comment": f"Sentiment score: {sentiment_score:.2f}"
 760                })
 761
 762                return results
 763            ```
 764
 765            Async evaluator using external API:
 766            ```python
 767            async def llm_judge_evaluator(*, input, output, expected_output=None, **kwargs):
 768                prompt = f"Rate the quality of this response on a scale of 1-10:\n"
 769                prompt += f"Question: {input}\nResponse: {output}"
 770
 771                response = await openai_client.chat.completions.create(
 772                    model="gpt-4",
 773                    messages=[{"role": "user", "content": prompt}]
 774                )
 775
 776                try:
 777                    score = float(response.choices[0].message.content.strip())
 778                    return {
 779                        "name": "llm_judge_quality",
 780                        "value": score,
 781                        "comment": f"LLM judge rated this {score}/10"
 782                    }
 783                except ValueError:
 784                    return {
 785                        "name": "llm_judge_quality",
 786                        "value": 0,
 787                        "comment": "Could not parse LLM judge score"
 788                    }
 789            ```
 790
 791            Context-aware evaluator:
 792            ```python
 793            def context_evaluator(*, input, output, metadata=None, **kwargs):
 794                # Use metadata for context-specific evaluation
 795                difficulty = metadata.get("difficulty", "medium") if metadata else "medium"
 796
 797                # Adjust expectations based on difficulty
 798                min_length = {"easy": 50, "medium": 100, "hard": 150}[difficulty]
 799
 800                meets_requirement = len(output) >= min_length
 801                return {
 802                    "name": f"meets_{difficulty}_requirement",
 803                    "value": meets_requirement,
 804                    "comment": f"Output {'meets' if meets_requirement else 'fails'} {difficulty} length requirement"
 805                }
 806            ```
 807        """
 808        ...
 809
 810
 811class RunEvaluatorFunction(Protocol):
 812    """Protocol defining the interface for run-level evaluator functions.
 813
 814    Run-level evaluators assess aggregate properties of the entire experiment run,
 815    computing metrics that span across all items rather than individual outputs.
 816    They receive the complete results from all processed items and can compute
 817    statistics like averages, distributions, correlations, or other aggregate metrics.
 818
 819    Run evaluators should:
 820    - Accept item_results as a keyword argument containing all item results
 821    - Return Evaluation dict(s) with aggregate metrics
 822    - Handle cases where some items may have failed processing
 823    - Compute meaningful statistics across the dataset
 824    - Can be either synchronous or asynchronous
 825    """
 826
 827    def __call__(
 828        self,
 829        *,
 830        item_results: List[ExperimentItemResult],
 831        **kwargs: Dict[str, Any],
 832    ) -> Union[
 833        Evaluation, List[Evaluation], Awaitable[Union[Evaluation, List[Evaluation]]]
 834    ]:
 835        r"""Evaluate the entire experiment run with aggregate metrics.
 836
 837        This method should implement aggregate evaluation logic such as computing
 838        averages, calculating distributions, finding correlations, detecting patterns
 839        across items, or performing statistical analysis on the experiment results.
 840
 841        Args:
 842            item_results: List of results from all successfully processed experiment items.
 843                Each item result contains:
 844                - item: The original experiment item
 845                - output: The task function's output for this item
 846                - evaluations: List of item-level evaluation results
 847                - trace_id: Langfuse trace ID for this execution
 848                - dataset_run_id: Dataset run ID (if using Langfuse datasets)
 849
 850                Note: This list only includes items that were successfully processed.
 851                Failed items are excluded but logged separately.
 852            **kwargs: Additional keyword arguments that may be passed by the framework
 853
 854        Returns:
 855            Evaluation results in one of these formats:
 856            - Single Evaluation dict: {"name": "avg_accuracy", "value": 0.78, "comment": "..."}
 857            - List of Evaluation dicts: [{"name": "mean", ...}, {"name": "std_dev", ...}]
 858            - Awaitable returning either of the above (for async evaluators)
 859
 860            Each Evaluation dict should contain:
 861            - name (str): Unique identifier for this run-level metric
 862            - value (int|float|str|bool): The aggregate evaluation result
 863            - comment (str, optional): Human-readable explanation of the metric
 864            - metadata (dict, optional): Additional structured data about the evaluation
 865
 866        Examples:
 867            Average accuracy calculator:
 868            ```python
 869            def average_accuracy(*, item_results, **kwargs):
 870                if not item_results:
 871                    return {"name": "avg_accuracy", "value": 0.0, "comment": "No results"}
 872
 873                accuracy_values = []
 874                for result in item_results:
 875                    for evaluation in result.evaluations:
 876                        if evaluation.name == "accuracy":
 877                            accuracy_values.append(evaluation.value)
 878
 879                if not accuracy_values:
 880                    return {"name": "avg_accuracy", "value": 0, "comment": "No accuracy evaluations found"}
 881
 882                avg = sum(accuracy_values) / len(accuracy_values)
 883                return {
 884                    "name": "avg_accuracy",
 885                    "value": avg,
 886                    "comment": f"Average accuracy across {len(accuracy_values)} items: {avg:.2%}"
 887                }
 888            ```
 889
 890            Multiple aggregate metrics:
 891            ```python
 892            def statistical_summary(*, item_results, **kwargs):
 893                if not item_results:
 894                    return []
 895
 896                results = []
 897
 898                # Calculate output length statistics
 899                lengths = [len(str(result.output)) for result in item_results]
 900                results.extend([
 901                    {"name": "avg_output_length", "value": sum(lengths) / len(lengths)},
 902                    {"name": "min_output_length", "value": min(lengths)},
 903                    {"name": "max_output_length", "value": max(lengths)}
 904                ])
 905
 906                # Success rate
 907                total_items = len(item_results)  # Only successful items are included
 908                results.append({
 909                    "name": "processing_success_rate",
 910                    "value": 1.0,  # All items in item_results succeeded
 911                    "comment": f"Successfully processed {total_items} items"
 912                })
 913
 914                return results
 915            ```
 916
 917            Async run evaluator with external analysis:
 918            ```python
 919            async def llm_batch_analysis(*, item_results, **kwargs):
 920                # Prepare batch analysis prompt
 921                outputs = [result.output for result in item_results]
 922                prompt = f"Analyze these {len(outputs)} outputs for common themes:\n"
 923                prompt += "\n".join(f"{i+1}. {output}" for i, output in enumerate(outputs))
 924
 925                response = await openai_client.chat.completions.create(
 926                    model="gpt-4",
 927                    messages=[{"role": "user", "content": prompt}]
 928                )
 929
 930                return {
 931                    "name": "thematic_analysis",
 932                    "value": response.choices[0].message.content,
 933                    "comment": f"LLM analysis of {len(outputs)} outputs"
 934                }
 935            ```
 936
 937            Performance distribution analysis:
 938            ```python
 939            def performance_distribution(*, item_results, **kwargs):
 940                # Extract all evaluation scores
 941                all_scores = []
 942                score_by_metric = {}
 943
 944                for result in item_results:
 945                    for evaluation in result.evaluations:
 946                        metric_name = evaluation.name
 947                        value = evaluation.value
 948
 949                        if isinstance(value, (int, float)):
 950                            all_scores.append(value)
 951                            if metric_name not in score_by_metric:
 952                                score_by_metric[metric_name] = []
 953                            score_by_metric[metric_name].append(value)
 954
 955                results = []
 956
 957                # Overall score distribution
 958                if all_scores:
 959                    import statistics
 960                    results.append({
 961                        "name": "score_std_dev",
 962                        "value": statistics.stdev(all_scores) if len(all_scores) > 1 else 0,
 963                        "comment": f"Standard deviation across all numeric scores"
 964                    })
 965
 966                # Per-metric statistics
 967                for metric, scores in score_by_metric.items():
 968                    if len(scores) > 1:
 969                        results.append({
 970                            "name": f"{metric}_variance",
 971                            "value": statistics.variance(scores),
 972                            "comment": f"Variance in {metric} across {len(scores)} items"
 973                        })
 974
 975                return results
 976            ```
 977        """
 978        ...
 979
 980
 981def _format_value(value: Any) -> str:
 982    """Format a value for display."""
 983    if isinstance(value, str):
 984        return value[:50] + "..." if len(value) > 50 else value
 985    return str(value)
 986
 987
 988async def _run_evaluator(
 989    evaluator: Union[EvaluatorFunction, RunEvaluatorFunction], **kwargs: Any
 990) -> List[Evaluation]:
 991    """Run an evaluator function and normalize the result."""
 992    try:
 993        result = evaluator(**kwargs)
 994
 995        # Handle async evaluators
 996        if asyncio.iscoroutine(result):
 997            result = await result
 998
 999        # Normalize to list
1000        if isinstance(result, (dict, Evaluation)):
1001            return [result]  # type: ignore
1002
1003        elif isinstance(result, list):
1004            return result
1005
1006        else:
1007            return []
1008
1009    except Exception as e:
1010        evaluator_name = getattr(evaluator, "__name__", "unknown_evaluator")
1011        logger.error(f"Evaluator {evaluator_name} failed: {e}")
1012        return []
1013
1014
1015async def _run_task(task: TaskFunction, item: ExperimentItem) -> Any:
1016    """Run a task function and handle sync/async."""
1017    result = task(item=item)
1018
1019    # Handle async tasks
1020    if asyncio.iscoroutine(result):
1021        result = await result
1022
1023    return result
1024
1025
1026def create_evaluator_from_autoevals(
1027    autoevals_evaluator: Any, **kwargs: Optional[Dict[str, Any]]
1028) -> EvaluatorFunction:
1029    """Create a Langfuse evaluator from an autoevals evaluator.
1030
1031    Args:
1032        autoevals_evaluator: An autoevals evaluator instance
1033        **kwargs: Additional arguments passed to the evaluator
1034
1035    Returns:
1036        A Langfuse-compatible evaluator function
1037    """
1038
1039    def langfuse_evaluator(
1040        *,
1041        input: Any,
1042        output: Any,
1043        expected_output: Any,
1044        metadata: Optional[Dict[str, Any]],
1045        **langfuse_kwargs: Dict[str, Any],
1046    ) -> Evaluation:
1047        evaluation = autoevals_evaluator(
1048            input=input, output=output, expected=expected_output, **kwargs
1049        )
1050
1051        return Evaluation(
1052            name=evaluation.name,
1053            value=evaluation.score,
1054            comment=(evaluation.metadata or {}).get("comment"),
1055            metadata=evaluation.metadata,
1056        )
1057
1058    return langfuse_evaluator
1059
1060
1061class RunnerContext:
1062    """Wraps :meth:`Langfuse.run_experiment` with CI-injected defaults.
1063
1064    Intended for use with the ``langfuse/experiment-action`` GitHub Action
1065    (https://github.com/langfuse/experiment-action). The action builds a
1066    ``RunnerContext`` before invoking the user's ``experiment(context)``
1067    function. Defaults set here (dataset, metadata tags) are applied when
1068    the user omits them on the :meth:`run_experiment` call; users can
1069    override any default by passing the corresponding argument explicitly.
1070    """
1071
1072    def __init__(
1073        self,
1074        *,
1075        client: "Langfuse",
1076        data: Optional[ExperimentData] = None,
1077        dataset_version: Optional[datetime] = None,
1078        metadata: Optional[Dict[str, str]] = None,
1079    ):
1080        """Build a ``RunnerContext`` populated with defaults for ``run_experiment``.
1081
1082        Typically called by the ``langfuse/experiment-action`` GitHub Action,
1083        not by end users directly. Every field except ``client`` is optional:
1084        fields left as ``None`` simply mean the corresponding argument must be
1085        supplied on the :meth:`run_experiment` call.
1086
1087        Args:
1088            client: Initialized Langfuse SDK client used to execute the
1089                experiment. The action creates this from the
1090                ``langfuse_public_key`` / ``langfuse_secret_key`` /
1091                ``langfuse_base_url`` inputs.
1092            data: Default dataset items to run the experiment on. Accepts
1093                either ``List[LocalExperimentItem]`` or ``List[DatasetItem]``.
1094                Injected by the action when ``dataset_name`` is configured.
1095                If ``None``, the user must pass ``data=`` to
1096                :meth:`run_experiment`.
1097            dataset_version: Optional pinned dataset version. Injected by the
1098                action when ``dataset_version`` is configured.
1099            metadata: Default metadata attached to every experiment trace and
1100                the dataset run. The action injects GitHub-sourced tags (SHA,
1101                PR link, workflow run link, branch, GH user, etc.). Merged
1102                with any ``metadata`` passed to :meth:`run_experiment`, with
1103                user-supplied keys winning on collision.
1104        """
1105        self.client = client
1106        self.data = data
1107        self.dataset_version = dataset_version
1108        self.metadata = metadata
1109
1110    def run_experiment(
1111        self,
1112        *,
1113        name: str,
1114        run_name: Optional[str] = None,
1115        description: Optional[str] = None,
1116        data: Optional[ExperimentData] = None,
1117        task: TaskFunction,
1118        evaluators: List[EvaluatorFunction] = [],
1119        composite_evaluator: Optional["CompositeEvaluatorFunction"] = None,
1120        run_evaluators: List[RunEvaluatorFunction] = [],
1121        max_concurrency: int = 50,
1122        metadata: Optional[Dict[str, str]] = None,
1123        _dataset_version: Optional[datetime] = None,
1124    ) -> ExperimentResult:
1125        resolved_data = data if data is not None else self.data
1126        if resolved_data is None:
1127            raise ValueError(
1128                "`data` must be provided either on the RunnerContext or the run_experiment call"
1129            )
1130
1131        resolved_dataset_version = (
1132            _dataset_version if _dataset_version is not None else self.dataset_version
1133        )
1134
1135        merged_metadata: Optional[Dict[str, str]]
1136        if self.metadata is None and metadata is None:
1137            merged_metadata = None
1138        else:
1139            merged_metadata = {**(self.metadata or {}), **(metadata or {})}
1140
1141        return self.client.run_experiment(
1142            name=name,
1143            run_name=run_name,
1144            description=description,
1145            data=resolved_data,
1146            task=task,
1147            evaluators=evaluators,
1148            composite_evaluator=composite_evaluator,
1149            run_evaluators=run_evaluators,
1150            max_concurrency=max_concurrency,
1151            metadata=merged_metadata,
1152            _dataset_version=resolved_dataset_version,
1153        )
1154
1155
1156class RegressionError(Exception):
1157    """Raised by a user's ``experiment`` function to signal a CI gate failure.
1158
1159    Intended for use with the ``langfuse/experiment-action`` GitHub Action
1160    (https://github.com/langfuse/experiment-action). The action catches this
1161    exception and, when ``should_fail_on_error`` is enabled, fails the
1162    workflow run and renders a callout in the PR comment using
1163    ``metric``/``value``/``threshold`` if supplied, otherwise ``str(exc)``.
1164
1165    Callers choose one of three forms:
1166
1167    - ``RegressionError(result=r)`` — minimal, generic message.
1168    - ``RegressionError(result=r, message="...")`` — free-form message.
1169    - ``RegressionError(result=r, metric="acc", value=0.7, threshold=0.9)`` —
1170      structured; ``metric`` and ``value`` must be provided together so the
1171      action can render a targeted callout without ``None`` placeholders.
1172    """
1173
1174    @overload
1175    def __init__(self, *, result: ExperimentResult) -> None: ...
1176    @overload
1177    def __init__(self, *, result: ExperimentResult, message: str) -> None: ...
1178    @overload
1179    def __init__(
1180        self,
1181        *,
1182        result: ExperimentResult,
1183        metric: str,
1184        value: float,
1185        threshold: Optional[float] = None,
1186        message: Optional[str] = None,
1187    ) -> None: ...
1188    def __init__(
1189        self,
1190        *,
1191        result: ExperimentResult,
1192        metric: Optional[str] = None,
1193        value: Optional[float] = None,
1194        threshold: Optional[float] = None,
1195        message: Optional[str] = None,
1196    ):
1197        self.result = result
1198        self.metric = metric
1199        self.value = value
1200        self.threshold = threshold
1201        if message is not None:
1202            formatted = message
1203        elif metric is not None and value is not None:
1204            formatted = f"Regression on `{metric}`: {value} (threshold {threshold})"
1205        else:
1206            formatted = "Experiment regression detected"
1207        super().__init__(formatted)
class LocalExperimentItem(typing.TypedDict):
33class LocalExperimentItem(TypedDict, total=False):
34    """Structure for local experiment data items (not from Langfuse datasets).
35
36    This TypedDict defines the structure for experiment items when using local data
37    rather than Langfuse-hosted datasets. All fields are optional to provide
38    flexibility in data structure.
39
40    Attributes:
41        input: The input data to pass to the task function. Can be any type that
42            your task function can process (string, dict, list, etc.). This is
43            typically the prompt, question, or data that your task will operate on.
44        expected_output: Optional expected/ground truth output for evaluation purposes.
45            Used by evaluators to assess correctness or quality. Can be None if
46            no ground truth is available.
47        metadata: Optional metadata dictionary containing additional context about
48            this specific item. Can include information like difficulty level,
49            category, source, or any other relevant attributes that evaluators
50            might use for context-aware evaluation.
51
52    Examples:
53        Simple text processing item:
54        ```python
55        item: LocalExperimentItem = {
56            "input": "Summarize this article: ...",
57            "expected_output": "Expected summary...",
58            "metadata": {"difficulty": "medium", "category": "news"}
59        }
60        ```
61
62        Classification item:
63        ```python
64        item: LocalExperimentItem = {
65            "input": {"text": "This movie is great!", "context": "movie review"},
66            "expected_output": "positive",
67            "metadata": {"dataset_source": "imdb", "confidence": 0.95}
68        }
69        ```
70
71        Minimal item with only input:
72        ```python
73        item: LocalExperimentItem = {
74            "input": "What is the capital of France?"
75        }
76        ```
77    """
78
79    input: Any
80    expected_output: Any
81    metadata: Optional[Dict[str, Any]]

Structure for local experiment data items (not from Langfuse datasets).

This TypedDict defines the structure for experiment items when using local data rather than Langfuse-hosted datasets. All fields are optional to provide flexibility in data structure.

Attributes:
  • input: The input data to pass to the task function. Can be any type that your task function can process (string, dict, list, etc.). This is typically the prompt, question, or data that your task will operate on.
  • expected_output: Optional expected/ground truth output for evaluation purposes. Used by evaluators to assess correctness or quality. Can be None if no ground truth is available.
  • metadata: Optional metadata dictionary containing additional context about this specific item. Can include information like difficulty level, category, source, or any other relevant attributes that evaluators might use for context-aware evaluation.
Examples:

Simple text processing item:

item: LocalExperimentItem = {
    "input": "Summarize this article: ...",
    "expected_output": "Expected summary...",
    "metadata": {"difficulty": "medium", "category": "news"}
}

Classification item:

item: LocalExperimentItem = {
    "input": {"text": "This movie is great!", "context": "movie review"},
    "expected_output": "positive",
    "metadata": {"dataset_source": "imdb", "confidence": 0.95}
}

Minimal item with only input:

item: LocalExperimentItem = {
    "input": "What is the capital of France?"
}
input: Any
expected_output: Any
metadata: Optional[Dict[str, Any]]
ExperimentItem = typing.Union[LocalExperimentItem, langfuse.api.DatasetItem]

Type alias for items that can be processed in experiments.

Can be either:

  • LocalExperimentItem: Dict-like items with 'input', 'expected_output', 'metadata' keys
  • DatasetItem: Items from Langfuse datasets with .input, .expected_output, .metadata attributes
ExperimentData = typing.Union[typing.List[LocalExperimentItem], typing.List[langfuse.api.DatasetItem]]

Type alias for experiment datasets.

Represents the collection of items to process in an experiment. Can be either:

  • List[LocalExperimentItem]: Local data items as dictionaries
  • List[DatasetItem]: Items from a Langfuse dataset (typically from dataset.items)
class Evaluation:
101class Evaluation:
102    """Represents an evaluation result for an experiment item or an entire experiment run.
103
104    This class provides a strongly-typed way to create evaluation results in evaluator functions.
105    Users must use keyword arguments when instantiating this class.
106
107    Attributes:
108        name: Unique identifier for the evaluation metric. Should be descriptive
109            and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity").
110            Used for aggregation and comparison across experiment runs.
111        value: The evaluation score or result. Can be:
112            - Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42)
113            - String: For categorical results like "positive", "negative", "neutral"
114            - Boolean: For binary assessments like "passes_safety_check"
115        comment: Optional human-readable explanation of the evaluation result.
116            Useful for providing context, explaining scoring rationale, or noting
117            special conditions. Displayed in Langfuse UI for interpretability.
118        metadata: Optional structured metadata about the evaluation process.
119            Can include confidence scores, intermediate calculations, model versions,
120            or any other relevant technical details.
121        data_type: Optional score data type. Required if value is not NUMERIC.
122            One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC.
123        config_id: Optional Langfuse score config ID.
124
125    Examples:
126        Basic accuracy evaluation:
127        ```python
128        from langfuse import Evaluation
129
130        def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
131            if not expected_output:
132                return Evaluation(name="accuracy", value=0, comment="No expected output")
133
134            is_correct = output.strip().lower() == expected_output.strip().lower()
135            return Evaluation(
136                name="accuracy",
137                value=1.0 if is_correct else 0.0,
138                comment="Correct answer" if is_correct else "Incorrect answer"
139            )
140        ```
141
142        Multi-metric evaluator:
143        ```python
144        def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs):
145            return [
146                Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"),
147                Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"),
148                Evaluation(
149                    name="quality",
150                    value=0.85,
151                    comment="High quality response",
152                    metadata={"confidence": 0.92, "model": "gpt-4"}
153                )
154            ]
155        ```
156
157        Categorical evaluation:
158        ```python
159        def sentiment_evaluator(*, input, output, **kwargs):
160            sentiment = analyze_sentiment(output)  # Returns "positive", "negative", or "neutral"
161            return Evaluation(
162                name="sentiment",
163                value=sentiment,
164                comment=f"Response expresses {sentiment} sentiment",
165                data_type="CATEGORICAL"
166            )
167        ```
168
169        Failed evaluation with error handling:
170        ```python
171        def external_api_evaluator(*, input, output, **kwargs):
172            try:
173                score = external_api.evaluate(output)
174                return Evaluation(name="external_score", value=score)
175            except Exception as e:
176                return Evaluation(
177                    name="external_score",
178                    value=0,
179                    comment=f"API unavailable: {e}",
180                    metadata={"error": str(e), "retry_count": 3}
181                )
182        ```
183
184    Note:
185        All arguments must be passed as keywords. Positional arguments are not allowed
186        to ensure code clarity and prevent errors from argument reordering.
187    """
188
189    def __init__(
190        self,
191        *,
192        name: str,
193        value: Union[int, float, str, bool],
194        comment: Optional[str] = None,
195        metadata: Optional[Dict[str, Any]] = None,
196        data_type: Optional[ExperimentScoreType] = None,
197        config_id: Optional[str] = None,
198    ):
199        """Initialize an Evaluation with the provided data.
200
201        Args:
202            name: Unique identifier for the evaluation metric.
203            value: The evaluation score or result.
204            comment: Optional human-readable explanation of the result.
205            metadata: Optional structured metadata about the evaluation process.
206            data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
207            config_id: Optional Langfuse score config ID.
208
209        Note:
210            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
211        """
212        self.name = name
213        self.value = value
214        self.comment = comment
215        self.metadata = metadata
216        self.data_type = data_type
217        self.config_id = config_id

Represents an evaluation result for an experiment item or an entire experiment run.

This class provides a strongly-typed way to create evaluation results in evaluator functions. Users must use keyword arguments when instantiating this class.

Attributes:
  • name: Unique identifier for the evaluation metric. Should be descriptive and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity"). Used for aggregation and comparison across experiment runs.
  • value: The evaluation score or result. Can be:
    • Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42)
    • String: For categorical results like "positive", "negative", "neutral"
    • Boolean: For binary assessments like "passes_safety_check"
  • comment: Optional human-readable explanation of the evaluation result. Useful for providing context, explaining scoring rationale, or noting special conditions. Displayed in Langfuse UI for interpretability.
  • metadata: Optional structured metadata about the evaluation process. Can include confidence scores, intermediate calculations, model versions, or any other relevant technical details.
  • data_type: Optional score data type. Required if value is not NUMERIC. One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC.
  • config_id: Optional Langfuse score config ID.
Examples:

Basic accuracy evaluation:

from langfuse import Evaluation

def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
    if not expected_output:
        return Evaluation(name="accuracy", value=0, comment="No expected output")

    is_correct = output.strip().lower() == expected_output.strip().lower()
    return Evaluation(
        name="accuracy",
        value=1.0 if is_correct else 0.0,
        comment="Correct answer" if is_correct else "Incorrect answer"
    )

Multi-metric evaluator:

def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs):
    return [
        Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"),
        Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"),
        Evaluation(
            name="quality",
            value=0.85,
            comment="High quality response",
            metadata={"confidence": 0.92, "model": "gpt-4"}
        )
    ]

Categorical evaluation:

def sentiment_evaluator(*, input, output, **kwargs):
    sentiment = analyze_sentiment(output)  # Returns "positive", "negative", or "neutral"
    return Evaluation(
        name="sentiment",
        value=sentiment,
        comment=f"Response expresses {sentiment} sentiment",
        data_type="CATEGORICAL"
    )

Failed evaluation with error handling:

def external_api_evaluator(*, input, output, **kwargs):
    try:
        score = external_api.evaluate(output)
        return Evaluation(name="external_score", value=score)
    except Exception as e:
        return Evaluation(
            name="external_score",
            value=0,
            comment=f"API unavailable: {e}",
            metadata={"error": str(e), "retry_count": 3}
        )
Note:

All arguments must be passed as keywords. Positional arguments are not allowed to ensure code clarity and prevent errors from argument reordering.

Evaluation( *, name: str, value: Union[int, float, str, bool], comment: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, data_type: Optional[Literal['NUMERIC', 'CATEGORICAL', 'BOOLEAN']] = None, config_id: Optional[str] = None)
189    def __init__(
190        self,
191        *,
192        name: str,
193        value: Union[int, float, str, bool],
194        comment: Optional[str] = None,
195        metadata: Optional[Dict[str, Any]] = None,
196        data_type: Optional[ExperimentScoreType] = None,
197        config_id: Optional[str] = None,
198    ):
199        """Initialize an Evaluation with the provided data.
200
201        Args:
202            name: Unique identifier for the evaluation metric.
203            value: The evaluation score or result.
204            comment: Optional human-readable explanation of the result.
205            metadata: Optional structured metadata about the evaluation process.
206            data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
207            config_id: Optional Langfuse score config ID.
208
209        Note:
210            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
211        """
212        self.name = name
213        self.value = value
214        self.comment = comment
215        self.metadata = metadata
216        self.data_type = data_type
217        self.config_id = config_id

Initialize an Evaluation with the provided data.

Arguments:
  • name: Unique identifier for the evaluation metric.
  • value: The evaluation score or result.
  • comment: Optional human-readable explanation of the result.
  • metadata: Optional structured metadata about the evaluation process.
  • data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
  • config_id: Optional Langfuse score config ID.
Note:

All arguments must be provided as keywords. Positional arguments will raise a TypeError.

name
value
comment
metadata
data_type
config_id
class ExperimentItemResult:
220class ExperimentItemResult:
221    """Result structure for individual experiment items.
222
223    This class represents the complete result of processing a single item
224    during an experiment run, including the original input, task output,
225    evaluations, and tracing information. Users must use keyword arguments when instantiating this class.
226
227    Attributes:
228        item: The original experiment item that was processed. Can be either
229            a dictionary with 'input', 'expected_output', and 'metadata' keys,
230            or a DatasetItem from Langfuse datasets.
231        output: The actual output produced by the task function for this item.
232            Can be any type depending on what your task function returns.
233        evaluations: List of evaluation results for this item. Each evaluation
234            contains a name, value, optional comment, and optional metadata.
235        trace_id: Optional Langfuse trace ID for this item's execution. Used
236            to link the experiment result with the detailed trace in Langfuse UI.
237        dataset_run_id: Optional dataset run ID if this item was part of a
238            Langfuse dataset. None for local experiments.
239
240    Examples:
241        Accessing item result data:
242        ```python
243        result = langfuse.run_experiment(...)
244        for item_result in result.item_results:
245            print(f"Input: {item_result.item}")
246            print(f"Output: {item_result.output}")
247            print(f"Trace: {item_result.trace_id}")
248
249            # Access evaluations
250            for evaluation in item_result.evaluations:
251                print(f"{evaluation.name}: {evaluation.value}")
252        ```
253
254        Working with different item types:
255        ```python
256        # Local experiment item (dict)
257        if isinstance(item_result.item, dict):
258            input_data = item_result.item["input"]
259            expected = item_result.item.get("expected_output")
260
261        # Langfuse dataset item (object with attributes)
262        else:
263            input_data = item_result.item.input
264            expected = item_result.item.expected_output
265        ```
266
267    Note:
268        All arguments must be passed as keywords. Positional arguments are not allowed
269        to ensure code clarity and prevent errors from argument reordering.
270    """
271
272    def __init__(
273        self,
274        *,
275        item: ExperimentItem,
276        output: Any,
277        evaluations: List[Evaluation],
278        trace_id: Optional[str],
279        dataset_run_id: Optional[str],
280    ):
281        """Initialize an ExperimentItemResult with the provided data.
282
283        Args:
284            item: The original experiment item that was processed.
285            output: The actual output produced by the task function for this item.
286            evaluations: List of evaluation results for this item.
287            trace_id: Optional Langfuse trace ID for this item's execution.
288            dataset_run_id: Optional dataset run ID if this item was part of a Langfuse dataset.
289
290        Note:
291            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
292        """
293        self.item = item
294        self.output = output
295        self.evaluations = evaluations
296        self.trace_id = trace_id
297        self.dataset_run_id = dataset_run_id

Result structure for individual experiment items.

This class represents the complete result of processing a single item during an experiment run, including the original input, task output, evaluations, and tracing information. Users must use keyword arguments when instantiating this class.

Attributes:
  • item: The original experiment item that was processed. Can be either a dictionary with 'input', 'expected_output', and 'metadata' keys, or a DatasetItem from Langfuse datasets.
  • output: The actual output produced by the task function for this item. Can be any type depending on what your task function returns.
  • evaluations: List of evaluation results for this item. Each evaluation contains a name, value, optional comment, and optional metadata.
  • trace_id: Optional Langfuse trace ID for this item's execution. Used to link the experiment result with the detailed trace in Langfuse UI.
  • dataset_run_id: Optional dataset run ID if this item was part of a Langfuse dataset. None for local experiments.
Examples:

Accessing item result data:

result = langfuse.run_experiment(...)
for item_result in result.item_results:
    print(f"Input: {item_result.item}")
    print(f"Output: {item_result.output}")
    print(f"Trace: {item_result.trace_id}")

    # Access evaluations
    for evaluation in item_result.evaluations:
        print(f"{evaluation.name}: {evaluation.value}")

Working with different item types:

# Local experiment item (dict)
if isinstance(item_result.item, dict):
    input_data = item_result.item["input"]
    expected = item_result.item.get("expected_output")

# Langfuse dataset item (object with attributes)
else:
    input_data = item_result.item.input
    expected = item_result.item.expected_output
Note:

All arguments must be passed as keywords. Positional arguments are not allowed to ensure code clarity and prevent errors from argument reordering.

ExperimentItemResult( *, item: Union[LocalExperimentItem, langfuse.api.DatasetItem], output: Any, evaluations: List[Evaluation], trace_id: Optional[str], dataset_run_id: Optional[str])
272    def __init__(
273        self,
274        *,
275        item: ExperimentItem,
276        output: Any,
277        evaluations: List[Evaluation],
278        trace_id: Optional[str],
279        dataset_run_id: Optional[str],
280    ):
281        """Initialize an ExperimentItemResult with the provided data.
282
283        Args:
284            item: The original experiment item that was processed.
285            output: The actual output produced by the task function for this item.
286            evaluations: List of evaluation results for this item.
287            trace_id: Optional Langfuse trace ID for this item's execution.
288            dataset_run_id: Optional dataset run ID if this item was part of a Langfuse dataset.
289
290        Note:
291            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
292        """
293        self.item = item
294        self.output = output
295        self.evaluations = evaluations
296        self.trace_id = trace_id
297        self.dataset_run_id = dataset_run_id

Initialize an ExperimentItemResult with the provided data.

Arguments:
  • item: The original experiment item that was processed.
  • output: The actual output produced by the task function for this item.
  • evaluations: List of evaluation results for this item.
  • trace_id: Optional Langfuse trace ID for this item's execution.
  • dataset_run_id: Optional dataset run ID if this item was part of a Langfuse dataset.
Note:

All arguments must be provided as keywords. Positional arguments will raise a TypeError.

item
output
evaluations
trace_id
dataset_run_id
class ExperimentResult:
300class ExperimentResult:
301    """Complete result structure for experiment execution.
302
303    This class encapsulates the complete results of running an experiment on a dataset,
304    including individual item results, aggregate run-level evaluations, and metadata
305    about the experiment execution.
306
307    Attributes:
308        name: The name of the experiment as specified during execution.
309        run_name: The name of the current experiment run.
310        description: Optional description of the experiment's purpose or methodology.
311        item_results: List of results from processing each individual dataset item,
312            containing the original item, task output, evaluations, and trace information.
313        run_evaluations: List of aggregate evaluation results computed across all items,
314            such as average scores, statistical summaries, or cross-item analyses.
315        experiment_id: ID of the experiment run propagated across all items. For
316            Langfuse datasets, this matches the dataset run ID. For local experiments,
317            this is a stable SDK-generated identifier for the run.
318        dataset_run_id: Optional ID of the dataset run in Langfuse (when using Langfuse datasets).
319        dataset_run_url: Optional direct URL to view the experiment results in Langfuse UI.
320
321    Examples:
322        Basic usage with local dataset:
323        ```python
324        result = langfuse.run_experiment(
325            name="Capital Cities Test",
326            data=local_data,
327            task=generate_capital,
328            evaluators=[accuracy_check]
329        )
330
331        print(f"Processed {len(result.item_results)} items")
332        print(result.format())  # Human-readable summary
333
334        # Access individual results
335        for item_result in result.item_results:
336            print(f"Input: {item_result.item}")
337            print(f"Output: {item_result.output}")
338            print(f"Scores: {item_result.evaluations}")
339        ```
340
341        Usage with Langfuse datasets:
342        ```python
343        dataset = langfuse.get_dataset("qa-eval-set")
344        result = dataset.run_experiment(
345            name="GPT-4 QA Evaluation",
346            task=answer_question,
347            evaluators=[relevance_check, accuracy_check]
348        )
349
350        # View in Langfuse UI
351        if result.dataset_run_url:
352            print(f"View detailed results: {result.dataset_run_url}")
353        ```
354
355        Formatted output:
356        ```python
357        # Get summary view
358        summary = result.format()
359        print(summary)
360
361        # Get detailed view with individual items
362        detailed = result.format(include_item_results=True)
363        with open("experiment_report.txt", "w") as f:
364            f.write(detailed)
365        ```
366    """
367
368    def __init__(
369        self,
370        *,
371        name: str,
372        run_name: str,
373        description: Optional[str],
374        item_results: List[ExperimentItemResult],
375        run_evaluations: List[Evaluation],
376        experiment_id: str,
377        dataset_run_id: Optional[str] = None,
378        dataset_run_url: Optional[str] = None,
379    ):
380        """Initialize an ExperimentResult with the provided data.
381
382        Args:
383            name: The name of the experiment.
384            run_name: The current experiment run name.
385            description: Optional description of the experiment.
386            item_results: List of results from processing individual dataset items.
387            run_evaluations: List of aggregate evaluation results for the entire run.
388            experiment_id: ID of the experiment run.
389            dataset_run_id: Optional ID of the dataset run (for Langfuse datasets).
390            dataset_run_url: Optional URL to view results in Langfuse UI.
391        """
392        self.name = name
393        self.run_name = run_name
394        self.description = description
395        self.item_results = item_results
396        self.run_evaluations = run_evaluations
397        self.experiment_id = experiment_id
398        self.dataset_run_id = dataset_run_id
399        self.dataset_run_url = dataset_run_url
400
401    def format(self, *, include_item_results: bool = False) -> str:
402        r"""Format the experiment result for human-readable display.
403
404        Converts the experiment result into a nicely formatted string suitable for
405        console output, logging, or reporting. The output includes experiment overview,
406        aggregate statistics, and optionally individual item details.
407
408        This method provides a comprehensive view of experiment performance including:
409        - Experiment metadata (name, description, item count)
410        - List of evaluation metrics used across items
411        - Average scores computed across all processed items
412        - Run-level evaluation results (aggregate metrics)
413        - Links to view detailed results in Langfuse UI (when available)
414        - Individual item details (when requested)
415
416        Args:
417            include_item_results: Whether to include detailed results for each individual
418                item in the formatted output. When False (default), only shows aggregate
419                statistics and summary information. When True, includes input/output/scores
420                for every processed item, making the output significantly longer but more
421                detailed for debugging and analysis purposes.
422
423        Returns:
424            A formatted multi-line string containing:
425            - Experiment name and description (if provided)
426            - Total number of items successfully processed
427            - List of all evaluation metrics that were applied
428            - Average scores across all items for each numeric metric
429            - Run-level evaluation results with comments
430            - Dataset run URL for viewing in Langfuse UI (if applicable)
431            - Individual item details including inputs, outputs, and scores (if requested)
432
433        Examples:
434            Basic usage showing aggregate results only:
435            ```python
436            result = langfuse.run_experiment(
437                name="Capital Cities",
438                data=dataset,
439                task=generate_capital,
440                evaluators=[accuracy_evaluator]
441            )
442
443            print(result.format())
444            # Output:
445            # ──────────────────────────────────────────────────
446            # 📊 Capital Cities
447            # 100 items
448            # Evaluations:
449            #   • accuracy
450            # Average Scores:
451            #   • accuracy: 0.850
452            ```
453
454            Detailed output including all individual item results:
455            ```python
456            detailed_report = result.format(include_item_results=True)
457            print(detailed_report)
458            # Output includes each item:
459            # 1. Item 1:
460            #    Input:    What is the capital of France?
461            #    Expected: Paris
462            #    Actual:   The capital of France is Paris.
463            #    Scores:
464            #      • accuracy: 1.000
465            #        💭 Correct answer found
466            # [... continues for all items ...]
467            ```
468
469            Saving formatted results to file for reporting:
470            ```python
471            with open("experiment_report.txt", "w") as f:
472                f.write(result.format(include_item_results=True))
473
474            # Or create summary report
475            summary = result.format()  # Aggregate view only
476            print(f"Experiment Summary:\n{summary}")
477            ```
478
479            Integration with logging systems:
480            ```python
481            import logging
482            logger = logging.getLogger("experiments")
483
484            # Log summary after experiment
485            logger.info(f"Experiment completed:\n{result.format()}")
486
487            # Log detailed results for failed experiments
488            if any(eval['value'] < threshold for eval in result.run_evaluations):
489                logger.warning(f"Poor performance detected:\n{result.format(include_item_results=True)}")
490            ```
491        """
492        if not self.item_results:
493            return "No experiment results to display."
494
495        output = ""
496
497        # Individual results section
498        if include_item_results:
499            for i, result in enumerate(self.item_results):
500                output += f"\n{i + 1}. Item {i + 1}:\n"
501
502                # Extract and display input
503                item_input = None
504                if isinstance(result.item, dict):
505                    item_input = result.item.get("input")
506                elif hasattr(result.item, "input"):
507                    item_input = result.item.input
508
509                if item_input is not None:
510                    output += f"   Input:    {_format_value(item_input)}\n"
511
512                # Extract and display expected output
513                expected_output = None
514                if isinstance(result.item, dict):
515                    expected_output = result.item.get("expected_output")
516                elif hasattr(result.item, "expected_output"):
517                    expected_output = result.item.expected_output
518
519                if expected_output is not None:
520                    output += f"   Expected: {_format_value(expected_output)}\n"
521                output += f"   Actual:   {_format_value(result.output)}\n"
522
523                # Display evaluation scores
524                if result.evaluations:
525                    output += "   Scores:\n"
526                    for evaluation in result.evaluations:
527                        score = evaluation.value
528                        if isinstance(score, (int, float)):
529                            score = f"{score:.3f}"
530                        output += f"     • {evaluation.name}: {score}"
531                        if evaluation.comment:
532                            output += f"\n       💭 {evaluation.comment}"
533                        output += "\n"
534
535                # Display trace link if available
536                if result.trace_id:
537                    output += f"\n   Trace ID: {result.trace_id}\n"
538        else:
539            output += f"Individual Results: Hidden ({len(self.item_results)} items)\n"
540            output += "💡 Set include_item_results=True to view them\n"
541
542        # Experiment overview section
543        output += f"\n{'─' * 50}\n"
544        output += f"🧪 Experiment: {self.name}"
545        output += f"\n📋 Run name: {self.run_name}"
546        if self.description:
547            output += f" - {self.description}"
548
549        output += f"\n{len(self.item_results)} items"
550
551        # Collect unique evaluation names across all items
552        evaluation_names = set()
553        for result in self.item_results:
554            for evaluation in result.evaluations:
555                evaluation_names.add(evaluation.name)
556
557        if evaluation_names:
558            output += "\nEvaluations:"
559            for eval_name in evaluation_names:
560                output += f"\n{eval_name}"
561            output += "\n"
562
563        # Calculate and display average scores
564        if evaluation_names:
565            output += "\nAverage Scores:"
566            for eval_name in evaluation_names:
567                scores = []
568                for result in self.item_results:
569                    for evaluation in result.evaluations:
570                        if evaluation.name == eval_name and isinstance(
571                            evaluation.value, (int, float)
572                        ):
573                            scores.append(evaluation.value)
574
575                if scores:
576                    avg = sum(scores) / len(scores)
577                    output += f"\n{eval_name}: {avg:.3f}"
578            output += "\n"
579
580        # Display run-level evaluations
581        if self.run_evaluations:
582            output += "\nRun Evaluations:"
583            for run_eval in self.run_evaluations:
584                score = run_eval.value
585                if isinstance(score, (int, float)):
586                    score = f"{score:.3f}"
587                output += f"\n{run_eval.name}: {score}"
588                if run_eval.comment:
589                    output += f"\n    💭 {run_eval.comment}"
590            output += "\n"
591
592        # Add dataset run URL if available
593        if self.dataset_run_url:
594            output += f"\n🔗 Dataset Run:\n   {self.dataset_run_url}"
595
596        return output

Complete result structure for experiment execution.

This class encapsulates the complete results of running an experiment on a dataset, including individual item results, aggregate run-level evaluations, and metadata about the experiment execution.

Attributes:
  • name: The name of the experiment as specified during execution.
  • run_name: The name of the current experiment run.
  • description: Optional description of the experiment's purpose or methodology.
  • item_results: List of results from processing each individual dataset item, containing the original item, task output, evaluations, and trace information.
  • run_evaluations: List of aggregate evaluation results computed across all items, such as average scores, statistical summaries, or cross-item analyses.
  • experiment_id: ID of the experiment run propagated across all items. For Langfuse datasets, this matches the dataset run ID. For local experiments, this is a stable SDK-generated identifier for the run.
  • dataset_run_id: Optional ID of the dataset run in Langfuse (when using Langfuse datasets).
  • dataset_run_url: Optional direct URL to view the experiment results in Langfuse UI.
Examples:

Basic usage with local dataset:

result = langfuse.run_experiment(
    name="Capital Cities Test",
    data=local_data,
    task=generate_capital,
    evaluators=[accuracy_check]
)

print(f"Processed {len(result.item_results)} items")
print(result.format())  # Human-readable summary

# Access individual results
for item_result in result.item_results:
    print(f"Input: {item_result.item}")
    print(f"Output: {item_result.output}")
    print(f"Scores: {item_result.evaluations}")

Usage with Langfuse datasets:

dataset = langfuse.get_dataset("qa-eval-set")
result = dataset.run_experiment(
    name="GPT-4 QA Evaluation",
    task=answer_question,
    evaluators=[relevance_check, accuracy_check]
)

# View in Langfuse UI
if result.dataset_run_url:
    print(f"View detailed results: {result.dataset_run_url}")

Formatted output:

# Get summary view
summary = result.format()
print(summary)

# Get detailed view with individual items
detailed = result.format(include_item_results=True)
with open("experiment_report.txt", "w") as f:
    f.write(detailed)
ExperimentResult( *, name: str, run_name: str, description: Optional[str], item_results: List[ExperimentItemResult], run_evaluations: List[Evaluation], experiment_id: str, dataset_run_id: Optional[str] = None, dataset_run_url: Optional[str] = None)
368    def __init__(
369        self,
370        *,
371        name: str,
372        run_name: str,
373        description: Optional[str],
374        item_results: List[ExperimentItemResult],
375        run_evaluations: List[Evaluation],
376        experiment_id: str,
377        dataset_run_id: Optional[str] = None,
378        dataset_run_url: Optional[str] = None,
379    ):
380        """Initialize an ExperimentResult with the provided data.
381
382        Args:
383            name: The name of the experiment.
384            run_name: The current experiment run name.
385            description: Optional description of the experiment.
386            item_results: List of results from processing individual dataset items.
387            run_evaluations: List of aggregate evaluation results for the entire run.
388            experiment_id: ID of the experiment run.
389            dataset_run_id: Optional ID of the dataset run (for Langfuse datasets).
390            dataset_run_url: Optional URL to view results in Langfuse UI.
391        """
392        self.name = name
393        self.run_name = run_name
394        self.description = description
395        self.item_results = item_results
396        self.run_evaluations = run_evaluations
397        self.experiment_id = experiment_id
398        self.dataset_run_id = dataset_run_id
399        self.dataset_run_url = dataset_run_url

Initialize an ExperimentResult with the provided data.

Arguments:
  • name: The name of the experiment.
  • run_name: The current experiment run name.
  • description: Optional description of the experiment.
  • item_results: List of results from processing individual dataset items.
  • run_evaluations: List of aggregate evaluation results for the entire run.
  • experiment_id: ID of the experiment run.
  • dataset_run_id: Optional ID of the dataset run (for Langfuse datasets).
  • dataset_run_url: Optional URL to view results in Langfuse UI.
name
run_name
description
item_results
run_evaluations
experiment_id
dataset_run_id
dataset_run_url
def format(self, *, include_item_results: bool = False) -> str:
401    def format(self, *, include_item_results: bool = False) -> str:
402        r"""Format the experiment result for human-readable display.
403
404        Converts the experiment result into a nicely formatted string suitable for
405        console output, logging, or reporting. The output includes experiment overview,
406        aggregate statistics, and optionally individual item details.
407
408        This method provides a comprehensive view of experiment performance including:
409        - Experiment metadata (name, description, item count)
410        - List of evaluation metrics used across items
411        - Average scores computed across all processed items
412        - Run-level evaluation results (aggregate metrics)
413        - Links to view detailed results in Langfuse UI (when available)
414        - Individual item details (when requested)
415
416        Args:
417            include_item_results: Whether to include detailed results for each individual
418                item in the formatted output. When False (default), only shows aggregate
419                statistics and summary information. When True, includes input/output/scores
420                for every processed item, making the output significantly longer but more
421                detailed for debugging and analysis purposes.
422
423        Returns:
424            A formatted multi-line string containing:
425            - Experiment name and description (if provided)
426            - Total number of items successfully processed
427            - List of all evaluation metrics that were applied
428            - Average scores across all items for each numeric metric
429            - Run-level evaluation results with comments
430            - Dataset run URL for viewing in Langfuse UI (if applicable)
431            - Individual item details including inputs, outputs, and scores (if requested)
432
433        Examples:
434            Basic usage showing aggregate results only:
435            ```python
436            result = langfuse.run_experiment(
437                name="Capital Cities",
438                data=dataset,
439                task=generate_capital,
440                evaluators=[accuracy_evaluator]
441            )
442
443            print(result.format())
444            # Output:
445            # ──────────────────────────────────────────────────
446            # 📊 Capital Cities
447            # 100 items
448            # Evaluations:
449            #   • accuracy
450            # Average Scores:
451            #   • accuracy: 0.850
452            ```
453
454            Detailed output including all individual item results:
455            ```python
456            detailed_report = result.format(include_item_results=True)
457            print(detailed_report)
458            # Output includes each item:
459            # 1. Item 1:
460            #    Input:    What is the capital of France?
461            #    Expected: Paris
462            #    Actual:   The capital of France is Paris.
463            #    Scores:
464            #      • accuracy: 1.000
465            #        💭 Correct answer found
466            # [... continues for all items ...]
467            ```
468
469            Saving formatted results to file for reporting:
470            ```python
471            with open("experiment_report.txt", "w") as f:
472                f.write(result.format(include_item_results=True))
473
474            # Or create summary report
475            summary = result.format()  # Aggregate view only
476            print(f"Experiment Summary:\n{summary}")
477            ```
478
479            Integration with logging systems:
480            ```python
481            import logging
482            logger = logging.getLogger("experiments")
483
484            # Log summary after experiment
485            logger.info(f"Experiment completed:\n{result.format()}")
486
487            # Log detailed results for failed experiments
488            if any(eval['value'] < threshold for eval in result.run_evaluations):
489                logger.warning(f"Poor performance detected:\n{result.format(include_item_results=True)}")
490            ```
491        """
492        if not self.item_results:
493            return "No experiment results to display."
494
495        output = ""
496
497        # Individual results section
498        if include_item_results:
499            for i, result in enumerate(self.item_results):
500                output += f"\n{i + 1}. Item {i + 1}:\n"
501
502                # Extract and display input
503                item_input = None
504                if isinstance(result.item, dict):
505                    item_input = result.item.get("input")
506                elif hasattr(result.item, "input"):
507                    item_input = result.item.input
508
509                if item_input is not None:
510                    output += f"   Input:    {_format_value(item_input)}\n"
511
512                # Extract and display expected output
513                expected_output = None
514                if isinstance(result.item, dict):
515                    expected_output = result.item.get("expected_output")
516                elif hasattr(result.item, "expected_output"):
517                    expected_output = result.item.expected_output
518
519                if expected_output is not None:
520                    output += f"   Expected: {_format_value(expected_output)}\n"
521                output += f"   Actual:   {_format_value(result.output)}\n"
522
523                # Display evaluation scores
524                if result.evaluations:
525                    output += "   Scores:\n"
526                    for evaluation in result.evaluations:
527                        score = evaluation.value
528                        if isinstance(score, (int, float)):
529                            score = f"{score:.3f}"
530                        output += f"     • {evaluation.name}: {score}"
531                        if evaluation.comment:
532                            output += f"\n       💭 {evaluation.comment}"
533                        output += "\n"
534
535                # Display trace link if available
536                if result.trace_id:
537                    output += f"\n   Trace ID: {result.trace_id}\n"
538        else:
539            output += f"Individual Results: Hidden ({len(self.item_results)} items)\n"
540            output += "💡 Set include_item_results=True to view them\n"
541
542        # Experiment overview section
543        output += f"\n{'─' * 50}\n"
544        output += f"🧪 Experiment: {self.name}"
545        output += f"\n📋 Run name: {self.run_name}"
546        if self.description:
547            output += f" - {self.description}"
548
549        output += f"\n{len(self.item_results)} items"
550
551        # Collect unique evaluation names across all items
552        evaluation_names = set()
553        for result in self.item_results:
554            for evaluation in result.evaluations:
555                evaluation_names.add(evaluation.name)
556
557        if evaluation_names:
558            output += "\nEvaluations:"
559            for eval_name in evaluation_names:
560                output += f"\n{eval_name}"
561            output += "\n"
562
563        # Calculate and display average scores
564        if evaluation_names:
565            output += "\nAverage Scores:"
566            for eval_name in evaluation_names:
567                scores = []
568                for result in self.item_results:
569                    for evaluation in result.evaluations:
570                        if evaluation.name == eval_name and isinstance(
571                            evaluation.value, (int, float)
572                        ):
573                            scores.append(evaluation.value)
574
575                if scores:
576                    avg = sum(scores) / len(scores)
577                    output += f"\n{eval_name}: {avg:.3f}"
578            output += "\n"
579
580        # Display run-level evaluations
581        if self.run_evaluations:
582            output += "\nRun Evaluations:"
583            for run_eval in self.run_evaluations:
584                score = run_eval.value
585                if isinstance(score, (int, float)):
586                    score = f"{score:.3f}"
587                output += f"\n{run_eval.name}: {score}"
588                if run_eval.comment:
589                    output += f"\n    💭 {run_eval.comment}"
590            output += "\n"
591
592        # Add dataset run URL if available
593        if self.dataset_run_url:
594            output += f"\n🔗 Dataset Run:\n   {self.dataset_run_url}"
595
596        return output

Format the experiment result for human-readable display.

Converts the experiment result into a nicely formatted string suitable for console output, logging, or reporting. The output includes experiment overview, aggregate statistics, and optionally individual item details.

This method provides a comprehensive view of experiment performance including:

  • Experiment metadata (name, description, item count)
  • List of evaluation metrics used across items
  • Average scores computed across all processed items
  • Run-level evaluation results (aggregate metrics)
  • Links to view detailed results in Langfuse UI (when available)
  • Individual item details (when requested)
Arguments:
  • include_item_results: Whether to include detailed results for each individual item in the formatted output. When False (default), only shows aggregate statistics and summary information. When True, includes input/output/scores for every processed item, making the output significantly longer but more detailed for debugging and analysis purposes.
Returns:

A formatted multi-line string containing:

  • Experiment name and description (if provided)
  • Total number of items successfully processed
  • List of all evaluation metrics that were applied
  • Average scores across all items for each numeric metric
  • Run-level evaluation results with comments
  • Dataset run URL for viewing in Langfuse UI (if applicable)
  • Individual item details including inputs, outputs, and scores (if requested)
Examples:

Basic usage showing aggregate results only:

result = langfuse.run_experiment(
    name="Capital Cities",
    data=dataset,
    task=generate_capital,
    evaluators=[accuracy_evaluator]
)

print(result.format())
# Output:
# ──────────────────────────────────────────────────
# 📊 Capital Cities
# 100 items
# Evaluations:
#   • accuracy
# Average Scores:
#   • accuracy: 0.850

Detailed output including all individual item results:

detailed_report = result.format(include_item_results=True)
print(detailed_report)
# Output includes each item:
# 1. Item 1:
#    Input:    What is the capital of France?
#    Expected: Paris
#    Actual:   The capital of France is Paris.
#    Scores:
#      • accuracy: 1.000
#        💭 Correct answer found
# [... continues for all items ...]

Saving formatted results to file for reporting:

with open("experiment_report.txt", "w") as f:
    f.write(result.format(include_item_results=True))

# Or create summary report
summary = result.format()  # Aggregate view only
print(f"Experiment Summary:\n{summary}")

Integration with logging systems:

import logging
logger = logging.getLogger("experiments")

# Log summary after experiment
logger.info(f"Experiment completed:\n{result.format()}")

# Log detailed results for failed experiments
if any(eval['value'] < threshold for eval in result.run_evaluations):
    logger.warning(f"Poor performance detected:\n{result.format(include_item_results=True)}")
class TaskFunction(typing.Protocol):
599class TaskFunction(Protocol):
600    """Protocol defining the interface for experiment task functions.
601
602    Task functions are the core processing functions that operate on each item
603    in an experiment dataset. They receive an experiment item as input and
604    produce some output that will be evaluated.
605
606    Task functions must:
607    - Accept 'item' as a keyword argument
608    - Return any type of output (will be passed to evaluators)
609    - Can be either synchronous or asynchronous
610    - Should handle their own errors gracefully (exceptions will be logged)
611    """
612
613    def __call__(
614        self,
615        *,
616        item: ExperimentItem,
617        **kwargs: Dict[str, Any],
618    ) -> Union[Any, Awaitable[Any]]:
619        """Execute the task on an experiment item.
620
621        This method defines the core processing logic for each item in your experiment.
622        The implementation should focus on the specific task you want to evaluate,
623        such as text generation, classification, summarization, etc.
624
625        Args:
626            item: The experiment item to process. Can be either:
627                - Dict with keys like 'input', 'expected_output', 'metadata'
628                - Langfuse DatasetItem object with .input, .expected_output attributes
629            **kwargs: Additional keyword arguments that may be passed by the framework
630
631        Returns:
632            Any: The output of processing the item. This output will be:
633            - Stored in the experiment results
634            - Passed to all item-level evaluators for assessment
635            - Traced automatically in Langfuse for observability
636
637            Can return either a direct value or an awaitable (async) result.
638
639        Examples:
640            Simple synchronous task:
641            ```python
642            def my_task(*, item, **kwargs):
643                prompt = f"Summarize: {item['input']}"
644                return my_llm_client.generate(prompt)
645            ```
646
647            Async task with error handling:
648            ```python
649            async def my_async_task(*, item, **kwargs):
650                try:
651                    response = await openai_client.chat.completions.create(
652                        model="gpt-4",
653                        messages=[{"role": "user", "content": item["input"]}]
654                    )
655                    return response.choices[0].message.content
656                except Exception as e:
657                    # Log error and return fallback
658                    print(f"Task failed for item {item}: {e}")
659                    return "Error: Could not process item"
660            ```
661
662            Task using dataset item attributes:
663            ```python
664            def classification_task(*, item, **kwargs):
665                # Works with both dict items and DatasetItem objects
666                text = item["input"] if isinstance(item, dict) else item.input
667                return classify_text(text)
668            ```
669        """
670        ...

Protocol defining the interface for experiment task functions.

Task functions are the core processing functions that operate on each item in an experiment dataset. They receive an experiment item as input and produce some output that will be evaluated.

Task functions must:

  • Accept 'item' as a keyword argument
  • Return any type of output (will be passed to evaluators)
  • Can be either synchronous or asynchronous
  • Should handle their own errors gracefully (exceptions will be logged)
TaskFunction(*args, **kwargs)
1927def _no_init_or_replace_init(self, *args, **kwargs):
1928    cls = type(self)
1929
1930    if cls._is_protocol:
1931        raise TypeError('Protocols cannot be instantiated')
1932
1933    # Already using a custom `__init__`. No need to calculate correct
1934    # `__init__` to call. This can lead to RecursionError. See bpo-45121.
1935    if cls.__init__ is not _no_init_or_replace_init:
1936        return
1937
1938    # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`.
1939    # The first instantiation of the subclass will call `_no_init_or_replace_init` which
1940    # searches for a proper new `__init__` in the MRO. The new `__init__`
1941    # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent
1942    # instantiation of the protocol subclass will thus use the new
1943    # `__init__` and no longer call `_no_init_or_replace_init`.
1944    for base in cls.__mro__:
1945        init = base.__dict__.get('__init__', _no_init_or_replace_init)
1946        if init is not _no_init_or_replace_init:
1947            cls.__init__ = init
1948            break
1949    else:
1950        # should not happen
1951        cls.__init__ = object.__init__
1952
1953    cls.__init__(self, *args, **kwargs)
class EvaluatorFunction(typing.Protocol):
673class EvaluatorFunction(Protocol):
674    """Protocol defining the interface for item-level evaluator functions.
675
676    Item-level evaluators assess the quality, correctness, or other properties
677    of individual task outputs. They receive the input, output, expected output,
678    and metadata for each item and return evaluation metrics.
679
680    Evaluators should:
681    - Accept input, output, expected_output, and metadata as keyword arguments
682    - Return Evaluation dict(s) with 'name', 'value', 'comment', 'metadata' fields
683    - Be deterministic when possible for reproducible results
684    - Handle edge cases gracefully (missing expected output, malformed data, etc.)
685    - Can be either synchronous or asynchronous
686    """
687
688    def __call__(
689        self,
690        *,
691        input: Any,
692        output: Any,
693        expected_output: Any,
694        metadata: Optional[Dict[str, Any]],
695        **kwargs: Dict[str, Any],
696    ) -> Union[
697        Evaluation, List[Evaluation], Awaitable[Union[Evaluation, List[Evaluation]]]
698    ]:
699        r"""Evaluate a task output for quality, correctness, or other metrics.
700
701        This method should implement specific evaluation logic such as accuracy checking,
702        similarity measurement, toxicity detection, fluency assessment, etc.
703
704        Args:
705            input: The original input that was passed to the task function.
706                This is typically the item['input'] or item.input value.
707            output: The output produced by the task function for this input.
708                This is the direct return value from your task function.
709            expected_output: The expected/ground truth output for comparison.
710                May be None if not available in the dataset. Evaluators should
711                handle this case appropriately.
712            metadata: Optional metadata from the experiment item that might
713                contain additional context for evaluation (categories, difficulty, etc.)
714            **kwargs: Additional keyword arguments that may be passed by the framework
715
716        Returns:
717            Evaluation results in one of these formats:
718            - Single Evaluation dict: {"name": "accuracy", "value": 0.85, "comment": "..."}
719            - List of Evaluation dicts: [{"name": "precision", ...}, {"name": "recall", ...}]
720            - Awaitable returning either of the above (for async evaluators)
721
722            Each Evaluation dict should contain:
723            - name (str): Unique identifier for this evaluation metric
724            - value (int|float|str|bool): The evaluation score or result
725            - comment (str, optional): Human-readable explanation of the result
726            - metadata (dict, optional): Additional structured data about the evaluation
727
728        Examples:
729            Simple accuracy evaluator:
730            ```python
731            def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
732                if expected_output is None:
733                    return {"name": "accuracy", "value": 0, "comment": "No expected output"}
734
735                is_correct = output.strip().lower() == expected_output.strip().lower()
736                return {
737                    "name": "accuracy",
738                    "value": 1.0 if is_correct else 0.0,
739                    "comment": "Exact match" if is_correct else "No match"
740                }
741            ```
742
743            Multi-metric evaluator:
744            ```python
745            def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs):
746                results = []
747
748                # Length check
749                results.append({
750                    "name": "output_length",
751                    "value": len(output),
752                    "comment": f"Output contains {len(output)} characters"
753                })
754
755                # Sentiment analysis
756                sentiment_score = analyze_sentiment(output)
757                results.append({
758                    "name": "sentiment",
759                    "value": sentiment_score,
760                    "comment": f"Sentiment score: {sentiment_score:.2f}"
761                })
762
763                return results
764            ```
765
766            Async evaluator using external API:
767            ```python
768            async def llm_judge_evaluator(*, input, output, expected_output=None, **kwargs):
769                prompt = f"Rate the quality of this response on a scale of 1-10:\n"
770                prompt += f"Question: {input}\nResponse: {output}"
771
772                response = await openai_client.chat.completions.create(
773                    model="gpt-4",
774                    messages=[{"role": "user", "content": prompt}]
775                )
776
777                try:
778                    score = float(response.choices[0].message.content.strip())
779                    return {
780                        "name": "llm_judge_quality",
781                        "value": score,
782                        "comment": f"LLM judge rated this {score}/10"
783                    }
784                except ValueError:
785                    return {
786                        "name": "llm_judge_quality",
787                        "value": 0,
788                        "comment": "Could not parse LLM judge score"
789                    }
790            ```
791
792            Context-aware evaluator:
793            ```python
794            def context_evaluator(*, input, output, metadata=None, **kwargs):
795                # Use metadata for context-specific evaluation
796                difficulty = metadata.get("difficulty", "medium") if metadata else "medium"
797
798                # Adjust expectations based on difficulty
799                min_length = {"easy": 50, "medium": 100, "hard": 150}[difficulty]
800
801                meets_requirement = len(output) >= min_length
802                return {
803                    "name": f"meets_{difficulty}_requirement",
804                    "value": meets_requirement,
805                    "comment": f"Output {'meets' if meets_requirement else 'fails'} {difficulty} length requirement"
806                }
807            ```
808        """
809        ...

Protocol defining the interface for item-level evaluator functions.

Item-level evaluators assess the quality, correctness, or other properties of individual task outputs. They receive the input, output, expected output, and metadata for each item and return evaluation metrics.

Evaluators should:

  • Accept input, output, expected_output, and metadata as keyword arguments
  • Return Evaluation dict(s) with 'name', 'value', 'comment', 'metadata' fields
  • Be deterministic when possible for reproducible results
  • Handle edge cases gracefully (missing expected output, malformed data, etc.)
  • Can be either synchronous or asynchronous
EvaluatorFunction(*args, **kwargs)
1927def _no_init_or_replace_init(self, *args, **kwargs):
1928    cls = type(self)
1929
1930    if cls._is_protocol:
1931        raise TypeError('Protocols cannot be instantiated')
1932
1933    # Already using a custom `__init__`. No need to calculate correct
1934    # `__init__` to call. This can lead to RecursionError. See bpo-45121.
1935    if cls.__init__ is not _no_init_or_replace_init:
1936        return
1937
1938    # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`.
1939    # The first instantiation of the subclass will call `_no_init_or_replace_init` which
1940    # searches for a proper new `__init__` in the MRO. The new `__init__`
1941    # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent
1942    # instantiation of the protocol subclass will thus use the new
1943    # `__init__` and no longer call `_no_init_or_replace_init`.
1944    for base in cls.__mro__:
1945        init = base.__dict__.get('__init__', _no_init_or_replace_init)
1946        if init is not _no_init_or_replace_init:
1947            cls.__init__ = init
1948            break
1949    else:
1950        # should not happen
1951        cls.__init__ = object.__init__
1952
1953    cls.__init__(self, *args, **kwargs)
class RunEvaluatorFunction(typing.Protocol):
812class RunEvaluatorFunction(Protocol):
813    """Protocol defining the interface for run-level evaluator functions.
814
815    Run-level evaluators assess aggregate properties of the entire experiment run,
816    computing metrics that span across all items rather than individual outputs.
817    They receive the complete results from all processed items and can compute
818    statistics like averages, distributions, correlations, or other aggregate metrics.
819
820    Run evaluators should:
821    - Accept item_results as a keyword argument containing all item results
822    - Return Evaluation dict(s) with aggregate metrics
823    - Handle cases where some items may have failed processing
824    - Compute meaningful statistics across the dataset
825    - Can be either synchronous or asynchronous
826    """
827
828    def __call__(
829        self,
830        *,
831        item_results: List[ExperimentItemResult],
832        **kwargs: Dict[str, Any],
833    ) -> Union[
834        Evaluation, List[Evaluation], Awaitable[Union[Evaluation, List[Evaluation]]]
835    ]:
836        r"""Evaluate the entire experiment run with aggregate metrics.
837
838        This method should implement aggregate evaluation logic such as computing
839        averages, calculating distributions, finding correlations, detecting patterns
840        across items, or performing statistical analysis on the experiment results.
841
842        Args:
843            item_results: List of results from all successfully processed experiment items.
844                Each item result contains:
845                - item: The original experiment item
846                - output: The task function's output for this item
847                - evaluations: List of item-level evaluation results
848                - trace_id: Langfuse trace ID for this execution
849                - dataset_run_id: Dataset run ID (if using Langfuse datasets)
850
851                Note: This list only includes items that were successfully processed.
852                Failed items are excluded but logged separately.
853            **kwargs: Additional keyword arguments that may be passed by the framework
854
855        Returns:
856            Evaluation results in one of these formats:
857            - Single Evaluation dict: {"name": "avg_accuracy", "value": 0.78, "comment": "..."}
858            - List of Evaluation dicts: [{"name": "mean", ...}, {"name": "std_dev", ...}]
859            - Awaitable returning either of the above (for async evaluators)
860
861            Each Evaluation dict should contain:
862            - name (str): Unique identifier for this run-level metric
863            - value (int|float|str|bool): The aggregate evaluation result
864            - comment (str, optional): Human-readable explanation of the metric
865            - metadata (dict, optional): Additional structured data about the evaluation
866
867        Examples:
868            Average accuracy calculator:
869            ```python
870            def average_accuracy(*, item_results, **kwargs):
871                if not item_results:
872                    return {"name": "avg_accuracy", "value": 0.0, "comment": "No results"}
873
874                accuracy_values = []
875                for result in item_results:
876                    for evaluation in result.evaluations:
877                        if evaluation.name == "accuracy":
878                            accuracy_values.append(evaluation.value)
879
880                if not accuracy_values:
881                    return {"name": "avg_accuracy", "value": 0, "comment": "No accuracy evaluations found"}
882
883                avg = sum(accuracy_values) / len(accuracy_values)
884                return {
885                    "name": "avg_accuracy",
886                    "value": avg,
887                    "comment": f"Average accuracy across {len(accuracy_values)} items: {avg:.2%}"
888                }
889            ```
890
891            Multiple aggregate metrics:
892            ```python
893            def statistical_summary(*, item_results, **kwargs):
894                if not item_results:
895                    return []
896
897                results = []
898
899                # Calculate output length statistics
900                lengths = [len(str(result.output)) for result in item_results]
901                results.extend([
902                    {"name": "avg_output_length", "value": sum(lengths) / len(lengths)},
903                    {"name": "min_output_length", "value": min(lengths)},
904                    {"name": "max_output_length", "value": max(lengths)}
905                ])
906
907                # Success rate
908                total_items = len(item_results)  # Only successful items are included
909                results.append({
910                    "name": "processing_success_rate",
911                    "value": 1.0,  # All items in item_results succeeded
912                    "comment": f"Successfully processed {total_items} items"
913                })
914
915                return results
916            ```
917
918            Async run evaluator with external analysis:
919            ```python
920            async def llm_batch_analysis(*, item_results, **kwargs):
921                # Prepare batch analysis prompt
922                outputs = [result.output for result in item_results]
923                prompt = f"Analyze these {len(outputs)} outputs for common themes:\n"
924                prompt += "\n".join(f"{i+1}. {output}" for i, output in enumerate(outputs))
925
926                response = await openai_client.chat.completions.create(
927                    model="gpt-4",
928                    messages=[{"role": "user", "content": prompt}]
929                )
930
931                return {
932                    "name": "thematic_analysis",
933                    "value": response.choices[0].message.content,
934                    "comment": f"LLM analysis of {len(outputs)} outputs"
935                }
936            ```
937
938            Performance distribution analysis:
939            ```python
940            def performance_distribution(*, item_results, **kwargs):
941                # Extract all evaluation scores
942                all_scores = []
943                score_by_metric = {}
944
945                for result in item_results:
946                    for evaluation in result.evaluations:
947                        metric_name = evaluation.name
948                        value = evaluation.value
949
950                        if isinstance(value, (int, float)):
951                            all_scores.append(value)
952                            if metric_name not in score_by_metric:
953                                score_by_metric[metric_name] = []
954                            score_by_metric[metric_name].append(value)
955
956                results = []
957
958                # Overall score distribution
959                if all_scores:
960                    import statistics
961                    results.append({
962                        "name": "score_std_dev",
963                        "value": statistics.stdev(all_scores) if len(all_scores) > 1 else 0,
964                        "comment": f"Standard deviation across all numeric scores"
965                    })
966
967                # Per-metric statistics
968                for metric, scores in score_by_metric.items():
969                    if len(scores) > 1:
970                        results.append({
971                            "name": f"{metric}_variance",
972                            "value": statistics.variance(scores),
973                            "comment": f"Variance in {metric} across {len(scores)} items"
974                        })
975
976                return results
977            ```
978        """
979        ...

Protocol defining the interface for run-level evaluator functions.

Run-level evaluators assess aggregate properties of the entire experiment run, computing metrics that span across all items rather than individual outputs. They receive the complete results from all processed items and can compute statistics like averages, distributions, correlations, or other aggregate metrics.

Run evaluators should:

  • Accept item_results as a keyword argument containing all item results
  • Return Evaluation dict(s) with aggregate metrics
  • Handle cases where some items may have failed processing
  • Compute meaningful statistics across the dataset
  • Can be either synchronous or asynchronous
RunEvaluatorFunction(*args, **kwargs)
1927def _no_init_or_replace_init(self, *args, **kwargs):
1928    cls = type(self)
1929
1930    if cls._is_protocol:
1931        raise TypeError('Protocols cannot be instantiated')
1932
1933    # Already using a custom `__init__`. No need to calculate correct
1934    # `__init__` to call. This can lead to RecursionError. See bpo-45121.
1935    if cls.__init__ is not _no_init_or_replace_init:
1936        return
1937
1938    # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`.
1939    # The first instantiation of the subclass will call `_no_init_or_replace_init` which
1940    # searches for a proper new `__init__` in the MRO. The new `__init__`
1941    # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent
1942    # instantiation of the protocol subclass will thus use the new
1943    # `__init__` and no longer call `_no_init_or_replace_init`.
1944    for base in cls.__mro__:
1945        init = base.__dict__.get('__init__', _no_init_or_replace_init)
1946        if init is not _no_init_or_replace_init:
1947            cls.__init__ = init
1948            break
1949    else:
1950        # should not happen
1951        cls.__init__ = object.__init__
1952
1953    cls.__init__(self, *args, **kwargs)
def create_evaluator_from_autoevals( autoevals_evaluator: Any, **kwargs: Optional[Dict[str, Any]]) -> EvaluatorFunction:
1027def create_evaluator_from_autoevals(
1028    autoevals_evaluator: Any, **kwargs: Optional[Dict[str, Any]]
1029) -> EvaluatorFunction:
1030    """Create a Langfuse evaluator from an autoevals evaluator.
1031
1032    Args:
1033        autoevals_evaluator: An autoevals evaluator instance
1034        **kwargs: Additional arguments passed to the evaluator
1035
1036    Returns:
1037        A Langfuse-compatible evaluator function
1038    """
1039
1040    def langfuse_evaluator(
1041        *,
1042        input: Any,
1043        output: Any,
1044        expected_output: Any,
1045        metadata: Optional[Dict[str, Any]],
1046        **langfuse_kwargs: Dict[str, Any],
1047    ) -> Evaluation:
1048        evaluation = autoevals_evaluator(
1049            input=input, output=output, expected=expected_output, **kwargs
1050        )
1051
1052        return Evaluation(
1053            name=evaluation.name,
1054            value=evaluation.score,
1055            comment=(evaluation.metadata or {}).get("comment"),
1056            metadata=evaluation.metadata,
1057        )
1058
1059    return langfuse_evaluator

Create a Langfuse evaluator from an autoevals evaluator.

Arguments:
  • autoevals_evaluator: An autoevals evaluator instance
  • **kwargs: Additional arguments passed to the evaluator
Returns:

A Langfuse-compatible evaluator function

class RunnerContext:
1062class RunnerContext:
1063    """Wraps :meth:`Langfuse.run_experiment` with CI-injected defaults.
1064
1065    Intended for use with the ``langfuse/experiment-action`` GitHub Action
1066    (https://github.com/langfuse/experiment-action). The action builds a
1067    ``RunnerContext`` before invoking the user's ``experiment(context)``
1068    function. Defaults set here (dataset, metadata tags) are applied when
1069    the user omits them on the :meth:`run_experiment` call; users can
1070    override any default by passing the corresponding argument explicitly.
1071    """
1072
1073    def __init__(
1074        self,
1075        *,
1076        client: "Langfuse",
1077        data: Optional[ExperimentData] = None,
1078        dataset_version: Optional[datetime] = None,
1079        metadata: Optional[Dict[str, str]] = None,
1080    ):
1081        """Build a ``RunnerContext`` populated with defaults for ``run_experiment``.
1082
1083        Typically called by the ``langfuse/experiment-action`` GitHub Action,
1084        not by end users directly. Every field except ``client`` is optional:
1085        fields left as ``None`` simply mean the corresponding argument must be
1086        supplied on the :meth:`run_experiment` call.
1087
1088        Args:
1089            client: Initialized Langfuse SDK client used to execute the
1090                experiment. The action creates this from the
1091                ``langfuse_public_key`` / ``langfuse_secret_key`` /
1092                ``langfuse_base_url`` inputs.
1093            data: Default dataset items to run the experiment on. Accepts
1094                either ``List[LocalExperimentItem]`` or ``List[DatasetItem]``.
1095                Injected by the action when ``dataset_name`` is configured.
1096                If ``None``, the user must pass ``data=`` to
1097                :meth:`run_experiment`.
1098            dataset_version: Optional pinned dataset version. Injected by the
1099                action when ``dataset_version`` is configured.
1100            metadata: Default metadata attached to every experiment trace and
1101                the dataset run. The action injects GitHub-sourced tags (SHA,
1102                PR link, workflow run link, branch, GH user, etc.). Merged
1103                with any ``metadata`` passed to :meth:`run_experiment`, with
1104                user-supplied keys winning on collision.
1105        """
1106        self.client = client
1107        self.data = data
1108        self.dataset_version = dataset_version
1109        self.metadata = metadata
1110
1111    def run_experiment(
1112        self,
1113        *,
1114        name: str,
1115        run_name: Optional[str] = None,
1116        description: Optional[str] = None,
1117        data: Optional[ExperimentData] = None,
1118        task: TaskFunction,
1119        evaluators: List[EvaluatorFunction] = [],
1120        composite_evaluator: Optional["CompositeEvaluatorFunction"] = None,
1121        run_evaluators: List[RunEvaluatorFunction] = [],
1122        max_concurrency: int = 50,
1123        metadata: Optional[Dict[str, str]] = None,
1124        _dataset_version: Optional[datetime] = None,
1125    ) -> ExperimentResult:
1126        resolved_data = data if data is not None else self.data
1127        if resolved_data is None:
1128            raise ValueError(
1129                "`data` must be provided either on the RunnerContext or the run_experiment call"
1130            )
1131
1132        resolved_dataset_version = (
1133            _dataset_version if _dataset_version is not None else self.dataset_version
1134        )
1135
1136        merged_metadata: Optional[Dict[str, str]]
1137        if self.metadata is None and metadata is None:
1138            merged_metadata = None
1139        else:
1140            merged_metadata = {**(self.metadata or {}), **(metadata or {})}
1141
1142        return self.client.run_experiment(
1143            name=name,
1144            run_name=run_name,
1145            description=description,
1146            data=resolved_data,
1147            task=task,
1148            evaluators=evaluators,
1149            composite_evaluator=composite_evaluator,
1150            run_evaluators=run_evaluators,
1151            max_concurrency=max_concurrency,
1152            metadata=merged_metadata,
1153            _dataset_version=resolved_dataset_version,
1154        )

Wraps Langfuse.run_experiment() with CI-injected defaults.

Intended for use with the langfuse/experiment-action GitHub Action (https://github.com/langfuse/experiment-action). The action builds a RunnerContext before invoking the user's experiment(context) function. Defaults set here (dataset, metadata tags) are applied when the user omits them on the run_experiment() call; users can override any default by passing the corresponding argument explicitly.

RunnerContext( *, client: langfuse.Langfuse, data: Union[List[LocalExperimentItem], List[langfuse.api.DatasetItem], NoneType] = None, dataset_version: Optional[datetime.datetime] = None, metadata: Optional[Dict[str, str]] = None)
1073    def __init__(
1074        self,
1075        *,
1076        client: "Langfuse",
1077        data: Optional[ExperimentData] = None,
1078        dataset_version: Optional[datetime] = None,
1079        metadata: Optional[Dict[str, str]] = None,
1080    ):
1081        """Build a ``RunnerContext`` populated with defaults for ``run_experiment``.
1082
1083        Typically called by the ``langfuse/experiment-action`` GitHub Action,
1084        not by end users directly. Every field except ``client`` is optional:
1085        fields left as ``None`` simply mean the corresponding argument must be
1086        supplied on the :meth:`run_experiment` call.
1087
1088        Args:
1089            client: Initialized Langfuse SDK client used to execute the
1090                experiment. The action creates this from the
1091                ``langfuse_public_key`` / ``langfuse_secret_key`` /
1092                ``langfuse_base_url`` inputs.
1093            data: Default dataset items to run the experiment on. Accepts
1094                either ``List[LocalExperimentItem]`` or ``List[DatasetItem]``.
1095                Injected by the action when ``dataset_name`` is configured.
1096                If ``None``, the user must pass ``data=`` to
1097                :meth:`run_experiment`.
1098            dataset_version: Optional pinned dataset version. Injected by the
1099                action when ``dataset_version`` is configured.
1100            metadata: Default metadata attached to every experiment trace and
1101                the dataset run. The action injects GitHub-sourced tags (SHA,
1102                PR link, workflow run link, branch, GH user, etc.). Merged
1103                with any ``metadata`` passed to :meth:`run_experiment`, with
1104                user-supplied keys winning on collision.
1105        """
1106        self.client = client
1107        self.data = data
1108        self.dataset_version = dataset_version
1109        self.metadata = metadata

Build a RunnerContext populated with defaults for run_experiment.

Typically called by the langfuse/experiment-action GitHub Action, not by end users directly. Every field except client is optional: fields left as None simply mean the corresponding argument must be supplied on the run_experiment() call.

Arguments:
  • client: Initialized Langfuse SDK client used to execute the experiment. The action creates this from the langfuse_public_key / langfuse_secret_key / langfuse_base_url inputs.
  • data: Default dataset items to run the experiment on. Accepts either List[LocalExperimentItem] or List[DatasetItem]. Injected by the action when dataset_name is configured. If None, the user must pass data= to run_experiment().
  • dataset_version: Optional pinned dataset version. Injected by the action when dataset_version is configured.
  • metadata: Default metadata attached to every experiment trace and the dataset run. The action injects GitHub-sourced tags (SHA, PR link, workflow run link, branch, GH user, etc.). Merged with any metadata passed to run_experiment(), with user-supplied keys winning on collision.
client
data
dataset_version
metadata
def run_experiment( self, *, name: str, run_name: Optional[str] = None, description: Optional[str] = None, data: Union[List[LocalExperimentItem], List[langfuse.api.DatasetItem], NoneType] = None, task: TaskFunction, evaluators: List[EvaluatorFunction] = [], composite_evaluator: Optional[langfuse.CompositeEvaluatorFunction] = None, run_evaluators: List[RunEvaluatorFunction] = [], max_concurrency: int = 50, metadata: Optional[Dict[str, str]] = None, _dataset_version: Optional[datetime.datetime] = None) -> ExperimentResult:
1111    def run_experiment(
1112        self,
1113        *,
1114        name: str,
1115        run_name: Optional[str] = None,
1116        description: Optional[str] = None,
1117        data: Optional[ExperimentData] = None,
1118        task: TaskFunction,
1119        evaluators: List[EvaluatorFunction] = [],
1120        composite_evaluator: Optional["CompositeEvaluatorFunction"] = None,
1121        run_evaluators: List[RunEvaluatorFunction] = [],
1122        max_concurrency: int = 50,
1123        metadata: Optional[Dict[str, str]] = None,
1124        _dataset_version: Optional[datetime] = None,
1125    ) -> ExperimentResult:
1126        resolved_data = data if data is not None else self.data
1127        if resolved_data is None:
1128            raise ValueError(
1129                "`data` must be provided either on the RunnerContext or the run_experiment call"
1130            )
1131
1132        resolved_dataset_version = (
1133            _dataset_version if _dataset_version is not None else self.dataset_version
1134        )
1135
1136        merged_metadata: Optional[Dict[str, str]]
1137        if self.metadata is None and metadata is None:
1138            merged_metadata = None
1139        else:
1140            merged_metadata = {**(self.metadata or {}), **(metadata or {})}
1141
1142        return self.client.run_experiment(
1143            name=name,
1144            run_name=run_name,
1145            description=description,
1146            data=resolved_data,
1147            task=task,
1148            evaluators=evaluators,
1149            composite_evaluator=composite_evaluator,
1150            run_evaluators=run_evaluators,
1151            max_concurrency=max_concurrency,
1152            metadata=merged_metadata,
1153            _dataset_version=resolved_dataset_version,
1154        )
class RegressionError(builtins.Exception):
1157class RegressionError(Exception):
1158    """Raised by a user's ``experiment`` function to signal a CI gate failure.
1159
1160    Intended for use with the ``langfuse/experiment-action`` GitHub Action
1161    (https://github.com/langfuse/experiment-action). The action catches this
1162    exception and, when ``should_fail_on_error`` is enabled, fails the
1163    workflow run and renders a callout in the PR comment using
1164    ``metric``/``value``/``threshold`` if supplied, otherwise ``str(exc)``.
1165
1166    Callers choose one of three forms:
1167
1168    - ``RegressionError(result=r)`` — minimal, generic message.
1169    - ``RegressionError(result=r, message="...")`` — free-form message.
1170    - ``RegressionError(result=r, metric="acc", value=0.7, threshold=0.9)`` —
1171      structured; ``metric`` and ``value`` must be provided together so the
1172      action can render a targeted callout without ``None`` placeholders.
1173    """
1174
1175    @overload
1176    def __init__(self, *, result: ExperimentResult) -> None: ...
1177    @overload
1178    def __init__(self, *, result: ExperimentResult, message: str) -> None: ...
1179    @overload
1180    def __init__(
1181        self,
1182        *,
1183        result: ExperimentResult,
1184        metric: str,
1185        value: float,
1186        threshold: Optional[float] = None,
1187        message: Optional[str] = None,
1188    ) -> None: ...
1189    def __init__(
1190        self,
1191        *,
1192        result: ExperimentResult,
1193        metric: Optional[str] = None,
1194        value: Optional[float] = None,
1195        threshold: Optional[float] = None,
1196        message: Optional[str] = None,
1197    ):
1198        self.result = result
1199        self.metric = metric
1200        self.value = value
1201        self.threshold = threshold
1202        if message is not None:
1203            formatted = message
1204        elif metric is not None and value is not None:
1205            formatted = f"Regression on `{metric}`: {value} (threshold {threshold})"
1206        else:
1207            formatted = "Experiment regression detected"
1208        super().__init__(formatted)

Raised by a user's experiment function to signal a CI gate failure.

Intended for use with the langfuse/experiment-action GitHub Action (https://github.com/langfuse/experiment-action). The action catches this exception and, when should_fail_on_error is enabled, fails the workflow run and renders a callout in the PR comment using metric/value/threshold if supplied, otherwise str(exc).

Callers choose one of three forms:

  • RegressionError(result=r) — minimal, generic message.
  • RegressionError(result=r, message="...") — free-form message.
  • RegressionError(result=r, metric="acc", value=0.7, threshold=0.9) — structured; metric and value must be provided together so the action can render a targeted callout without None placeholders.
RegressionError( *, result: ExperimentResult, metric: Optional[str] = None, value: Optional[float] = None, threshold: Optional[float] = None, message: Optional[str] = None)
1189    def __init__(
1190        self,
1191        *,
1192        result: ExperimentResult,
1193        metric: Optional[str] = None,
1194        value: Optional[float] = None,
1195        threshold: Optional[float] = None,
1196        message: Optional[str] = None,
1197    ):
1198        self.result = result
1199        self.metric = metric
1200        self.value = value
1201        self.threshold = threshold
1202        if message is not None:
1203            formatted = message
1204        elif metric is not None and value is not None:
1205            formatted = f"Regression on `{metric}`: {value} (threshold {threshold})"
1206        else:
1207            formatted = "Experiment regression detected"
1208        super().__init__(formatted)
result
metric
value
threshold