langfuse.experiment

Langfuse experiment functionality for running and evaluating tasks on datasets.

This module provides the core experiment functionality for the Langfuse Python SDK, allowing users to run experiments on datasets with automatic tracing, evaluation, and result formatting.

   1"""Langfuse experiment functionality for running and evaluating tasks on datasets.
   2
   3This module provides the core experiment functionality for the Langfuse Python SDK,
   4allowing users to run experiments on datasets with automatic tracing, evaluation,
   5and result formatting.
   6"""
   7
   8import asyncio
   9from typing import (
  10    Any,
  11    Awaitable,
  12    Dict,
  13    List,
  14    Optional,
  15    Protocol,
  16    TypedDict,
  17    Union,
  18)
  19
  20from langfuse.api import DatasetItem
  21from langfuse.logger import langfuse_logger as logger
  22from langfuse.types import ExperimentScoreType
  23
  24
  25class LocalExperimentItem(TypedDict, total=False):
  26    """Structure for local experiment data items (not from Langfuse datasets).
  27
  28    This TypedDict defines the structure for experiment items when using local data
  29    rather than Langfuse-hosted datasets. All fields are optional to provide
  30    flexibility in data structure.
  31
  32    Attributes:
  33        input: The input data to pass to the task function. Can be any type that
  34            your task function can process (string, dict, list, etc.). This is
  35            typically the prompt, question, or data that your task will operate on.
  36        expected_output: Optional expected/ground truth output for evaluation purposes.
  37            Used by evaluators to assess correctness or quality. Can be None if
  38            no ground truth is available.
  39        metadata: Optional metadata dictionary containing additional context about
  40            this specific item. Can include information like difficulty level,
  41            category, source, or any other relevant attributes that evaluators
  42            might use for context-aware evaluation.
  43
  44    Examples:
  45        Simple text processing item:
  46        ```python
  47        item: LocalExperimentItem = {
  48            "input": "Summarize this article: ...",
  49            "expected_output": "Expected summary...",
  50            "metadata": {"difficulty": "medium", "category": "news"}
  51        }
  52        ```
  53
  54        Classification item:
  55        ```python
  56        item: LocalExperimentItem = {
  57            "input": {"text": "This movie is great!", "context": "movie review"},
  58            "expected_output": "positive",
  59            "metadata": {"dataset_source": "imdb", "confidence": 0.95}
  60        }
  61        ```
  62
  63        Minimal item with only input:
  64        ```python
  65        item: LocalExperimentItem = {
  66            "input": "What is the capital of France?"
  67        }
  68        ```
  69    """
  70
  71    input: Any
  72    expected_output: Any
  73    metadata: Optional[Dict[str, Any]]
  74
  75
  76ExperimentItem = Union[LocalExperimentItem, DatasetItem]
  77"""Type alias for items that can be processed in experiments.
  78
  79Can be either:
  80- LocalExperimentItem: Dict-like items with 'input', 'expected_output', 'metadata' keys
  81- DatasetItem: Items from Langfuse datasets with .input, .expected_output, .metadata attributes
  82"""
  83
  84ExperimentData = Union[List[LocalExperimentItem], List[DatasetItem]]
  85"""Type alias for experiment datasets.
  86
  87Represents the collection of items to process in an experiment. Can be either:
  88- List[LocalExperimentItem]: Local data items as dictionaries
  89- List[DatasetItem]: Items from a Langfuse dataset (typically from dataset.items)
  90"""
  91
  92
  93class Evaluation:
  94    """Represents an evaluation result for an experiment item or an entire experiment run.
  95
  96    This class provides a strongly-typed way to create evaluation results in evaluator functions.
  97    Users must use keyword arguments when instantiating this class.
  98
  99    Attributes:
 100        name: Unique identifier for the evaluation metric. Should be descriptive
 101            and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity").
 102            Used for aggregation and comparison across experiment runs.
 103        value: The evaluation score or result. Can be:
 104            - Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42)
 105            - String: For categorical results like "positive", "negative", "neutral"
 106            - Boolean: For binary assessments like "passes_safety_check"
 107        comment: Optional human-readable explanation of the evaluation result.
 108            Useful for providing context, explaining scoring rationale, or noting
 109            special conditions. Displayed in Langfuse UI for interpretability.
 110        metadata: Optional structured metadata about the evaluation process.
 111            Can include confidence scores, intermediate calculations, model versions,
 112            or any other relevant technical details.
 113        data_type: Optional score data type. Required if value is not NUMERIC.
 114            One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC.
 115        config_id: Optional Langfuse score config ID.
 116
 117    Examples:
 118        Basic accuracy evaluation:
 119        ```python
 120        from langfuse import Evaluation
 121
 122        def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
 123            if not expected_output:
 124                return Evaluation(name="accuracy", value=0, comment="No expected output")
 125
 126            is_correct = output.strip().lower() == expected_output.strip().lower()
 127            return Evaluation(
 128                name="accuracy",
 129                value=1.0 if is_correct else 0.0,
 130                comment="Correct answer" if is_correct else "Incorrect answer"
 131            )
 132        ```
 133
 134        Multi-metric evaluator:
 135        ```python
 136        def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs):
 137            return [
 138                Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"),
 139                Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"),
 140                Evaluation(
 141                    name="quality",
 142                    value=0.85,
 143                    comment="High quality response",
 144                    metadata={"confidence": 0.92, "model": "gpt-4"}
 145                )
 146            ]
 147        ```
 148
 149        Categorical evaluation:
 150        ```python
 151        def sentiment_evaluator(*, input, output, **kwargs):
 152            sentiment = analyze_sentiment(output)  # Returns "positive", "negative", or "neutral"
 153            return Evaluation(
 154                name="sentiment",
 155                value=sentiment,
 156                comment=f"Response expresses {sentiment} sentiment",
 157                data_type="CATEGORICAL"
 158            )
 159        ```
 160
 161        Failed evaluation with error handling:
 162        ```python
 163        def external_api_evaluator(*, input, output, **kwargs):
 164            try:
 165                score = external_api.evaluate(output)
 166                return Evaluation(name="external_score", value=score)
 167            except Exception as e:
 168                return Evaluation(
 169                    name="external_score",
 170                    value=0,
 171                    comment=f"API unavailable: {e}",
 172                    metadata={"error": str(e), "retry_count": 3}
 173                )
 174        ```
 175
 176    Note:
 177        All arguments must be passed as keywords. Positional arguments are not allowed
 178        to ensure code clarity and prevent errors from argument reordering.
 179    """
 180
 181    def __init__(
 182        self,
 183        *,
 184        name: str,
 185        value: Union[int, float, str, bool],
 186        comment: Optional[str] = None,
 187        metadata: Optional[Dict[str, Any]] = None,
 188        data_type: Optional[ExperimentScoreType] = None,
 189        config_id: Optional[str] = None,
 190    ):
 191        """Initialize an Evaluation with the provided data.
 192
 193        Args:
 194            name: Unique identifier for the evaluation metric.
 195            value: The evaluation score or result.
 196            comment: Optional human-readable explanation of the result.
 197            metadata: Optional structured metadata about the evaluation process.
 198            data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
 199            config_id: Optional Langfuse score config ID.
 200
 201        Note:
 202            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
 203        """
 204        self.name = name
 205        self.value = value
 206        self.comment = comment
 207        self.metadata = metadata
 208        self.data_type = data_type
 209        self.config_id = config_id
 210
 211
 212class ExperimentItemResult:
 213    """Result structure for individual experiment items.
 214
 215    This class represents the complete result of processing a single item
 216    during an experiment run, including the original input, task output,
 217    evaluations, and tracing information. Users must use keyword arguments when instantiating this class.
 218
 219    Attributes:
 220        item: The original experiment item that was processed. Can be either
 221            a dictionary with 'input', 'expected_output', and 'metadata' keys,
 222            or a DatasetItem from Langfuse datasets.
 223        output: The actual output produced by the task function for this item.
 224            Can be any type depending on what your task function returns.
 225        evaluations: List of evaluation results for this item. Each evaluation
 226            contains a name, value, optional comment, and optional metadata.
 227        trace_id: Optional Langfuse trace ID for this item's execution. Used
 228            to link the experiment result with the detailed trace in Langfuse UI.
 229        dataset_run_id: Optional dataset run ID if this item was part of a
 230            Langfuse dataset. None for local experiments.
 231
 232    Examples:
 233        Accessing item result data:
 234        ```python
 235        result = langfuse.run_experiment(...)
 236        for item_result in result.item_results:
 237            print(f"Input: {item_result.item}")
 238            print(f"Output: {item_result.output}")
 239            print(f"Trace: {item_result.trace_id}")
 240
 241            # Access evaluations
 242            for evaluation in item_result.evaluations:
 243                print(f"{evaluation.name}: {evaluation.value}")
 244        ```
 245
 246        Working with different item types:
 247        ```python
 248        # Local experiment item (dict)
 249        if isinstance(item_result.item, dict):
 250            input_data = item_result.item["input"]
 251            expected = item_result.item.get("expected_output")
 252
 253        # Langfuse dataset item (object with attributes)
 254        else:
 255            input_data = item_result.item.input
 256            expected = item_result.item.expected_output
 257        ```
 258
 259    Note:
 260        All arguments must be passed as keywords. Positional arguments are not allowed
 261        to ensure code clarity and prevent errors from argument reordering.
 262    """
 263
 264    def __init__(
 265        self,
 266        *,
 267        item: ExperimentItem,
 268        output: Any,
 269        evaluations: List[Evaluation],
 270        trace_id: Optional[str],
 271        dataset_run_id: Optional[str],
 272    ):
 273        """Initialize an ExperimentItemResult with the provided data.
 274
 275        Args:
 276            item: The original experiment item that was processed.
 277            output: The actual output produced by the task function for this item.
 278            evaluations: List of evaluation results for this item.
 279            trace_id: Optional Langfuse trace ID for this item's execution.
 280            dataset_run_id: Optional dataset run ID if this item was part of a Langfuse dataset.
 281
 282        Note:
 283            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
 284        """
 285        self.item = item
 286        self.output = output
 287        self.evaluations = evaluations
 288        self.trace_id = trace_id
 289        self.dataset_run_id = dataset_run_id
 290
 291
 292class ExperimentResult:
 293    """Complete result structure for experiment execution.
 294
 295    This class encapsulates the complete results of running an experiment on a dataset,
 296    including individual item results, aggregate run-level evaluations, and metadata
 297    about the experiment execution.
 298
 299    Attributes:
 300        name: The name of the experiment as specified during execution.
 301        run_name: The name of the current experiment run.
 302        description: Optional description of the experiment's purpose or methodology.
 303        item_results: List of results from processing each individual dataset item,
 304            containing the original item, task output, evaluations, and trace information.
 305        run_evaluations: List of aggregate evaluation results computed across all items,
 306            such as average scores, statistical summaries, or cross-item analyses.
 307        experiment_id: ID of the experiment run propagated across all items. For
 308            Langfuse datasets, this matches the dataset run ID. For local experiments,
 309            this is a stable SDK-generated identifier for the run.
 310        dataset_run_id: Optional ID of the dataset run in Langfuse (when using Langfuse datasets).
 311        dataset_run_url: Optional direct URL to view the experiment results in Langfuse UI.
 312
 313    Examples:
 314        Basic usage with local dataset:
 315        ```python
 316        result = langfuse.run_experiment(
 317            name="Capital Cities Test",
 318            data=local_data,
 319            task=generate_capital,
 320            evaluators=[accuracy_check]
 321        )
 322
 323        print(f"Processed {len(result.item_results)} items")
 324        print(result.format())  # Human-readable summary
 325
 326        # Access individual results
 327        for item_result in result.item_results:
 328            print(f"Input: {item_result.item}")
 329            print(f"Output: {item_result.output}")
 330            print(f"Scores: {item_result.evaluations}")
 331        ```
 332
 333        Usage with Langfuse datasets:
 334        ```python
 335        dataset = langfuse.get_dataset("qa-eval-set")
 336        result = dataset.run_experiment(
 337            name="GPT-4 QA Evaluation",
 338            task=answer_question,
 339            evaluators=[relevance_check, accuracy_check]
 340        )
 341
 342        # View in Langfuse UI
 343        if result.dataset_run_url:
 344            print(f"View detailed results: {result.dataset_run_url}")
 345        ```
 346
 347        Formatted output:
 348        ```python
 349        # Get summary view
 350        summary = result.format()
 351        print(summary)
 352
 353        # Get detailed view with individual items
 354        detailed = result.format(include_item_results=True)
 355        with open("experiment_report.txt", "w") as f:
 356            f.write(detailed)
 357        ```
 358    """
 359
 360    def __init__(
 361        self,
 362        *,
 363        name: str,
 364        run_name: str,
 365        description: Optional[str],
 366        item_results: List[ExperimentItemResult],
 367        run_evaluations: List[Evaluation],
 368        experiment_id: str,
 369        dataset_run_id: Optional[str] = None,
 370        dataset_run_url: Optional[str] = None,
 371    ):
 372        """Initialize an ExperimentResult with the provided data.
 373
 374        Args:
 375            name: The name of the experiment.
 376            run_name: The current experiment run name.
 377            description: Optional description of the experiment.
 378            item_results: List of results from processing individual dataset items.
 379            run_evaluations: List of aggregate evaluation results for the entire run.
 380            experiment_id: ID of the experiment run.
 381            dataset_run_id: Optional ID of the dataset run (for Langfuse datasets).
 382            dataset_run_url: Optional URL to view results in Langfuse UI.
 383        """
 384        self.name = name
 385        self.run_name = run_name
 386        self.description = description
 387        self.item_results = item_results
 388        self.run_evaluations = run_evaluations
 389        self.experiment_id = experiment_id
 390        self.dataset_run_id = dataset_run_id
 391        self.dataset_run_url = dataset_run_url
 392
 393    def format(self, *, include_item_results: bool = False) -> str:
 394        r"""Format the experiment result for human-readable display.
 395
 396        Converts the experiment result into a nicely formatted string suitable for
 397        console output, logging, or reporting. The output includes experiment overview,
 398        aggregate statistics, and optionally individual item details.
 399
 400        This method provides a comprehensive view of experiment performance including:
 401        - Experiment metadata (name, description, item count)
 402        - List of evaluation metrics used across items
 403        - Average scores computed across all processed items
 404        - Run-level evaluation results (aggregate metrics)
 405        - Links to view detailed results in Langfuse UI (when available)
 406        - Individual item details (when requested)
 407
 408        Args:
 409            include_item_results: Whether to include detailed results for each individual
 410                item in the formatted output. When False (default), only shows aggregate
 411                statistics and summary information. When True, includes input/output/scores
 412                for every processed item, making the output significantly longer but more
 413                detailed for debugging and analysis purposes.
 414
 415        Returns:
 416            A formatted multi-line string containing:
 417            - Experiment name and description (if provided)
 418            - Total number of items successfully processed
 419            - List of all evaluation metrics that were applied
 420            - Average scores across all items for each numeric metric
 421            - Run-level evaluation results with comments
 422            - Dataset run URL for viewing in Langfuse UI (if applicable)
 423            - Individual item details including inputs, outputs, and scores (if requested)
 424
 425        Examples:
 426            Basic usage showing aggregate results only:
 427            ```python
 428            result = langfuse.run_experiment(
 429                name="Capital Cities",
 430                data=dataset,
 431                task=generate_capital,
 432                evaluators=[accuracy_evaluator]
 433            )
 434
 435            print(result.format())
 436            # Output:
 437            # ──────────────────────────────────────────────────
 438            # 📊 Capital Cities
 439            # 100 items
 440            # Evaluations:
 441            #   • accuracy
 442            # Average Scores:
 443            #   • accuracy: 0.850
 444            ```
 445
 446            Detailed output including all individual item results:
 447            ```python
 448            detailed_report = result.format(include_item_results=True)
 449            print(detailed_report)
 450            # Output includes each item:
 451            # 1. Item 1:
 452            #    Input:    What is the capital of France?
 453            #    Expected: Paris
 454            #    Actual:   The capital of France is Paris.
 455            #    Scores:
 456            #      • accuracy: 1.000
 457            #        💭 Correct answer found
 458            # [... continues for all items ...]
 459            ```
 460
 461            Saving formatted results to file for reporting:
 462            ```python
 463            with open("experiment_report.txt", "w") as f:
 464                f.write(result.format(include_item_results=True))
 465
 466            # Or create summary report
 467            summary = result.format()  # Aggregate view only
 468            print(f"Experiment Summary:\n{summary}")
 469            ```
 470
 471            Integration with logging systems:
 472            ```python
 473            import logging
 474            logger = logging.getLogger("experiments")
 475
 476            # Log summary after experiment
 477            logger.info(f"Experiment completed:\n{result.format()}")
 478
 479            # Log detailed results for failed experiments
 480            if any(eval['value'] < threshold for eval in result.run_evaluations):
 481                logger.warning(f"Poor performance detected:\n{result.format(include_item_results=True)}")
 482            ```
 483        """
 484        if not self.item_results:
 485            return "No experiment results to display."
 486
 487        output = ""
 488
 489        # Individual results section
 490        if include_item_results:
 491            for i, result in enumerate(self.item_results):
 492                output += f"\n{i + 1}. Item {i + 1}:\n"
 493
 494                # Extract and display input
 495                item_input = None
 496                if isinstance(result.item, dict):
 497                    item_input = result.item.get("input")
 498                elif hasattr(result.item, "input"):
 499                    item_input = result.item.input
 500
 501                if item_input is not None:
 502                    output += f"   Input:    {_format_value(item_input)}\n"
 503
 504                # Extract and display expected output
 505                expected_output = None
 506                if isinstance(result.item, dict):
 507                    expected_output = result.item.get("expected_output")
 508                elif hasattr(result.item, "expected_output"):
 509                    expected_output = result.item.expected_output
 510
 511                if expected_output is not None:
 512                    output += f"   Expected: {_format_value(expected_output)}\n"
 513                output += f"   Actual:   {_format_value(result.output)}\n"
 514
 515                # Display evaluation scores
 516                if result.evaluations:
 517                    output += "   Scores:\n"
 518                    for evaluation in result.evaluations:
 519                        score = evaluation.value
 520                        if isinstance(score, (int, float)):
 521                            score = f"{score:.3f}"
 522                        output += f"     • {evaluation.name}: {score}"
 523                        if evaluation.comment:
 524                            output += f"\n       💭 {evaluation.comment}"
 525                        output += "\n"
 526
 527                # Display trace link if available
 528                if result.trace_id:
 529                    output += f"\n   Trace ID: {result.trace_id}\n"
 530        else:
 531            output += f"Individual Results: Hidden ({len(self.item_results)} items)\n"
 532            output += "💡 Set include_item_results=True to view them\n"
 533
 534        # Experiment overview section
 535        output += f"\n{'─' * 50}\n"
 536        output += f"🧪 Experiment: {self.name}"
 537        output += f"\n📋 Run name: {self.run_name}"
 538        if self.description:
 539            output += f" - {self.description}"
 540
 541        output += f"\n{len(self.item_results)} items"
 542
 543        # Collect unique evaluation names across all items
 544        evaluation_names = set()
 545        for result in self.item_results:
 546            for evaluation in result.evaluations:
 547                evaluation_names.add(evaluation.name)
 548
 549        if evaluation_names:
 550            output += "\nEvaluations:"
 551            for eval_name in evaluation_names:
 552                output += f"\n  • {eval_name}"
 553            output += "\n"
 554
 555        # Calculate and display average scores
 556        if evaluation_names:
 557            output += "\nAverage Scores:"
 558            for eval_name in evaluation_names:
 559                scores = []
 560                for result in self.item_results:
 561                    for evaluation in result.evaluations:
 562                        if evaluation.name == eval_name and isinstance(
 563                            evaluation.value, (int, float)
 564                        ):
 565                            scores.append(evaluation.value)
 566
 567                if scores:
 568                    avg = sum(scores) / len(scores)
 569                    output += f"\n  • {eval_name}: {avg:.3f}"
 570            output += "\n"
 571
 572        # Display run-level evaluations
 573        if self.run_evaluations:
 574            output += "\nRun Evaluations:"
 575            for run_eval in self.run_evaluations:
 576                score = run_eval.value
 577                if isinstance(score, (int, float)):
 578                    score = f"{score:.3f}"
 579                output += f"\n  • {run_eval.name}: {score}"
 580                if run_eval.comment:
 581                    output += f"\n    💭 {run_eval.comment}"
 582            output += "\n"
 583
 584        # Add dataset run URL if available
 585        if self.dataset_run_url:
 586            output += f"\n🔗 Dataset Run:\n   {self.dataset_run_url}"
 587
 588        return output
 589
 590
 591class TaskFunction(Protocol):
 592    """Protocol defining the interface for experiment task functions.
 593
 594    Task functions are the core processing functions that operate on each item
 595    in an experiment dataset. They receive an experiment item as input and
 596    produce some output that will be evaluated.
 597
 598    Task functions must:
 599    - Accept 'item' as a keyword argument
 600    - Return any type of output (will be passed to evaluators)
 601    - Can be either synchronous or asynchronous
 602    - Should handle their own errors gracefully (exceptions will be logged)
 603    """
 604
 605    def __call__(
 606        self,
 607        *,
 608        item: ExperimentItem,
 609        **kwargs: Dict[str, Any],
 610    ) -> Union[Any, Awaitable[Any]]:
 611        """Execute the task on an experiment item.
 612
 613        This method defines the core processing logic for each item in your experiment.
 614        The implementation should focus on the specific task you want to evaluate,
 615        such as text generation, classification, summarization, etc.
 616
 617        Args:
 618            item: The experiment item to process. Can be either:
 619                - Dict with keys like 'input', 'expected_output', 'metadata'
 620                - Langfuse DatasetItem object with .input, .expected_output attributes
 621            **kwargs: Additional keyword arguments that may be passed by the framework
 622
 623        Returns:
 624            Any: The output of processing the item. This output will be:
 625            - Stored in the experiment results
 626            - Passed to all item-level evaluators for assessment
 627            - Traced automatically in Langfuse for observability
 628
 629            Can return either a direct value or an awaitable (async) result.
 630
 631        Examples:
 632            Simple synchronous task:
 633            ```python
 634            def my_task(*, item, **kwargs):
 635                prompt = f"Summarize: {item['input']}"
 636                return my_llm_client.generate(prompt)
 637            ```
 638
 639            Async task with error handling:
 640            ```python
 641            async def my_async_task(*, item, **kwargs):
 642                try:
 643                    response = await openai_client.chat.completions.create(
 644                        model="gpt-4",
 645                        messages=[{"role": "user", "content": item["input"]}]
 646                    )
 647                    return response.choices[0].message.content
 648                except Exception as e:
 649                    # Log error and return fallback
 650                    print(f"Task failed for item {item}: {e}")
 651                    return "Error: Could not process item"
 652            ```
 653
 654            Task using dataset item attributes:
 655            ```python
 656            def classification_task(*, item, **kwargs):
 657                # Works with both dict items and DatasetItem objects
 658                text = item["input"] if isinstance(item, dict) else item.input
 659                return classify_text(text)
 660            ```
 661        """
 662        ...
 663
 664
 665class EvaluatorFunction(Protocol):
 666    """Protocol defining the interface for item-level evaluator functions.
 667
 668    Item-level evaluators assess the quality, correctness, or other properties
 669    of individual task outputs. They receive the input, output, expected output,
 670    and metadata for each item and return evaluation metrics.
 671
 672    Evaluators should:
 673    - Accept input, output, expected_output, and metadata as keyword arguments
 674    - Return Evaluation dict(s) with 'name', 'value', 'comment', 'metadata' fields
 675    - Be deterministic when possible for reproducible results
 676    - Handle edge cases gracefully (missing expected output, malformed data, etc.)
 677    - Can be either synchronous or asynchronous
 678    """
 679
 680    def __call__(
 681        self,
 682        *,
 683        input: Any,
 684        output: Any,
 685        expected_output: Any,
 686        metadata: Optional[Dict[str, Any]],
 687        **kwargs: Dict[str, Any],
 688    ) -> Union[
 689        Evaluation, List[Evaluation], Awaitable[Union[Evaluation, List[Evaluation]]]
 690    ]:
 691        r"""Evaluate a task output for quality, correctness, or other metrics.
 692
 693        This method should implement specific evaluation logic such as accuracy checking,
 694        similarity measurement, toxicity detection, fluency assessment, etc.
 695
 696        Args:
 697            input: The original input that was passed to the task function.
 698                This is typically the item['input'] or item.input value.
 699            output: The output produced by the task function for this input.
 700                This is the direct return value from your task function.
 701            expected_output: The expected/ground truth output for comparison.
 702                May be None if not available in the dataset. Evaluators should
 703                handle this case appropriately.
 704            metadata: Optional metadata from the experiment item that might
 705                contain additional context for evaluation (categories, difficulty, etc.)
 706            **kwargs: Additional keyword arguments that may be passed by the framework
 707
 708        Returns:
 709            Evaluation results in one of these formats:
 710            - Single Evaluation dict: {"name": "accuracy", "value": 0.85, "comment": "..."}
 711            - List of Evaluation dicts: [{"name": "precision", ...}, {"name": "recall", ...}]
 712            - Awaitable returning either of the above (for async evaluators)
 713
 714            Each Evaluation dict should contain:
 715            - name (str): Unique identifier for this evaluation metric
 716            - value (int|float|str|bool): The evaluation score or result
 717            - comment (str, optional): Human-readable explanation of the result
 718            - metadata (dict, optional): Additional structured data about the evaluation
 719
 720        Examples:
 721            Simple accuracy evaluator:
 722            ```python
 723            def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
 724                if expected_output is None:
 725                    return {"name": "accuracy", "value": 0, "comment": "No expected output"}
 726
 727                is_correct = output.strip().lower() == expected_output.strip().lower()
 728                return {
 729                    "name": "accuracy",
 730                    "value": 1.0 if is_correct else 0.0,
 731                    "comment": "Exact match" if is_correct else "No match"
 732                }
 733            ```
 734
 735            Multi-metric evaluator:
 736            ```python
 737            def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs):
 738                results = []
 739
 740                # Length check
 741                results.append({
 742                    "name": "output_length",
 743                    "value": len(output),
 744                    "comment": f"Output contains {len(output)} characters"
 745                })
 746
 747                # Sentiment analysis
 748                sentiment_score = analyze_sentiment(output)
 749                results.append({
 750                    "name": "sentiment",
 751                    "value": sentiment_score,
 752                    "comment": f"Sentiment score: {sentiment_score:.2f}"
 753                })
 754
 755                return results
 756            ```
 757
 758            Async evaluator using external API:
 759            ```python
 760            async def llm_judge_evaluator(*, input, output, expected_output=None, **kwargs):
 761                prompt = f"Rate the quality of this response on a scale of 1-10:\n"
 762                prompt += f"Question: {input}\nResponse: {output}"
 763
 764                response = await openai_client.chat.completions.create(
 765                    model="gpt-4",
 766                    messages=[{"role": "user", "content": prompt}]
 767                )
 768
 769                try:
 770                    score = float(response.choices[0].message.content.strip())
 771                    return {
 772                        "name": "llm_judge_quality",
 773                        "value": score,
 774                        "comment": f"LLM judge rated this {score}/10"
 775                    }
 776                except ValueError:
 777                    return {
 778                        "name": "llm_judge_quality",
 779                        "value": 0,
 780                        "comment": "Could not parse LLM judge score"
 781                    }
 782            ```
 783
 784            Context-aware evaluator:
 785            ```python
 786            def context_evaluator(*, input, output, metadata=None, **kwargs):
 787                # Use metadata for context-specific evaluation
 788                difficulty = metadata.get("difficulty", "medium") if metadata else "medium"
 789
 790                # Adjust expectations based on difficulty
 791                min_length = {"easy": 50, "medium": 100, "hard": 150}[difficulty]
 792
 793                meets_requirement = len(output) >= min_length
 794                return {
 795                    "name": f"meets_{difficulty}_requirement",
 796                    "value": meets_requirement,
 797                    "comment": f"Output {'meets' if meets_requirement else 'fails'} {difficulty} length requirement"
 798                }
 799            ```
 800        """
 801        ...
 802
 803
 804class RunEvaluatorFunction(Protocol):
 805    """Protocol defining the interface for run-level evaluator functions.
 806
 807    Run-level evaluators assess aggregate properties of the entire experiment run,
 808    computing metrics that span across all items rather than individual outputs.
 809    They receive the complete results from all processed items and can compute
 810    statistics like averages, distributions, correlations, or other aggregate metrics.
 811
 812    Run evaluators should:
 813    - Accept item_results as a keyword argument containing all item results
 814    - Return Evaluation dict(s) with aggregate metrics
 815    - Handle cases where some items may have failed processing
 816    - Compute meaningful statistics across the dataset
 817    - Can be either synchronous or asynchronous
 818    """
 819
 820    def __call__(
 821        self,
 822        *,
 823        item_results: List[ExperimentItemResult],
 824        **kwargs: Dict[str, Any],
 825    ) -> Union[
 826        Evaluation, List[Evaluation], Awaitable[Union[Evaluation, List[Evaluation]]]
 827    ]:
 828        r"""Evaluate the entire experiment run with aggregate metrics.
 829
 830        This method should implement aggregate evaluation logic such as computing
 831        averages, calculating distributions, finding correlations, detecting patterns
 832        across items, or performing statistical analysis on the experiment results.
 833
 834        Args:
 835            item_results: List of results from all successfully processed experiment items.
 836                Each item result contains:
 837                - item: The original experiment item
 838                - output: The task function's output for this item
 839                - evaluations: List of item-level evaluation results
 840                - trace_id: Langfuse trace ID for this execution
 841                - dataset_run_id: Dataset run ID (if using Langfuse datasets)
 842
 843                Note: This list only includes items that were successfully processed.
 844                Failed items are excluded but logged separately.
 845            **kwargs: Additional keyword arguments that may be passed by the framework
 846
 847        Returns:
 848            Evaluation results in one of these formats:
 849            - Single Evaluation dict: {"name": "avg_accuracy", "value": 0.78, "comment": "..."}
 850            - List of Evaluation dicts: [{"name": "mean", ...}, {"name": "std_dev", ...}]
 851            - Awaitable returning either of the above (for async evaluators)
 852
 853            Each Evaluation dict should contain:
 854            - name (str): Unique identifier for this run-level metric
 855            - value (int|float|str|bool): The aggregate evaluation result
 856            - comment (str, optional): Human-readable explanation of the metric
 857            - metadata (dict, optional): Additional structured data about the evaluation
 858
 859        Examples:
 860            Average accuracy calculator:
 861            ```python
 862            def average_accuracy(*, item_results, **kwargs):
 863                if not item_results:
 864                    return {"name": "avg_accuracy", "value": 0.0, "comment": "No results"}
 865
 866                accuracy_values = []
 867                for result in item_results:
 868                    for evaluation in result.evaluations:
 869                        if evaluation.name == "accuracy":
 870                            accuracy_values.append(evaluation.value)
 871
 872                if not accuracy_values:
 873                    return {"name": "avg_accuracy", "value": 0, "comment": "No accuracy evaluations found"}
 874
 875                avg = sum(accuracy_values) / len(accuracy_values)
 876                return {
 877                    "name": "avg_accuracy",
 878                    "value": avg,
 879                    "comment": f"Average accuracy across {len(accuracy_values)} items: {avg:.2%}"
 880                }
 881            ```
 882
 883            Multiple aggregate metrics:
 884            ```python
 885            def statistical_summary(*, item_results, **kwargs):
 886                if not item_results:
 887                    return []
 888
 889                results = []
 890
 891                # Calculate output length statistics
 892                lengths = [len(str(result.output)) for result in item_results]
 893                results.extend([
 894                    {"name": "avg_output_length", "value": sum(lengths) / len(lengths)},
 895                    {"name": "min_output_length", "value": min(lengths)},
 896                    {"name": "max_output_length", "value": max(lengths)}
 897                ])
 898
 899                # Success rate
 900                total_items = len(item_results)  # Only successful items are included
 901                results.append({
 902                    "name": "processing_success_rate",
 903                    "value": 1.0,  # All items in item_results succeeded
 904                    "comment": f"Successfully processed {total_items} items"
 905                })
 906
 907                return results
 908            ```
 909
 910            Async run evaluator with external analysis:
 911            ```python
 912            async def llm_batch_analysis(*, item_results, **kwargs):
 913                # Prepare batch analysis prompt
 914                outputs = [result.output for result in item_results]
 915                prompt = f"Analyze these {len(outputs)} outputs for common themes:\n"
 916                prompt += "\n".join(f"{i+1}. {output}" for i, output in enumerate(outputs))
 917
 918                response = await openai_client.chat.completions.create(
 919                    model="gpt-4",
 920                    messages=[{"role": "user", "content": prompt}]
 921                )
 922
 923                return {
 924                    "name": "thematic_analysis",
 925                    "value": response.choices[0].message.content,
 926                    "comment": f"LLM analysis of {len(outputs)} outputs"
 927                }
 928            ```
 929
 930            Performance distribution analysis:
 931            ```python
 932            def performance_distribution(*, item_results, **kwargs):
 933                # Extract all evaluation scores
 934                all_scores = []
 935                score_by_metric = {}
 936
 937                for result in item_results:
 938                    for evaluation in result.evaluations:
 939                        metric_name = evaluation.name
 940                        value = evaluation.value
 941
 942                        if isinstance(value, (int, float)):
 943                            all_scores.append(value)
 944                            if metric_name not in score_by_metric:
 945                                score_by_metric[metric_name] = []
 946                            score_by_metric[metric_name].append(value)
 947
 948                results = []
 949
 950                # Overall score distribution
 951                if all_scores:
 952                    import statistics
 953                    results.append({
 954                        "name": "score_std_dev",
 955                        "value": statistics.stdev(all_scores) if len(all_scores) > 1 else 0,
 956                        "comment": f"Standard deviation across all numeric scores"
 957                    })
 958
 959                # Per-metric statistics
 960                for metric, scores in score_by_metric.items():
 961                    if len(scores) > 1:
 962                        results.append({
 963                            "name": f"{metric}_variance",
 964                            "value": statistics.variance(scores),
 965                            "comment": f"Variance in {metric} across {len(scores)} items"
 966                        })
 967
 968                return results
 969            ```
 970        """
 971        ...
 972
 973
 974def _format_value(value: Any) -> str:
 975    """Format a value for display."""
 976    if isinstance(value, str):
 977        return value[:50] + "..." if len(value) > 50 else value
 978    return str(value)
 979
 980
 981async def _run_evaluator(
 982    evaluator: Union[EvaluatorFunction, RunEvaluatorFunction], **kwargs: Any
 983) -> List[Evaluation]:
 984    """Run an evaluator function and normalize the result."""
 985    try:
 986        result = evaluator(**kwargs)
 987
 988        # Handle async evaluators
 989        if asyncio.iscoroutine(result):
 990            result = await result
 991
 992        # Normalize to list
 993        if isinstance(result, (dict, Evaluation)):
 994            return [result]  # type: ignore
 995
 996        elif isinstance(result, list):
 997            return result
 998
 999        else:
1000            return []
1001
1002    except Exception as e:
1003        evaluator_name = getattr(evaluator, "__name__", "unknown_evaluator")
1004        logger.error(f"Evaluator {evaluator_name} failed: {e}")
1005        return []
1006
1007
1008async def _run_task(task: TaskFunction, item: ExperimentItem) -> Any:
1009    """Run a task function and handle sync/async."""
1010    result = task(item=item)
1011
1012    # Handle async tasks
1013    if asyncio.iscoroutine(result):
1014        result = await result
1015
1016    return result
1017
1018
1019def create_evaluator_from_autoevals(
1020    autoevals_evaluator: Any, **kwargs: Optional[Dict[str, Any]]
1021) -> EvaluatorFunction:
1022    """Create a Langfuse evaluator from an autoevals evaluator.
1023
1024    Args:
1025        autoevals_evaluator: An autoevals evaluator instance
1026        **kwargs: Additional arguments passed to the evaluator
1027
1028    Returns:
1029        A Langfuse-compatible evaluator function
1030    """
1031
1032    def langfuse_evaluator(
1033        *,
1034        input: Any,
1035        output: Any,
1036        expected_output: Any,
1037        metadata: Optional[Dict[str, Any]],
1038        **langfuse_kwargs: Dict[str, Any],
1039    ) -> Evaluation:
1040        evaluation = autoevals_evaluator(
1041            input=input, output=output, expected=expected_output, **kwargs
1042        )
1043
1044        return Evaluation(
1045            name=evaluation.name,
1046            value=evaluation.score,
1047            comment=(evaluation.metadata or {}).get("comment"),
1048            metadata=evaluation.metadata,
1049        )
1050
1051    return langfuse_evaluator
class LocalExperimentItem(typing.TypedDict):
26class LocalExperimentItem(TypedDict, total=False):
27    """Structure for local experiment data items (not from Langfuse datasets).
28
29    This TypedDict defines the structure for experiment items when using local data
30    rather than Langfuse-hosted datasets. All fields are optional to provide
31    flexibility in data structure.
32
33    Attributes:
34        input: The input data to pass to the task function. Can be any type that
35            your task function can process (string, dict, list, etc.). This is
36            typically the prompt, question, or data that your task will operate on.
37        expected_output: Optional expected/ground truth output for evaluation purposes.
38            Used by evaluators to assess correctness or quality. Can be None if
39            no ground truth is available.
40        metadata: Optional metadata dictionary containing additional context about
41            this specific item. Can include information like difficulty level,
42            category, source, or any other relevant attributes that evaluators
43            might use for context-aware evaluation.
44
45    Examples:
46        Simple text processing item:
47        ```python
48        item: LocalExperimentItem = {
49            "input": "Summarize this article: ...",
50            "expected_output": "Expected summary...",
51            "metadata": {"difficulty": "medium", "category": "news"}
52        }
53        ```
54
55        Classification item:
56        ```python
57        item: LocalExperimentItem = {
58            "input": {"text": "This movie is great!", "context": "movie review"},
59            "expected_output": "positive",
60            "metadata": {"dataset_source": "imdb", "confidence": 0.95}
61        }
62        ```
63
64        Minimal item with only input:
65        ```python
66        item: LocalExperimentItem = {
67            "input": "What is the capital of France?"
68        }
69        ```
70    """
71
72    input: Any
73    expected_output: Any
74    metadata: Optional[Dict[str, Any]]

Structure for local experiment data items (not from Langfuse datasets).

This TypedDict defines the structure for experiment items when using local data rather than Langfuse-hosted datasets. All fields are optional to provide flexibility in data structure.

Attributes:
  • input: The input data to pass to the task function. Can be any type that your task function can process (string, dict, list, etc.). This is typically the prompt, question, or data that your task will operate on.
  • expected_output: Optional expected/ground truth output for evaluation purposes. Used by evaluators to assess correctness or quality. Can be None if no ground truth is available.
  • metadata: Optional metadata dictionary containing additional context about this specific item. Can include information like difficulty level, category, source, or any other relevant attributes that evaluators might use for context-aware evaluation.
Examples:

Simple text processing item:

item: LocalExperimentItem = {
    "input": "Summarize this article: ...",
    "expected_output": "Expected summary...",
    "metadata": {"difficulty": "medium", "category": "news"}
}

Classification item:

item: LocalExperimentItem = {
    "input": {"text": "This movie is great!", "context": "movie review"},
    "expected_output": "positive",
    "metadata": {"dataset_source": "imdb", "confidence": 0.95}
}

Minimal item with only input:

item: LocalExperimentItem = {
    "input": "What is the capital of France?"
}
input: Any
expected_output: Any
metadata: Optional[Dict[str, Any]]
ExperimentItem = typing.Union[LocalExperimentItem, langfuse.api.DatasetItem]

Type alias for items that can be processed in experiments.

Can be either:

  • LocalExperimentItem: Dict-like items with 'input', 'expected_output', 'metadata' keys
  • DatasetItem: Items from Langfuse datasets with .input, .expected_output, .metadata attributes
ExperimentData = typing.Union[typing.List[LocalExperimentItem], typing.List[langfuse.api.DatasetItem]]

Type alias for experiment datasets.

Represents the collection of items to process in an experiment. Can be either:

  • List[LocalExperimentItem]: Local data items as dictionaries
  • List[DatasetItem]: Items from a Langfuse dataset (typically from dataset.items)
class Evaluation:
 94class Evaluation:
 95    """Represents an evaluation result for an experiment item or an entire experiment run.
 96
 97    This class provides a strongly-typed way to create evaluation results in evaluator functions.
 98    Users must use keyword arguments when instantiating this class.
 99
100    Attributes:
101        name: Unique identifier for the evaluation metric. Should be descriptive
102            and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity").
103            Used for aggregation and comparison across experiment runs.
104        value: The evaluation score or result. Can be:
105            - Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42)
106            - String: For categorical results like "positive", "negative", "neutral"
107            - Boolean: For binary assessments like "passes_safety_check"
108        comment: Optional human-readable explanation of the evaluation result.
109            Useful for providing context, explaining scoring rationale, or noting
110            special conditions. Displayed in Langfuse UI for interpretability.
111        metadata: Optional structured metadata about the evaluation process.
112            Can include confidence scores, intermediate calculations, model versions,
113            or any other relevant technical details.
114        data_type: Optional score data type. Required if value is not NUMERIC.
115            One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC.
116        config_id: Optional Langfuse score config ID.
117
118    Examples:
119        Basic accuracy evaluation:
120        ```python
121        from langfuse import Evaluation
122
123        def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
124            if not expected_output:
125                return Evaluation(name="accuracy", value=0, comment="No expected output")
126
127            is_correct = output.strip().lower() == expected_output.strip().lower()
128            return Evaluation(
129                name="accuracy",
130                value=1.0 if is_correct else 0.0,
131                comment="Correct answer" if is_correct else "Incorrect answer"
132            )
133        ```
134
135        Multi-metric evaluator:
136        ```python
137        def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs):
138            return [
139                Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"),
140                Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"),
141                Evaluation(
142                    name="quality",
143                    value=0.85,
144                    comment="High quality response",
145                    metadata={"confidence": 0.92, "model": "gpt-4"}
146                )
147            ]
148        ```
149
150        Categorical evaluation:
151        ```python
152        def sentiment_evaluator(*, input, output, **kwargs):
153            sentiment = analyze_sentiment(output)  # Returns "positive", "negative", or "neutral"
154            return Evaluation(
155                name="sentiment",
156                value=sentiment,
157                comment=f"Response expresses {sentiment} sentiment",
158                data_type="CATEGORICAL"
159            )
160        ```
161
162        Failed evaluation with error handling:
163        ```python
164        def external_api_evaluator(*, input, output, **kwargs):
165            try:
166                score = external_api.evaluate(output)
167                return Evaluation(name="external_score", value=score)
168            except Exception as e:
169                return Evaluation(
170                    name="external_score",
171                    value=0,
172                    comment=f"API unavailable: {e}",
173                    metadata={"error": str(e), "retry_count": 3}
174                )
175        ```
176
177    Note:
178        All arguments must be passed as keywords. Positional arguments are not allowed
179        to ensure code clarity and prevent errors from argument reordering.
180    """
181
182    def __init__(
183        self,
184        *,
185        name: str,
186        value: Union[int, float, str, bool],
187        comment: Optional[str] = None,
188        metadata: Optional[Dict[str, Any]] = None,
189        data_type: Optional[ExperimentScoreType] = None,
190        config_id: Optional[str] = None,
191    ):
192        """Initialize an Evaluation with the provided data.
193
194        Args:
195            name: Unique identifier for the evaluation metric.
196            value: The evaluation score or result.
197            comment: Optional human-readable explanation of the result.
198            metadata: Optional structured metadata about the evaluation process.
199            data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
200            config_id: Optional Langfuse score config ID.
201
202        Note:
203            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
204        """
205        self.name = name
206        self.value = value
207        self.comment = comment
208        self.metadata = metadata
209        self.data_type = data_type
210        self.config_id = config_id

Represents an evaluation result for an experiment item or an entire experiment run.

This class provides a strongly-typed way to create evaluation results in evaluator functions. Users must use keyword arguments when instantiating this class.

Attributes:
  • name: Unique identifier for the evaluation metric. Should be descriptive and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity"). Used for aggregation and comparison across experiment runs.
  • value: The evaluation score or result. Can be:
    • Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42)
    • String: For categorical results like "positive", "negative", "neutral"
    • Boolean: For binary assessments like "passes_safety_check"
  • comment: Optional human-readable explanation of the evaluation result. Useful for providing context, explaining scoring rationale, or noting special conditions. Displayed in Langfuse UI for interpretability.
  • metadata: Optional structured metadata about the evaluation process. Can include confidence scores, intermediate calculations, model versions, or any other relevant technical details.
  • data_type: Optional score data type. Required if value is not NUMERIC. One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC.
  • config_id: Optional Langfuse score config ID.
Examples:

Basic accuracy evaluation:

from langfuse import Evaluation

def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
    if not expected_output:
        return Evaluation(name="accuracy", value=0, comment="No expected output")

    is_correct = output.strip().lower() == expected_output.strip().lower()
    return Evaluation(
        name="accuracy",
        value=1.0 if is_correct else 0.0,
        comment="Correct answer" if is_correct else "Incorrect answer"
    )

Multi-metric evaluator:

def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs):
    return [
        Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"),
        Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"),
        Evaluation(
            name="quality",
            value=0.85,
            comment="High quality response",
            metadata={"confidence": 0.92, "model": "gpt-4"}
        )
    ]

Categorical evaluation:

def sentiment_evaluator(*, input, output, **kwargs):
    sentiment = analyze_sentiment(output)  # Returns "positive", "negative", or "neutral"
    return Evaluation(
        name="sentiment",
        value=sentiment,
        comment=f"Response expresses {sentiment} sentiment",
        data_type="CATEGORICAL"
    )

Failed evaluation with error handling:

def external_api_evaluator(*, input, output, **kwargs):
    try:
        score = external_api.evaluate(output)
        return Evaluation(name="external_score", value=score)
    except Exception as e:
        return Evaluation(
            name="external_score",
            value=0,
            comment=f"API unavailable: {e}",
            metadata={"error": str(e), "retry_count": 3}
        )
Note:

All arguments must be passed as keywords. Positional arguments are not allowed to ensure code clarity and prevent errors from argument reordering.

Evaluation( *, name: str, value: Union[int, float, str, bool], comment: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, data_type: Optional[Literal['NUMERIC', 'CATEGORICAL', 'BOOLEAN']] = None, config_id: Optional[str] = None)
182    def __init__(
183        self,
184        *,
185        name: str,
186        value: Union[int, float, str, bool],
187        comment: Optional[str] = None,
188        metadata: Optional[Dict[str, Any]] = None,
189        data_type: Optional[ExperimentScoreType] = None,
190        config_id: Optional[str] = None,
191    ):
192        """Initialize an Evaluation with the provided data.
193
194        Args:
195            name: Unique identifier for the evaluation metric.
196            value: The evaluation score or result.
197            comment: Optional human-readable explanation of the result.
198            metadata: Optional structured metadata about the evaluation process.
199            data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
200            config_id: Optional Langfuse score config ID.
201
202        Note:
203            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
204        """
205        self.name = name
206        self.value = value
207        self.comment = comment
208        self.metadata = metadata
209        self.data_type = data_type
210        self.config_id = config_id

Initialize an Evaluation with the provided data.

Arguments:
  • name: Unique identifier for the evaluation metric.
  • value: The evaluation score or result.
  • comment: Optional human-readable explanation of the result.
  • metadata: Optional structured metadata about the evaluation process.
  • data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
  • config_id: Optional Langfuse score config ID.
Note:

All arguments must be provided as keywords. Positional arguments will raise a TypeError.

name
value
comment
metadata
data_type
config_id
class ExperimentItemResult:
213class ExperimentItemResult:
214    """Result structure for individual experiment items.
215
216    This class represents the complete result of processing a single item
217    during an experiment run, including the original input, task output,
218    evaluations, and tracing information. Users must use keyword arguments when instantiating this class.
219
220    Attributes:
221        item: The original experiment item that was processed. Can be either
222            a dictionary with 'input', 'expected_output', and 'metadata' keys,
223            or a DatasetItem from Langfuse datasets.
224        output: The actual output produced by the task function for this item.
225            Can be any type depending on what your task function returns.
226        evaluations: List of evaluation results for this item. Each evaluation
227            contains a name, value, optional comment, and optional metadata.
228        trace_id: Optional Langfuse trace ID for this item's execution. Used
229            to link the experiment result with the detailed trace in Langfuse UI.
230        dataset_run_id: Optional dataset run ID if this item was part of a
231            Langfuse dataset. None for local experiments.
232
233    Examples:
234        Accessing item result data:
235        ```python
236        result = langfuse.run_experiment(...)
237        for item_result in result.item_results:
238            print(f"Input: {item_result.item}")
239            print(f"Output: {item_result.output}")
240            print(f"Trace: {item_result.trace_id}")
241
242            # Access evaluations
243            for evaluation in item_result.evaluations:
244                print(f"{evaluation.name}: {evaluation.value}")
245        ```
246
247        Working with different item types:
248        ```python
249        # Local experiment item (dict)
250        if isinstance(item_result.item, dict):
251            input_data = item_result.item["input"]
252            expected = item_result.item.get("expected_output")
253
254        # Langfuse dataset item (object with attributes)
255        else:
256            input_data = item_result.item.input
257            expected = item_result.item.expected_output
258        ```
259
260    Note:
261        All arguments must be passed as keywords. Positional arguments are not allowed
262        to ensure code clarity and prevent errors from argument reordering.
263    """
264
265    def __init__(
266        self,
267        *,
268        item: ExperimentItem,
269        output: Any,
270        evaluations: List[Evaluation],
271        trace_id: Optional[str],
272        dataset_run_id: Optional[str],
273    ):
274        """Initialize an ExperimentItemResult with the provided data.
275
276        Args:
277            item: The original experiment item that was processed.
278            output: The actual output produced by the task function for this item.
279            evaluations: List of evaluation results for this item.
280            trace_id: Optional Langfuse trace ID for this item's execution.
281            dataset_run_id: Optional dataset run ID if this item was part of a Langfuse dataset.
282
283        Note:
284            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
285        """
286        self.item = item
287        self.output = output
288        self.evaluations = evaluations
289        self.trace_id = trace_id
290        self.dataset_run_id = dataset_run_id

Result structure for individual experiment items.

This class represents the complete result of processing a single item during an experiment run, including the original input, task output, evaluations, and tracing information. Users must use keyword arguments when instantiating this class.

Attributes:
  • item: The original experiment item that was processed. Can be either a dictionary with 'input', 'expected_output', and 'metadata' keys, or a DatasetItem from Langfuse datasets.
  • output: The actual output produced by the task function for this item. Can be any type depending on what your task function returns.
  • evaluations: List of evaluation results for this item. Each evaluation contains a name, value, optional comment, and optional metadata.
  • trace_id: Optional Langfuse trace ID for this item's execution. Used to link the experiment result with the detailed trace in Langfuse UI.
  • dataset_run_id: Optional dataset run ID if this item was part of a Langfuse dataset. None for local experiments.
Examples:

Accessing item result data:

result = langfuse.run_experiment(...)
for item_result in result.item_results:
    print(f"Input: {item_result.item}")
    print(f"Output: {item_result.output}")
    print(f"Trace: {item_result.trace_id}")

    # Access evaluations
    for evaluation in item_result.evaluations:
        print(f"{evaluation.name}: {evaluation.value}")

Working with different item types:

# Local experiment item (dict)
if isinstance(item_result.item, dict):
    input_data = item_result.item["input"]
    expected = item_result.item.get("expected_output")

# Langfuse dataset item (object with attributes)
else:
    input_data = item_result.item.input
    expected = item_result.item.expected_output
Note:

All arguments must be passed as keywords. Positional arguments are not allowed to ensure code clarity and prevent errors from argument reordering.

ExperimentItemResult( *, item: Union[LocalExperimentItem, langfuse.api.DatasetItem], output: Any, evaluations: List[Evaluation], trace_id: Optional[str], dataset_run_id: Optional[str])
265    def __init__(
266        self,
267        *,
268        item: ExperimentItem,
269        output: Any,
270        evaluations: List[Evaluation],
271        trace_id: Optional[str],
272        dataset_run_id: Optional[str],
273    ):
274        """Initialize an ExperimentItemResult with the provided data.
275
276        Args:
277            item: The original experiment item that was processed.
278            output: The actual output produced by the task function for this item.
279            evaluations: List of evaluation results for this item.
280            trace_id: Optional Langfuse trace ID for this item's execution.
281            dataset_run_id: Optional dataset run ID if this item was part of a Langfuse dataset.
282
283        Note:
284            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
285        """
286        self.item = item
287        self.output = output
288        self.evaluations = evaluations
289        self.trace_id = trace_id
290        self.dataset_run_id = dataset_run_id

Initialize an ExperimentItemResult with the provided data.

Arguments:
  • item: The original experiment item that was processed.
  • output: The actual output produced by the task function for this item.
  • evaluations: List of evaluation results for this item.
  • trace_id: Optional Langfuse trace ID for this item's execution.
  • dataset_run_id: Optional dataset run ID if this item was part of a Langfuse dataset.
Note:

All arguments must be provided as keywords. Positional arguments will raise a TypeError.

item
output
evaluations
trace_id
dataset_run_id
class ExperimentResult:
293class ExperimentResult:
294    """Complete result structure for experiment execution.
295
296    This class encapsulates the complete results of running an experiment on a dataset,
297    including individual item results, aggregate run-level evaluations, and metadata
298    about the experiment execution.
299
300    Attributes:
301        name: The name of the experiment as specified during execution.
302        run_name: The name of the current experiment run.
303        description: Optional description of the experiment's purpose or methodology.
304        item_results: List of results from processing each individual dataset item,
305            containing the original item, task output, evaluations, and trace information.
306        run_evaluations: List of aggregate evaluation results computed across all items,
307            such as average scores, statistical summaries, or cross-item analyses.
308        experiment_id: ID of the experiment run propagated across all items. For
309            Langfuse datasets, this matches the dataset run ID. For local experiments,
310            this is a stable SDK-generated identifier for the run.
311        dataset_run_id: Optional ID of the dataset run in Langfuse (when using Langfuse datasets).
312        dataset_run_url: Optional direct URL to view the experiment results in Langfuse UI.
313
314    Examples:
315        Basic usage with local dataset:
316        ```python
317        result = langfuse.run_experiment(
318            name="Capital Cities Test",
319            data=local_data,
320            task=generate_capital,
321            evaluators=[accuracy_check]
322        )
323
324        print(f"Processed {len(result.item_results)} items")
325        print(result.format())  # Human-readable summary
326
327        # Access individual results
328        for item_result in result.item_results:
329            print(f"Input: {item_result.item}")
330            print(f"Output: {item_result.output}")
331            print(f"Scores: {item_result.evaluations}")
332        ```
333
334        Usage with Langfuse datasets:
335        ```python
336        dataset = langfuse.get_dataset("qa-eval-set")
337        result = dataset.run_experiment(
338            name="GPT-4 QA Evaluation",
339            task=answer_question,
340            evaluators=[relevance_check, accuracy_check]
341        )
342
343        # View in Langfuse UI
344        if result.dataset_run_url:
345            print(f"View detailed results: {result.dataset_run_url}")
346        ```
347
348        Formatted output:
349        ```python
350        # Get summary view
351        summary = result.format()
352        print(summary)
353
354        # Get detailed view with individual items
355        detailed = result.format(include_item_results=True)
356        with open("experiment_report.txt", "w") as f:
357            f.write(detailed)
358        ```
359    """
360
361    def __init__(
362        self,
363        *,
364        name: str,
365        run_name: str,
366        description: Optional[str],
367        item_results: List[ExperimentItemResult],
368        run_evaluations: List[Evaluation],
369        experiment_id: str,
370        dataset_run_id: Optional[str] = None,
371        dataset_run_url: Optional[str] = None,
372    ):
373        """Initialize an ExperimentResult with the provided data.
374
375        Args:
376            name: The name of the experiment.
377            run_name: The current experiment run name.
378            description: Optional description of the experiment.
379            item_results: List of results from processing individual dataset items.
380            run_evaluations: List of aggregate evaluation results for the entire run.
381            experiment_id: ID of the experiment run.
382            dataset_run_id: Optional ID of the dataset run (for Langfuse datasets).
383            dataset_run_url: Optional URL to view results in Langfuse UI.
384        """
385        self.name = name
386        self.run_name = run_name
387        self.description = description
388        self.item_results = item_results
389        self.run_evaluations = run_evaluations
390        self.experiment_id = experiment_id
391        self.dataset_run_id = dataset_run_id
392        self.dataset_run_url = dataset_run_url
393
394    def format(self, *, include_item_results: bool = False) -> str:
395        r"""Format the experiment result for human-readable display.
396
397        Converts the experiment result into a nicely formatted string suitable for
398        console output, logging, or reporting. The output includes experiment overview,
399        aggregate statistics, and optionally individual item details.
400
401        This method provides a comprehensive view of experiment performance including:
402        - Experiment metadata (name, description, item count)
403        - List of evaluation metrics used across items
404        - Average scores computed across all processed items
405        - Run-level evaluation results (aggregate metrics)
406        - Links to view detailed results in Langfuse UI (when available)
407        - Individual item details (when requested)
408
409        Args:
410            include_item_results: Whether to include detailed results for each individual
411                item in the formatted output. When False (default), only shows aggregate
412                statistics and summary information. When True, includes input/output/scores
413                for every processed item, making the output significantly longer but more
414                detailed for debugging and analysis purposes.
415
416        Returns:
417            A formatted multi-line string containing:
418            - Experiment name and description (if provided)
419            - Total number of items successfully processed
420            - List of all evaluation metrics that were applied
421            - Average scores across all items for each numeric metric
422            - Run-level evaluation results with comments
423            - Dataset run URL for viewing in Langfuse UI (if applicable)
424            - Individual item details including inputs, outputs, and scores (if requested)
425
426        Examples:
427            Basic usage showing aggregate results only:
428            ```python
429            result = langfuse.run_experiment(
430                name="Capital Cities",
431                data=dataset,
432                task=generate_capital,
433                evaluators=[accuracy_evaluator]
434            )
435
436            print(result.format())
437            # Output:
438            # ──────────────────────────────────────────────────
439            # 📊 Capital Cities
440            # 100 items
441            # Evaluations:
442            #   • accuracy
443            # Average Scores:
444            #   • accuracy: 0.850
445            ```
446
447            Detailed output including all individual item results:
448            ```python
449            detailed_report = result.format(include_item_results=True)
450            print(detailed_report)
451            # Output includes each item:
452            # 1. Item 1:
453            #    Input:    What is the capital of France?
454            #    Expected: Paris
455            #    Actual:   The capital of France is Paris.
456            #    Scores:
457            #      • accuracy: 1.000
458            #        💭 Correct answer found
459            # [... continues for all items ...]
460            ```
461
462            Saving formatted results to file for reporting:
463            ```python
464            with open("experiment_report.txt", "w") as f:
465                f.write(result.format(include_item_results=True))
466
467            # Or create summary report
468            summary = result.format()  # Aggregate view only
469            print(f"Experiment Summary:\n{summary}")
470            ```
471
472            Integration with logging systems:
473            ```python
474            import logging
475            logger = logging.getLogger("experiments")
476
477            # Log summary after experiment
478            logger.info(f"Experiment completed:\n{result.format()}")
479
480            # Log detailed results for failed experiments
481            if any(eval['value'] < threshold for eval in result.run_evaluations):
482                logger.warning(f"Poor performance detected:\n{result.format(include_item_results=True)}")
483            ```
484        """
485        if not self.item_results:
486            return "No experiment results to display."
487
488        output = ""
489
490        # Individual results section
491        if include_item_results:
492            for i, result in enumerate(self.item_results):
493                output += f"\n{i + 1}. Item {i + 1}:\n"
494
495                # Extract and display input
496                item_input = None
497                if isinstance(result.item, dict):
498                    item_input = result.item.get("input")
499                elif hasattr(result.item, "input"):
500                    item_input = result.item.input
501
502                if item_input is not None:
503                    output += f"   Input:    {_format_value(item_input)}\n"
504
505                # Extract and display expected output
506                expected_output = None
507                if isinstance(result.item, dict):
508                    expected_output = result.item.get("expected_output")
509                elif hasattr(result.item, "expected_output"):
510                    expected_output = result.item.expected_output
511
512                if expected_output is not None:
513                    output += f"   Expected: {_format_value(expected_output)}\n"
514                output += f"   Actual:   {_format_value(result.output)}\n"
515
516                # Display evaluation scores
517                if result.evaluations:
518                    output += "   Scores:\n"
519                    for evaluation in result.evaluations:
520                        score = evaluation.value
521                        if isinstance(score, (int, float)):
522                            score = f"{score:.3f}"
523                        output += f"     • {evaluation.name}: {score}"
524                        if evaluation.comment:
525                            output += f"\n       💭 {evaluation.comment}"
526                        output += "\n"
527
528                # Display trace link if available
529                if result.trace_id:
530                    output += f"\n   Trace ID: {result.trace_id}\n"
531        else:
532            output += f"Individual Results: Hidden ({len(self.item_results)} items)\n"
533            output += "💡 Set include_item_results=True to view them\n"
534
535        # Experiment overview section
536        output += f"\n{'─' * 50}\n"
537        output += f"🧪 Experiment: {self.name}"
538        output += f"\n📋 Run name: {self.run_name}"
539        if self.description:
540            output += f" - {self.description}"
541
542        output += f"\n{len(self.item_results)} items"
543
544        # Collect unique evaluation names across all items
545        evaluation_names = set()
546        for result in self.item_results:
547            for evaluation in result.evaluations:
548                evaluation_names.add(evaluation.name)
549
550        if evaluation_names:
551            output += "\nEvaluations:"
552            for eval_name in evaluation_names:
553                output += f"\n  • {eval_name}"
554            output += "\n"
555
556        # Calculate and display average scores
557        if evaluation_names:
558            output += "\nAverage Scores:"
559            for eval_name in evaluation_names:
560                scores = []
561                for result in self.item_results:
562                    for evaluation in result.evaluations:
563                        if evaluation.name == eval_name and isinstance(
564                            evaluation.value, (int, float)
565                        ):
566                            scores.append(evaluation.value)
567
568                if scores:
569                    avg = sum(scores) / len(scores)
570                    output += f"\n  • {eval_name}: {avg:.3f}"
571            output += "\n"
572
573        # Display run-level evaluations
574        if self.run_evaluations:
575            output += "\nRun Evaluations:"
576            for run_eval in self.run_evaluations:
577                score = run_eval.value
578                if isinstance(score, (int, float)):
579                    score = f"{score:.3f}"
580                output += f"\n  • {run_eval.name}: {score}"
581                if run_eval.comment:
582                    output += f"\n    💭 {run_eval.comment}"
583            output += "\n"
584
585        # Add dataset run URL if available
586        if self.dataset_run_url:
587            output += f"\n🔗 Dataset Run:\n   {self.dataset_run_url}"
588
589        return output

Complete result structure for experiment execution.

This class encapsulates the complete results of running an experiment on a dataset, including individual item results, aggregate run-level evaluations, and metadata about the experiment execution.

Attributes:
  • name: The name of the experiment as specified during execution.
  • run_name: The name of the current experiment run.
  • description: Optional description of the experiment's purpose or methodology.
  • item_results: List of results from processing each individual dataset item, containing the original item, task output, evaluations, and trace information.
  • run_evaluations: List of aggregate evaluation results computed across all items, such as average scores, statistical summaries, or cross-item analyses.
  • experiment_id: ID of the experiment run propagated across all items. For Langfuse datasets, this matches the dataset run ID. For local experiments, this is a stable SDK-generated identifier for the run.
  • dataset_run_id: Optional ID of the dataset run in Langfuse (when using Langfuse datasets).
  • dataset_run_url: Optional direct URL to view the experiment results in Langfuse UI.
Examples:

Basic usage with local dataset:

result = langfuse.run_experiment(
    name="Capital Cities Test",
    data=local_data,
    task=generate_capital,
    evaluators=[accuracy_check]
)

print(f"Processed {len(result.item_results)} items")
print(result.format())  # Human-readable summary

# Access individual results
for item_result in result.item_results:
    print(f"Input: {item_result.item}")
    print(f"Output: {item_result.output}")
    print(f"Scores: {item_result.evaluations}")

Usage with Langfuse datasets:

dataset = langfuse.get_dataset("qa-eval-set")
result = dataset.run_experiment(
    name="GPT-4 QA Evaluation",
    task=answer_question,
    evaluators=[relevance_check, accuracy_check]
)

# View in Langfuse UI
if result.dataset_run_url:
    print(f"View detailed results: {result.dataset_run_url}")

Formatted output:

# Get summary view
summary = result.format()
print(summary)

# Get detailed view with individual items
detailed = result.format(include_item_results=True)
with open("experiment_report.txt", "w") as f:
    f.write(detailed)
ExperimentResult( *, name: str, run_name: str, description: Optional[str], item_results: List[ExperimentItemResult], run_evaluations: List[Evaluation], experiment_id: str, dataset_run_id: Optional[str] = None, dataset_run_url: Optional[str] = None)
361    def __init__(
362        self,
363        *,
364        name: str,
365        run_name: str,
366        description: Optional[str],
367        item_results: List[ExperimentItemResult],
368        run_evaluations: List[Evaluation],
369        experiment_id: str,
370        dataset_run_id: Optional[str] = None,
371        dataset_run_url: Optional[str] = None,
372    ):
373        """Initialize an ExperimentResult with the provided data.
374
375        Args:
376            name: The name of the experiment.
377            run_name: The current experiment run name.
378            description: Optional description of the experiment.
379            item_results: List of results from processing individual dataset items.
380            run_evaluations: List of aggregate evaluation results for the entire run.
381            experiment_id: ID of the experiment run.
382            dataset_run_id: Optional ID of the dataset run (for Langfuse datasets).
383            dataset_run_url: Optional URL to view results in Langfuse UI.
384        """
385        self.name = name
386        self.run_name = run_name
387        self.description = description
388        self.item_results = item_results
389        self.run_evaluations = run_evaluations
390        self.experiment_id = experiment_id
391        self.dataset_run_id = dataset_run_id
392        self.dataset_run_url = dataset_run_url

Initialize an ExperimentResult with the provided data.

Arguments:
  • name: The name of the experiment.
  • run_name: The current experiment run name.
  • description: Optional description of the experiment.
  • item_results: List of results from processing individual dataset items.
  • run_evaluations: List of aggregate evaluation results for the entire run.
  • experiment_id: ID of the experiment run.
  • dataset_run_id: Optional ID of the dataset run (for Langfuse datasets).
  • dataset_run_url: Optional URL to view results in Langfuse UI.
name
run_name
description
item_results
run_evaluations
experiment_id
dataset_run_id
dataset_run_url
def format(self, *, include_item_results: bool = False) -> str:
394    def format(self, *, include_item_results: bool = False) -> str:
395        r"""Format the experiment result for human-readable display.
396
397        Converts the experiment result into a nicely formatted string suitable for
398        console output, logging, or reporting. The output includes experiment overview,
399        aggregate statistics, and optionally individual item details.
400
401        This method provides a comprehensive view of experiment performance including:
402        - Experiment metadata (name, description, item count)
403        - List of evaluation metrics used across items
404        - Average scores computed across all processed items
405        - Run-level evaluation results (aggregate metrics)
406        - Links to view detailed results in Langfuse UI (when available)
407        - Individual item details (when requested)
408
409        Args:
410            include_item_results: Whether to include detailed results for each individual
411                item in the formatted output. When False (default), only shows aggregate
412                statistics and summary information. When True, includes input/output/scores
413                for every processed item, making the output significantly longer but more
414                detailed for debugging and analysis purposes.
415
416        Returns:
417            A formatted multi-line string containing:
418            - Experiment name and description (if provided)
419            - Total number of items successfully processed
420            - List of all evaluation metrics that were applied
421            - Average scores across all items for each numeric metric
422            - Run-level evaluation results with comments
423            - Dataset run URL for viewing in Langfuse UI (if applicable)
424            - Individual item details including inputs, outputs, and scores (if requested)
425
426        Examples:
427            Basic usage showing aggregate results only:
428            ```python
429            result = langfuse.run_experiment(
430                name="Capital Cities",
431                data=dataset,
432                task=generate_capital,
433                evaluators=[accuracy_evaluator]
434            )
435
436            print(result.format())
437            # Output:
438            # ──────────────────────────────────────────────────
439            # 📊 Capital Cities
440            # 100 items
441            # Evaluations:
442            #   • accuracy
443            # Average Scores:
444            #   • accuracy: 0.850
445            ```
446
447            Detailed output including all individual item results:
448            ```python
449            detailed_report = result.format(include_item_results=True)
450            print(detailed_report)
451            # Output includes each item:
452            # 1. Item 1:
453            #    Input:    What is the capital of France?
454            #    Expected: Paris
455            #    Actual:   The capital of France is Paris.
456            #    Scores:
457            #      • accuracy: 1.000
458            #        💭 Correct answer found
459            # [... continues for all items ...]
460            ```
461
462            Saving formatted results to file for reporting:
463            ```python
464            with open("experiment_report.txt", "w") as f:
465                f.write(result.format(include_item_results=True))
466
467            # Or create summary report
468            summary = result.format()  # Aggregate view only
469            print(f"Experiment Summary:\n{summary}")
470            ```
471
472            Integration with logging systems:
473            ```python
474            import logging
475            logger = logging.getLogger("experiments")
476
477            # Log summary after experiment
478            logger.info(f"Experiment completed:\n{result.format()}")
479
480            # Log detailed results for failed experiments
481            if any(eval['value'] < threshold for eval in result.run_evaluations):
482                logger.warning(f"Poor performance detected:\n{result.format(include_item_results=True)}")
483            ```
484        """
485        if not self.item_results:
486            return "No experiment results to display."
487
488        output = ""
489
490        # Individual results section
491        if include_item_results:
492            for i, result in enumerate(self.item_results):
493                output += f"\n{i + 1}. Item {i + 1}:\n"
494
495                # Extract and display input
496                item_input = None
497                if isinstance(result.item, dict):
498                    item_input = result.item.get("input")
499                elif hasattr(result.item, "input"):
500                    item_input = result.item.input
501
502                if item_input is not None:
503                    output += f"   Input:    {_format_value(item_input)}\n"
504
505                # Extract and display expected output
506                expected_output = None
507                if isinstance(result.item, dict):
508                    expected_output = result.item.get("expected_output")
509                elif hasattr(result.item, "expected_output"):
510                    expected_output = result.item.expected_output
511
512                if expected_output is not None:
513                    output += f"   Expected: {_format_value(expected_output)}\n"
514                output += f"   Actual:   {_format_value(result.output)}\n"
515
516                # Display evaluation scores
517                if result.evaluations:
518                    output += "   Scores:\n"
519                    for evaluation in result.evaluations:
520                        score = evaluation.value
521                        if isinstance(score, (int, float)):
522                            score = f"{score:.3f}"
523                        output += f"     • {evaluation.name}: {score}"
524                        if evaluation.comment:
525                            output += f"\n       💭 {evaluation.comment}"
526                        output += "\n"
527
528                # Display trace link if available
529                if result.trace_id:
530                    output += f"\n   Trace ID: {result.trace_id}\n"
531        else:
532            output += f"Individual Results: Hidden ({len(self.item_results)} items)\n"
533            output += "💡 Set include_item_results=True to view them\n"
534
535        # Experiment overview section
536        output += f"\n{'─' * 50}\n"
537        output += f"🧪 Experiment: {self.name}"
538        output += f"\n📋 Run name: {self.run_name}"
539        if self.description:
540            output += f" - {self.description}"
541
542        output += f"\n{len(self.item_results)} items"
543
544        # Collect unique evaluation names across all items
545        evaluation_names = set()
546        for result in self.item_results:
547            for evaluation in result.evaluations:
548                evaluation_names.add(evaluation.name)
549
550        if evaluation_names:
551            output += "\nEvaluations:"
552            for eval_name in evaluation_names:
553                output += f"\n  • {eval_name}"
554            output += "\n"
555
556        # Calculate and display average scores
557        if evaluation_names:
558            output += "\nAverage Scores:"
559            for eval_name in evaluation_names:
560                scores = []
561                for result in self.item_results:
562                    for evaluation in result.evaluations:
563                        if evaluation.name == eval_name and isinstance(
564                            evaluation.value, (int, float)
565                        ):
566                            scores.append(evaluation.value)
567
568                if scores:
569                    avg = sum(scores) / len(scores)
570                    output += f"\n  • {eval_name}: {avg:.3f}"
571            output += "\n"
572
573        # Display run-level evaluations
574        if self.run_evaluations:
575            output += "\nRun Evaluations:"
576            for run_eval in self.run_evaluations:
577                score = run_eval.value
578                if isinstance(score, (int, float)):
579                    score = f"{score:.3f}"
580                output += f"\n  • {run_eval.name}: {score}"
581                if run_eval.comment:
582                    output += f"\n    💭 {run_eval.comment}"
583            output += "\n"
584
585        # Add dataset run URL if available
586        if self.dataset_run_url:
587            output += f"\n🔗 Dataset Run:\n   {self.dataset_run_url}"
588
589        return output

Format the experiment result for human-readable display.

Converts the experiment result into a nicely formatted string suitable for console output, logging, or reporting. The output includes experiment overview, aggregate statistics, and optionally individual item details.

This method provides a comprehensive view of experiment performance including:

  • Experiment metadata (name, description, item count)
  • List of evaluation metrics used across items
  • Average scores computed across all processed items
  • Run-level evaluation results (aggregate metrics)
  • Links to view detailed results in Langfuse UI (when available)
  • Individual item details (when requested)
Arguments:
  • include_item_results: Whether to include detailed results for each individual item in the formatted output. When False (default), only shows aggregate statistics and summary information. When True, includes input/output/scores for every processed item, making the output significantly longer but more detailed for debugging and analysis purposes.
Returns:

A formatted multi-line string containing:

  • Experiment name and description (if provided)
  • Total number of items successfully processed
  • List of all evaluation metrics that were applied
  • Average scores across all items for each numeric metric
  • Run-level evaluation results with comments
  • Dataset run URL for viewing in Langfuse UI (if applicable)
  • Individual item details including inputs, outputs, and scores (if requested)
Examples:

Basic usage showing aggregate results only:

result = langfuse.run_experiment(
    name="Capital Cities",
    data=dataset,
    task=generate_capital,
    evaluators=[accuracy_evaluator]
)

print(result.format())
# Output:
# ──────────────────────────────────────────────────
# 📊 Capital Cities
# 100 items
# Evaluations:
#   • accuracy
# Average Scores:
#   • accuracy: 0.850

Detailed output including all individual item results:

detailed_report = result.format(include_item_results=True)
print(detailed_report)
# Output includes each item:
# 1. Item 1:
#    Input:    What is the capital of France?
#    Expected: Paris
#    Actual:   The capital of France is Paris.
#    Scores:
#      • accuracy: 1.000
#        💭 Correct answer found
# [... continues for all items ...]

Saving formatted results to file for reporting:

with open("experiment_report.txt", "w") as f:
    f.write(result.format(include_item_results=True))

# Or create summary report
summary = result.format()  # Aggregate view only
print(f"Experiment Summary:\n{summary}")

Integration with logging systems:

import logging
logger = logging.getLogger("experiments")

# Log summary after experiment
logger.info(f"Experiment completed:\n{result.format()}")

# Log detailed results for failed experiments
if any(eval['value'] < threshold for eval in result.run_evaluations):
    logger.warning(f"Poor performance detected:\n{result.format(include_item_results=True)}")
class TaskFunction(typing.Protocol):
592class TaskFunction(Protocol):
593    """Protocol defining the interface for experiment task functions.
594
595    Task functions are the core processing functions that operate on each item
596    in an experiment dataset. They receive an experiment item as input and
597    produce some output that will be evaluated.
598
599    Task functions must:
600    - Accept 'item' as a keyword argument
601    - Return any type of output (will be passed to evaluators)
602    - Can be either synchronous or asynchronous
603    - Should handle their own errors gracefully (exceptions will be logged)
604    """
605
606    def __call__(
607        self,
608        *,
609        item: ExperimentItem,
610        **kwargs: Dict[str, Any],
611    ) -> Union[Any, Awaitable[Any]]:
612        """Execute the task on an experiment item.
613
614        This method defines the core processing logic for each item in your experiment.
615        The implementation should focus on the specific task you want to evaluate,
616        such as text generation, classification, summarization, etc.
617
618        Args:
619            item: The experiment item to process. Can be either:
620                - Dict with keys like 'input', 'expected_output', 'metadata'
621                - Langfuse DatasetItem object with .input, .expected_output attributes
622            **kwargs: Additional keyword arguments that may be passed by the framework
623
624        Returns:
625            Any: The output of processing the item. This output will be:
626            - Stored in the experiment results
627            - Passed to all item-level evaluators for assessment
628            - Traced automatically in Langfuse for observability
629
630            Can return either a direct value or an awaitable (async) result.
631
632        Examples:
633            Simple synchronous task:
634            ```python
635            def my_task(*, item, **kwargs):
636                prompt = f"Summarize: {item['input']}"
637                return my_llm_client.generate(prompt)
638            ```
639
640            Async task with error handling:
641            ```python
642            async def my_async_task(*, item, **kwargs):
643                try:
644                    response = await openai_client.chat.completions.create(
645                        model="gpt-4",
646                        messages=[{"role": "user", "content": item["input"]}]
647                    )
648                    return response.choices[0].message.content
649                except Exception as e:
650                    # Log error and return fallback
651                    print(f"Task failed for item {item}: {e}")
652                    return "Error: Could not process item"
653            ```
654
655            Task using dataset item attributes:
656            ```python
657            def classification_task(*, item, **kwargs):
658                # Works with both dict items and DatasetItem objects
659                text = item["input"] if isinstance(item, dict) else item.input
660                return classify_text(text)
661            ```
662        """
663        ...

Protocol defining the interface for experiment task functions.

Task functions are the core processing functions that operate on each item in an experiment dataset. They receive an experiment item as input and produce some output that will be evaluated.

Task functions must:

  • Accept 'item' as a keyword argument
  • Return any type of output (will be passed to evaluators)
  • Can be either synchronous or asynchronous
  • Should handle their own errors gracefully (exceptions will be logged)
TaskFunction(*args, **kwargs)
1927def _no_init_or_replace_init(self, *args, **kwargs):
1928    cls = type(self)
1929
1930    if cls._is_protocol:
1931        raise TypeError('Protocols cannot be instantiated')
1932
1933    # Already using a custom `__init__`. No need to calculate correct
1934    # `__init__` to call. This can lead to RecursionError. See bpo-45121.
1935    if cls.__init__ is not _no_init_or_replace_init:
1936        return
1937
1938    # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`.
1939    # The first instantiation of the subclass will call `_no_init_or_replace_init` which
1940    # searches for a proper new `__init__` in the MRO. The new `__init__`
1941    # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent
1942    # instantiation of the protocol subclass will thus use the new
1943    # `__init__` and no longer call `_no_init_or_replace_init`.
1944    for base in cls.__mro__:
1945        init = base.__dict__.get('__init__', _no_init_or_replace_init)
1946        if init is not _no_init_or_replace_init:
1947            cls.__init__ = init
1948            break
1949    else:
1950        # should not happen
1951        cls.__init__ = object.__init__
1952
1953    cls.__init__(self, *args, **kwargs)
class EvaluatorFunction(typing.Protocol):
666class EvaluatorFunction(Protocol):
667    """Protocol defining the interface for item-level evaluator functions.
668
669    Item-level evaluators assess the quality, correctness, or other properties
670    of individual task outputs. They receive the input, output, expected output,
671    and metadata for each item and return evaluation metrics.
672
673    Evaluators should:
674    - Accept input, output, expected_output, and metadata as keyword arguments
675    - Return Evaluation dict(s) with 'name', 'value', 'comment', 'metadata' fields
676    - Be deterministic when possible for reproducible results
677    - Handle edge cases gracefully (missing expected output, malformed data, etc.)
678    - Can be either synchronous or asynchronous
679    """
680
681    def __call__(
682        self,
683        *,
684        input: Any,
685        output: Any,
686        expected_output: Any,
687        metadata: Optional[Dict[str, Any]],
688        **kwargs: Dict[str, Any],
689    ) -> Union[
690        Evaluation, List[Evaluation], Awaitable[Union[Evaluation, List[Evaluation]]]
691    ]:
692        r"""Evaluate a task output for quality, correctness, or other metrics.
693
694        This method should implement specific evaluation logic such as accuracy checking,
695        similarity measurement, toxicity detection, fluency assessment, etc.
696
697        Args:
698            input: The original input that was passed to the task function.
699                This is typically the item['input'] or item.input value.
700            output: The output produced by the task function for this input.
701                This is the direct return value from your task function.
702            expected_output: The expected/ground truth output for comparison.
703                May be None if not available in the dataset. Evaluators should
704                handle this case appropriately.
705            metadata: Optional metadata from the experiment item that might
706                contain additional context for evaluation (categories, difficulty, etc.)
707            **kwargs: Additional keyword arguments that may be passed by the framework
708
709        Returns:
710            Evaluation results in one of these formats:
711            - Single Evaluation dict: {"name": "accuracy", "value": 0.85, "comment": "..."}
712            - List of Evaluation dicts: [{"name": "precision", ...}, {"name": "recall", ...}]
713            - Awaitable returning either of the above (for async evaluators)
714
715            Each Evaluation dict should contain:
716            - name (str): Unique identifier for this evaluation metric
717            - value (int|float|str|bool): The evaluation score or result
718            - comment (str, optional): Human-readable explanation of the result
719            - metadata (dict, optional): Additional structured data about the evaluation
720
721        Examples:
722            Simple accuracy evaluator:
723            ```python
724            def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
725                if expected_output is None:
726                    return {"name": "accuracy", "value": 0, "comment": "No expected output"}
727
728                is_correct = output.strip().lower() == expected_output.strip().lower()
729                return {
730                    "name": "accuracy",
731                    "value": 1.0 if is_correct else 0.0,
732                    "comment": "Exact match" if is_correct else "No match"
733                }
734            ```
735
736            Multi-metric evaluator:
737            ```python
738            def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs):
739                results = []
740
741                # Length check
742                results.append({
743                    "name": "output_length",
744                    "value": len(output),
745                    "comment": f"Output contains {len(output)} characters"
746                })
747
748                # Sentiment analysis
749                sentiment_score = analyze_sentiment(output)
750                results.append({
751                    "name": "sentiment",
752                    "value": sentiment_score,
753                    "comment": f"Sentiment score: {sentiment_score:.2f}"
754                })
755
756                return results
757            ```
758
759            Async evaluator using external API:
760            ```python
761            async def llm_judge_evaluator(*, input, output, expected_output=None, **kwargs):
762                prompt = f"Rate the quality of this response on a scale of 1-10:\n"
763                prompt += f"Question: {input}\nResponse: {output}"
764
765                response = await openai_client.chat.completions.create(
766                    model="gpt-4",
767                    messages=[{"role": "user", "content": prompt}]
768                )
769
770                try:
771                    score = float(response.choices[0].message.content.strip())
772                    return {
773                        "name": "llm_judge_quality",
774                        "value": score,
775                        "comment": f"LLM judge rated this {score}/10"
776                    }
777                except ValueError:
778                    return {
779                        "name": "llm_judge_quality",
780                        "value": 0,
781                        "comment": "Could not parse LLM judge score"
782                    }
783            ```
784
785            Context-aware evaluator:
786            ```python
787            def context_evaluator(*, input, output, metadata=None, **kwargs):
788                # Use metadata for context-specific evaluation
789                difficulty = metadata.get("difficulty", "medium") if metadata else "medium"
790
791                # Adjust expectations based on difficulty
792                min_length = {"easy": 50, "medium": 100, "hard": 150}[difficulty]
793
794                meets_requirement = len(output) >= min_length
795                return {
796                    "name": f"meets_{difficulty}_requirement",
797                    "value": meets_requirement,
798                    "comment": f"Output {'meets' if meets_requirement else 'fails'} {difficulty} length requirement"
799                }
800            ```
801        """
802        ...

Protocol defining the interface for item-level evaluator functions.

Item-level evaluators assess the quality, correctness, or other properties of individual task outputs. They receive the input, output, expected output, and metadata for each item and return evaluation metrics.

Evaluators should:

  • Accept input, output, expected_output, and metadata as keyword arguments
  • Return Evaluation dict(s) with 'name', 'value', 'comment', 'metadata' fields
  • Be deterministic when possible for reproducible results
  • Handle edge cases gracefully (missing expected output, malformed data, etc.)
  • Can be either synchronous or asynchronous
EvaluatorFunction(*args, **kwargs)
1927def _no_init_or_replace_init(self, *args, **kwargs):
1928    cls = type(self)
1929
1930    if cls._is_protocol:
1931        raise TypeError('Protocols cannot be instantiated')
1932
1933    # Already using a custom `__init__`. No need to calculate correct
1934    # `__init__` to call. This can lead to RecursionError. See bpo-45121.
1935    if cls.__init__ is not _no_init_or_replace_init:
1936        return
1937
1938    # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`.
1939    # The first instantiation of the subclass will call `_no_init_or_replace_init` which
1940    # searches for a proper new `__init__` in the MRO. The new `__init__`
1941    # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent
1942    # instantiation of the protocol subclass will thus use the new
1943    # `__init__` and no longer call `_no_init_or_replace_init`.
1944    for base in cls.__mro__:
1945        init = base.__dict__.get('__init__', _no_init_or_replace_init)
1946        if init is not _no_init_or_replace_init:
1947            cls.__init__ = init
1948            break
1949    else:
1950        # should not happen
1951        cls.__init__ = object.__init__
1952
1953    cls.__init__(self, *args, **kwargs)
class RunEvaluatorFunction(typing.Protocol):
805class RunEvaluatorFunction(Protocol):
806    """Protocol defining the interface for run-level evaluator functions.
807
808    Run-level evaluators assess aggregate properties of the entire experiment run,
809    computing metrics that span across all items rather than individual outputs.
810    They receive the complete results from all processed items and can compute
811    statistics like averages, distributions, correlations, or other aggregate metrics.
812
813    Run evaluators should:
814    - Accept item_results as a keyword argument containing all item results
815    - Return Evaluation dict(s) with aggregate metrics
816    - Handle cases where some items may have failed processing
817    - Compute meaningful statistics across the dataset
818    - Can be either synchronous or asynchronous
819    """
820
821    def __call__(
822        self,
823        *,
824        item_results: List[ExperimentItemResult],
825        **kwargs: Dict[str, Any],
826    ) -> Union[
827        Evaluation, List[Evaluation], Awaitable[Union[Evaluation, List[Evaluation]]]
828    ]:
829        r"""Evaluate the entire experiment run with aggregate metrics.
830
831        This method should implement aggregate evaluation logic such as computing
832        averages, calculating distributions, finding correlations, detecting patterns
833        across items, or performing statistical analysis on the experiment results.
834
835        Args:
836            item_results: List of results from all successfully processed experiment items.
837                Each item result contains:
838                - item: The original experiment item
839                - output: The task function's output for this item
840                - evaluations: List of item-level evaluation results
841                - trace_id: Langfuse trace ID for this execution
842                - dataset_run_id: Dataset run ID (if using Langfuse datasets)
843
844                Note: This list only includes items that were successfully processed.
845                Failed items are excluded but logged separately.
846            **kwargs: Additional keyword arguments that may be passed by the framework
847
848        Returns:
849            Evaluation results in one of these formats:
850            - Single Evaluation dict: {"name": "avg_accuracy", "value": 0.78, "comment": "..."}
851            - List of Evaluation dicts: [{"name": "mean", ...}, {"name": "std_dev", ...}]
852            - Awaitable returning either of the above (for async evaluators)
853
854            Each Evaluation dict should contain:
855            - name (str): Unique identifier for this run-level metric
856            - value (int|float|str|bool): The aggregate evaluation result
857            - comment (str, optional): Human-readable explanation of the metric
858            - metadata (dict, optional): Additional structured data about the evaluation
859
860        Examples:
861            Average accuracy calculator:
862            ```python
863            def average_accuracy(*, item_results, **kwargs):
864                if not item_results:
865                    return {"name": "avg_accuracy", "value": 0.0, "comment": "No results"}
866
867                accuracy_values = []
868                for result in item_results:
869                    for evaluation in result.evaluations:
870                        if evaluation.name == "accuracy":
871                            accuracy_values.append(evaluation.value)
872
873                if not accuracy_values:
874                    return {"name": "avg_accuracy", "value": 0, "comment": "No accuracy evaluations found"}
875
876                avg = sum(accuracy_values) / len(accuracy_values)
877                return {
878                    "name": "avg_accuracy",
879                    "value": avg,
880                    "comment": f"Average accuracy across {len(accuracy_values)} items: {avg:.2%}"
881                }
882            ```
883
884            Multiple aggregate metrics:
885            ```python
886            def statistical_summary(*, item_results, **kwargs):
887                if not item_results:
888                    return []
889
890                results = []
891
892                # Calculate output length statistics
893                lengths = [len(str(result.output)) for result in item_results]
894                results.extend([
895                    {"name": "avg_output_length", "value": sum(lengths) / len(lengths)},
896                    {"name": "min_output_length", "value": min(lengths)},
897                    {"name": "max_output_length", "value": max(lengths)}
898                ])
899
900                # Success rate
901                total_items = len(item_results)  # Only successful items are included
902                results.append({
903                    "name": "processing_success_rate",
904                    "value": 1.0,  # All items in item_results succeeded
905                    "comment": f"Successfully processed {total_items} items"
906                })
907
908                return results
909            ```
910
911            Async run evaluator with external analysis:
912            ```python
913            async def llm_batch_analysis(*, item_results, **kwargs):
914                # Prepare batch analysis prompt
915                outputs = [result.output for result in item_results]
916                prompt = f"Analyze these {len(outputs)} outputs for common themes:\n"
917                prompt += "\n".join(f"{i+1}. {output}" for i, output in enumerate(outputs))
918
919                response = await openai_client.chat.completions.create(
920                    model="gpt-4",
921                    messages=[{"role": "user", "content": prompt}]
922                )
923
924                return {
925                    "name": "thematic_analysis",
926                    "value": response.choices[0].message.content,
927                    "comment": f"LLM analysis of {len(outputs)} outputs"
928                }
929            ```
930
931            Performance distribution analysis:
932            ```python
933            def performance_distribution(*, item_results, **kwargs):
934                # Extract all evaluation scores
935                all_scores = []
936                score_by_metric = {}
937
938                for result in item_results:
939                    for evaluation in result.evaluations:
940                        metric_name = evaluation.name
941                        value = evaluation.value
942
943                        if isinstance(value, (int, float)):
944                            all_scores.append(value)
945                            if metric_name not in score_by_metric:
946                                score_by_metric[metric_name] = []
947                            score_by_metric[metric_name].append(value)
948
949                results = []
950
951                # Overall score distribution
952                if all_scores:
953                    import statistics
954                    results.append({
955                        "name": "score_std_dev",
956                        "value": statistics.stdev(all_scores) if len(all_scores) > 1 else 0,
957                        "comment": f"Standard deviation across all numeric scores"
958                    })
959
960                # Per-metric statistics
961                for metric, scores in score_by_metric.items():
962                    if len(scores) > 1:
963                        results.append({
964                            "name": f"{metric}_variance",
965                            "value": statistics.variance(scores),
966                            "comment": f"Variance in {metric} across {len(scores)} items"
967                        })
968
969                return results
970            ```
971        """
972        ...

Protocol defining the interface for run-level evaluator functions.

Run-level evaluators assess aggregate properties of the entire experiment run, computing metrics that span across all items rather than individual outputs. They receive the complete results from all processed items and can compute statistics like averages, distributions, correlations, or other aggregate metrics.

Run evaluators should:

  • Accept item_results as a keyword argument containing all item results
  • Return Evaluation dict(s) with aggregate metrics
  • Handle cases where some items may have failed processing
  • Compute meaningful statistics across the dataset
  • Can be either synchronous or asynchronous
RunEvaluatorFunction(*args, **kwargs)
1927def _no_init_or_replace_init(self, *args, **kwargs):
1928    cls = type(self)
1929
1930    if cls._is_protocol:
1931        raise TypeError('Protocols cannot be instantiated')
1932
1933    # Already using a custom `__init__`. No need to calculate correct
1934    # `__init__` to call. This can lead to RecursionError. See bpo-45121.
1935    if cls.__init__ is not _no_init_or_replace_init:
1936        return
1937
1938    # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`.
1939    # The first instantiation of the subclass will call `_no_init_or_replace_init` which
1940    # searches for a proper new `__init__` in the MRO. The new `__init__`
1941    # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent
1942    # instantiation of the protocol subclass will thus use the new
1943    # `__init__` and no longer call `_no_init_or_replace_init`.
1944    for base in cls.__mro__:
1945        init = base.__dict__.get('__init__', _no_init_or_replace_init)
1946        if init is not _no_init_or_replace_init:
1947            cls.__init__ = init
1948            break
1949    else:
1950        # should not happen
1951        cls.__init__ = object.__init__
1952
1953    cls.__init__(self, *args, **kwargs)
def create_evaluator_from_autoevals( autoevals_evaluator: Any, **kwargs: Optional[Dict[str, Any]]) -> EvaluatorFunction:
1020def create_evaluator_from_autoevals(
1021    autoevals_evaluator: Any, **kwargs: Optional[Dict[str, Any]]
1022) -> EvaluatorFunction:
1023    """Create a Langfuse evaluator from an autoevals evaluator.
1024
1025    Args:
1026        autoevals_evaluator: An autoevals evaluator instance
1027        **kwargs: Additional arguments passed to the evaluator
1028
1029    Returns:
1030        A Langfuse-compatible evaluator function
1031    """
1032
1033    def langfuse_evaluator(
1034        *,
1035        input: Any,
1036        output: Any,
1037        expected_output: Any,
1038        metadata: Optional[Dict[str, Any]],
1039        **langfuse_kwargs: Dict[str, Any],
1040    ) -> Evaluation:
1041        evaluation = autoevals_evaluator(
1042            input=input, output=output, expected=expected_output, **kwargs
1043        )
1044
1045        return Evaluation(
1046            name=evaluation.name,
1047            value=evaluation.score,
1048            comment=(evaluation.metadata or {}).get("comment"),
1049            metadata=evaluation.metadata,
1050        )
1051
1052    return langfuse_evaluator

Create a Langfuse evaluator from an autoevals evaluator.

Arguments:
  • autoevals_evaluator: An autoevals evaluator instance
  • **kwargs: Additional arguments passed to the evaluator
Returns:

A Langfuse-compatible evaluator function