langfuse.experiment

Langfuse experiment functionality for running and evaluating tasks on datasets.

This module provides the core experiment functionality for the Langfuse Python SDK, allowing users to run experiments on datasets with automatic tracing, evaluation, and result formatting.

   1"""Langfuse experiment functionality for running and evaluating tasks on datasets.
   2
   3This module provides the core experiment functionality for the Langfuse Python SDK,
   4allowing users to run experiments on datasets with automatic tracing, evaluation,
   5and result formatting.
   6"""
   7
   8import asyncio
   9import logging
  10from typing import (
  11    Any,
  12    Awaitable,
  13    Dict,
  14    List,
  15    Optional,
  16    Protocol,
  17    TypedDict,
  18    Union,
  19)
  20
  21from langfuse.api import DatasetItem, ScoreDataType
  22
  23
  24class LocalExperimentItem(TypedDict, total=False):
  25    """Structure for local experiment data items (not from Langfuse datasets).
  26
  27    This TypedDict defines the structure for experiment items when using local data
  28    rather than Langfuse-hosted datasets. All fields are optional to provide
  29    flexibility in data structure.
  30
  31    Attributes:
  32        input: The input data to pass to the task function. Can be any type that
  33            your task function can process (string, dict, list, etc.). This is
  34            typically the prompt, question, or data that your task will operate on.
  35        expected_output: Optional expected/ground truth output for evaluation purposes.
  36            Used by evaluators to assess correctness or quality. Can be None if
  37            no ground truth is available.
  38        metadata: Optional metadata dictionary containing additional context about
  39            this specific item. Can include information like difficulty level,
  40            category, source, or any other relevant attributes that evaluators
  41            might use for context-aware evaluation.
  42
  43    Examples:
  44        Simple text processing item:
  45        ```python
  46        item: LocalExperimentItem = {
  47            "input": "Summarize this article: ...",
  48            "expected_output": "Expected summary...",
  49            "metadata": {"difficulty": "medium", "category": "news"}
  50        }
  51        ```
  52
  53        Classification item:
  54        ```python
  55        item: LocalExperimentItem = {
  56            "input": {"text": "This movie is great!", "context": "movie review"},
  57            "expected_output": "positive",
  58            "metadata": {"dataset_source": "imdb", "confidence": 0.95}
  59        }
  60        ```
  61
  62        Minimal item with only input:
  63        ```python
  64        item: LocalExperimentItem = {
  65            "input": "What is the capital of France?"
  66        }
  67        ```
  68    """
  69
  70    input: Any
  71    expected_output: Any
  72    metadata: Optional[Dict[str, Any]]
  73
  74
  75ExperimentItem = Union[LocalExperimentItem, DatasetItem]
  76"""Type alias for items that can be processed in experiments.
  77
  78Can be either:
  79- LocalExperimentItem: Dict-like items with 'input', 'expected_output', 'metadata' keys
  80- DatasetItem: Items from Langfuse datasets with .input, .expected_output, .metadata attributes
  81"""
  82
  83ExperimentData = Union[List[LocalExperimentItem], List[DatasetItem]]
  84"""Type alias for experiment datasets.
  85
  86Represents the collection of items to process in an experiment. Can be either:
  87- List[LocalExperimentItem]: Local data items as dictionaries
  88- List[DatasetItem]: Items from a Langfuse dataset (typically from dataset.items)
  89"""
  90
  91
  92class Evaluation:
  93    """Represents an evaluation result for an experiment item or an entire experiment run.
  94
  95    This class provides a strongly-typed way to create evaluation results in evaluator functions.
  96    Users must use keyword arguments when instantiating this class.
  97
  98    Attributes:
  99        name: Unique identifier for the evaluation metric. Should be descriptive
 100            and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity").
 101            Used for aggregation and comparison across experiment runs.
 102        value: The evaluation score or result. Can be:
 103            - Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42)
 104            - String: For categorical results like "positive", "negative", "neutral"
 105            - Boolean: For binary assessments like "passes_safety_check"
 106        comment: Optional human-readable explanation of the evaluation result.
 107            Useful for providing context, explaining scoring rationale, or noting
 108            special conditions. Displayed in Langfuse UI for interpretability.
 109        metadata: Optional structured metadata about the evaluation process.
 110            Can include confidence scores, intermediate calculations, model versions,
 111            or any other relevant technical details.
 112        data_type: Optional score data type. Required if value is not NUMERIC.
 113            One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC.
 114        config_id: Optional Langfuse score config ID.
 115
 116    Examples:
 117        Basic accuracy evaluation:
 118        ```python
 119        from langfuse import Evaluation
 120
 121        def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
 122            if not expected_output:
 123                return Evaluation(name="accuracy", value=0, comment="No expected output")
 124
 125            is_correct = output.strip().lower() == expected_output.strip().lower()
 126            return Evaluation(
 127                name="accuracy",
 128                value=1.0 if is_correct else 0.0,
 129                comment="Correct answer" if is_correct else "Incorrect answer"
 130            )
 131        ```
 132
 133        Multi-metric evaluator:
 134        ```python
 135        def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs):
 136            return [
 137                Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"),
 138                Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"),
 139                Evaluation(
 140                    name="quality",
 141                    value=0.85,
 142                    comment="High quality response",
 143                    metadata={"confidence": 0.92, "model": "gpt-4"}
 144                )
 145            ]
 146        ```
 147
 148        Categorical evaluation:
 149        ```python
 150        def sentiment_evaluator(*, input, output, **kwargs):
 151            sentiment = analyze_sentiment(output)  # Returns "positive", "negative", or "neutral"
 152            return Evaluation(
 153                name="sentiment",
 154                value=sentiment,
 155                comment=f"Response expresses {sentiment} sentiment",
 156                data_type="CATEGORICAL"
 157            )
 158        ```
 159
 160        Failed evaluation with error handling:
 161        ```python
 162        def external_api_evaluator(*, input, output, **kwargs):
 163            try:
 164                score = external_api.evaluate(output)
 165                return Evaluation(name="external_score", value=score)
 166            except Exception as e:
 167                return Evaluation(
 168                    name="external_score",
 169                    value=0,
 170                    comment=f"API unavailable: {e}",
 171                    metadata={"error": str(e), "retry_count": 3}
 172                )
 173        ```
 174
 175    Note:
 176        All arguments must be passed as keywords. Positional arguments are not allowed
 177        to ensure code clarity and prevent errors from argument reordering.
 178    """
 179
 180    def __init__(
 181        self,
 182        *,
 183        name: str,
 184        value: Union[int, float, str, bool],
 185        comment: Optional[str] = None,
 186        metadata: Optional[Dict[str, Any]] = None,
 187        data_type: Optional[ScoreDataType] = None,
 188        config_id: Optional[str] = None,
 189    ):
 190        """Initialize an Evaluation with the provided data.
 191
 192        Args:
 193            name: Unique identifier for the evaluation metric.
 194            value: The evaluation score or result.
 195            comment: Optional human-readable explanation of the result.
 196            metadata: Optional structured metadata about the evaluation process.
 197            data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
 198            config_id: Optional Langfuse score config ID.
 199
 200        Note:
 201            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
 202        """
 203        self.name = name
 204        self.value = value
 205        self.comment = comment
 206        self.metadata = metadata
 207        self.data_type = data_type
 208        self.config_id = config_id
 209
 210
 211class ExperimentItemResult:
 212    """Result structure for individual experiment items.
 213
 214    This class represents the complete result of processing a single item
 215    during an experiment run, including the original input, task output,
 216    evaluations, and tracing information. Users must use keyword arguments when instantiating this class.
 217
 218    Attributes:
 219        item: The original experiment item that was processed. Can be either
 220            a dictionary with 'input', 'expected_output', and 'metadata' keys,
 221            or a DatasetItem from Langfuse datasets.
 222        output: The actual output produced by the task function for this item.
 223            Can be any type depending on what your task function returns.
 224        evaluations: List of evaluation results for this item. Each evaluation
 225            contains a name, value, optional comment, and optional metadata.
 226        trace_id: Optional Langfuse trace ID for this item's execution. Used
 227            to link the experiment result with the detailed trace in Langfuse UI.
 228        dataset_run_id: Optional dataset run ID if this item was part of a
 229            Langfuse dataset. None for local experiments.
 230
 231    Examples:
 232        Accessing item result data:
 233        ```python
 234        result = langfuse.run_experiment(...)
 235        for item_result in result.item_results:
 236            print(f"Input: {item_result.item}")
 237            print(f"Output: {item_result.output}")
 238            print(f"Trace: {item_result.trace_id}")
 239
 240            # Access evaluations
 241            for evaluation in item_result.evaluations:
 242                print(f"{evaluation.name}: {evaluation.value}")
 243        ```
 244
 245        Working with different item types:
 246        ```python
 247        # Local experiment item (dict)
 248        if isinstance(item_result.item, dict):
 249            input_data = item_result.item["input"]
 250            expected = item_result.item.get("expected_output")
 251
 252        # Langfuse dataset item (object with attributes)
 253        else:
 254            input_data = item_result.item.input
 255            expected = item_result.item.expected_output
 256        ```
 257
 258    Note:
 259        All arguments must be passed as keywords. Positional arguments are not allowed
 260        to ensure code clarity and prevent errors from argument reordering.
 261    """
 262
 263    def __init__(
 264        self,
 265        *,
 266        item: ExperimentItem,
 267        output: Any,
 268        evaluations: List[Evaluation],
 269        trace_id: Optional[str],
 270        dataset_run_id: Optional[str],
 271    ):
 272        """Initialize an ExperimentItemResult with the provided data.
 273
 274        Args:
 275            item: The original experiment item that was processed.
 276            output: The actual output produced by the task function for this item.
 277            evaluations: List of evaluation results for this item.
 278            trace_id: Optional Langfuse trace ID for this item's execution.
 279            dataset_run_id: Optional dataset run ID if this item was part of a Langfuse dataset.
 280
 281        Note:
 282            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
 283        """
 284        self.item = item
 285        self.output = output
 286        self.evaluations = evaluations
 287        self.trace_id = trace_id
 288        self.dataset_run_id = dataset_run_id
 289
 290
 291class ExperimentResult:
 292    """Complete result structure for experiment execution.
 293
 294    This class encapsulates the complete results of running an experiment on a dataset,
 295    including individual item results, aggregate run-level evaluations, and metadata
 296    about the experiment execution.
 297
 298    Attributes:
 299        name: The name of the experiment as specified during execution.
 300        run_name: The name of the current experiment run.
 301        description: Optional description of the experiment's purpose or methodology.
 302        item_results: List of results from processing each individual dataset item,
 303            containing the original item, task output, evaluations, and trace information.
 304        run_evaluations: List of aggregate evaluation results computed across all items,
 305            such as average scores, statistical summaries, or cross-item analyses.
 306        dataset_run_id: Optional ID of the dataset run in Langfuse (when using Langfuse datasets).
 307        dataset_run_url: Optional direct URL to view the experiment results in Langfuse UI.
 308
 309    Examples:
 310        Basic usage with local dataset:
 311        ```python
 312        result = langfuse.run_experiment(
 313            name="Capital Cities Test",
 314            data=local_data,
 315            task=generate_capital,
 316            evaluators=[accuracy_check]
 317        )
 318
 319        print(f"Processed {len(result.item_results)} items")
 320        print(result.format())  # Human-readable summary
 321
 322        # Access individual results
 323        for item_result in result.item_results:
 324            print(f"Input: {item_result.item}")
 325            print(f"Output: {item_result.output}")
 326            print(f"Scores: {item_result.evaluations}")
 327        ```
 328
 329        Usage with Langfuse datasets:
 330        ```python
 331        dataset = langfuse.get_dataset("qa-eval-set")
 332        result = dataset.run_experiment(
 333            name="GPT-4 QA Evaluation",
 334            task=answer_question,
 335            evaluators=[relevance_check, accuracy_check]
 336        )
 337
 338        # View in Langfuse UI
 339        if result.dataset_run_url:
 340            print(f"View detailed results: {result.dataset_run_url}")
 341        ```
 342
 343        Formatted output:
 344        ```python
 345        # Get summary view
 346        summary = result.format()
 347        print(summary)
 348
 349        # Get detailed view with individual items
 350        detailed = result.format(include_item_results=True)
 351        with open("experiment_report.txt", "w") as f:
 352            f.write(detailed)
 353        ```
 354    """
 355
 356    def __init__(
 357        self,
 358        *,
 359        name: str,
 360        run_name: str,
 361        description: Optional[str],
 362        item_results: List[ExperimentItemResult],
 363        run_evaluations: List[Evaluation],
 364        dataset_run_id: Optional[str] = None,
 365        dataset_run_url: Optional[str] = None,
 366    ):
 367        """Initialize an ExperimentResult with the provided data.
 368
 369        Args:
 370            name: The name of the experiment.
 371            run_name: The current experiment run name.
 372            description: Optional description of the experiment.
 373            item_results: List of results from processing individual dataset items.
 374            run_evaluations: List of aggregate evaluation results for the entire run.
 375            dataset_run_id: Optional ID of the dataset run (for Langfuse datasets).
 376            dataset_run_url: Optional URL to view results in Langfuse UI.
 377        """
 378        self.name = name
 379        self.run_name = run_name
 380        self.description = description
 381        self.item_results = item_results
 382        self.run_evaluations = run_evaluations
 383        self.dataset_run_id = dataset_run_id
 384        self.dataset_run_url = dataset_run_url
 385
 386    def format(self, *, include_item_results: bool = False) -> str:
 387        r"""Format the experiment result for human-readable display.
 388
 389        Converts the experiment result into a nicely formatted string suitable for
 390        console output, logging, or reporting. The output includes experiment overview,
 391        aggregate statistics, and optionally individual item details.
 392
 393        This method provides a comprehensive view of experiment performance including:
 394        - Experiment metadata (name, description, item count)
 395        - List of evaluation metrics used across items
 396        - Average scores computed across all processed items
 397        - Run-level evaluation results (aggregate metrics)
 398        - Links to view detailed results in Langfuse UI (when available)
 399        - Individual item details (when requested)
 400
 401        Args:
 402            include_item_results: Whether to include detailed results for each individual
 403                item in the formatted output. When False (default), only shows aggregate
 404                statistics and summary information. When True, includes input/output/scores
 405                for every processed item, making the output significantly longer but more
 406                detailed for debugging and analysis purposes.
 407
 408        Returns:
 409            A formatted multi-line string containing:
 410            - Experiment name and description (if provided)
 411            - Total number of items successfully processed
 412            - List of all evaluation metrics that were applied
 413            - Average scores across all items for each numeric metric
 414            - Run-level evaluation results with comments
 415            - Dataset run URL for viewing in Langfuse UI (if applicable)
 416            - Individual item details including inputs, outputs, and scores (if requested)
 417
 418        Examples:
 419            Basic usage showing aggregate results only:
 420            ```python
 421            result = langfuse.run_experiment(
 422                name="Capital Cities",
 423                data=dataset,
 424                task=generate_capital,
 425                evaluators=[accuracy_evaluator]
 426            )
 427
 428            print(result.format())
 429            # Output:
 430            # ──────────────────────────────────────────────────
 431            # 📊 Capital Cities
 432            # 100 items
 433            # Evaluations:
 434            #   • accuracy
 435            # Average Scores:
 436            #   • accuracy: 0.850
 437            ```
 438
 439            Detailed output including all individual item results:
 440            ```python
 441            detailed_report = result.format(include_item_results=True)
 442            print(detailed_report)
 443            # Output includes each item:
 444            # 1. Item 1:
 445            #    Input:    What is the capital of France?
 446            #    Expected: Paris
 447            #    Actual:   The capital of France is Paris.
 448            #    Scores:
 449            #      • accuracy: 1.000
 450            #        💭 Correct answer found
 451            # [... continues for all items ...]
 452            ```
 453
 454            Saving formatted results to file for reporting:
 455            ```python
 456            with open("experiment_report.txt", "w") as f:
 457                f.write(result.format(include_item_results=True))
 458
 459            # Or create summary report
 460            summary = result.format()  # Aggregate view only
 461            print(f"Experiment Summary:\n{summary}")
 462            ```
 463
 464            Integration with logging systems:
 465            ```python
 466            import logging
 467            logger = logging.getLogger("experiments")
 468
 469            # Log summary after experiment
 470            logger.info(f"Experiment completed:\n{result.format()}")
 471
 472            # Log detailed results for failed experiments
 473            if any(eval['value'] < threshold for eval in result.run_evaluations):
 474                logger.warning(f"Poor performance detected:\n{result.format(include_item_results=True)}")
 475            ```
 476        """
 477        if not self.item_results:
 478            return "No experiment results to display."
 479
 480        output = ""
 481
 482        # Individual results section
 483        if include_item_results:
 484            for i, result in enumerate(self.item_results):
 485                output += f"\n{i + 1}. Item {i + 1}:\n"
 486
 487                # Extract and display input
 488                item_input = None
 489                if isinstance(result.item, dict):
 490                    item_input = result.item.get("input")
 491                elif hasattr(result.item, "input"):
 492                    item_input = result.item.input
 493
 494                if item_input is not None:
 495                    output += f"   Input:    {_format_value(item_input)}\n"
 496
 497                # Extract and display expected output
 498                expected_output = None
 499                if isinstance(result.item, dict):
 500                    expected_output = result.item.get("expected_output")
 501                elif hasattr(result.item, "expected_output"):
 502                    expected_output = result.item.expected_output
 503
 504                if expected_output is not None:
 505                    output += f"   Expected: {_format_value(expected_output)}\n"
 506                output += f"   Actual:   {_format_value(result.output)}\n"
 507
 508                # Display evaluation scores
 509                if result.evaluations:
 510                    output += "   Scores:\n"
 511                    for evaluation in result.evaluations:
 512                        score = evaluation.value
 513                        if isinstance(score, (int, float)):
 514                            score = f"{score:.3f}"
 515                        output += f"     • {evaluation.name}: {score}"
 516                        if evaluation.comment:
 517                            output += f"\n       💭 {evaluation.comment}"
 518                        output += "\n"
 519
 520                # Display trace link if available
 521                if result.trace_id:
 522                    output += f"\n   Trace ID: {result.trace_id}\n"
 523        else:
 524            output += f"Individual Results: Hidden ({len(self.item_results)} items)\n"
 525            output += "💡 Set include_item_results=True to view them\n"
 526
 527        # Experiment overview section
 528        output += f"\n{'─' * 50}\n"
 529        output += f"🧪 Experiment: {self.name}"
 530        output += f"\n📋 Run name: {self.run_name}"
 531        if self.description:
 532            output += f" - {self.description}"
 533
 534        output += f"\n{len(self.item_results)} items"
 535
 536        # Collect unique evaluation names across all items
 537        evaluation_names = set()
 538        for result in self.item_results:
 539            for evaluation in result.evaluations:
 540                evaluation_names.add(evaluation.name)
 541
 542        if evaluation_names:
 543            output += "\nEvaluations:"
 544            for eval_name in evaluation_names:
 545                output += f"\n  • {eval_name}"
 546            output += "\n"
 547
 548        # Calculate and display average scores
 549        if evaluation_names:
 550            output += "\nAverage Scores:"
 551            for eval_name in evaluation_names:
 552                scores = []
 553                for result in self.item_results:
 554                    for evaluation in result.evaluations:
 555                        if evaluation.name == eval_name and isinstance(
 556                            evaluation.value, (int, float)
 557                        ):
 558                            scores.append(evaluation.value)
 559
 560                if scores:
 561                    avg = sum(scores) / len(scores)
 562                    output += f"\n  • {eval_name}: {avg:.3f}"
 563            output += "\n"
 564
 565        # Display run-level evaluations
 566        if self.run_evaluations:
 567            output += "\nRun Evaluations:"
 568            for run_eval in self.run_evaluations:
 569                score = run_eval.value
 570                if isinstance(score, (int, float)):
 571                    score = f"{score:.3f}"
 572                output += f"\n  • {run_eval.name}: {score}"
 573                if run_eval.comment:
 574                    output += f"\n    💭 {run_eval.comment}"
 575            output += "\n"
 576
 577        # Add dataset run URL if available
 578        if self.dataset_run_url:
 579            output += f"\n🔗 Dataset Run:\n   {self.dataset_run_url}"
 580
 581        return output
 582
 583
 584class TaskFunction(Protocol):
 585    """Protocol defining the interface for experiment task functions.
 586
 587    Task functions are the core processing functions that operate on each item
 588    in an experiment dataset. They receive an experiment item as input and
 589    produce some output that will be evaluated.
 590
 591    Task functions must:
 592    - Accept 'item' as a keyword argument
 593    - Return any type of output (will be passed to evaluators)
 594    - Can be either synchronous or asynchronous
 595    - Should handle their own errors gracefully (exceptions will be logged)
 596    """
 597
 598    def __call__(
 599        self,
 600        *,
 601        item: ExperimentItem,
 602        **kwargs: Dict[str, Any],
 603    ) -> Union[Any, Awaitable[Any]]:
 604        """Execute the task on an experiment item.
 605
 606        This method defines the core processing logic for each item in your experiment.
 607        The implementation should focus on the specific task you want to evaluate,
 608        such as text generation, classification, summarization, etc.
 609
 610        Args:
 611            item: The experiment item to process. Can be either:
 612                - Dict with keys like 'input', 'expected_output', 'metadata'
 613                - Langfuse DatasetItem object with .input, .expected_output attributes
 614            **kwargs: Additional keyword arguments that may be passed by the framework
 615
 616        Returns:
 617            Any: The output of processing the item. This output will be:
 618            - Stored in the experiment results
 619            - Passed to all item-level evaluators for assessment
 620            - Traced automatically in Langfuse for observability
 621
 622            Can return either a direct value or an awaitable (async) result.
 623
 624        Examples:
 625            Simple synchronous task:
 626            ```python
 627            def my_task(*, item, **kwargs):
 628                prompt = f"Summarize: {item['input']}"
 629                return my_llm_client.generate(prompt)
 630            ```
 631
 632            Async task with error handling:
 633            ```python
 634            async def my_async_task(*, item, **kwargs):
 635                try:
 636                    response = await openai_client.chat.completions.create(
 637                        model="gpt-4",
 638                        messages=[{"role": "user", "content": item["input"]}]
 639                    )
 640                    return response.choices[0].message.content
 641                except Exception as e:
 642                    # Log error and return fallback
 643                    print(f"Task failed for item {item}: {e}")
 644                    return "Error: Could not process item"
 645            ```
 646
 647            Task using dataset item attributes:
 648            ```python
 649            def classification_task(*, item, **kwargs):
 650                # Works with both dict items and DatasetItem objects
 651                text = item["input"] if isinstance(item, dict) else item.input
 652                return classify_text(text)
 653            ```
 654        """
 655        ...
 656
 657
 658class EvaluatorFunction(Protocol):
 659    """Protocol defining the interface for item-level evaluator functions.
 660
 661    Item-level evaluators assess the quality, correctness, or other properties
 662    of individual task outputs. They receive the input, output, expected output,
 663    and metadata for each item and return evaluation metrics.
 664
 665    Evaluators should:
 666    - Accept input, output, expected_output, and metadata as keyword arguments
 667    - Return Evaluation dict(s) with 'name', 'value', 'comment', 'metadata' fields
 668    - Be deterministic when possible for reproducible results
 669    - Handle edge cases gracefully (missing expected output, malformed data, etc.)
 670    - Can be either synchronous or asynchronous
 671    """
 672
 673    def __call__(
 674        self,
 675        *,
 676        input: Any,
 677        output: Any,
 678        expected_output: Any,
 679        metadata: Optional[Dict[str, Any]],
 680        **kwargs: Dict[str, Any],
 681    ) -> Union[
 682        Evaluation, List[Evaluation], Awaitable[Union[Evaluation, List[Evaluation]]]
 683    ]:
 684        r"""Evaluate a task output for quality, correctness, or other metrics.
 685
 686        This method should implement specific evaluation logic such as accuracy checking,
 687        similarity measurement, toxicity detection, fluency assessment, etc.
 688
 689        Args:
 690            input: The original input that was passed to the task function.
 691                This is typically the item['input'] or item.input value.
 692            output: The output produced by the task function for this input.
 693                This is the direct return value from your task function.
 694            expected_output: The expected/ground truth output for comparison.
 695                May be None if not available in the dataset. Evaluators should
 696                handle this case appropriately.
 697            metadata: Optional metadata from the experiment item that might
 698                contain additional context for evaluation (categories, difficulty, etc.)
 699            **kwargs: Additional keyword arguments that may be passed by the framework
 700
 701        Returns:
 702            Evaluation results in one of these formats:
 703            - Single Evaluation dict: {"name": "accuracy", "value": 0.85, "comment": "..."}
 704            - List of Evaluation dicts: [{"name": "precision", ...}, {"name": "recall", ...}]
 705            - Awaitable returning either of the above (for async evaluators)
 706
 707            Each Evaluation dict should contain:
 708            - name (str): Unique identifier for this evaluation metric
 709            - value (int|float|str|bool): The evaluation score or result
 710            - comment (str, optional): Human-readable explanation of the result
 711            - metadata (dict, optional): Additional structured data about the evaluation
 712
 713        Examples:
 714            Simple accuracy evaluator:
 715            ```python
 716            def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
 717                if expected_output is None:
 718                    return {"name": "accuracy", "value": 0, "comment": "No expected output"}
 719
 720                is_correct = output.strip().lower() == expected_output.strip().lower()
 721                return {
 722                    "name": "accuracy",
 723                    "value": 1.0 if is_correct else 0.0,
 724                    "comment": "Exact match" if is_correct else "No match"
 725                }
 726            ```
 727
 728            Multi-metric evaluator:
 729            ```python
 730            def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs):
 731                results = []
 732
 733                # Length check
 734                results.append({
 735                    "name": "output_length",
 736                    "value": len(output),
 737                    "comment": f"Output contains {len(output)} characters"
 738                })
 739
 740                # Sentiment analysis
 741                sentiment_score = analyze_sentiment(output)
 742                results.append({
 743                    "name": "sentiment",
 744                    "value": sentiment_score,
 745                    "comment": f"Sentiment score: {sentiment_score:.2f}"
 746                })
 747
 748                return results
 749            ```
 750
 751            Async evaluator using external API:
 752            ```python
 753            async def llm_judge_evaluator(*, input, output, expected_output=None, **kwargs):
 754                prompt = f"Rate the quality of this response on a scale of 1-10:\n"
 755                prompt += f"Question: {input}\nResponse: {output}"
 756
 757                response = await openai_client.chat.completions.create(
 758                    model="gpt-4",
 759                    messages=[{"role": "user", "content": prompt}]
 760                )
 761
 762                try:
 763                    score = float(response.choices[0].message.content.strip())
 764                    return {
 765                        "name": "llm_judge_quality",
 766                        "value": score,
 767                        "comment": f"LLM judge rated this {score}/10"
 768                    }
 769                except ValueError:
 770                    return {
 771                        "name": "llm_judge_quality",
 772                        "value": 0,
 773                        "comment": "Could not parse LLM judge score"
 774                    }
 775            ```
 776
 777            Context-aware evaluator:
 778            ```python
 779            def context_evaluator(*, input, output, metadata=None, **kwargs):
 780                # Use metadata for context-specific evaluation
 781                difficulty = metadata.get("difficulty", "medium") if metadata else "medium"
 782
 783                # Adjust expectations based on difficulty
 784                min_length = {"easy": 50, "medium": 100, "hard": 150}[difficulty]
 785
 786                meets_requirement = len(output) >= min_length
 787                return {
 788                    "name": f"meets_{difficulty}_requirement",
 789                    "value": meets_requirement,
 790                    "comment": f"Output {'meets' if meets_requirement else 'fails'} {difficulty} length requirement"
 791                }
 792            ```
 793        """
 794        ...
 795
 796
 797class RunEvaluatorFunction(Protocol):
 798    """Protocol defining the interface for run-level evaluator functions.
 799
 800    Run-level evaluators assess aggregate properties of the entire experiment run,
 801    computing metrics that span across all items rather than individual outputs.
 802    They receive the complete results from all processed items and can compute
 803    statistics like averages, distributions, correlations, or other aggregate metrics.
 804
 805    Run evaluators should:
 806    - Accept item_results as a keyword argument containing all item results
 807    - Return Evaluation dict(s) with aggregate metrics
 808    - Handle cases where some items may have failed processing
 809    - Compute meaningful statistics across the dataset
 810    - Can be either synchronous or asynchronous
 811    """
 812
 813    def __call__(
 814        self,
 815        *,
 816        item_results: List[ExperimentItemResult],
 817        **kwargs: Dict[str, Any],
 818    ) -> Union[
 819        Evaluation, List[Evaluation], Awaitable[Union[Evaluation, List[Evaluation]]]
 820    ]:
 821        r"""Evaluate the entire experiment run with aggregate metrics.
 822
 823        This method should implement aggregate evaluation logic such as computing
 824        averages, calculating distributions, finding correlations, detecting patterns
 825        across items, or performing statistical analysis on the experiment results.
 826
 827        Args:
 828            item_results: List of results from all successfully processed experiment items.
 829                Each item result contains:
 830                - item: The original experiment item
 831                - output: The task function's output for this item
 832                - evaluations: List of item-level evaluation results
 833                - trace_id: Langfuse trace ID for this execution
 834                - dataset_run_id: Dataset run ID (if using Langfuse datasets)
 835
 836                Note: This list only includes items that were successfully processed.
 837                Failed items are excluded but logged separately.
 838            **kwargs: Additional keyword arguments that may be passed by the framework
 839
 840        Returns:
 841            Evaluation results in one of these formats:
 842            - Single Evaluation dict: {"name": "avg_accuracy", "value": 0.78, "comment": "..."}
 843            - List of Evaluation dicts: [{"name": "mean", ...}, {"name": "std_dev", ...}]
 844            - Awaitable returning either of the above (for async evaluators)
 845
 846            Each Evaluation dict should contain:
 847            - name (str): Unique identifier for this run-level metric
 848            - value (int|float|str|bool): The aggregate evaluation result
 849            - comment (str, optional): Human-readable explanation of the metric
 850            - metadata (dict, optional): Additional structured data about the evaluation
 851
 852        Examples:
 853            Average accuracy calculator:
 854            ```python
 855            def average_accuracy(*, item_results, **kwargs):
 856                if not item_results:
 857                    return {"name": "avg_accuracy", "value": 0.0, "comment": "No results"}
 858
 859                accuracy_values = []
 860                for result in item_results:
 861                    for evaluation in result.evaluations:
 862                        if evaluation.name == "accuracy":
 863                            accuracy_values.append(evaluation.value)
 864
 865                if not accuracy_values:
 866                    return {"name": "avg_accuracy", "value": 0, "comment": "No accuracy evaluations found"}
 867
 868                avg = sum(accuracy_values) / len(accuracy_values)
 869                return {
 870                    "name": "avg_accuracy",
 871                    "value": avg,
 872                    "comment": f"Average accuracy across {len(accuracy_values)} items: {avg:.2%}"
 873                }
 874            ```
 875
 876            Multiple aggregate metrics:
 877            ```python
 878            def statistical_summary(*, item_results, **kwargs):
 879                if not item_results:
 880                    return []
 881
 882                results = []
 883
 884                # Calculate output length statistics
 885                lengths = [len(str(result.output)) for result in item_results]
 886                results.extend([
 887                    {"name": "avg_output_length", "value": sum(lengths) / len(lengths)},
 888                    {"name": "min_output_length", "value": min(lengths)},
 889                    {"name": "max_output_length", "value": max(lengths)}
 890                ])
 891
 892                # Success rate
 893                total_items = len(item_results)  # Only successful items are included
 894                results.append({
 895                    "name": "processing_success_rate",
 896                    "value": 1.0,  # All items in item_results succeeded
 897                    "comment": f"Successfully processed {total_items} items"
 898                })
 899
 900                return results
 901            ```
 902
 903            Async run evaluator with external analysis:
 904            ```python
 905            async def llm_batch_analysis(*, item_results, **kwargs):
 906                # Prepare batch analysis prompt
 907                outputs = [result.output for result in item_results]
 908                prompt = f"Analyze these {len(outputs)} outputs for common themes:\n"
 909                prompt += "\n".join(f"{i+1}. {output}" for i, output in enumerate(outputs))
 910
 911                response = await openai_client.chat.completions.create(
 912                    model="gpt-4",
 913                    messages=[{"role": "user", "content": prompt}]
 914                )
 915
 916                return {
 917                    "name": "thematic_analysis",
 918                    "value": response.choices[0].message.content,
 919                    "comment": f"LLM analysis of {len(outputs)} outputs"
 920                }
 921            ```
 922
 923            Performance distribution analysis:
 924            ```python
 925            def performance_distribution(*, item_results, **kwargs):
 926                # Extract all evaluation scores
 927                all_scores = []
 928                score_by_metric = {}
 929
 930                for result in item_results:
 931                    for evaluation in result.evaluations:
 932                        metric_name = evaluation.name
 933                        value = evaluation.value
 934
 935                        if isinstance(value, (int, float)):
 936                            all_scores.append(value)
 937                            if metric_name not in score_by_metric:
 938                                score_by_metric[metric_name] = []
 939                            score_by_metric[metric_name].append(value)
 940
 941                results = []
 942
 943                # Overall score distribution
 944                if all_scores:
 945                    import statistics
 946                    results.append({
 947                        "name": "score_std_dev",
 948                        "value": statistics.stdev(all_scores) if len(all_scores) > 1 else 0,
 949                        "comment": f"Standard deviation across all numeric scores"
 950                    })
 951
 952                # Per-metric statistics
 953                for metric, scores in score_by_metric.items():
 954                    if len(scores) > 1:
 955                        results.append({
 956                            "name": f"{metric}_variance",
 957                            "value": statistics.variance(scores),
 958                            "comment": f"Variance in {metric} across {len(scores)} items"
 959                        })
 960
 961                return results
 962            ```
 963        """
 964        ...
 965
 966
 967def _format_value(value: Any) -> str:
 968    """Format a value for display."""
 969    if isinstance(value, str):
 970        return value[:50] + "..." if len(value) > 50 else value
 971    return str(value)
 972
 973
 974async def _run_evaluator(
 975    evaluator: Union[EvaluatorFunction, RunEvaluatorFunction], **kwargs: Any
 976) -> List[Evaluation]:
 977    """Run an evaluator function and normalize the result."""
 978    try:
 979        result = evaluator(**kwargs)
 980
 981        # Handle async evaluators
 982        if asyncio.iscoroutine(result):
 983            result = await result
 984
 985        # Normalize to list
 986        if isinstance(result, (dict, Evaluation)):
 987            return [result]  # type: ignore
 988
 989        elif isinstance(result, list):
 990            return result
 991
 992        else:
 993            return []
 994
 995    except Exception as e:
 996        evaluator_name = getattr(evaluator, "__name__", "unknown_evaluator")
 997        logging.getLogger("langfuse").error(f"Evaluator {evaluator_name} failed: {e}")
 998        return []
 999
1000
1001async def _run_task(task: TaskFunction, item: ExperimentItem) -> Any:
1002    """Run a task function and handle sync/async."""
1003    result = task(item=item)
1004
1005    # Handle async tasks
1006    if asyncio.iscoroutine(result):
1007        result = await result
1008
1009    return result
1010
1011
1012def create_evaluator_from_autoevals(
1013    autoevals_evaluator: Any, **kwargs: Optional[Dict[str, Any]]
1014) -> EvaluatorFunction:
1015    """Create a Langfuse evaluator from an autoevals evaluator.
1016
1017    Args:
1018        autoevals_evaluator: An autoevals evaluator instance
1019        **kwargs: Additional arguments passed to the evaluator
1020
1021    Returns:
1022        A Langfuse-compatible evaluator function
1023    """
1024
1025    def langfuse_evaluator(
1026        *,
1027        input: Any,
1028        output: Any,
1029        expected_output: Any,
1030        metadata: Optional[Dict[str, Any]],
1031        **langfuse_kwargs: Dict[str, Any],
1032    ) -> Evaluation:
1033        evaluation = autoevals_evaluator(
1034            input=input, output=output, expected=expected_output, **kwargs
1035        )
1036
1037        return Evaluation(
1038            name=evaluation.name,
1039            value=evaluation.score,
1040            comment=(evaluation.metadata or {}).get("comment"),
1041            metadata=evaluation.metadata,
1042        )
1043
1044    return langfuse_evaluator
class LocalExperimentItem(typing.TypedDict):
25class LocalExperimentItem(TypedDict, total=False):
26    """Structure for local experiment data items (not from Langfuse datasets).
27
28    This TypedDict defines the structure for experiment items when using local data
29    rather than Langfuse-hosted datasets. All fields are optional to provide
30    flexibility in data structure.
31
32    Attributes:
33        input: The input data to pass to the task function. Can be any type that
34            your task function can process (string, dict, list, etc.). This is
35            typically the prompt, question, or data that your task will operate on.
36        expected_output: Optional expected/ground truth output for evaluation purposes.
37            Used by evaluators to assess correctness or quality. Can be None if
38            no ground truth is available.
39        metadata: Optional metadata dictionary containing additional context about
40            this specific item. Can include information like difficulty level,
41            category, source, or any other relevant attributes that evaluators
42            might use for context-aware evaluation.
43
44    Examples:
45        Simple text processing item:
46        ```python
47        item: LocalExperimentItem = {
48            "input": "Summarize this article: ...",
49            "expected_output": "Expected summary...",
50            "metadata": {"difficulty": "medium", "category": "news"}
51        }
52        ```
53
54        Classification item:
55        ```python
56        item: LocalExperimentItem = {
57            "input": {"text": "This movie is great!", "context": "movie review"},
58            "expected_output": "positive",
59            "metadata": {"dataset_source": "imdb", "confidence": 0.95}
60        }
61        ```
62
63        Minimal item with only input:
64        ```python
65        item: LocalExperimentItem = {
66            "input": "What is the capital of France?"
67        }
68        ```
69    """
70
71    input: Any
72    expected_output: Any
73    metadata: Optional[Dict[str, Any]]

Structure for local experiment data items (not from Langfuse datasets).

This TypedDict defines the structure for experiment items when using local data rather than Langfuse-hosted datasets. All fields are optional to provide flexibility in data structure.

Attributes:
  • input: The input data to pass to the task function. Can be any type that your task function can process (string, dict, list, etc.). This is typically the prompt, question, or data that your task will operate on.
  • expected_output: Optional expected/ground truth output for evaluation purposes. Used by evaluators to assess correctness or quality. Can be None if no ground truth is available.
  • metadata: Optional metadata dictionary containing additional context about this specific item. Can include information like difficulty level, category, source, or any other relevant attributes that evaluators might use for context-aware evaluation.
Examples:

Simple text processing item:

item: LocalExperimentItem = {
    "input": "Summarize this article: ...",
    "expected_output": "Expected summary...",
    "metadata": {"difficulty": "medium", "category": "news"}
}

Classification item:

item: LocalExperimentItem = {
    "input": {"text": "This movie is great!", "context": "movie review"},
    "expected_output": "positive",
    "metadata": {"dataset_source": "imdb", "confidence": 0.95}
}

Minimal item with only input:

item: LocalExperimentItem = {
    "input": "What is the capital of France?"
}
input: Any
expected_output: Any
metadata: Optional[Dict[str, Any]]
ExperimentItem = typing.Union[LocalExperimentItem, langfuse.api.DatasetItem]

Type alias for items that can be processed in experiments.

Can be either:

  • LocalExperimentItem: Dict-like items with 'input', 'expected_output', 'metadata' keys
  • DatasetItem: Items from Langfuse datasets with .input, .expected_output, .metadata attributes
ExperimentData = typing.Union[typing.List[LocalExperimentItem], typing.List[langfuse.api.DatasetItem]]

Type alias for experiment datasets.

Represents the collection of items to process in an experiment. Can be either:

  • List[LocalExperimentItem]: Local data items as dictionaries
  • List[DatasetItem]: Items from a Langfuse dataset (typically from dataset.items)
class Evaluation:
 93class Evaluation:
 94    """Represents an evaluation result for an experiment item or an entire experiment run.
 95
 96    This class provides a strongly-typed way to create evaluation results in evaluator functions.
 97    Users must use keyword arguments when instantiating this class.
 98
 99    Attributes:
100        name: Unique identifier for the evaluation metric. Should be descriptive
101            and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity").
102            Used for aggregation and comparison across experiment runs.
103        value: The evaluation score or result. Can be:
104            - Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42)
105            - String: For categorical results like "positive", "negative", "neutral"
106            - Boolean: For binary assessments like "passes_safety_check"
107        comment: Optional human-readable explanation of the evaluation result.
108            Useful for providing context, explaining scoring rationale, or noting
109            special conditions. Displayed in Langfuse UI for interpretability.
110        metadata: Optional structured metadata about the evaluation process.
111            Can include confidence scores, intermediate calculations, model versions,
112            or any other relevant technical details.
113        data_type: Optional score data type. Required if value is not NUMERIC.
114            One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC.
115        config_id: Optional Langfuse score config ID.
116
117    Examples:
118        Basic accuracy evaluation:
119        ```python
120        from langfuse import Evaluation
121
122        def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
123            if not expected_output:
124                return Evaluation(name="accuracy", value=0, comment="No expected output")
125
126            is_correct = output.strip().lower() == expected_output.strip().lower()
127            return Evaluation(
128                name="accuracy",
129                value=1.0 if is_correct else 0.0,
130                comment="Correct answer" if is_correct else "Incorrect answer"
131            )
132        ```
133
134        Multi-metric evaluator:
135        ```python
136        def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs):
137            return [
138                Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"),
139                Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"),
140                Evaluation(
141                    name="quality",
142                    value=0.85,
143                    comment="High quality response",
144                    metadata={"confidence": 0.92, "model": "gpt-4"}
145                )
146            ]
147        ```
148
149        Categorical evaluation:
150        ```python
151        def sentiment_evaluator(*, input, output, **kwargs):
152            sentiment = analyze_sentiment(output)  # Returns "positive", "negative", or "neutral"
153            return Evaluation(
154                name="sentiment",
155                value=sentiment,
156                comment=f"Response expresses {sentiment} sentiment",
157                data_type="CATEGORICAL"
158            )
159        ```
160
161        Failed evaluation with error handling:
162        ```python
163        def external_api_evaluator(*, input, output, **kwargs):
164            try:
165                score = external_api.evaluate(output)
166                return Evaluation(name="external_score", value=score)
167            except Exception as e:
168                return Evaluation(
169                    name="external_score",
170                    value=0,
171                    comment=f"API unavailable: {e}",
172                    metadata={"error": str(e), "retry_count": 3}
173                )
174        ```
175
176    Note:
177        All arguments must be passed as keywords. Positional arguments are not allowed
178        to ensure code clarity and prevent errors from argument reordering.
179    """
180
181    def __init__(
182        self,
183        *,
184        name: str,
185        value: Union[int, float, str, bool],
186        comment: Optional[str] = None,
187        metadata: Optional[Dict[str, Any]] = None,
188        data_type: Optional[ScoreDataType] = None,
189        config_id: Optional[str] = None,
190    ):
191        """Initialize an Evaluation with the provided data.
192
193        Args:
194            name: Unique identifier for the evaluation metric.
195            value: The evaluation score or result.
196            comment: Optional human-readable explanation of the result.
197            metadata: Optional structured metadata about the evaluation process.
198            data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
199            config_id: Optional Langfuse score config ID.
200
201        Note:
202            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
203        """
204        self.name = name
205        self.value = value
206        self.comment = comment
207        self.metadata = metadata
208        self.data_type = data_type
209        self.config_id = config_id

Represents an evaluation result for an experiment item or an entire experiment run.

This class provides a strongly-typed way to create evaluation results in evaluator functions. Users must use keyword arguments when instantiating this class.

Attributes:
  • name: Unique identifier for the evaluation metric. Should be descriptive and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity"). Used for aggregation and comparison across experiment runs.
  • value: The evaluation score or result. Can be:
    • Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42)
    • String: For categorical results like "positive", "negative", "neutral"
    • Boolean: For binary assessments like "passes_safety_check"
  • comment: Optional human-readable explanation of the evaluation result. Useful for providing context, explaining scoring rationale, or noting special conditions. Displayed in Langfuse UI for interpretability.
  • metadata: Optional structured metadata about the evaluation process. Can include confidence scores, intermediate calculations, model versions, or any other relevant technical details.
  • data_type: Optional score data type. Required if value is not NUMERIC. One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC.
  • config_id: Optional Langfuse score config ID.
Examples:

Basic accuracy evaluation:

from langfuse import Evaluation

def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
    if not expected_output:
        return Evaluation(name="accuracy", value=0, comment="No expected output")

    is_correct = output.strip().lower() == expected_output.strip().lower()
    return Evaluation(
        name="accuracy",
        value=1.0 if is_correct else 0.0,
        comment="Correct answer" if is_correct else "Incorrect answer"
    )

Multi-metric evaluator:

def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs):
    return [
        Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"),
        Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"),
        Evaluation(
            name="quality",
            value=0.85,
            comment="High quality response",
            metadata={"confidence": 0.92, "model": "gpt-4"}
        )
    ]

Categorical evaluation:

def sentiment_evaluator(*, input, output, **kwargs):
    sentiment = analyze_sentiment(output)  # Returns "positive", "negative", or "neutral"
    return Evaluation(
        name="sentiment",
        value=sentiment,
        comment=f"Response expresses {sentiment} sentiment",
        data_type="CATEGORICAL"
    )

Failed evaluation with error handling:

def external_api_evaluator(*, input, output, **kwargs):
    try:
        score = external_api.evaluate(output)
        return Evaluation(name="external_score", value=score)
    except Exception as e:
        return Evaluation(
            name="external_score",
            value=0,
            comment=f"API unavailable: {e}",
            metadata={"error": str(e), "retry_count": 3}
        )
Note:

All arguments must be passed as keywords. Positional arguments are not allowed to ensure code clarity and prevent errors from argument reordering.

Evaluation( *, name: str, value: Union[int, float, str, bool], comment: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, data_type: Optional[langfuse.api.ScoreDataType] = None, config_id: Optional[str] = None)
181    def __init__(
182        self,
183        *,
184        name: str,
185        value: Union[int, float, str, bool],
186        comment: Optional[str] = None,
187        metadata: Optional[Dict[str, Any]] = None,
188        data_type: Optional[ScoreDataType] = None,
189        config_id: Optional[str] = None,
190    ):
191        """Initialize an Evaluation with the provided data.
192
193        Args:
194            name: Unique identifier for the evaluation metric.
195            value: The evaluation score or result.
196            comment: Optional human-readable explanation of the result.
197            metadata: Optional structured metadata about the evaluation process.
198            data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
199            config_id: Optional Langfuse score config ID.
200
201        Note:
202            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
203        """
204        self.name = name
205        self.value = value
206        self.comment = comment
207        self.metadata = metadata
208        self.data_type = data_type
209        self.config_id = config_id

Initialize an Evaluation with the provided data.

Arguments:
  • name: Unique identifier for the evaluation metric.
  • value: The evaluation score or result.
  • comment: Optional human-readable explanation of the result.
  • metadata: Optional structured metadata about the evaluation process.
  • data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
  • config_id: Optional Langfuse score config ID.
Note:

All arguments must be provided as keywords. Positional arguments will raise a TypeError.

name
value
comment
metadata
data_type
config_id
class ExperimentItemResult:
212class ExperimentItemResult:
213    """Result structure for individual experiment items.
214
215    This class represents the complete result of processing a single item
216    during an experiment run, including the original input, task output,
217    evaluations, and tracing information. Users must use keyword arguments when instantiating this class.
218
219    Attributes:
220        item: The original experiment item that was processed. Can be either
221            a dictionary with 'input', 'expected_output', and 'metadata' keys,
222            or a DatasetItem from Langfuse datasets.
223        output: The actual output produced by the task function for this item.
224            Can be any type depending on what your task function returns.
225        evaluations: List of evaluation results for this item. Each evaluation
226            contains a name, value, optional comment, and optional metadata.
227        trace_id: Optional Langfuse trace ID for this item's execution. Used
228            to link the experiment result with the detailed trace in Langfuse UI.
229        dataset_run_id: Optional dataset run ID if this item was part of a
230            Langfuse dataset. None for local experiments.
231
232    Examples:
233        Accessing item result data:
234        ```python
235        result = langfuse.run_experiment(...)
236        for item_result in result.item_results:
237            print(f"Input: {item_result.item}")
238            print(f"Output: {item_result.output}")
239            print(f"Trace: {item_result.trace_id}")
240
241            # Access evaluations
242            for evaluation in item_result.evaluations:
243                print(f"{evaluation.name}: {evaluation.value}")
244        ```
245
246        Working with different item types:
247        ```python
248        # Local experiment item (dict)
249        if isinstance(item_result.item, dict):
250            input_data = item_result.item["input"]
251            expected = item_result.item.get("expected_output")
252
253        # Langfuse dataset item (object with attributes)
254        else:
255            input_data = item_result.item.input
256            expected = item_result.item.expected_output
257        ```
258
259    Note:
260        All arguments must be passed as keywords. Positional arguments are not allowed
261        to ensure code clarity and prevent errors from argument reordering.
262    """
263
264    def __init__(
265        self,
266        *,
267        item: ExperimentItem,
268        output: Any,
269        evaluations: List[Evaluation],
270        trace_id: Optional[str],
271        dataset_run_id: Optional[str],
272    ):
273        """Initialize an ExperimentItemResult with the provided data.
274
275        Args:
276            item: The original experiment item that was processed.
277            output: The actual output produced by the task function for this item.
278            evaluations: List of evaluation results for this item.
279            trace_id: Optional Langfuse trace ID for this item's execution.
280            dataset_run_id: Optional dataset run ID if this item was part of a Langfuse dataset.
281
282        Note:
283            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
284        """
285        self.item = item
286        self.output = output
287        self.evaluations = evaluations
288        self.trace_id = trace_id
289        self.dataset_run_id = dataset_run_id

Result structure for individual experiment items.

This class represents the complete result of processing a single item during an experiment run, including the original input, task output, evaluations, and tracing information. Users must use keyword arguments when instantiating this class.

Attributes:
  • item: The original experiment item that was processed. Can be either a dictionary with 'input', 'expected_output', and 'metadata' keys, or a DatasetItem from Langfuse datasets.
  • output: The actual output produced by the task function for this item. Can be any type depending on what your task function returns.
  • evaluations: List of evaluation results for this item. Each evaluation contains a name, value, optional comment, and optional metadata.
  • trace_id: Optional Langfuse trace ID for this item's execution. Used to link the experiment result with the detailed trace in Langfuse UI.
  • dataset_run_id: Optional dataset run ID if this item was part of a Langfuse dataset. None for local experiments.
Examples:

Accessing item result data:

result = langfuse.run_experiment(...)
for item_result in result.item_results:
    print(f"Input: {item_result.item}")
    print(f"Output: {item_result.output}")
    print(f"Trace: {item_result.trace_id}")

    # Access evaluations
    for evaluation in item_result.evaluations:
        print(f"{evaluation.name}: {evaluation.value}")

Working with different item types:

# Local experiment item (dict)
if isinstance(item_result.item, dict):
    input_data = item_result.item["input"]
    expected = item_result.item.get("expected_output")

# Langfuse dataset item (object with attributes)
else:
    input_data = item_result.item.input
    expected = item_result.item.expected_output
Note:

All arguments must be passed as keywords. Positional arguments are not allowed to ensure code clarity and prevent errors from argument reordering.

ExperimentItemResult( *, item: Union[LocalExperimentItem, langfuse.api.DatasetItem], output: Any, evaluations: List[Evaluation], trace_id: Optional[str], dataset_run_id: Optional[str])
264    def __init__(
265        self,
266        *,
267        item: ExperimentItem,
268        output: Any,
269        evaluations: List[Evaluation],
270        trace_id: Optional[str],
271        dataset_run_id: Optional[str],
272    ):
273        """Initialize an ExperimentItemResult with the provided data.
274
275        Args:
276            item: The original experiment item that was processed.
277            output: The actual output produced by the task function for this item.
278            evaluations: List of evaluation results for this item.
279            trace_id: Optional Langfuse trace ID for this item's execution.
280            dataset_run_id: Optional dataset run ID if this item was part of a Langfuse dataset.
281
282        Note:
283            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
284        """
285        self.item = item
286        self.output = output
287        self.evaluations = evaluations
288        self.trace_id = trace_id
289        self.dataset_run_id = dataset_run_id

Initialize an ExperimentItemResult with the provided data.

Arguments:
  • item: The original experiment item that was processed.
  • output: The actual output produced by the task function for this item.
  • evaluations: List of evaluation results for this item.
  • trace_id: Optional Langfuse trace ID for this item's execution.
  • dataset_run_id: Optional dataset run ID if this item was part of a Langfuse dataset.
Note:

All arguments must be provided as keywords. Positional arguments will raise a TypeError.

item
output
evaluations
trace_id
dataset_run_id
class ExperimentResult:
292class ExperimentResult:
293    """Complete result structure for experiment execution.
294
295    This class encapsulates the complete results of running an experiment on a dataset,
296    including individual item results, aggregate run-level evaluations, and metadata
297    about the experiment execution.
298
299    Attributes:
300        name: The name of the experiment as specified during execution.
301        run_name: The name of the current experiment run.
302        description: Optional description of the experiment's purpose or methodology.
303        item_results: List of results from processing each individual dataset item,
304            containing the original item, task output, evaluations, and trace information.
305        run_evaluations: List of aggregate evaluation results computed across all items,
306            such as average scores, statistical summaries, or cross-item analyses.
307        dataset_run_id: Optional ID of the dataset run in Langfuse (when using Langfuse datasets).
308        dataset_run_url: Optional direct URL to view the experiment results in Langfuse UI.
309
310    Examples:
311        Basic usage with local dataset:
312        ```python
313        result = langfuse.run_experiment(
314            name="Capital Cities Test",
315            data=local_data,
316            task=generate_capital,
317            evaluators=[accuracy_check]
318        )
319
320        print(f"Processed {len(result.item_results)} items")
321        print(result.format())  # Human-readable summary
322
323        # Access individual results
324        for item_result in result.item_results:
325            print(f"Input: {item_result.item}")
326            print(f"Output: {item_result.output}")
327            print(f"Scores: {item_result.evaluations}")
328        ```
329
330        Usage with Langfuse datasets:
331        ```python
332        dataset = langfuse.get_dataset("qa-eval-set")
333        result = dataset.run_experiment(
334            name="GPT-4 QA Evaluation",
335            task=answer_question,
336            evaluators=[relevance_check, accuracy_check]
337        )
338
339        # View in Langfuse UI
340        if result.dataset_run_url:
341            print(f"View detailed results: {result.dataset_run_url}")
342        ```
343
344        Formatted output:
345        ```python
346        # Get summary view
347        summary = result.format()
348        print(summary)
349
350        # Get detailed view with individual items
351        detailed = result.format(include_item_results=True)
352        with open("experiment_report.txt", "w") as f:
353            f.write(detailed)
354        ```
355    """
356
357    def __init__(
358        self,
359        *,
360        name: str,
361        run_name: str,
362        description: Optional[str],
363        item_results: List[ExperimentItemResult],
364        run_evaluations: List[Evaluation],
365        dataset_run_id: Optional[str] = None,
366        dataset_run_url: Optional[str] = None,
367    ):
368        """Initialize an ExperimentResult with the provided data.
369
370        Args:
371            name: The name of the experiment.
372            run_name: The current experiment run name.
373            description: Optional description of the experiment.
374            item_results: List of results from processing individual dataset items.
375            run_evaluations: List of aggregate evaluation results for the entire run.
376            dataset_run_id: Optional ID of the dataset run (for Langfuse datasets).
377            dataset_run_url: Optional URL to view results in Langfuse UI.
378        """
379        self.name = name
380        self.run_name = run_name
381        self.description = description
382        self.item_results = item_results
383        self.run_evaluations = run_evaluations
384        self.dataset_run_id = dataset_run_id
385        self.dataset_run_url = dataset_run_url
386
387    def format(self, *, include_item_results: bool = False) -> str:
388        r"""Format the experiment result for human-readable display.
389
390        Converts the experiment result into a nicely formatted string suitable for
391        console output, logging, or reporting. The output includes experiment overview,
392        aggregate statistics, and optionally individual item details.
393
394        This method provides a comprehensive view of experiment performance including:
395        - Experiment metadata (name, description, item count)
396        - List of evaluation metrics used across items
397        - Average scores computed across all processed items
398        - Run-level evaluation results (aggregate metrics)
399        - Links to view detailed results in Langfuse UI (when available)
400        - Individual item details (when requested)
401
402        Args:
403            include_item_results: Whether to include detailed results for each individual
404                item in the formatted output. When False (default), only shows aggregate
405                statistics and summary information. When True, includes input/output/scores
406                for every processed item, making the output significantly longer but more
407                detailed for debugging and analysis purposes.
408
409        Returns:
410            A formatted multi-line string containing:
411            - Experiment name and description (if provided)
412            - Total number of items successfully processed
413            - List of all evaluation metrics that were applied
414            - Average scores across all items for each numeric metric
415            - Run-level evaluation results with comments
416            - Dataset run URL for viewing in Langfuse UI (if applicable)
417            - Individual item details including inputs, outputs, and scores (if requested)
418
419        Examples:
420            Basic usage showing aggregate results only:
421            ```python
422            result = langfuse.run_experiment(
423                name="Capital Cities",
424                data=dataset,
425                task=generate_capital,
426                evaluators=[accuracy_evaluator]
427            )
428
429            print(result.format())
430            # Output:
431            # ──────────────────────────────────────────────────
432            # 📊 Capital Cities
433            # 100 items
434            # Evaluations:
435            #   • accuracy
436            # Average Scores:
437            #   • accuracy: 0.850
438            ```
439
440            Detailed output including all individual item results:
441            ```python
442            detailed_report = result.format(include_item_results=True)
443            print(detailed_report)
444            # Output includes each item:
445            # 1. Item 1:
446            #    Input:    What is the capital of France?
447            #    Expected: Paris
448            #    Actual:   The capital of France is Paris.
449            #    Scores:
450            #      • accuracy: 1.000
451            #        💭 Correct answer found
452            # [... continues for all items ...]
453            ```
454
455            Saving formatted results to file for reporting:
456            ```python
457            with open("experiment_report.txt", "w") as f:
458                f.write(result.format(include_item_results=True))
459
460            # Or create summary report
461            summary = result.format()  # Aggregate view only
462            print(f"Experiment Summary:\n{summary}")
463            ```
464
465            Integration with logging systems:
466            ```python
467            import logging
468            logger = logging.getLogger("experiments")
469
470            # Log summary after experiment
471            logger.info(f"Experiment completed:\n{result.format()}")
472
473            # Log detailed results for failed experiments
474            if any(eval['value'] < threshold for eval in result.run_evaluations):
475                logger.warning(f"Poor performance detected:\n{result.format(include_item_results=True)}")
476            ```
477        """
478        if not self.item_results:
479            return "No experiment results to display."
480
481        output = ""
482
483        # Individual results section
484        if include_item_results:
485            for i, result in enumerate(self.item_results):
486                output += f"\n{i + 1}. Item {i + 1}:\n"
487
488                # Extract and display input
489                item_input = None
490                if isinstance(result.item, dict):
491                    item_input = result.item.get("input")
492                elif hasattr(result.item, "input"):
493                    item_input = result.item.input
494
495                if item_input is not None:
496                    output += f"   Input:    {_format_value(item_input)}\n"
497
498                # Extract and display expected output
499                expected_output = None
500                if isinstance(result.item, dict):
501                    expected_output = result.item.get("expected_output")
502                elif hasattr(result.item, "expected_output"):
503                    expected_output = result.item.expected_output
504
505                if expected_output is not None:
506                    output += f"   Expected: {_format_value(expected_output)}\n"
507                output += f"   Actual:   {_format_value(result.output)}\n"
508
509                # Display evaluation scores
510                if result.evaluations:
511                    output += "   Scores:\n"
512                    for evaluation in result.evaluations:
513                        score = evaluation.value
514                        if isinstance(score, (int, float)):
515                            score = f"{score:.3f}"
516                        output += f"     • {evaluation.name}: {score}"
517                        if evaluation.comment:
518                            output += f"\n       💭 {evaluation.comment}"
519                        output += "\n"
520
521                # Display trace link if available
522                if result.trace_id:
523                    output += f"\n   Trace ID: {result.trace_id}\n"
524        else:
525            output += f"Individual Results: Hidden ({len(self.item_results)} items)\n"
526            output += "💡 Set include_item_results=True to view them\n"
527
528        # Experiment overview section
529        output += f"\n{'─' * 50}\n"
530        output += f"🧪 Experiment: {self.name}"
531        output += f"\n📋 Run name: {self.run_name}"
532        if self.description:
533            output += f" - {self.description}"
534
535        output += f"\n{len(self.item_results)} items"
536
537        # Collect unique evaluation names across all items
538        evaluation_names = set()
539        for result in self.item_results:
540            for evaluation in result.evaluations:
541                evaluation_names.add(evaluation.name)
542
543        if evaluation_names:
544            output += "\nEvaluations:"
545            for eval_name in evaluation_names:
546                output += f"\n  • {eval_name}"
547            output += "\n"
548
549        # Calculate and display average scores
550        if evaluation_names:
551            output += "\nAverage Scores:"
552            for eval_name in evaluation_names:
553                scores = []
554                for result in self.item_results:
555                    for evaluation in result.evaluations:
556                        if evaluation.name == eval_name and isinstance(
557                            evaluation.value, (int, float)
558                        ):
559                            scores.append(evaluation.value)
560
561                if scores:
562                    avg = sum(scores) / len(scores)
563                    output += f"\n  • {eval_name}: {avg:.3f}"
564            output += "\n"
565
566        # Display run-level evaluations
567        if self.run_evaluations:
568            output += "\nRun Evaluations:"
569            for run_eval in self.run_evaluations:
570                score = run_eval.value
571                if isinstance(score, (int, float)):
572                    score = f"{score:.3f}"
573                output += f"\n  • {run_eval.name}: {score}"
574                if run_eval.comment:
575                    output += f"\n    💭 {run_eval.comment}"
576            output += "\n"
577
578        # Add dataset run URL if available
579        if self.dataset_run_url:
580            output += f"\n🔗 Dataset Run:\n   {self.dataset_run_url}"
581
582        return output

Complete result structure for experiment execution.

This class encapsulates the complete results of running an experiment on a dataset, including individual item results, aggregate run-level evaluations, and metadata about the experiment execution.

Attributes:
  • name: The name of the experiment as specified during execution.
  • run_name: The name of the current experiment run.
  • description: Optional description of the experiment's purpose or methodology.
  • item_results: List of results from processing each individual dataset item, containing the original item, task output, evaluations, and trace information.
  • run_evaluations: List of aggregate evaluation results computed across all items, such as average scores, statistical summaries, or cross-item analyses.
  • dataset_run_id: Optional ID of the dataset run in Langfuse (when using Langfuse datasets).
  • dataset_run_url: Optional direct URL to view the experiment results in Langfuse UI.
Examples:

Basic usage with local dataset:

result = langfuse.run_experiment(
    name="Capital Cities Test",
    data=local_data,
    task=generate_capital,
    evaluators=[accuracy_check]
)

print(f"Processed {len(result.item_results)} items")
print(result.format())  # Human-readable summary

# Access individual results
for item_result in result.item_results:
    print(f"Input: {item_result.item}")
    print(f"Output: {item_result.output}")
    print(f"Scores: {item_result.evaluations}")

Usage with Langfuse datasets:

dataset = langfuse.get_dataset("qa-eval-set")
result = dataset.run_experiment(
    name="GPT-4 QA Evaluation",
    task=answer_question,
    evaluators=[relevance_check, accuracy_check]
)

# View in Langfuse UI
if result.dataset_run_url:
    print(f"View detailed results: {result.dataset_run_url}")

Formatted output:

# Get summary view
summary = result.format()
print(summary)

# Get detailed view with individual items
detailed = result.format(include_item_results=True)
with open("experiment_report.txt", "w") as f:
    f.write(detailed)
ExperimentResult( *, name: str, run_name: str, description: Optional[str], item_results: List[ExperimentItemResult], run_evaluations: List[Evaluation], dataset_run_id: Optional[str] = None, dataset_run_url: Optional[str] = None)
357    def __init__(
358        self,
359        *,
360        name: str,
361        run_name: str,
362        description: Optional[str],
363        item_results: List[ExperimentItemResult],
364        run_evaluations: List[Evaluation],
365        dataset_run_id: Optional[str] = None,
366        dataset_run_url: Optional[str] = None,
367    ):
368        """Initialize an ExperimentResult with the provided data.
369
370        Args:
371            name: The name of the experiment.
372            run_name: The current experiment run name.
373            description: Optional description of the experiment.
374            item_results: List of results from processing individual dataset items.
375            run_evaluations: List of aggregate evaluation results for the entire run.
376            dataset_run_id: Optional ID of the dataset run (for Langfuse datasets).
377            dataset_run_url: Optional URL to view results in Langfuse UI.
378        """
379        self.name = name
380        self.run_name = run_name
381        self.description = description
382        self.item_results = item_results
383        self.run_evaluations = run_evaluations
384        self.dataset_run_id = dataset_run_id
385        self.dataset_run_url = dataset_run_url

Initialize an ExperimentResult with the provided data.

Arguments:
  • name: The name of the experiment.
  • run_name: The current experiment run name.
  • description: Optional description of the experiment.
  • item_results: List of results from processing individual dataset items.
  • run_evaluations: List of aggregate evaluation results for the entire run.
  • dataset_run_id: Optional ID of the dataset run (for Langfuse datasets).
  • dataset_run_url: Optional URL to view results in Langfuse UI.
name
run_name
description
item_results
run_evaluations
dataset_run_id
dataset_run_url
def format(self, *, include_item_results: bool = False) -> str:
387    def format(self, *, include_item_results: bool = False) -> str:
388        r"""Format the experiment result for human-readable display.
389
390        Converts the experiment result into a nicely formatted string suitable for
391        console output, logging, or reporting. The output includes experiment overview,
392        aggregate statistics, and optionally individual item details.
393
394        This method provides a comprehensive view of experiment performance including:
395        - Experiment metadata (name, description, item count)
396        - List of evaluation metrics used across items
397        - Average scores computed across all processed items
398        - Run-level evaluation results (aggregate metrics)
399        - Links to view detailed results in Langfuse UI (when available)
400        - Individual item details (when requested)
401
402        Args:
403            include_item_results: Whether to include detailed results for each individual
404                item in the formatted output. When False (default), only shows aggregate
405                statistics and summary information. When True, includes input/output/scores
406                for every processed item, making the output significantly longer but more
407                detailed for debugging and analysis purposes.
408
409        Returns:
410            A formatted multi-line string containing:
411            - Experiment name and description (if provided)
412            - Total number of items successfully processed
413            - List of all evaluation metrics that were applied
414            - Average scores across all items for each numeric metric
415            - Run-level evaluation results with comments
416            - Dataset run URL for viewing in Langfuse UI (if applicable)
417            - Individual item details including inputs, outputs, and scores (if requested)
418
419        Examples:
420            Basic usage showing aggregate results only:
421            ```python
422            result = langfuse.run_experiment(
423                name="Capital Cities",
424                data=dataset,
425                task=generate_capital,
426                evaluators=[accuracy_evaluator]
427            )
428
429            print(result.format())
430            # Output:
431            # ──────────────────────────────────────────────────
432            # 📊 Capital Cities
433            # 100 items
434            # Evaluations:
435            #   • accuracy
436            # Average Scores:
437            #   • accuracy: 0.850
438            ```
439
440            Detailed output including all individual item results:
441            ```python
442            detailed_report = result.format(include_item_results=True)
443            print(detailed_report)
444            # Output includes each item:
445            # 1. Item 1:
446            #    Input:    What is the capital of France?
447            #    Expected: Paris
448            #    Actual:   The capital of France is Paris.
449            #    Scores:
450            #      • accuracy: 1.000
451            #        💭 Correct answer found
452            # [... continues for all items ...]
453            ```
454
455            Saving formatted results to file for reporting:
456            ```python
457            with open("experiment_report.txt", "w") as f:
458                f.write(result.format(include_item_results=True))
459
460            # Or create summary report
461            summary = result.format()  # Aggregate view only
462            print(f"Experiment Summary:\n{summary}")
463            ```
464
465            Integration with logging systems:
466            ```python
467            import logging
468            logger = logging.getLogger("experiments")
469
470            # Log summary after experiment
471            logger.info(f"Experiment completed:\n{result.format()}")
472
473            # Log detailed results for failed experiments
474            if any(eval['value'] < threshold for eval in result.run_evaluations):
475                logger.warning(f"Poor performance detected:\n{result.format(include_item_results=True)}")
476            ```
477        """
478        if not self.item_results:
479            return "No experiment results to display."
480
481        output = ""
482
483        # Individual results section
484        if include_item_results:
485            for i, result in enumerate(self.item_results):
486                output += f"\n{i + 1}. Item {i + 1}:\n"
487
488                # Extract and display input
489                item_input = None
490                if isinstance(result.item, dict):
491                    item_input = result.item.get("input")
492                elif hasattr(result.item, "input"):
493                    item_input = result.item.input
494
495                if item_input is not None:
496                    output += f"   Input:    {_format_value(item_input)}\n"
497
498                # Extract and display expected output
499                expected_output = None
500                if isinstance(result.item, dict):
501                    expected_output = result.item.get("expected_output")
502                elif hasattr(result.item, "expected_output"):
503                    expected_output = result.item.expected_output
504
505                if expected_output is not None:
506                    output += f"   Expected: {_format_value(expected_output)}\n"
507                output += f"   Actual:   {_format_value(result.output)}\n"
508
509                # Display evaluation scores
510                if result.evaluations:
511                    output += "   Scores:\n"
512                    for evaluation in result.evaluations:
513                        score = evaluation.value
514                        if isinstance(score, (int, float)):
515                            score = f"{score:.3f}"
516                        output += f"     • {evaluation.name}: {score}"
517                        if evaluation.comment:
518                            output += f"\n       💭 {evaluation.comment}"
519                        output += "\n"
520
521                # Display trace link if available
522                if result.trace_id:
523                    output += f"\n   Trace ID: {result.trace_id}\n"
524        else:
525            output += f"Individual Results: Hidden ({len(self.item_results)} items)\n"
526            output += "💡 Set include_item_results=True to view them\n"
527
528        # Experiment overview section
529        output += f"\n{'─' * 50}\n"
530        output += f"🧪 Experiment: {self.name}"
531        output += f"\n📋 Run name: {self.run_name}"
532        if self.description:
533            output += f" - {self.description}"
534
535        output += f"\n{len(self.item_results)} items"
536
537        # Collect unique evaluation names across all items
538        evaluation_names = set()
539        for result in self.item_results:
540            for evaluation in result.evaluations:
541                evaluation_names.add(evaluation.name)
542
543        if evaluation_names:
544            output += "\nEvaluations:"
545            for eval_name in evaluation_names:
546                output += f"\n  • {eval_name}"
547            output += "\n"
548
549        # Calculate and display average scores
550        if evaluation_names:
551            output += "\nAverage Scores:"
552            for eval_name in evaluation_names:
553                scores = []
554                for result in self.item_results:
555                    for evaluation in result.evaluations:
556                        if evaluation.name == eval_name and isinstance(
557                            evaluation.value, (int, float)
558                        ):
559                            scores.append(evaluation.value)
560
561                if scores:
562                    avg = sum(scores) / len(scores)
563                    output += f"\n  • {eval_name}: {avg:.3f}"
564            output += "\n"
565
566        # Display run-level evaluations
567        if self.run_evaluations:
568            output += "\nRun Evaluations:"
569            for run_eval in self.run_evaluations:
570                score = run_eval.value
571                if isinstance(score, (int, float)):
572                    score = f"{score:.3f}"
573                output += f"\n  • {run_eval.name}: {score}"
574                if run_eval.comment:
575                    output += f"\n    💭 {run_eval.comment}"
576            output += "\n"
577
578        # Add dataset run URL if available
579        if self.dataset_run_url:
580            output += f"\n🔗 Dataset Run:\n   {self.dataset_run_url}"
581
582        return output

Format the experiment result for human-readable display.

Converts the experiment result into a nicely formatted string suitable for console output, logging, or reporting. The output includes experiment overview, aggregate statistics, and optionally individual item details.

This method provides a comprehensive view of experiment performance including:

  • Experiment metadata (name, description, item count)
  • List of evaluation metrics used across items
  • Average scores computed across all processed items
  • Run-level evaluation results (aggregate metrics)
  • Links to view detailed results in Langfuse UI (when available)
  • Individual item details (when requested)
Arguments:
  • include_item_results: Whether to include detailed results for each individual item in the formatted output. When False (default), only shows aggregate statistics and summary information. When True, includes input/output/scores for every processed item, making the output significantly longer but more detailed for debugging and analysis purposes.
Returns:

A formatted multi-line string containing:

  • Experiment name and description (if provided)
  • Total number of items successfully processed
  • List of all evaluation metrics that were applied
  • Average scores across all items for each numeric metric
  • Run-level evaluation results with comments
  • Dataset run URL for viewing in Langfuse UI (if applicable)
  • Individual item details including inputs, outputs, and scores (if requested)
Examples:

Basic usage showing aggregate results only:

result = langfuse.run_experiment(
    name="Capital Cities",
    data=dataset,
    task=generate_capital,
    evaluators=[accuracy_evaluator]
)

print(result.format())
# Output:
# ──────────────────────────────────────────────────
# 📊 Capital Cities
# 100 items
# Evaluations:
#   • accuracy
# Average Scores:
#   • accuracy: 0.850

Detailed output including all individual item results:

detailed_report = result.format(include_item_results=True)
print(detailed_report)
# Output includes each item:
# 1. Item 1:
#    Input:    What is the capital of France?
#    Expected: Paris
#    Actual:   The capital of France is Paris.
#    Scores:
#      • accuracy: 1.000
#        💭 Correct answer found
# [... continues for all items ...]

Saving formatted results to file for reporting:

with open("experiment_report.txt", "w") as f:
    f.write(result.format(include_item_results=True))

# Or create summary report
summary = result.format()  # Aggregate view only
print(f"Experiment Summary:\n{summary}")

Integration with logging systems:

import logging
logger = logging.getLogger("experiments")

# Log summary after experiment
logger.info(f"Experiment completed:\n{result.format()}")

# Log detailed results for failed experiments
if any(eval['value'] < threshold for eval in result.run_evaluations):
    logger.warning(f"Poor performance detected:\n{result.format(include_item_results=True)}")
class TaskFunction(typing.Protocol):
585class TaskFunction(Protocol):
586    """Protocol defining the interface for experiment task functions.
587
588    Task functions are the core processing functions that operate on each item
589    in an experiment dataset. They receive an experiment item as input and
590    produce some output that will be evaluated.
591
592    Task functions must:
593    - Accept 'item' as a keyword argument
594    - Return any type of output (will be passed to evaluators)
595    - Can be either synchronous or asynchronous
596    - Should handle their own errors gracefully (exceptions will be logged)
597    """
598
599    def __call__(
600        self,
601        *,
602        item: ExperimentItem,
603        **kwargs: Dict[str, Any],
604    ) -> Union[Any, Awaitable[Any]]:
605        """Execute the task on an experiment item.
606
607        This method defines the core processing logic for each item in your experiment.
608        The implementation should focus on the specific task you want to evaluate,
609        such as text generation, classification, summarization, etc.
610
611        Args:
612            item: The experiment item to process. Can be either:
613                - Dict with keys like 'input', 'expected_output', 'metadata'
614                - Langfuse DatasetItem object with .input, .expected_output attributes
615            **kwargs: Additional keyword arguments that may be passed by the framework
616
617        Returns:
618            Any: The output of processing the item. This output will be:
619            - Stored in the experiment results
620            - Passed to all item-level evaluators for assessment
621            - Traced automatically in Langfuse for observability
622
623            Can return either a direct value or an awaitable (async) result.
624
625        Examples:
626            Simple synchronous task:
627            ```python
628            def my_task(*, item, **kwargs):
629                prompt = f"Summarize: {item['input']}"
630                return my_llm_client.generate(prompt)
631            ```
632
633            Async task with error handling:
634            ```python
635            async def my_async_task(*, item, **kwargs):
636                try:
637                    response = await openai_client.chat.completions.create(
638                        model="gpt-4",
639                        messages=[{"role": "user", "content": item["input"]}]
640                    )
641                    return response.choices[0].message.content
642                except Exception as e:
643                    # Log error and return fallback
644                    print(f"Task failed for item {item}: {e}")
645                    return "Error: Could not process item"
646            ```
647
648            Task using dataset item attributes:
649            ```python
650            def classification_task(*, item, **kwargs):
651                # Works with both dict items and DatasetItem objects
652                text = item["input"] if isinstance(item, dict) else item.input
653                return classify_text(text)
654            ```
655        """
656        ...

Protocol defining the interface for experiment task functions.

Task functions are the core processing functions that operate on each item in an experiment dataset. They receive an experiment item as input and produce some output that will be evaluated.

Task functions must:

  • Accept 'item' as a keyword argument
  • Return any type of output (will be passed to evaluators)
  • Can be either synchronous or asynchronous
  • Should handle their own errors gracefully (exceptions will be logged)
TaskFunction(*args, **kwargs)
1927def _no_init_or_replace_init(self, *args, **kwargs):
1928    cls = type(self)
1929
1930    if cls._is_protocol:
1931        raise TypeError('Protocols cannot be instantiated')
1932
1933    # Already using a custom `__init__`. No need to calculate correct
1934    # `__init__` to call. This can lead to RecursionError. See bpo-45121.
1935    if cls.__init__ is not _no_init_or_replace_init:
1936        return
1937
1938    # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`.
1939    # The first instantiation of the subclass will call `_no_init_or_replace_init` which
1940    # searches for a proper new `__init__` in the MRO. The new `__init__`
1941    # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent
1942    # instantiation of the protocol subclass will thus use the new
1943    # `__init__` and no longer call `_no_init_or_replace_init`.
1944    for base in cls.__mro__:
1945        init = base.__dict__.get('__init__', _no_init_or_replace_init)
1946        if init is not _no_init_or_replace_init:
1947            cls.__init__ = init
1948            break
1949    else:
1950        # should not happen
1951        cls.__init__ = object.__init__
1952
1953    cls.__init__(self, *args, **kwargs)
class EvaluatorFunction(typing.Protocol):
659class EvaluatorFunction(Protocol):
660    """Protocol defining the interface for item-level evaluator functions.
661
662    Item-level evaluators assess the quality, correctness, or other properties
663    of individual task outputs. They receive the input, output, expected output,
664    and metadata for each item and return evaluation metrics.
665
666    Evaluators should:
667    - Accept input, output, expected_output, and metadata as keyword arguments
668    - Return Evaluation dict(s) with 'name', 'value', 'comment', 'metadata' fields
669    - Be deterministic when possible for reproducible results
670    - Handle edge cases gracefully (missing expected output, malformed data, etc.)
671    - Can be either synchronous or asynchronous
672    """
673
674    def __call__(
675        self,
676        *,
677        input: Any,
678        output: Any,
679        expected_output: Any,
680        metadata: Optional[Dict[str, Any]],
681        **kwargs: Dict[str, Any],
682    ) -> Union[
683        Evaluation, List[Evaluation], Awaitable[Union[Evaluation, List[Evaluation]]]
684    ]:
685        r"""Evaluate a task output for quality, correctness, or other metrics.
686
687        This method should implement specific evaluation logic such as accuracy checking,
688        similarity measurement, toxicity detection, fluency assessment, etc.
689
690        Args:
691            input: The original input that was passed to the task function.
692                This is typically the item['input'] or item.input value.
693            output: The output produced by the task function for this input.
694                This is the direct return value from your task function.
695            expected_output: The expected/ground truth output for comparison.
696                May be None if not available in the dataset. Evaluators should
697                handle this case appropriately.
698            metadata: Optional metadata from the experiment item that might
699                contain additional context for evaluation (categories, difficulty, etc.)
700            **kwargs: Additional keyword arguments that may be passed by the framework
701
702        Returns:
703            Evaluation results in one of these formats:
704            - Single Evaluation dict: {"name": "accuracy", "value": 0.85, "comment": "..."}
705            - List of Evaluation dicts: [{"name": "precision", ...}, {"name": "recall", ...}]
706            - Awaitable returning either of the above (for async evaluators)
707
708            Each Evaluation dict should contain:
709            - name (str): Unique identifier for this evaluation metric
710            - value (int|float|str|bool): The evaluation score or result
711            - comment (str, optional): Human-readable explanation of the result
712            - metadata (dict, optional): Additional structured data about the evaluation
713
714        Examples:
715            Simple accuracy evaluator:
716            ```python
717            def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
718                if expected_output is None:
719                    return {"name": "accuracy", "value": 0, "comment": "No expected output"}
720
721                is_correct = output.strip().lower() == expected_output.strip().lower()
722                return {
723                    "name": "accuracy",
724                    "value": 1.0 if is_correct else 0.0,
725                    "comment": "Exact match" if is_correct else "No match"
726                }
727            ```
728
729            Multi-metric evaluator:
730            ```python
731            def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs):
732                results = []
733
734                # Length check
735                results.append({
736                    "name": "output_length",
737                    "value": len(output),
738                    "comment": f"Output contains {len(output)} characters"
739                })
740
741                # Sentiment analysis
742                sentiment_score = analyze_sentiment(output)
743                results.append({
744                    "name": "sentiment",
745                    "value": sentiment_score,
746                    "comment": f"Sentiment score: {sentiment_score:.2f}"
747                })
748
749                return results
750            ```
751
752            Async evaluator using external API:
753            ```python
754            async def llm_judge_evaluator(*, input, output, expected_output=None, **kwargs):
755                prompt = f"Rate the quality of this response on a scale of 1-10:\n"
756                prompt += f"Question: {input}\nResponse: {output}"
757
758                response = await openai_client.chat.completions.create(
759                    model="gpt-4",
760                    messages=[{"role": "user", "content": prompt}]
761                )
762
763                try:
764                    score = float(response.choices[0].message.content.strip())
765                    return {
766                        "name": "llm_judge_quality",
767                        "value": score,
768                        "comment": f"LLM judge rated this {score}/10"
769                    }
770                except ValueError:
771                    return {
772                        "name": "llm_judge_quality",
773                        "value": 0,
774                        "comment": "Could not parse LLM judge score"
775                    }
776            ```
777
778            Context-aware evaluator:
779            ```python
780            def context_evaluator(*, input, output, metadata=None, **kwargs):
781                # Use metadata for context-specific evaluation
782                difficulty = metadata.get("difficulty", "medium") if metadata else "medium"
783
784                # Adjust expectations based on difficulty
785                min_length = {"easy": 50, "medium": 100, "hard": 150}[difficulty]
786
787                meets_requirement = len(output) >= min_length
788                return {
789                    "name": f"meets_{difficulty}_requirement",
790                    "value": meets_requirement,
791                    "comment": f"Output {'meets' if meets_requirement else 'fails'} {difficulty} length requirement"
792                }
793            ```
794        """
795        ...

Protocol defining the interface for item-level evaluator functions.

Item-level evaluators assess the quality, correctness, or other properties of individual task outputs. They receive the input, output, expected output, and metadata for each item and return evaluation metrics.

Evaluators should:

  • Accept input, output, expected_output, and metadata as keyword arguments
  • Return Evaluation dict(s) with 'name', 'value', 'comment', 'metadata' fields
  • Be deterministic when possible for reproducible results
  • Handle edge cases gracefully (missing expected output, malformed data, etc.)
  • Can be either synchronous or asynchronous
EvaluatorFunction(*args, **kwargs)
1927def _no_init_or_replace_init(self, *args, **kwargs):
1928    cls = type(self)
1929
1930    if cls._is_protocol:
1931        raise TypeError('Protocols cannot be instantiated')
1932
1933    # Already using a custom `__init__`. No need to calculate correct
1934    # `__init__` to call. This can lead to RecursionError. See bpo-45121.
1935    if cls.__init__ is not _no_init_or_replace_init:
1936        return
1937
1938    # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`.
1939    # The first instantiation of the subclass will call `_no_init_or_replace_init` which
1940    # searches for a proper new `__init__` in the MRO. The new `__init__`
1941    # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent
1942    # instantiation of the protocol subclass will thus use the new
1943    # `__init__` and no longer call `_no_init_or_replace_init`.
1944    for base in cls.__mro__:
1945        init = base.__dict__.get('__init__', _no_init_or_replace_init)
1946        if init is not _no_init_or_replace_init:
1947            cls.__init__ = init
1948            break
1949    else:
1950        # should not happen
1951        cls.__init__ = object.__init__
1952
1953    cls.__init__(self, *args, **kwargs)
class RunEvaluatorFunction(typing.Protocol):
798class RunEvaluatorFunction(Protocol):
799    """Protocol defining the interface for run-level evaluator functions.
800
801    Run-level evaluators assess aggregate properties of the entire experiment run,
802    computing metrics that span across all items rather than individual outputs.
803    They receive the complete results from all processed items and can compute
804    statistics like averages, distributions, correlations, or other aggregate metrics.
805
806    Run evaluators should:
807    - Accept item_results as a keyword argument containing all item results
808    - Return Evaluation dict(s) with aggregate metrics
809    - Handle cases where some items may have failed processing
810    - Compute meaningful statistics across the dataset
811    - Can be either synchronous or asynchronous
812    """
813
814    def __call__(
815        self,
816        *,
817        item_results: List[ExperimentItemResult],
818        **kwargs: Dict[str, Any],
819    ) -> Union[
820        Evaluation, List[Evaluation], Awaitable[Union[Evaluation, List[Evaluation]]]
821    ]:
822        r"""Evaluate the entire experiment run with aggregate metrics.
823
824        This method should implement aggregate evaluation logic such as computing
825        averages, calculating distributions, finding correlations, detecting patterns
826        across items, or performing statistical analysis on the experiment results.
827
828        Args:
829            item_results: List of results from all successfully processed experiment items.
830                Each item result contains:
831                - item: The original experiment item
832                - output: The task function's output for this item
833                - evaluations: List of item-level evaluation results
834                - trace_id: Langfuse trace ID for this execution
835                - dataset_run_id: Dataset run ID (if using Langfuse datasets)
836
837                Note: This list only includes items that were successfully processed.
838                Failed items are excluded but logged separately.
839            **kwargs: Additional keyword arguments that may be passed by the framework
840
841        Returns:
842            Evaluation results in one of these formats:
843            - Single Evaluation dict: {"name": "avg_accuracy", "value": 0.78, "comment": "..."}
844            - List of Evaluation dicts: [{"name": "mean", ...}, {"name": "std_dev", ...}]
845            - Awaitable returning either of the above (for async evaluators)
846
847            Each Evaluation dict should contain:
848            - name (str): Unique identifier for this run-level metric
849            - value (int|float|str|bool): The aggregate evaluation result
850            - comment (str, optional): Human-readable explanation of the metric
851            - metadata (dict, optional): Additional structured data about the evaluation
852
853        Examples:
854            Average accuracy calculator:
855            ```python
856            def average_accuracy(*, item_results, **kwargs):
857                if not item_results:
858                    return {"name": "avg_accuracy", "value": 0.0, "comment": "No results"}
859
860                accuracy_values = []
861                for result in item_results:
862                    for evaluation in result.evaluations:
863                        if evaluation.name == "accuracy":
864                            accuracy_values.append(evaluation.value)
865
866                if not accuracy_values:
867                    return {"name": "avg_accuracy", "value": 0, "comment": "No accuracy evaluations found"}
868
869                avg = sum(accuracy_values) / len(accuracy_values)
870                return {
871                    "name": "avg_accuracy",
872                    "value": avg,
873                    "comment": f"Average accuracy across {len(accuracy_values)} items: {avg:.2%}"
874                }
875            ```
876
877            Multiple aggregate metrics:
878            ```python
879            def statistical_summary(*, item_results, **kwargs):
880                if not item_results:
881                    return []
882
883                results = []
884
885                # Calculate output length statistics
886                lengths = [len(str(result.output)) for result in item_results]
887                results.extend([
888                    {"name": "avg_output_length", "value": sum(lengths) / len(lengths)},
889                    {"name": "min_output_length", "value": min(lengths)},
890                    {"name": "max_output_length", "value": max(lengths)}
891                ])
892
893                # Success rate
894                total_items = len(item_results)  # Only successful items are included
895                results.append({
896                    "name": "processing_success_rate",
897                    "value": 1.0,  # All items in item_results succeeded
898                    "comment": f"Successfully processed {total_items} items"
899                })
900
901                return results
902            ```
903
904            Async run evaluator with external analysis:
905            ```python
906            async def llm_batch_analysis(*, item_results, **kwargs):
907                # Prepare batch analysis prompt
908                outputs = [result.output for result in item_results]
909                prompt = f"Analyze these {len(outputs)} outputs for common themes:\n"
910                prompt += "\n".join(f"{i+1}. {output}" for i, output in enumerate(outputs))
911
912                response = await openai_client.chat.completions.create(
913                    model="gpt-4",
914                    messages=[{"role": "user", "content": prompt}]
915                )
916
917                return {
918                    "name": "thematic_analysis",
919                    "value": response.choices[0].message.content,
920                    "comment": f"LLM analysis of {len(outputs)} outputs"
921                }
922            ```
923
924            Performance distribution analysis:
925            ```python
926            def performance_distribution(*, item_results, **kwargs):
927                # Extract all evaluation scores
928                all_scores = []
929                score_by_metric = {}
930
931                for result in item_results:
932                    for evaluation in result.evaluations:
933                        metric_name = evaluation.name
934                        value = evaluation.value
935
936                        if isinstance(value, (int, float)):
937                            all_scores.append(value)
938                            if metric_name not in score_by_metric:
939                                score_by_metric[metric_name] = []
940                            score_by_metric[metric_name].append(value)
941
942                results = []
943
944                # Overall score distribution
945                if all_scores:
946                    import statistics
947                    results.append({
948                        "name": "score_std_dev",
949                        "value": statistics.stdev(all_scores) if len(all_scores) > 1 else 0,
950                        "comment": f"Standard deviation across all numeric scores"
951                    })
952
953                # Per-metric statistics
954                for metric, scores in score_by_metric.items():
955                    if len(scores) > 1:
956                        results.append({
957                            "name": f"{metric}_variance",
958                            "value": statistics.variance(scores),
959                            "comment": f"Variance in {metric} across {len(scores)} items"
960                        })
961
962                return results
963            ```
964        """
965        ...

Protocol defining the interface for run-level evaluator functions.

Run-level evaluators assess aggregate properties of the entire experiment run, computing metrics that span across all items rather than individual outputs. They receive the complete results from all processed items and can compute statistics like averages, distributions, correlations, or other aggregate metrics.

Run evaluators should:

  • Accept item_results as a keyword argument containing all item results
  • Return Evaluation dict(s) with aggregate metrics
  • Handle cases where some items may have failed processing
  • Compute meaningful statistics across the dataset
  • Can be either synchronous or asynchronous
RunEvaluatorFunction(*args, **kwargs)
1927def _no_init_or_replace_init(self, *args, **kwargs):
1928    cls = type(self)
1929
1930    if cls._is_protocol:
1931        raise TypeError('Protocols cannot be instantiated')
1932
1933    # Already using a custom `__init__`. No need to calculate correct
1934    # `__init__` to call. This can lead to RecursionError. See bpo-45121.
1935    if cls.__init__ is not _no_init_or_replace_init:
1936        return
1937
1938    # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`.
1939    # The first instantiation of the subclass will call `_no_init_or_replace_init` which
1940    # searches for a proper new `__init__` in the MRO. The new `__init__`
1941    # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent
1942    # instantiation of the protocol subclass will thus use the new
1943    # `__init__` and no longer call `_no_init_or_replace_init`.
1944    for base in cls.__mro__:
1945        init = base.__dict__.get('__init__', _no_init_or_replace_init)
1946        if init is not _no_init_or_replace_init:
1947            cls.__init__ = init
1948            break
1949    else:
1950        # should not happen
1951        cls.__init__ = object.__init__
1952
1953    cls.__init__(self, *args, **kwargs)
def create_evaluator_from_autoevals( autoevals_evaluator: Any, **kwargs: Optional[Dict[str, Any]]) -> EvaluatorFunction:
1013def create_evaluator_from_autoevals(
1014    autoevals_evaluator: Any, **kwargs: Optional[Dict[str, Any]]
1015) -> EvaluatorFunction:
1016    """Create a Langfuse evaluator from an autoevals evaluator.
1017
1018    Args:
1019        autoevals_evaluator: An autoevals evaluator instance
1020        **kwargs: Additional arguments passed to the evaluator
1021
1022    Returns:
1023        A Langfuse-compatible evaluator function
1024    """
1025
1026    def langfuse_evaluator(
1027        *,
1028        input: Any,
1029        output: Any,
1030        expected_output: Any,
1031        metadata: Optional[Dict[str, Any]],
1032        **langfuse_kwargs: Dict[str, Any],
1033    ) -> Evaluation:
1034        evaluation = autoevals_evaluator(
1035            input=input, output=output, expected=expected_output, **kwargs
1036        )
1037
1038        return Evaluation(
1039            name=evaluation.name,
1040            value=evaluation.score,
1041            comment=(evaluation.metadata or {}).get("comment"),
1042            metadata=evaluation.metadata,
1043        )
1044
1045    return langfuse_evaluator

Create a Langfuse evaluator from an autoevals evaluator.

Arguments:
  • autoevals_evaluator: An autoevals evaluator instance
  • **kwargs: Additional arguments passed to the evaluator
Returns:

A Langfuse-compatible evaluator function