langfuse.experiment

Langfuse experiment functionality for running and evaluating tasks on datasets.

This module provides the core experiment functionality for the Langfuse Python SDK, allowing users to run experiments on datasets with automatic tracing, evaluation, and result formatting.

   1"""Langfuse experiment functionality for running and evaluating tasks on datasets.
   2
   3This module provides the core experiment functionality for the Langfuse Python SDK,
   4allowing users to run experiments on datasets with automatic tracing, evaluation,
   5and result formatting.
   6"""
   7
   8import asyncio
   9import logging
  10from typing import (
  11    TYPE_CHECKING,
  12    Any,
  13    Awaitable,
  14    Dict,
  15    List,
  16    Optional,
  17    Protocol,
  18    TypedDict,
  19    Union,
  20)
  21
  22from langfuse.api import ScoreDataType
  23
  24if TYPE_CHECKING:
  25    from langfuse._client.datasets import DatasetItemClient
  26
  27
  28class LocalExperimentItem(TypedDict, total=False):
  29    """Structure for local experiment data items (not from Langfuse datasets).
  30
  31    This TypedDict defines the structure for experiment items when using local data
  32    rather than Langfuse-hosted datasets. All fields are optional to provide
  33    flexibility in data structure.
  34
  35    Attributes:
  36        input: The input data to pass to the task function. Can be any type that
  37            your task function can process (string, dict, list, etc.). This is
  38            typically the prompt, question, or data that your task will operate on.
  39        expected_output: Optional expected/ground truth output for evaluation purposes.
  40            Used by evaluators to assess correctness or quality. Can be None if
  41            no ground truth is available.
  42        metadata: Optional metadata dictionary containing additional context about
  43            this specific item. Can include information like difficulty level,
  44            category, source, or any other relevant attributes that evaluators
  45            might use for context-aware evaluation.
  46
  47    Examples:
  48        Simple text processing item:
  49        ```python
  50        item: LocalExperimentItem = {
  51            "input": "Summarize this article: ...",
  52            "expected_output": "Expected summary...",
  53            "metadata": {"difficulty": "medium", "category": "news"}
  54        }
  55        ```
  56
  57        Classification item:
  58        ```python
  59        item: LocalExperimentItem = {
  60            "input": {"text": "This movie is great!", "context": "movie review"},
  61            "expected_output": "positive",
  62            "metadata": {"dataset_source": "imdb", "confidence": 0.95}
  63        }
  64        ```
  65
  66        Minimal item with only input:
  67        ```python
  68        item: LocalExperimentItem = {
  69            "input": "What is the capital of France?"
  70        }
  71        ```
  72    """
  73
  74    input: Any
  75    expected_output: Any
  76    metadata: Optional[Dict[str, Any]]
  77
  78
  79ExperimentItem = Union[LocalExperimentItem, "DatasetItemClient"]
  80"""Type alias for items that can be processed in experiments.
  81
  82Can be either:
  83- LocalExperimentItem: Dict-like items with 'input', 'expected_output', 'metadata' keys
  84- DatasetItemClient: Items from Langfuse datasets with .input, .expected_output, .metadata attributes
  85"""
  86
  87ExperimentData = Union[List[LocalExperimentItem], List["DatasetItemClient"]]
  88"""Type alias for experiment datasets.
  89
  90Represents the collection of items to process in an experiment. Can be either:
  91- List[LocalExperimentItem]: Local data items as dictionaries
  92- List[DatasetItemClient]: Items from a Langfuse dataset (typically from dataset.items)
  93"""
  94
  95
  96class Evaluation:
  97    """Represents an evaluation result for an experiment item or an entire experiment run.
  98
  99    This class provides a strongly-typed way to create evaluation results in evaluator functions.
 100    Users must use keyword arguments when instantiating this class.
 101
 102    Attributes:
 103        name: Unique identifier for the evaluation metric. Should be descriptive
 104            and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity").
 105            Used for aggregation and comparison across experiment runs.
 106        value: The evaluation score or result. Can be:
 107            - Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42)
 108            - String: For categorical results like "positive", "negative", "neutral"
 109            - Boolean: For binary assessments like "passes_safety_check"
 110        comment: Optional human-readable explanation of the evaluation result.
 111            Useful for providing context, explaining scoring rationale, or noting
 112            special conditions. Displayed in Langfuse UI for interpretability.
 113        metadata: Optional structured metadata about the evaluation process.
 114            Can include confidence scores, intermediate calculations, model versions,
 115            or any other relevant technical details.
 116        data_type: Optional score data type. Required if value is not NUMERIC.
 117            One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC.
 118        config_id: Optional Langfuse score config ID.
 119
 120    Examples:
 121        Basic accuracy evaluation:
 122        ```python
 123        from langfuse import Evaluation
 124
 125        def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
 126            if not expected_output:
 127                return Evaluation(name="accuracy", value=None, comment="No expected output")
 128
 129            is_correct = output.strip().lower() == expected_output.strip().lower()
 130            return Evaluation(
 131                name="accuracy",
 132                value=1.0 if is_correct else 0.0,
 133                comment="Correct answer" if is_correct else "Incorrect answer"
 134            )
 135        ```
 136
 137        Multi-metric evaluator:
 138        ```python
 139        def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs):
 140            return [
 141                Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"),
 142                Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"),
 143                Evaluation(
 144                    name="quality",
 145                    value=0.85,
 146                    comment="High quality response",
 147                    metadata={"confidence": 0.92, "model": "gpt-4"}
 148                )
 149            ]
 150        ```
 151
 152        Categorical evaluation:
 153        ```python
 154        def sentiment_evaluator(*, input, output, **kwargs):
 155            sentiment = analyze_sentiment(output)  # Returns "positive", "negative", or "neutral"
 156            return Evaluation(
 157                name="sentiment",
 158                value=sentiment,
 159                comment=f"Response expresses {sentiment} sentiment",
 160                data_type="CATEGORICAL"
 161            )
 162        ```
 163
 164        Failed evaluation with error handling:
 165        ```python
 166        def external_api_evaluator(*, input, output, **kwargs):
 167            try:
 168                score = external_api.evaluate(output)
 169                return Evaluation(name="external_score", value=score)
 170            except Exception as e:
 171                return Evaluation(
 172                    name="external_score",
 173                    value=None,
 174                    comment=f"API unavailable: {e}",
 175                    metadata={"error": str(e), "retry_count": 3}
 176                )
 177        ```
 178
 179    Note:
 180        All arguments must be passed as keywords. Positional arguments are not allowed
 181        to ensure code clarity and prevent errors from argument reordering.
 182    """
 183
 184    def __init__(
 185        self,
 186        *,
 187        name: str,
 188        value: Union[int, float, str, bool],
 189        comment: Optional[str] = None,
 190        metadata: Optional[Dict[str, Any]] = None,
 191        data_type: Optional[ScoreDataType] = None,
 192        config_id: Optional[str] = None,
 193    ):
 194        """Initialize an Evaluation with the provided data.
 195
 196        Args:
 197            name: Unique identifier for the evaluation metric.
 198            value: The evaluation score or result.
 199            comment: Optional human-readable explanation of the result.
 200            metadata: Optional structured metadata about the evaluation process.
 201            data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
 202            config_id: Optional Langfuse score config ID.
 203
 204        Note:
 205            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
 206        """
 207        self.name = name
 208        self.value = value
 209        self.comment = comment
 210        self.metadata = metadata
 211        self.data_type = data_type
 212        self.config_id = config_id
 213
 214
 215class ExperimentItemResult:
 216    """Result structure for individual experiment items.
 217
 218    This class represents the complete result of processing a single item
 219    during an experiment run, including the original input, task output,
 220    evaluations, and tracing information. Users must use keyword arguments when instantiating this class.
 221
 222    Attributes:
 223        item: The original experiment item that was processed. Can be either
 224            a dictionary with 'input', 'expected_output', and 'metadata' keys,
 225            or a DatasetItemClient from Langfuse datasets.
 226        output: The actual output produced by the task function for this item.
 227            Can be any type depending on what your task function returns.
 228        evaluations: List of evaluation results for this item. Each evaluation
 229            contains a name, value, optional comment, and optional metadata.
 230        trace_id: Optional Langfuse trace ID for this item's execution. Used
 231            to link the experiment result with the detailed trace in Langfuse UI.
 232        dataset_run_id: Optional dataset run ID if this item was part of a
 233            Langfuse dataset. None for local experiments.
 234
 235    Examples:
 236        Accessing item result data:
 237        ```python
 238        result = langfuse.run_experiment(...)
 239        for item_result in result.item_results:
 240            print(f"Input: {item_result.item}")
 241            print(f"Output: {item_result.output}")
 242            print(f"Trace: {item_result.trace_id}")
 243
 244            # Access evaluations
 245            for evaluation in item_result.evaluations:
 246                print(f"{evaluation.name}: {evaluation.value}")
 247        ```
 248
 249        Working with different item types:
 250        ```python
 251        # Local experiment item (dict)
 252        if isinstance(item_result.item, dict):
 253            input_data = item_result.item["input"]
 254            expected = item_result.item.get("expected_output")
 255
 256        # Langfuse dataset item (object with attributes)
 257        else:
 258            input_data = item_result.item.input
 259            expected = item_result.item.expected_output
 260        ```
 261
 262    Note:
 263        All arguments must be passed as keywords. Positional arguments are not allowed
 264        to ensure code clarity and prevent errors from argument reordering.
 265    """
 266
 267    def __init__(
 268        self,
 269        *,
 270        item: ExperimentItem,
 271        output: Any,
 272        evaluations: List[Evaluation],
 273        trace_id: Optional[str],
 274        dataset_run_id: Optional[str],
 275    ):
 276        """Initialize an ExperimentItemResult with the provided data.
 277
 278        Args:
 279            item: The original experiment item that was processed.
 280            output: The actual output produced by the task function for this item.
 281            evaluations: List of evaluation results for this item.
 282            trace_id: Optional Langfuse trace ID for this item's execution.
 283            dataset_run_id: Optional dataset run ID if this item was part of a Langfuse dataset.
 284
 285        Note:
 286            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
 287        """
 288        self.item = item
 289        self.output = output
 290        self.evaluations = evaluations
 291        self.trace_id = trace_id
 292        self.dataset_run_id = dataset_run_id
 293
 294
 295class ExperimentResult:
 296    """Complete result structure for experiment execution.
 297
 298    This class encapsulates the complete results of running an experiment on a dataset,
 299    including individual item results, aggregate run-level evaluations, and metadata
 300    about the experiment execution.
 301
 302    Attributes:
 303        name: The name of the experiment as specified during execution.
 304        run_name: The name of the current experiment run.
 305        description: Optional description of the experiment's purpose or methodology.
 306        item_results: List of results from processing each individual dataset item,
 307            containing the original item, task output, evaluations, and trace information.
 308        run_evaluations: List of aggregate evaluation results computed across all items,
 309            such as average scores, statistical summaries, or cross-item analyses.
 310        dataset_run_id: Optional ID of the dataset run in Langfuse (when using Langfuse datasets).
 311        dataset_run_url: Optional direct URL to view the experiment results in Langfuse UI.
 312
 313    Examples:
 314        Basic usage with local dataset:
 315        ```python
 316        result = langfuse.run_experiment(
 317            name="Capital Cities Test",
 318            data=local_data,
 319            task=generate_capital,
 320            evaluators=[accuracy_check]
 321        )
 322
 323        print(f"Processed {len(result.item_results)} items")
 324        print(result.format())  # Human-readable summary
 325
 326        # Access individual results
 327        for item_result in result.item_results:
 328            print(f"Input: {item_result.item}")
 329            print(f"Output: {item_result.output}")
 330            print(f"Scores: {item_result.evaluations}")
 331        ```
 332
 333        Usage with Langfuse datasets:
 334        ```python
 335        dataset = langfuse.get_dataset("qa-eval-set")
 336        result = dataset.run_experiment(
 337            name="GPT-4 QA Evaluation",
 338            task=answer_question,
 339            evaluators=[relevance_check, accuracy_check]
 340        )
 341
 342        # View in Langfuse UI
 343        if result.dataset_run_url:
 344            print(f"View detailed results: {result.dataset_run_url}")
 345        ```
 346
 347        Formatted output:
 348        ```python
 349        # Get summary view
 350        summary = result.format()
 351        print(summary)
 352
 353        # Get detailed view with individual items
 354        detailed = result.format(include_item_results=True)
 355        with open("experiment_report.txt", "w") as f:
 356            f.write(detailed)
 357        ```
 358    """
 359
 360    def __init__(
 361        self,
 362        *,
 363        name: str,
 364        run_name: str,
 365        description: Optional[str],
 366        item_results: List[ExperimentItemResult],
 367        run_evaluations: List[Evaluation],
 368        dataset_run_id: Optional[str] = None,
 369        dataset_run_url: Optional[str] = None,
 370    ):
 371        """Initialize an ExperimentResult with the provided data.
 372
 373        Args:
 374            name: The name of the experiment.
 375            run_name: The current experiment run name.
 376            description: Optional description of the experiment.
 377            item_results: List of results from processing individual dataset items.
 378            run_evaluations: List of aggregate evaluation results for the entire run.
 379            dataset_run_id: Optional ID of the dataset run (for Langfuse datasets).
 380            dataset_run_url: Optional URL to view results in Langfuse UI.
 381        """
 382        self.name = name
 383        self.run_name = run_name
 384        self.description = description
 385        self.item_results = item_results
 386        self.run_evaluations = run_evaluations
 387        self.dataset_run_id = dataset_run_id
 388        self.dataset_run_url = dataset_run_url
 389
 390    def format(self, *, include_item_results: bool = False) -> str:
 391        r"""Format the experiment result for human-readable display.
 392
 393        Converts the experiment result into a nicely formatted string suitable for
 394        console output, logging, or reporting. The output includes experiment overview,
 395        aggregate statistics, and optionally individual item details.
 396
 397        This method provides a comprehensive view of experiment performance including:
 398        - Experiment metadata (name, description, item count)
 399        - List of evaluation metrics used across items
 400        - Average scores computed across all processed items
 401        - Run-level evaluation results (aggregate metrics)
 402        - Links to view detailed results in Langfuse UI (when available)
 403        - Individual item details (when requested)
 404
 405        Args:
 406            include_item_results: Whether to include detailed results for each individual
 407                item in the formatted output. When False (default), only shows aggregate
 408                statistics and summary information. When True, includes input/output/scores
 409                for every processed item, making the output significantly longer but more
 410                detailed for debugging and analysis purposes.
 411
 412        Returns:
 413            A formatted multi-line string containing:
 414            - Experiment name and description (if provided)
 415            - Total number of items successfully processed
 416            - List of all evaluation metrics that were applied
 417            - Average scores across all items for each numeric metric
 418            - Run-level evaluation results with comments
 419            - Dataset run URL for viewing in Langfuse UI (if applicable)
 420            - Individual item details including inputs, outputs, and scores (if requested)
 421
 422        Examples:
 423            Basic usage showing aggregate results only:
 424            ```python
 425            result = langfuse.run_experiment(
 426                name="Capital Cities",
 427                data=dataset,
 428                task=generate_capital,
 429                evaluators=[accuracy_evaluator]
 430            )
 431
 432            print(result.format())
 433            # Output:
 434            # ──────────────────────────────────────────────────
 435            # 📊 Capital Cities
 436            # 100 items
 437            # Evaluations:
 438            #   • accuracy
 439            # Average Scores:
 440            #   • accuracy: 0.850
 441            ```
 442
 443            Detailed output including all individual item results:
 444            ```python
 445            detailed_report = result.format(include_item_results=True)
 446            print(detailed_report)
 447            # Output includes each item:
 448            # 1. Item 1:
 449            #    Input:    What is the capital of France?
 450            #    Expected: Paris
 451            #    Actual:   The capital of France is Paris.
 452            #    Scores:
 453            #      • accuracy: 1.000
 454            #        💭 Correct answer found
 455            # [... continues for all items ...]
 456            ```
 457
 458            Saving formatted results to file for reporting:
 459            ```python
 460            with open("experiment_report.txt", "w") as f:
 461                f.write(result.format(include_item_results=True))
 462
 463            # Or create summary report
 464            summary = result.format()  # Aggregate view only
 465            print(f"Experiment Summary:\\n{summary}")
 466            ```
 467
 468            Integration with logging systems:
 469            ```python
 470            import logging
 471            logger = logging.getLogger("experiments")
 472
 473            # Log summary after experiment
 474            logger.info(f"Experiment completed:\\n{result.format()}")
 475
 476            # Log detailed results for failed experiments
 477            if any(eval['value'] < threshold for eval in result.run_evaluations):
 478                logger.warning(f"Poor performance detected:\\n{result.format(include_item_results=True)}")
 479            ```
 480        """
 481        if not self.item_results:
 482            return "No experiment results to display."
 483
 484        output = ""
 485
 486        # Individual results section
 487        if include_item_results:
 488            for i, result in enumerate(self.item_results):
 489                output += f"\\n{i + 1}. Item {i + 1}:\\n"
 490
 491                # Extract and display input
 492                item_input = None
 493                if isinstance(result.item, dict):
 494                    item_input = result.item.get("input")
 495                elif hasattr(result.item, "input"):
 496                    item_input = result.item.input
 497
 498                if item_input is not None:
 499                    output += f"   Input:    {_format_value(item_input)}\\n"
 500
 501                # Extract and display expected output
 502                expected_output = None
 503                if isinstance(result.item, dict):
 504                    expected_output = result.item.get("expected_output")
 505                elif hasattr(result.item, "expected_output"):
 506                    expected_output = result.item.expected_output
 507
 508                if expected_output is not None:
 509                    output += f"   Expected: {_format_value(expected_output)}\\n"
 510                output += f"   Actual:   {_format_value(result.output)}\\n"
 511
 512                # Display evaluation scores
 513                if result.evaluations:
 514                    output += "   Scores:\\n"
 515                    for evaluation in result.evaluations:
 516                        score = evaluation.value
 517                        if isinstance(score, (int, float)):
 518                            score = f"{score:.3f}"
 519                        output += f"     • {evaluation.name}: {score}"
 520                        if evaluation.comment:
 521                            output += f"\\n       💭 {evaluation.comment}"
 522                        output += "\\n"
 523
 524                # Display trace link if available
 525                if result.trace_id:
 526                    output += f"\\n   Trace ID: {result.trace_id}\\n"
 527        else:
 528            output += f"Individual Results: Hidden ({len(self.item_results)} items)\\n"
 529            output += "💡 Set include_item_results=True to view them\\n"
 530
 531        # Experiment overview section
 532        output += f"\\n{'─' * 50}\\n"
 533        output += f"🧪 Experiment: {self.name}"
 534        output += f"\n📋 Run name: {self.run_name}"
 535        if self.description:
 536            output += f" - {self.description}"
 537
 538        output += f"\\n{len(self.item_results)} items"
 539
 540        # Collect unique evaluation names across all items
 541        evaluation_names = set()
 542        for result in self.item_results:
 543            for evaluation in result.evaluations:
 544                evaluation_names.add(evaluation.name)
 545
 546        if evaluation_names:
 547            output += "\\nEvaluations:"
 548            for eval_name in evaluation_names:
 549                output += f"\\n  • {eval_name}"
 550            output += "\\n"
 551
 552        # Calculate and display average scores
 553        if evaluation_names:
 554            output += "\\nAverage Scores:"
 555            for eval_name in evaluation_names:
 556                scores = []
 557                for result in self.item_results:
 558                    for evaluation in result.evaluations:
 559                        if evaluation.name == eval_name and isinstance(
 560                            evaluation.value, (int, float)
 561                        ):
 562                            scores.append(evaluation.value)
 563
 564                if scores:
 565                    avg = sum(scores) / len(scores)
 566                    output += f"\\n  • {eval_name}: {avg:.3f}"
 567            output += "\\n"
 568
 569        # Display run-level evaluations
 570        if self.run_evaluations:
 571            output += "\\nRun Evaluations:"
 572            for run_eval in self.run_evaluations:
 573                score = run_eval.value
 574                if isinstance(score, (int, float)):
 575                    score = f"{score:.3f}"
 576                output += f"\\n  • {run_eval.name}: {score}"
 577                if run_eval.comment:
 578                    output += f"\\n    💭 {run_eval.comment}"
 579            output += "\\n"
 580
 581        # Add dataset run URL if available
 582        if self.dataset_run_url:
 583            output += f"\\n🔗 Dataset Run:\\n   {self.dataset_run_url}"
 584
 585        return output
 586
 587
 588class TaskFunction(Protocol):
 589    """Protocol defining the interface for experiment task functions.
 590
 591    Task functions are the core processing functions that operate on each item
 592    in an experiment dataset. They receive an experiment item as input and
 593    produce some output that will be evaluated.
 594
 595    Task functions must:
 596    - Accept 'item' as a keyword argument
 597    - Return any type of output (will be passed to evaluators)
 598    - Can be either synchronous or asynchronous
 599    - Should handle their own errors gracefully (exceptions will be logged)
 600    """
 601
 602    def __call__(
 603        self,
 604        *,
 605        item: ExperimentItem,
 606        **kwargs: Dict[str, Any],
 607    ) -> Union[Any, Awaitable[Any]]:
 608        """Execute the task on an experiment item.
 609
 610        This method defines the core processing logic for each item in your experiment.
 611        The implementation should focus on the specific task you want to evaluate,
 612        such as text generation, classification, summarization, etc.
 613
 614        Args:
 615            item: The experiment item to process. Can be either:
 616                - Dict with keys like 'input', 'expected_output', 'metadata'
 617                - Langfuse DatasetItem object with .input, .expected_output attributes
 618            **kwargs: Additional keyword arguments that may be passed by the framework
 619
 620        Returns:
 621            Any: The output of processing the item. This output will be:
 622            - Stored in the experiment results
 623            - Passed to all item-level evaluators for assessment
 624            - Traced automatically in Langfuse for observability
 625
 626            Can return either a direct value or an awaitable (async) result.
 627
 628        Examples:
 629            Simple synchronous task:
 630            ```python
 631            def my_task(*, item, **kwargs):
 632                prompt = f"Summarize: {item['input']}"
 633                return my_llm_client.generate(prompt)
 634            ```
 635
 636            Async task with error handling:
 637            ```python
 638            async def my_async_task(*, item, **kwargs):
 639                try:
 640                    response = await openai_client.chat.completions.create(
 641                        model="gpt-4",
 642                        messages=[{"role": "user", "content": item["input"]}]
 643                    )
 644                    return response.choices[0].message.content
 645                except Exception as e:
 646                    # Log error and return fallback
 647                    print(f"Task failed for item {item}: {e}")
 648                    return "Error: Could not process item"
 649            ```
 650
 651            Task using dataset item attributes:
 652            ```python
 653            def classification_task(*, item, **kwargs):
 654                # Works with both dict items and DatasetItem objects
 655                text = item["input"] if isinstance(item, dict) else item.input
 656                return classify_text(text)
 657            ```
 658        """
 659        ...
 660
 661
 662class EvaluatorFunction(Protocol):
 663    """Protocol defining the interface for item-level evaluator functions.
 664
 665    Item-level evaluators assess the quality, correctness, or other properties
 666    of individual task outputs. They receive the input, output, expected output,
 667    and metadata for each item and return evaluation metrics.
 668
 669    Evaluators should:
 670    - Accept input, output, expected_output, and metadata as keyword arguments
 671    - Return Evaluation dict(s) with 'name', 'value', 'comment', 'metadata' fields
 672    - Be deterministic when possible for reproducible results
 673    - Handle edge cases gracefully (missing expected output, malformed data, etc.)
 674    - Can be either synchronous or asynchronous
 675    """
 676
 677    def __call__(
 678        self,
 679        *,
 680        input: Any,
 681        output: Any,
 682        expected_output: Any,
 683        metadata: Optional[Dict[str, Any]],
 684        **kwargs: Dict[str, Any],
 685    ) -> Union[
 686        Evaluation, List[Evaluation], Awaitable[Union[Evaluation, List[Evaluation]]]
 687    ]:
 688        r"""Evaluate a task output for quality, correctness, or other metrics.
 689
 690        This method should implement specific evaluation logic such as accuracy checking,
 691        similarity measurement, toxicity detection, fluency assessment, etc.
 692
 693        Args:
 694            input: The original input that was passed to the task function.
 695                This is typically the item['input'] or item.input value.
 696            output: The output produced by the task function for this input.
 697                This is the direct return value from your task function.
 698            expected_output: The expected/ground truth output for comparison.
 699                May be None if not available in the dataset. Evaluators should
 700                handle this case appropriately.
 701            metadata: Optional metadata from the experiment item that might
 702                contain additional context for evaluation (categories, difficulty, etc.)
 703            **kwargs: Additional keyword arguments that may be passed by the framework
 704
 705        Returns:
 706            Evaluation results in one of these formats:
 707            - Single Evaluation dict: {"name": "accuracy", "value": 0.85, "comment": "..."}
 708            - List of Evaluation dicts: [{"name": "precision", ...}, {"name": "recall", ...}]
 709            - Awaitable returning either of the above (for async evaluators)
 710
 711            Each Evaluation dict should contain:
 712            - name (str): Unique identifier for this evaluation metric
 713            - value (int|float|str|bool): The evaluation score or result
 714            - comment (str, optional): Human-readable explanation of the result
 715            - metadata (dict, optional): Additional structured data about the evaluation
 716
 717        Examples:
 718            Simple accuracy evaluator:
 719            ```python
 720            def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
 721                if expected_output is None:
 722                    return {"name": "accuracy", "value": None, "comment": "No expected output"}
 723
 724                is_correct = output.strip().lower() == expected_output.strip().lower()
 725                return {
 726                    "name": "accuracy",
 727                    "value": 1.0 if is_correct else 0.0,
 728                    "comment": "Exact match" if is_correct else "No match"
 729                }
 730            ```
 731
 732            Multi-metric evaluator:
 733            ```python
 734            def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs):
 735                results = []
 736
 737                # Length check
 738                results.append({
 739                    "name": "output_length",
 740                    "value": len(output),
 741                    "comment": f"Output contains {len(output)} characters"
 742                })
 743
 744                # Sentiment analysis
 745                sentiment_score = analyze_sentiment(output)
 746                results.append({
 747                    "name": "sentiment",
 748                    "value": sentiment_score,
 749                    "comment": f"Sentiment score: {sentiment_score:.2f}"
 750                })
 751
 752                return results
 753            ```
 754
 755            Async evaluator using external API:
 756            ```python
 757            async def llm_judge_evaluator(*, input, output, expected_output=None, **kwargs):
 758                prompt = f"Rate the quality of this response on a scale of 1-10:\n"
 759                prompt += f"Question: {input}\nResponse: {output}"
 760
 761                response = await openai_client.chat.completions.create(
 762                    model="gpt-4",
 763                    messages=[{"role": "user", "content": prompt}]
 764                )
 765
 766                try:
 767                    score = float(response.choices[0].message.content.strip())
 768                    return {
 769                        "name": "llm_judge_quality",
 770                        "value": score,
 771                        "comment": f"LLM judge rated this {score}/10"
 772                    }
 773                except ValueError:
 774                    return {
 775                        "name": "llm_judge_quality",
 776                        "value": None,
 777                        "comment": "Could not parse LLM judge score"
 778                    }
 779            ```
 780
 781            Context-aware evaluator:
 782            ```python
 783            def context_evaluator(*, input, output, metadata=None, **kwargs):
 784                # Use metadata for context-specific evaluation
 785                difficulty = metadata.get("difficulty", "medium") if metadata else "medium"
 786
 787                # Adjust expectations based on difficulty
 788                min_length = {"easy": 50, "medium": 100, "hard": 150}[difficulty]
 789
 790                meets_requirement = len(output) >= min_length
 791                return {
 792                    "name": f"meets_{difficulty}_requirement",
 793                    "value": meets_requirement,
 794                    "comment": f"Output {'meets' if meets_requirement else 'fails'} {difficulty} length requirement"
 795                }
 796            ```
 797        """
 798        ...
 799
 800
 801class RunEvaluatorFunction(Protocol):
 802    """Protocol defining the interface for run-level evaluator functions.
 803
 804    Run-level evaluators assess aggregate properties of the entire experiment run,
 805    computing metrics that span across all items rather than individual outputs.
 806    They receive the complete results from all processed items and can compute
 807    statistics like averages, distributions, correlations, or other aggregate metrics.
 808
 809    Run evaluators should:
 810    - Accept item_results as a keyword argument containing all item results
 811    - Return Evaluation dict(s) with aggregate metrics
 812    - Handle cases where some items may have failed processing
 813    - Compute meaningful statistics across the dataset
 814    - Can be either synchronous or asynchronous
 815    """
 816
 817    def __call__(
 818        self,
 819        *,
 820        item_results: List[ExperimentItemResult],
 821        **kwargs: Dict[str, Any],
 822    ) -> Union[
 823        Evaluation, List[Evaluation], Awaitable[Union[Evaluation, List[Evaluation]]]
 824    ]:
 825        r"""Evaluate the entire experiment run with aggregate metrics.
 826
 827        This method should implement aggregate evaluation logic such as computing
 828        averages, calculating distributions, finding correlations, detecting patterns
 829        across items, or performing statistical analysis on the experiment results.
 830
 831        Args:
 832            item_results: List of results from all successfully processed experiment items.
 833                Each item result contains:
 834                - item: The original experiment item
 835                - output: The task function's output for this item
 836                - evaluations: List of item-level evaluation results
 837                - trace_id: Langfuse trace ID for this execution
 838                - dataset_run_id: Dataset run ID (if using Langfuse datasets)
 839
 840                Note: This list only includes items that were successfully processed.
 841                Failed items are excluded but logged separately.
 842            **kwargs: Additional keyword arguments that may be passed by the framework
 843
 844        Returns:
 845            Evaluation results in one of these formats:
 846            - Single Evaluation dict: {"name": "avg_accuracy", "value": 0.78, "comment": "..."}
 847            - List of Evaluation dicts: [{"name": "mean", ...}, {"name": "std_dev", ...}]
 848            - Awaitable returning either of the above (for async evaluators)
 849
 850            Each Evaluation dict should contain:
 851            - name (str): Unique identifier for this run-level metric
 852            - value (int|float|str|bool): The aggregate evaluation result
 853            - comment (str, optional): Human-readable explanation of the metric
 854            - metadata (dict, optional): Additional structured data about the evaluation
 855
 856        Examples:
 857            Average accuracy calculator:
 858            ```python
 859            def average_accuracy(*, item_results, **kwargs):
 860                if not item_results:
 861                    return {"name": "avg_accuracy", "value": 0.0, "comment": "No results"}
 862
 863                accuracy_values = []
 864                for result in item_results:
 865                    for evaluation in result.evaluations:
 866                        if evaluation.name == "accuracy":
 867                            accuracy_values.append(evaluation.value)
 868
 869                if not accuracy_values:
 870                    return {"name": "avg_accuracy", "value": None, "comment": "No accuracy evaluations found"}
 871
 872                avg = sum(accuracy_values) / len(accuracy_values)
 873                return {
 874                    "name": "avg_accuracy",
 875                    "value": avg,
 876                    "comment": f"Average accuracy across {len(accuracy_values)} items: {avg:.2%}"
 877                }
 878            ```
 879
 880            Multiple aggregate metrics:
 881            ```python
 882            def statistical_summary(*, item_results, **kwargs):
 883                if not item_results:
 884                    return []
 885
 886                results = []
 887
 888                # Calculate output length statistics
 889                lengths = [len(str(result.output)) for result in item_results]
 890                results.extend([
 891                    {"name": "avg_output_length", "value": sum(lengths) / len(lengths)},
 892                    {"name": "min_output_length", "value": min(lengths)},
 893                    {"name": "max_output_length", "value": max(lengths)}
 894                ])
 895
 896                # Success rate
 897                total_items = len(item_results)  # Only successful items are included
 898                results.append({
 899                    "name": "processing_success_rate",
 900                    "value": 1.0,  # All items in item_results succeeded
 901                    "comment": f"Successfully processed {total_items} items"
 902                })
 903
 904                return results
 905            ```
 906
 907            Async run evaluator with external analysis:
 908            ```python
 909            async def llm_batch_analysis(*, item_results, **kwargs):
 910                # Prepare batch analysis prompt
 911                outputs = [result.output for result in item_results]
 912                prompt = f"Analyze these {len(outputs)} outputs for common themes:\n"
 913                prompt += "\n".join(f"{i+1}. {output}" for i, output in enumerate(outputs))
 914
 915                response = await openai_client.chat.completions.create(
 916                    model="gpt-4",
 917                    messages=[{"role": "user", "content": prompt}]
 918                )
 919
 920                return {
 921                    "name": "thematic_analysis",
 922                    "value": response.choices[0].message.content,
 923                    "comment": f"LLM analysis of {len(outputs)} outputs"
 924                }
 925            ```
 926
 927            Performance distribution analysis:
 928            ```python
 929            def performance_distribution(*, item_results, **kwargs):
 930                # Extract all evaluation scores
 931                all_scores = []
 932                score_by_metric = {}
 933
 934                for result in item_results:
 935                    for evaluation in result.evaluations:
 936                        metric_name = evaluation.name
 937                        value = evaluation.value
 938
 939                        if isinstance(value, (int, float)):
 940                            all_scores.append(value)
 941                            if metric_name not in score_by_metric:
 942                                score_by_metric[metric_name] = []
 943                            score_by_metric[metric_name].append(value)
 944
 945                results = []
 946
 947                # Overall score distribution
 948                if all_scores:
 949                    import statistics
 950                    results.append({
 951                        "name": "score_std_dev",
 952                        "value": statistics.stdev(all_scores) if len(all_scores) > 1 else 0,
 953                        "comment": f"Standard deviation across all numeric scores"
 954                    })
 955
 956                # Per-metric statistics
 957                for metric, scores in score_by_metric.items():
 958                    if len(scores) > 1:
 959                        results.append({
 960                            "name": f"{metric}_variance",
 961                            "value": statistics.variance(scores),
 962                            "comment": f"Variance in {metric} across {len(scores)} items"
 963                        })
 964
 965                return results
 966            ```
 967        """
 968        ...
 969
 970
 971def _format_value(value: Any) -> str:
 972    """Format a value for display."""
 973    if isinstance(value, str):
 974        return value[:50] + "..." if len(value) > 50 else value
 975    return str(value)
 976
 977
 978async def _run_evaluator(
 979    evaluator: Union[EvaluatorFunction, RunEvaluatorFunction], **kwargs: Any
 980) -> List[Evaluation]:
 981    """Run an evaluator function and normalize the result."""
 982    try:
 983        result = evaluator(**kwargs)
 984
 985        # Handle async evaluators
 986        if asyncio.iscoroutine(result):
 987            result = await result
 988
 989        # Normalize to list
 990        if isinstance(result, (dict, Evaluation)):
 991            return [result]  # type: ignore
 992
 993        elif isinstance(result, list):
 994            return result
 995
 996        else:
 997            return []
 998
 999    except Exception as e:
1000        evaluator_name = getattr(evaluator, "__name__", "unknown_evaluator")
1001        logging.getLogger("langfuse").error(f"Evaluator {evaluator_name} failed: {e}")
1002        return []
1003
1004
1005async def _run_task(task: TaskFunction, item: ExperimentItem) -> Any:
1006    """Run a task function and handle sync/async."""
1007    result = task(item=item)
1008
1009    # Handle async tasks
1010    if asyncio.iscoroutine(result):
1011        result = await result
1012
1013    return result
1014
1015
1016def create_evaluator_from_autoevals(
1017    autoevals_evaluator: Any, **kwargs: Optional[Dict[str, Any]]
1018) -> EvaluatorFunction:
1019    """Create a Langfuse evaluator from an autoevals evaluator.
1020
1021    Args:
1022        autoevals_evaluator: An autoevals evaluator instance
1023        **kwargs: Additional arguments passed to the evaluator
1024
1025    Returns:
1026        A Langfuse-compatible evaluator function
1027    """
1028
1029    def langfuse_evaluator(
1030        *,
1031        input: Any,
1032        output: Any,
1033        expected_output: Any,
1034        metadata: Optional[Dict[str, Any]],
1035        **langfuse_kwargs: Dict[str, Any],
1036    ) -> Evaluation:
1037        evaluation = autoevals_evaluator(
1038            input=input, output=output, expected=expected_output, **kwargs
1039        )
1040
1041        return Evaluation(
1042            name=evaluation.name,
1043            value=evaluation.score,
1044            comment=(evaluation.metadata or {}).get("comment"),
1045            metadata=evaluation.metadata,
1046        )
1047
1048    return langfuse_evaluator
class LocalExperimentItem(typing.TypedDict):
29class LocalExperimentItem(TypedDict, total=False):
30    """Structure for local experiment data items (not from Langfuse datasets).
31
32    This TypedDict defines the structure for experiment items when using local data
33    rather than Langfuse-hosted datasets. All fields are optional to provide
34    flexibility in data structure.
35
36    Attributes:
37        input: The input data to pass to the task function. Can be any type that
38            your task function can process (string, dict, list, etc.). This is
39            typically the prompt, question, or data that your task will operate on.
40        expected_output: Optional expected/ground truth output for evaluation purposes.
41            Used by evaluators to assess correctness or quality. Can be None if
42            no ground truth is available.
43        metadata: Optional metadata dictionary containing additional context about
44            this specific item. Can include information like difficulty level,
45            category, source, or any other relevant attributes that evaluators
46            might use for context-aware evaluation.
47
48    Examples:
49        Simple text processing item:
50        ```python
51        item: LocalExperimentItem = {
52            "input": "Summarize this article: ...",
53            "expected_output": "Expected summary...",
54            "metadata": {"difficulty": "medium", "category": "news"}
55        }
56        ```
57
58        Classification item:
59        ```python
60        item: LocalExperimentItem = {
61            "input": {"text": "This movie is great!", "context": "movie review"},
62            "expected_output": "positive",
63            "metadata": {"dataset_source": "imdb", "confidence": 0.95}
64        }
65        ```
66
67        Minimal item with only input:
68        ```python
69        item: LocalExperimentItem = {
70            "input": "What is the capital of France?"
71        }
72        ```
73    """
74
75    input: Any
76    expected_output: Any
77    metadata: Optional[Dict[str, Any]]

Structure for local experiment data items (not from Langfuse datasets).

This TypedDict defines the structure for experiment items when using local data rather than Langfuse-hosted datasets. All fields are optional to provide flexibility in data structure.

Attributes:
  • input: The input data to pass to the task function. Can be any type that your task function can process (string, dict, list, etc.). This is typically the prompt, question, or data that your task will operate on.
  • expected_output: Optional expected/ground truth output for evaluation purposes. Used by evaluators to assess correctness or quality. Can be None if no ground truth is available.
  • metadata: Optional metadata dictionary containing additional context about this specific item. Can include information like difficulty level, category, source, or any other relevant attributes that evaluators might use for context-aware evaluation.
Examples:

Simple text processing item:

item: LocalExperimentItem = {
    "input": "Summarize this article: ...",
    "expected_output": "Expected summary...",
    "metadata": {"difficulty": "medium", "category": "news"}
}

Classification item:

item: LocalExperimentItem = {
    "input": {"text": "This movie is great!", "context": "movie review"},
    "expected_output": "positive",
    "metadata": {"dataset_source": "imdb", "confidence": 0.95}
}

Minimal item with only input:

item: LocalExperimentItem = {
    "input": "What is the capital of France?"
}
input: Any
expected_output: Any
metadata: Optional[Dict[str, Any]]
ExperimentItem = typing.Union[LocalExperimentItem, ForwardRef('DatasetItemClient')]

Type alias for items that can be processed in experiments.

Can be either:

  • LocalExperimentItem: Dict-like items with 'input', 'expected_output', 'metadata' keys
  • DatasetItemClient: Items from Langfuse datasets with .input, .expected_output, .metadata attributes
ExperimentData = typing.Union[typing.List[LocalExperimentItem], typing.List[ForwardRef('DatasetItemClient')]]

Type alias for experiment datasets.

Represents the collection of items to process in an experiment. Can be either:

  • List[LocalExperimentItem]: Local data items as dictionaries
  • List[DatasetItemClient]: Items from a Langfuse dataset (typically from dataset.items)
class Evaluation:
 97class Evaluation:
 98    """Represents an evaluation result for an experiment item or an entire experiment run.
 99
100    This class provides a strongly-typed way to create evaluation results in evaluator functions.
101    Users must use keyword arguments when instantiating this class.
102
103    Attributes:
104        name: Unique identifier for the evaluation metric. Should be descriptive
105            and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity").
106            Used for aggregation and comparison across experiment runs.
107        value: The evaluation score or result. Can be:
108            - Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42)
109            - String: For categorical results like "positive", "negative", "neutral"
110            - Boolean: For binary assessments like "passes_safety_check"
111        comment: Optional human-readable explanation of the evaluation result.
112            Useful for providing context, explaining scoring rationale, or noting
113            special conditions. Displayed in Langfuse UI for interpretability.
114        metadata: Optional structured metadata about the evaluation process.
115            Can include confidence scores, intermediate calculations, model versions,
116            or any other relevant technical details.
117        data_type: Optional score data type. Required if value is not NUMERIC.
118            One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC.
119        config_id: Optional Langfuse score config ID.
120
121    Examples:
122        Basic accuracy evaluation:
123        ```python
124        from langfuse import Evaluation
125
126        def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
127            if not expected_output:
128                return Evaluation(name="accuracy", value=None, comment="No expected output")
129
130            is_correct = output.strip().lower() == expected_output.strip().lower()
131            return Evaluation(
132                name="accuracy",
133                value=1.0 if is_correct else 0.0,
134                comment="Correct answer" if is_correct else "Incorrect answer"
135            )
136        ```
137
138        Multi-metric evaluator:
139        ```python
140        def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs):
141            return [
142                Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"),
143                Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"),
144                Evaluation(
145                    name="quality",
146                    value=0.85,
147                    comment="High quality response",
148                    metadata={"confidence": 0.92, "model": "gpt-4"}
149                )
150            ]
151        ```
152
153        Categorical evaluation:
154        ```python
155        def sentiment_evaluator(*, input, output, **kwargs):
156            sentiment = analyze_sentiment(output)  # Returns "positive", "negative", or "neutral"
157            return Evaluation(
158                name="sentiment",
159                value=sentiment,
160                comment=f"Response expresses {sentiment} sentiment",
161                data_type="CATEGORICAL"
162            )
163        ```
164
165        Failed evaluation with error handling:
166        ```python
167        def external_api_evaluator(*, input, output, **kwargs):
168            try:
169                score = external_api.evaluate(output)
170                return Evaluation(name="external_score", value=score)
171            except Exception as e:
172                return Evaluation(
173                    name="external_score",
174                    value=None,
175                    comment=f"API unavailable: {e}",
176                    metadata={"error": str(e), "retry_count": 3}
177                )
178        ```
179
180    Note:
181        All arguments must be passed as keywords. Positional arguments are not allowed
182        to ensure code clarity and prevent errors from argument reordering.
183    """
184
185    def __init__(
186        self,
187        *,
188        name: str,
189        value: Union[int, float, str, bool],
190        comment: Optional[str] = None,
191        metadata: Optional[Dict[str, Any]] = None,
192        data_type: Optional[ScoreDataType] = None,
193        config_id: Optional[str] = None,
194    ):
195        """Initialize an Evaluation with the provided data.
196
197        Args:
198            name: Unique identifier for the evaluation metric.
199            value: The evaluation score or result.
200            comment: Optional human-readable explanation of the result.
201            metadata: Optional structured metadata about the evaluation process.
202            data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
203            config_id: Optional Langfuse score config ID.
204
205        Note:
206            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
207        """
208        self.name = name
209        self.value = value
210        self.comment = comment
211        self.metadata = metadata
212        self.data_type = data_type
213        self.config_id = config_id

Represents an evaluation result for an experiment item or an entire experiment run.

This class provides a strongly-typed way to create evaluation results in evaluator functions. Users must use keyword arguments when instantiating this class.

Attributes:
  • name: Unique identifier for the evaluation metric. Should be descriptive and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity"). Used for aggregation and comparison across experiment runs.
  • value: The evaluation score or result. Can be:
    • Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42)
    • String: For categorical results like "positive", "negative", "neutral"
    • Boolean: For binary assessments like "passes_safety_check"
  • comment: Optional human-readable explanation of the evaluation result. Useful for providing context, explaining scoring rationale, or noting special conditions. Displayed in Langfuse UI for interpretability.
  • metadata: Optional structured metadata about the evaluation process. Can include confidence scores, intermediate calculations, model versions, or any other relevant technical details.
  • data_type: Optional score data type. Required if value is not NUMERIC. One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC.
  • config_id: Optional Langfuse score config ID.
Examples:

Basic accuracy evaluation:

from langfuse import Evaluation

def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
    if not expected_output:
        return Evaluation(name="accuracy", value=None, comment="No expected output")

    is_correct = output.strip().lower() == expected_output.strip().lower()
    return Evaluation(
        name="accuracy",
        value=1.0 if is_correct else 0.0,
        comment="Correct answer" if is_correct else "Incorrect answer"
    )

Multi-metric evaluator:

def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs):
    return [
        Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"),
        Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"),
        Evaluation(
            name="quality",
            value=0.85,
            comment="High quality response",
            metadata={"confidence": 0.92, "model": "gpt-4"}
        )
    ]

Categorical evaluation:

def sentiment_evaluator(*, input, output, **kwargs):
    sentiment = analyze_sentiment(output)  # Returns "positive", "negative", or "neutral"
    return Evaluation(
        name="sentiment",
        value=sentiment,
        comment=f"Response expresses {sentiment} sentiment",
        data_type="CATEGORICAL"
    )

Failed evaluation with error handling:

def external_api_evaluator(*, input, output, **kwargs):
    try:
        score = external_api.evaluate(output)
        return Evaluation(name="external_score", value=score)
    except Exception as e:
        return Evaluation(
            name="external_score",
            value=None,
            comment=f"API unavailable: {e}",
            metadata={"error": str(e), "retry_count": 3}
        )
Note:

All arguments must be passed as keywords. Positional arguments are not allowed to ensure code clarity and prevent errors from argument reordering.

Evaluation( *, name: str, value: Union[int, float, str, bool], comment: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, data_type: Optional[langfuse.api.ScoreDataType] = None, config_id: Optional[str] = None)
185    def __init__(
186        self,
187        *,
188        name: str,
189        value: Union[int, float, str, bool],
190        comment: Optional[str] = None,
191        metadata: Optional[Dict[str, Any]] = None,
192        data_type: Optional[ScoreDataType] = None,
193        config_id: Optional[str] = None,
194    ):
195        """Initialize an Evaluation with the provided data.
196
197        Args:
198            name: Unique identifier for the evaluation metric.
199            value: The evaluation score or result.
200            comment: Optional human-readable explanation of the result.
201            metadata: Optional structured metadata about the evaluation process.
202            data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
203            config_id: Optional Langfuse score config ID.
204
205        Note:
206            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
207        """
208        self.name = name
209        self.value = value
210        self.comment = comment
211        self.metadata = metadata
212        self.data_type = data_type
213        self.config_id = config_id

Initialize an Evaluation with the provided data.

Arguments:
  • name: Unique identifier for the evaluation metric.
  • value: The evaluation score or result.
  • comment: Optional human-readable explanation of the result.
  • metadata: Optional structured metadata about the evaluation process.
  • data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
  • config_id: Optional Langfuse score config ID.
Note:

All arguments must be provided as keywords. Positional arguments will raise a TypeError.

name
value
comment
metadata
data_type
config_id
class ExperimentItemResult:
216class ExperimentItemResult:
217    """Result structure for individual experiment items.
218
219    This class represents the complete result of processing a single item
220    during an experiment run, including the original input, task output,
221    evaluations, and tracing information. Users must use keyword arguments when instantiating this class.
222
223    Attributes:
224        item: The original experiment item that was processed. Can be either
225            a dictionary with 'input', 'expected_output', and 'metadata' keys,
226            or a DatasetItemClient from Langfuse datasets.
227        output: The actual output produced by the task function for this item.
228            Can be any type depending on what your task function returns.
229        evaluations: List of evaluation results for this item. Each evaluation
230            contains a name, value, optional comment, and optional metadata.
231        trace_id: Optional Langfuse trace ID for this item's execution. Used
232            to link the experiment result with the detailed trace in Langfuse UI.
233        dataset_run_id: Optional dataset run ID if this item was part of a
234            Langfuse dataset. None for local experiments.
235
236    Examples:
237        Accessing item result data:
238        ```python
239        result = langfuse.run_experiment(...)
240        for item_result in result.item_results:
241            print(f"Input: {item_result.item}")
242            print(f"Output: {item_result.output}")
243            print(f"Trace: {item_result.trace_id}")
244
245            # Access evaluations
246            for evaluation in item_result.evaluations:
247                print(f"{evaluation.name}: {evaluation.value}")
248        ```
249
250        Working with different item types:
251        ```python
252        # Local experiment item (dict)
253        if isinstance(item_result.item, dict):
254            input_data = item_result.item["input"]
255            expected = item_result.item.get("expected_output")
256
257        # Langfuse dataset item (object with attributes)
258        else:
259            input_data = item_result.item.input
260            expected = item_result.item.expected_output
261        ```
262
263    Note:
264        All arguments must be passed as keywords. Positional arguments are not allowed
265        to ensure code clarity and prevent errors from argument reordering.
266    """
267
268    def __init__(
269        self,
270        *,
271        item: ExperimentItem,
272        output: Any,
273        evaluations: List[Evaluation],
274        trace_id: Optional[str],
275        dataset_run_id: Optional[str],
276    ):
277        """Initialize an ExperimentItemResult with the provided data.
278
279        Args:
280            item: The original experiment item that was processed.
281            output: The actual output produced by the task function for this item.
282            evaluations: List of evaluation results for this item.
283            trace_id: Optional Langfuse trace ID for this item's execution.
284            dataset_run_id: Optional dataset run ID if this item was part of a Langfuse dataset.
285
286        Note:
287            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
288        """
289        self.item = item
290        self.output = output
291        self.evaluations = evaluations
292        self.trace_id = trace_id
293        self.dataset_run_id = dataset_run_id

Result structure for individual experiment items.

This class represents the complete result of processing a single item during an experiment run, including the original input, task output, evaluations, and tracing information. Users must use keyword arguments when instantiating this class.

Attributes:
  • item: The original experiment item that was processed. Can be either a dictionary with 'input', 'expected_output', and 'metadata' keys, or a DatasetItemClient from Langfuse datasets.
  • output: The actual output produced by the task function for this item. Can be any type depending on what your task function returns.
  • evaluations: List of evaluation results for this item. Each evaluation contains a name, value, optional comment, and optional metadata.
  • trace_id: Optional Langfuse trace ID for this item's execution. Used to link the experiment result with the detailed trace in Langfuse UI.
  • dataset_run_id: Optional dataset run ID if this item was part of a Langfuse dataset. None for local experiments.
Examples:

Accessing item result data:

result = langfuse.run_experiment(...)
for item_result in result.item_results:
    print(f"Input: {item_result.item}")
    print(f"Output: {item_result.output}")
    print(f"Trace: {item_result.trace_id}")

    # Access evaluations
    for evaluation in item_result.evaluations:
        print(f"{evaluation.name}: {evaluation.value}")

Working with different item types:

# Local experiment item (dict)
if isinstance(item_result.item, dict):
    input_data = item_result.item["input"]
    expected = item_result.item.get("expected_output")

# Langfuse dataset item (object with attributes)
else:
    input_data = item_result.item.input
    expected = item_result.item.expected_output
Note:

All arguments must be passed as keywords. Positional arguments are not allowed to ensure code clarity and prevent errors from argument reordering.

ExperimentItemResult( *, item: Union[LocalExperimentItem, langfuse._client.datasets.DatasetItemClient], output: Any, evaluations: List[Evaluation], trace_id: Optional[str], dataset_run_id: Optional[str])
268    def __init__(
269        self,
270        *,
271        item: ExperimentItem,
272        output: Any,
273        evaluations: List[Evaluation],
274        trace_id: Optional[str],
275        dataset_run_id: Optional[str],
276    ):
277        """Initialize an ExperimentItemResult with the provided data.
278
279        Args:
280            item: The original experiment item that was processed.
281            output: The actual output produced by the task function for this item.
282            evaluations: List of evaluation results for this item.
283            trace_id: Optional Langfuse trace ID for this item's execution.
284            dataset_run_id: Optional dataset run ID if this item was part of a Langfuse dataset.
285
286        Note:
287            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
288        """
289        self.item = item
290        self.output = output
291        self.evaluations = evaluations
292        self.trace_id = trace_id
293        self.dataset_run_id = dataset_run_id

Initialize an ExperimentItemResult with the provided data.

Arguments:
  • item: The original experiment item that was processed.
  • output: The actual output produced by the task function for this item.
  • evaluations: List of evaluation results for this item.
  • trace_id: Optional Langfuse trace ID for this item's execution.
  • dataset_run_id: Optional dataset run ID if this item was part of a Langfuse dataset.
Note:

All arguments must be provided as keywords. Positional arguments will raise a TypeError.

item
output
evaluations
trace_id
dataset_run_id
class ExperimentResult:
296class ExperimentResult:
297    """Complete result structure for experiment execution.
298
299    This class encapsulates the complete results of running an experiment on a dataset,
300    including individual item results, aggregate run-level evaluations, and metadata
301    about the experiment execution.
302
303    Attributes:
304        name: The name of the experiment as specified during execution.
305        run_name: The name of the current experiment run.
306        description: Optional description of the experiment's purpose or methodology.
307        item_results: List of results from processing each individual dataset item,
308            containing the original item, task output, evaluations, and trace information.
309        run_evaluations: List of aggregate evaluation results computed across all items,
310            such as average scores, statistical summaries, or cross-item analyses.
311        dataset_run_id: Optional ID of the dataset run in Langfuse (when using Langfuse datasets).
312        dataset_run_url: Optional direct URL to view the experiment results in Langfuse UI.
313
314    Examples:
315        Basic usage with local dataset:
316        ```python
317        result = langfuse.run_experiment(
318            name="Capital Cities Test",
319            data=local_data,
320            task=generate_capital,
321            evaluators=[accuracy_check]
322        )
323
324        print(f"Processed {len(result.item_results)} items")
325        print(result.format())  # Human-readable summary
326
327        # Access individual results
328        for item_result in result.item_results:
329            print(f"Input: {item_result.item}")
330            print(f"Output: {item_result.output}")
331            print(f"Scores: {item_result.evaluations}")
332        ```
333
334        Usage with Langfuse datasets:
335        ```python
336        dataset = langfuse.get_dataset("qa-eval-set")
337        result = dataset.run_experiment(
338            name="GPT-4 QA Evaluation",
339            task=answer_question,
340            evaluators=[relevance_check, accuracy_check]
341        )
342
343        # View in Langfuse UI
344        if result.dataset_run_url:
345            print(f"View detailed results: {result.dataset_run_url}")
346        ```
347
348        Formatted output:
349        ```python
350        # Get summary view
351        summary = result.format()
352        print(summary)
353
354        # Get detailed view with individual items
355        detailed = result.format(include_item_results=True)
356        with open("experiment_report.txt", "w") as f:
357            f.write(detailed)
358        ```
359    """
360
361    def __init__(
362        self,
363        *,
364        name: str,
365        run_name: str,
366        description: Optional[str],
367        item_results: List[ExperimentItemResult],
368        run_evaluations: List[Evaluation],
369        dataset_run_id: Optional[str] = None,
370        dataset_run_url: Optional[str] = None,
371    ):
372        """Initialize an ExperimentResult with the provided data.
373
374        Args:
375            name: The name of the experiment.
376            run_name: The current experiment run name.
377            description: Optional description of the experiment.
378            item_results: List of results from processing individual dataset items.
379            run_evaluations: List of aggregate evaluation results for the entire run.
380            dataset_run_id: Optional ID of the dataset run (for Langfuse datasets).
381            dataset_run_url: Optional URL to view results in Langfuse UI.
382        """
383        self.name = name
384        self.run_name = run_name
385        self.description = description
386        self.item_results = item_results
387        self.run_evaluations = run_evaluations
388        self.dataset_run_id = dataset_run_id
389        self.dataset_run_url = dataset_run_url
390
391    def format(self, *, include_item_results: bool = False) -> str:
392        r"""Format the experiment result for human-readable display.
393
394        Converts the experiment result into a nicely formatted string suitable for
395        console output, logging, or reporting. The output includes experiment overview,
396        aggregate statistics, and optionally individual item details.
397
398        This method provides a comprehensive view of experiment performance including:
399        - Experiment metadata (name, description, item count)
400        - List of evaluation metrics used across items
401        - Average scores computed across all processed items
402        - Run-level evaluation results (aggregate metrics)
403        - Links to view detailed results in Langfuse UI (when available)
404        - Individual item details (when requested)
405
406        Args:
407            include_item_results: Whether to include detailed results for each individual
408                item in the formatted output. When False (default), only shows aggregate
409                statistics and summary information. When True, includes input/output/scores
410                for every processed item, making the output significantly longer but more
411                detailed for debugging and analysis purposes.
412
413        Returns:
414            A formatted multi-line string containing:
415            - Experiment name and description (if provided)
416            - Total number of items successfully processed
417            - List of all evaluation metrics that were applied
418            - Average scores across all items for each numeric metric
419            - Run-level evaluation results with comments
420            - Dataset run URL for viewing in Langfuse UI (if applicable)
421            - Individual item details including inputs, outputs, and scores (if requested)
422
423        Examples:
424            Basic usage showing aggregate results only:
425            ```python
426            result = langfuse.run_experiment(
427                name="Capital Cities",
428                data=dataset,
429                task=generate_capital,
430                evaluators=[accuracy_evaluator]
431            )
432
433            print(result.format())
434            # Output:
435            # ──────────────────────────────────────────────────
436            # 📊 Capital Cities
437            # 100 items
438            # Evaluations:
439            #   • accuracy
440            # Average Scores:
441            #   • accuracy: 0.850
442            ```
443
444            Detailed output including all individual item results:
445            ```python
446            detailed_report = result.format(include_item_results=True)
447            print(detailed_report)
448            # Output includes each item:
449            # 1. Item 1:
450            #    Input:    What is the capital of France?
451            #    Expected: Paris
452            #    Actual:   The capital of France is Paris.
453            #    Scores:
454            #      • accuracy: 1.000
455            #        💭 Correct answer found
456            # [... continues for all items ...]
457            ```
458
459            Saving formatted results to file for reporting:
460            ```python
461            with open("experiment_report.txt", "w") as f:
462                f.write(result.format(include_item_results=True))
463
464            # Or create summary report
465            summary = result.format()  # Aggregate view only
466            print(f"Experiment Summary:\\n{summary}")
467            ```
468
469            Integration with logging systems:
470            ```python
471            import logging
472            logger = logging.getLogger("experiments")
473
474            # Log summary after experiment
475            logger.info(f"Experiment completed:\\n{result.format()}")
476
477            # Log detailed results for failed experiments
478            if any(eval['value'] < threshold for eval in result.run_evaluations):
479                logger.warning(f"Poor performance detected:\\n{result.format(include_item_results=True)}")
480            ```
481        """
482        if not self.item_results:
483            return "No experiment results to display."
484
485        output = ""
486
487        # Individual results section
488        if include_item_results:
489            for i, result in enumerate(self.item_results):
490                output += f"\\n{i + 1}. Item {i + 1}:\\n"
491
492                # Extract and display input
493                item_input = None
494                if isinstance(result.item, dict):
495                    item_input = result.item.get("input")
496                elif hasattr(result.item, "input"):
497                    item_input = result.item.input
498
499                if item_input is not None:
500                    output += f"   Input:    {_format_value(item_input)}\\n"
501
502                # Extract and display expected output
503                expected_output = None
504                if isinstance(result.item, dict):
505                    expected_output = result.item.get("expected_output")
506                elif hasattr(result.item, "expected_output"):
507                    expected_output = result.item.expected_output
508
509                if expected_output is not None:
510                    output += f"   Expected: {_format_value(expected_output)}\\n"
511                output += f"   Actual:   {_format_value(result.output)}\\n"
512
513                # Display evaluation scores
514                if result.evaluations:
515                    output += "   Scores:\\n"
516                    for evaluation in result.evaluations:
517                        score = evaluation.value
518                        if isinstance(score, (int, float)):
519                            score = f"{score:.3f}"
520                        output += f"     • {evaluation.name}: {score}"
521                        if evaluation.comment:
522                            output += f"\\n       💭 {evaluation.comment}"
523                        output += "\\n"
524
525                # Display trace link if available
526                if result.trace_id:
527                    output += f"\\n   Trace ID: {result.trace_id}\\n"
528        else:
529            output += f"Individual Results: Hidden ({len(self.item_results)} items)\\n"
530            output += "💡 Set include_item_results=True to view them\\n"
531
532        # Experiment overview section
533        output += f"\\n{'─' * 50}\\n"
534        output += f"🧪 Experiment: {self.name}"
535        output += f"\n📋 Run name: {self.run_name}"
536        if self.description:
537            output += f" - {self.description}"
538
539        output += f"\\n{len(self.item_results)} items"
540
541        # Collect unique evaluation names across all items
542        evaluation_names = set()
543        for result in self.item_results:
544            for evaluation in result.evaluations:
545                evaluation_names.add(evaluation.name)
546
547        if evaluation_names:
548            output += "\\nEvaluations:"
549            for eval_name in evaluation_names:
550                output += f"\\n  • {eval_name}"
551            output += "\\n"
552
553        # Calculate and display average scores
554        if evaluation_names:
555            output += "\\nAverage Scores:"
556            for eval_name in evaluation_names:
557                scores = []
558                for result in self.item_results:
559                    for evaluation in result.evaluations:
560                        if evaluation.name == eval_name and isinstance(
561                            evaluation.value, (int, float)
562                        ):
563                            scores.append(evaluation.value)
564
565                if scores:
566                    avg = sum(scores) / len(scores)
567                    output += f"\\n  • {eval_name}: {avg:.3f}"
568            output += "\\n"
569
570        # Display run-level evaluations
571        if self.run_evaluations:
572            output += "\\nRun Evaluations:"
573            for run_eval in self.run_evaluations:
574                score = run_eval.value
575                if isinstance(score, (int, float)):
576                    score = f"{score:.3f}"
577                output += f"\\n  • {run_eval.name}: {score}"
578                if run_eval.comment:
579                    output += f"\\n    💭 {run_eval.comment}"
580            output += "\\n"
581
582        # Add dataset run URL if available
583        if self.dataset_run_url:
584            output += f"\\n🔗 Dataset Run:\\n   {self.dataset_run_url}"
585
586        return output

Complete result structure for experiment execution.

This class encapsulates the complete results of running an experiment on a dataset, including individual item results, aggregate run-level evaluations, and metadata about the experiment execution.

Attributes:
  • name: The name of the experiment as specified during execution.
  • run_name: The name of the current experiment run.
  • description: Optional description of the experiment's purpose or methodology.
  • item_results: List of results from processing each individual dataset item, containing the original item, task output, evaluations, and trace information.
  • run_evaluations: List of aggregate evaluation results computed across all items, such as average scores, statistical summaries, or cross-item analyses.
  • dataset_run_id: Optional ID of the dataset run in Langfuse (when using Langfuse datasets).
  • dataset_run_url: Optional direct URL to view the experiment results in Langfuse UI.
Examples:

Basic usage with local dataset:

result = langfuse.run_experiment(
    name="Capital Cities Test",
    data=local_data,
    task=generate_capital,
    evaluators=[accuracy_check]
)

print(f"Processed {len(result.item_results)} items")
print(result.format())  # Human-readable summary

# Access individual results
for item_result in result.item_results:
    print(f"Input: {item_result.item}")
    print(f"Output: {item_result.output}")
    print(f"Scores: {item_result.evaluations}")

Usage with Langfuse datasets:

dataset = langfuse.get_dataset("qa-eval-set")
result = dataset.run_experiment(
    name="GPT-4 QA Evaluation",
    task=answer_question,
    evaluators=[relevance_check, accuracy_check]
)

# View in Langfuse UI
if result.dataset_run_url:
    print(f"View detailed results: {result.dataset_run_url}")

Formatted output:

# Get summary view
summary = result.format()
print(summary)

# Get detailed view with individual items
detailed = result.format(include_item_results=True)
with open("experiment_report.txt", "w") as f:
    f.write(detailed)
ExperimentResult( *, name: str, run_name: str, description: Optional[str], item_results: List[ExperimentItemResult], run_evaluations: List[Evaluation], dataset_run_id: Optional[str] = None, dataset_run_url: Optional[str] = None)
361    def __init__(
362        self,
363        *,
364        name: str,
365        run_name: str,
366        description: Optional[str],
367        item_results: List[ExperimentItemResult],
368        run_evaluations: List[Evaluation],
369        dataset_run_id: Optional[str] = None,
370        dataset_run_url: Optional[str] = None,
371    ):
372        """Initialize an ExperimentResult with the provided data.
373
374        Args:
375            name: The name of the experiment.
376            run_name: The current experiment run name.
377            description: Optional description of the experiment.
378            item_results: List of results from processing individual dataset items.
379            run_evaluations: List of aggregate evaluation results for the entire run.
380            dataset_run_id: Optional ID of the dataset run (for Langfuse datasets).
381            dataset_run_url: Optional URL to view results in Langfuse UI.
382        """
383        self.name = name
384        self.run_name = run_name
385        self.description = description
386        self.item_results = item_results
387        self.run_evaluations = run_evaluations
388        self.dataset_run_id = dataset_run_id
389        self.dataset_run_url = dataset_run_url

Initialize an ExperimentResult with the provided data.

Arguments:
  • name: The name of the experiment.
  • run_name: The current experiment run name.
  • description: Optional description of the experiment.
  • item_results: List of results from processing individual dataset items.
  • run_evaluations: List of aggregate evaluation results for the entire run.
  • dataset_run_id: Optional ID of the dataset run (for Langfuse datasets).
  • dataset_run_url: Optional URL to view results in Langfuse UI.
name
run_name
description
item_results
run_evaluations
dataset_run_id
dataset_run_url
def format(self, *, include_item_results: bool = False) -> str:
391    def format(self, *, include_item_results: bool = False) -> str:
392        r"""Format the experiment result for human-readable display.
393
394        Converts the experiment result into a nicely formatted string suitable for
395        console output, logging, or reporting. The output includes experiment overview,
396        aggregate statistics, and optionally individual item details.
397
398        This method provides a comprehensive view of experiment performance including:
399        - Experiment metadata (name, description, item count)
400        - List of evaluation metrics used across items
401        - Average scores computed across all processed items
402        - Run-level evaluation results (aggregate metrics)
403        - Links to view detailed results in Langfuse UI (when available)
404        - Individual item details (when requested)
405
406        Args:
407            include_item_results: Whether to include detailed results for each individual
408                item in the formatted output. When False (default), only shows aggregate
409                statistics and summary information. When True, includes input/output/scores
410                for every processed item, making the output significantly longer but more
411                detailed for debugging and analysis purposes.
412
413        Returns:
414            A formatted multi-line string containing:
415            - Experiment name and description (if provided)
416            - Total number of items successfully processed
417            - List of all evaluation metrics that were applied
418            - Average scores across all items for each numeric metric
419            - Run-level evaluation results with comments
420            - Dataset run URL for viewing in Langfuse UI (if applicable)
421            - Individual item details including inputs, outputs, and scores (if requested)
422
423        Examples:
424            Basic usage showing aggregate results only:
425            ```python
426            result = langfuse.run_experiment(
427                name="Capital Cities",
428                data=dataset,
429                task=generate_capital,
430                evaluators=[accuracy_evaluator]
431            )
432
433            print(result.format())
434            # Output:
435            # ──────────────────────────────────────────────────
436            # 📊 Capital Cities
437            # 100 items
438            # Evaluations:
439            #   • accuracy
440            # Average Scores:
441            #   • accuracy: 0.850
442            ```
443
444            Detailed output including all individual item results:
445            ```python
446            detailed_report = result.format(include_item_results=True)
447            print(detailed_report)
448            # Output includes each item:
449            # 1. Item 1:
450            #    Input:    What is the capital of France?
451            #    Expected: Paris
452            #    Actual:   The capital of France is Paris.
453            #    Scores:
454            #      • accuracy: 1.000
455            #        💭 Correct answer found
456            # [... continues for all items ...]
457            ```
458
459            Saving formatted results to file for reporting:
460            ```python
461            with open("experiment_report.txt", "w") as f:
462                f.write(result.format(include_item_results=True))
463
464            # Or create summary report
465            summary = result.format()  # Aggregate view only
466            print(f"Experiment Summary:\\n{summary}")
467            ```
468
469            Integration with logging systems:
470            ```python
471            import logging
472            logger = logging.getLogger("experiments")
473
474            # Log summary after experiment
475            logger.info(f"Experiment completed:\\n{result.format()}")
476
477            # Log detailed results for failed experiments
478            if any(eval['value'] < threshold for eval in result.run_evaluations):
479                logger.warning(f"Poor performance detected:\\n{result.format(include_item_results=True)}")
480            ```
481        """
482        if not self.item_results:
483            return "No experiment results to display."
484
485        output = ""
486
487        # Individual results section
488        if include_item_results:
489            for i, result in enumerate(self.item_results):
490                output += f"\\n{i + 1}. Item {i + 1}:\\n"
491
492                # Extract and display input
493                item_input = None
494                if isinstance(result.item, dict):
495                    item_input = result.item.get("input")
496                elif hasattr(result.item, "input"):
497                    item_input = result.item.input
498
499                if item_input is not None:
500                    output += f"   Input:    {_format_value(item_input)}\\n"
501
502                # Extract and display expected output
503                expected_output = None
504                if isinstance(result.item, dict):
505                    expected_output = result.item.get("expected_output")
506                elif hasattr(result.item, "expected_output"):
507                    expected_output = result.item.expected_output
508
509                if expected_output is not None:
510                    output += f"   Expected: {_format_value(expected_output)}\\n"
511                output += f"   Actual:   {_format_value(result.output)}\\n"
512
513                # Display evaluation scores
514                if result.evaluations:
515                    output += "   Scores:\\n"
516                    for evaluation in result.evaluations:
517                        score = evaluation.value
518                        if isinstance(score, (int, float)):
519                            score = f"{score:.3f}"
520                        output += f"     • {evaluation.name}: {score}"
521                        if evaluation.comment:
522                            output += f"\\n       💭 {evaluation.comment}"
523                        output += "\\n"
524
525                # Display trace link if available
526                if result.trace_id:
527                    output += f"\\n   Trace ID: {result.trace_id}\\n"
528        else:
529            output += f"Individual Results: Hidden ({len(self.item_results)} items)\\n"
530            output += "💡 Set include_item_results=True to view them\\n"
531
532        # Experiment overview section
533        output += f"\\n{'─' * 50}\\n"
534        output += f"🧪 Experiment: {self.name}"
535        output += f"\n📋 Run name: {self.run_name}"
536        if self.description:
537            output += f" - {self.description}"
538
539        output += f"\\n{len(self.item_results)} items"
540
541        # Collect unique evaluation names across all items
542        evaluation_names = set()
543        for result in self.item_results:
544            for evaluation in result.evaluations:
545                evaluation_names.add(evaluation.name)
546
547        if evaluation_names:
548            output += "\\nEvaluations:"
549            for eval_name in evaluation_names:
550                output += f"\\n  • {eval_name}"
551            output += "\\n"
552
553        # Calculate and display average scores
554        if evaluation_names:
555            output += "\\nAverage Scores:"
556            for eval_name in evaluation_names:
557                scores = []
558                for result in self.item_results:
559                    for evaluation in result.evaluations:
560                        if evaluation.name == eval_name and isinstance(
561                            evaluation.value, (int, float)
562                        ):
563                            scores.append(evaluation.value)
564
565                if scores:
566                    avg = sum(scores) / len(scores)
567                    output += f"\\n  • {eval_name}: {avg:.3f}"
568            output += "\\n"
569
570        # Display run-level evaluations
571        if self.run_evaluations:
572            output += "\\nRun Evaluations:"
573            for run_eval in self.run_evaluations:
574                score = run_eval.value
575                if isinstance(score, (int, float)):
576                    score = f"{score:.3f}"
577                output += f"\\n  • {run_eval.name}: {score}"
578                if run_eval.comment:
579                    output += f"\\n    💭 {run_eval.comment}"
580            output += "\\n"
581
582        # Add dataset run URL if available
583        if self.dataset_run_url:
584            output += f"\\n🔗 Dataset Run:\\n   {self.dataset_run_url}"
585
586        return output

Format the experiment result for human-readable display.

Converts the experiment result into a nicely formatted string suitable for console output, logging, or reporting. The output includes experiment overview, aggregate statistics, and optionally individual item details.

This method provides a comprehensive view of experiment performance including:

  • Experiment metadata (name, description, item count)
  • List of evaluation metrics used across items
  • Average scores computed across all processed items
  • Run-level evaluation results (aggregate metrics)
  • Links to view detailed results in Langfuse UI (when available)
  • Individual item details (when requested)
Arguments:
  • include_item_results: Whether to include detailed results for each individual item in the formatted output. When False (default), only shows aggregate statistics and summary information. When True, includes input/output/scores for every processed item, making the output significantly longer but more detailed for debugging and analysis purposes.
Returns:

A formatted multi-line string containing:

  • Experiment name and description (if provided)
  • Total number of items successfully processed
  • List of all evaluation metrics that were applied
  • Average scores across all items for each numeric metric
  • Run-level evaluation results with comments
  • Dataset run URL for viewing in Langfuse UI (if applicable)
  • Individual item details including inputs, outputs, and scores (if requested)
Examples:

Basic usage showing aggregate results only:

result = langfuse.run_experiment(
    name="Capital Cities",
    data=dataset,
    task=generate_capital,
    evaluators=[accuracy_evaluator]
)

print(result.format())
# Output:
# ──────────────────────────────────────────────────
# 📊 Capital Cities
# 100 items
# Evaluations:
#   • accuracy
# Average Scores:
#   • accuracy: 0.850

Detailed output including all individual item results:

detailed_report = result.format(include_item_results=True)
print(detailed_report)
# Output includes each item:
# 1. Item 1:
#    Input:    What is the capital of France?
#    Expected: Paris
#    Actual:   The capital of France is Paris.
#    Scores:
#      • accuracy: 1.000
#        💭 Correct answer found
# [... continues for all items ...]

Saving formatted results to file for reporting:

with open("experiment_report.txt", "w") as f:
    f.write(result.format(include_item_results=True))

# Or create summary report
summary = result.format()  # Aggregate view only
print(f"Experiment Summary:\\n{summary}")

Integration with logging systems:

import logging
logger = logging.getLogger("experiments")

# Log summary after experiment
logger.info(f"Experiment completed:\\n{result.format()}")

# Log detailed results for failed experiments
if any(eval['value'] < threshold for eval in result.run_evaluations):
    logger.warning(f"Poor performance detected:\\n{result.format(include_item_results=True)}")
class TaskFunction(typing.Protocol):
589class TaskFunction(Protocol):
590    """Protocol defining the interface for experiment task functions.
591
592    Task functions are the core processing functions that operate on each item
593    in an experiment dataset. They receive an experiment item as input and
594    produce some output that will be evaluated.
595
596    Task functions must:
597    - Accept 'item' as a keyword argument
598    - Return any type of output (will be passed to evaluators)
599    - Can be either synchronous or asynchronous
600    - Should handle their own errors gracefully (exceptions will be logged)
601    """
602
603    def __call__(
604        self,
605        *,
606        item: ExperimentItem,
607        **kwargs: Dict[str, Any],
608    ) -> Union[Any, Awaitable[Any]]:
609        """Execute the task on an experiment item.
610
611        This method defines the core processing logic for each item in your experiment.
612        The implementation should focus on the specific task you want to evaluate,
613        such as text generation, classification, summarization, etc.
614
615        Args:
616            item: The experiment item to process. Can be either:
617                - Dict with keys like 'input', 'expected_output', 'metadata'
618                - Langfuse DatasetItem object with .input, .expected_output attributes
619            **kwargs: Additional keyword arguments that may be passed by the framework
620
621        Returns:
622            Any: The output of processing the item. This output will be:
623            - Stored in the experiment results
624            - Passed to all item-level evaluators for assessment
625            - Traced automatically in Langfuse for observability
626
627            Can return either a direct value or an awaitable (async) result.
628
629        Examples:
630            Simple synchronous task:
631            ```python
632            def my_task(*, item, **kwargs):
633                prompt = f"Summarize: {item['input']}"
634                return my_llm_client.generate(prompt)
635            ```
636
637            Async task with error handling:
638            ```python
639            async def my_async_task(*, item, **kwargs):
640                try:
641                    response = await openai_client.chat.completions.create(
642                        model="gpt-4",
643                        messages=[{"role": "user", "content": item["input"]}]
644                    )
645                    return response.choices[0].message.content
646                except Exception as e:
647                    # Log error and return fallback
648                    print(f"Task failed for item {item}: {e}")
649                    return "Error: Could not process item"
650            ```
651
652            Task using dataset item attributes:
653            ```python
654            def classification_task(*, item, **kwargs):
655                # Works with both dict items and DatasetItem objects
656                text = item["input"] if isinstance(item, dict) else item.input
657                return classify_text(text)
658            ```
659        """
660        ...

Protocol defining the interface for experiment task functions.

Task functions are the core processing functions that operate on each item in an experiment dataset. They receive an experiment item as input and produce some output that will be evaluated.

Task functions must:

  • Accept 'item' as a keyword argument
  • Return any type of output (will be passed to evaluators)
  • Can be either synchronous or asynchronous
  • Should handle their own errors gracefully (exceptions will be logged)
TaskFunction(*args, **kwargs)
1927def _no_init_or_replace_init(self, *args, **kwargs):
1928    cls = type(self)
1929
1930    if cls._is_protocol:
1931        raise TypeError('Protocols cannot be instantiated')
1932
1933    # Already using a custom `__init__`. No need to calculate correct
1934    # `__init__` to call. This can lead to RecursionError. See bpo-45121.
1935    if cls.__init__ is not _no_init_or_replace_init:
1936        return
1937
1938    # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`.
1939    # The first instantiation of the subclass will call `_no_init_or_replace_init` which
1940    # searches for a proper new `__init__` in the MRO. The new `__init__`
1941    # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent
1942    # instantiation of the protocol subclass will thus use the new
1943    # `__init__` and no longer call `_no_init_or_replace_init`.
1944    for base in cls.__mro__:
1945        init = base.__dict__.get('__init__', _no_init_or_replace_init)
1946        if init is not _no_init_or_replace_init:
1947            cls.__init__ = init
1948            break
1949    else:
1950        # should not happen
1951        cls.__init__ = object.__init__
1952
1953    cls.__init__(self, *args, **kwargs)
class EvaluatorFunction(typing.Protocol):
663class EvaluatorFunction(Protocol):
664    """Protocol defining the interface for item-level evaluator functions.
665
666    Item-level evaluators assess the quality, correctness, or other properties
667    of individual task outputs. They receive the input, output, expected output,
668    and metadata for each item and return evaluation metrics.
669
670    Evaluators should:
671    - Accept input, output, expected_output, and metadata as keyword arguments
672    - Return Evaluation dict(s) with 'name', 'value', 'comment', 'metadata' fields
673    - Be deterministic when possible for reproducible results
674    - Handle edge cases gracefully (missing expected output, malformed data, etc.)
675    - Can be either synchronous or asynchronous
676    """
677
678    def __call__(
679        self,
680        *,
681        input: Any,
682        output: Any,
683        expected_output: Any,
684        metadata: Optional[Dict[str, Any]],
685        **kwargs: Dict[str, Any],
686    ) -> Union[
687        Evaluation, List[Evaluation], Awaitable[Union[Evaluation, List[Evaluation]]]
688    ]:
689        r"""Evaluate a task output for quality, correctness, or other metrics.
690
691        This method should implement specific evaluation logic such as accuracy checking,
692        similarity measurement, toxicity detection, fluency assessment, etc.
693
694        Args:
695            input: The original input that was passed to the task function.
696                This is typically the item['input'] or item.input value.
697            output: The output produced by the task function for this input.
698                This is the direct return value from your task function.
699            expected_output: The expected/ground truth output for comparison.
700                May be None if not available in the dataset. Evaluators should
701                handle this case appropriately.
702            metadata: Optional metadata from the experiment item that might
703                contain additional context for evaluation (categories, difficulty, etc.)
704            **kwargs: Additional keyword arguments that may be passed by the framework
705
706        Returns:
707            Evaluation results in one of these formats:
708            - Single Evaluation dict: {"name": "accuracy", "value": 0.85, "comment": "..."}
709            - List of Evaluation dicts: [{"name": "precision", ...}, {"name": "recall", ...}]
710            - Awaitable returning either of the above (for async evaluators)
711
712            Each Evaluation dict should contain:
713            - name (str): Unique identifier for this evaluation metric
714            - value (int|float|str|bool): The evaluation score or result
715            - comment (str, optional): Human-readable explanation of the result
716            - metadata (dict, optional): Additional structured data about the evaluation
717
718        Examples:
719            Simple accuracy evaluator:
720            ```python
721            def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
722                if expected_output is None:
723                    return {"name": "accuracy", "value": None, "comment": "No expected output"}
724
725                is_correct = output.strip().lower() == expected_output.strip().lower()
726                return {
727                    "name": "accuracy",
728                    "value": 1.0 if is_correct else 0.0,
729                    "comment": "Exact match" if is_correct else "No match"
730                }
731            ```
732
733            Multi-metric evaluator:
734            ```python
735            def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs):
736                results = []
737
738                # Length check
739                results.append({
740                    "name": "output_length",
741                    "value": len(output),
742                    "comment": f"Output contains {len(output)} characters"
743                })
744
745                # Sentiment analysis
746                sentiment_score = analyze_sentiment(output)
747                results.append({
748                    "name": "sentiment",
749                    "value": sentiment_score,
750                    "comment": f"Sentiment score: {sentiment_score:.2f}"
751                })
752
753                return results
754            ```
755
756            Async evaluator using external API:
757            ```python
758            async def llm_judge_evaluator(*, input, output, expected_output=None, **kwargs):
759                prompt = f"Rate the quality of this response on a scale of 1-10:\n"
760                prompt += f"Question: {input}\nResponse: {output}"
761
762                response = await openai_client.chat.completions.create(
763                    model="gpt-4",
764                    messages=[{"role": "user", "content": prompt}]
765                )
766
767                try:
768                    score = float(response.choices[0].message.content.strip())
769                    return {
770                        "name": "llm_judge_quality",
771                        "value": score,
772                        "comment": f"LLM judge rated this {score}/10"
773                    }
774                except ValueError:
775                    return {
776                        "name": "llm_judge_quality",
777                        "value": None,
778                        "comment": "Could not parse LLM judge score"
779                    }
780            ```
781
782            Context-aware evaluator:
783            ```python
784            def context_evaluator(*, input, output, metadata=None, **kwargs):
785                # Use metadata for context-specific evaluation
786                difficulty = metadata.get("difficulty", "medium") if metadata else "medium"
787
788                # Adjust expectations based on difficulty
789                min_length = {"easy": 50, "medium": 100, "hard": 150}[difficulty]
790
791                meets_requirement = len(output) >= min_length
792                return {
793                    "name": f"meets_{difficulty}_requirement",
794                    "value": meets_requirement,
795                    "comment": f"Output {'meets' if meets_requirement else 'fails'} {difficulty} length requirement"
796                }
797            ```
798        """
799        ...

Protocol defining the interface for item-level evaluator functions.

Item-level evaluators assess the quality, correctness, or other properties of individual task outputs. They receive the input, output, expected output, and metadata for each item and return evaluation metrics.

Evaluators should:

  • Accept input, output, expected_output, and metadata as keyword arguments
  • Return Evaluation dict(s) with 'name', 'value', 'comment', 'metadata' fields
  • Be deterministic when possible for reproducible results
  • Handle edge cases gracefully (missing expected output, malformed data, etc.)
  • Can be either synchronous or asynchronous
EvaluatorFunction(*args, **kwargs)
1927def _no_init_or_replace_init(self, *args, **kwargs):
1928    cls = type(self)
1929
1930    if cls._is_protocol:
1931        raise TypeError('Protocols cannot be instantiated')
1932
1933    # Already using a custom `__init__`. No need to calculate correct
1934    # `__init__` to call. This can lead to RecursionError. See bpo-45121.
1935    if cls.__init__ is not _no_init_or_replace_init:
1936        return
1937
1938    # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`.
1939    # The first instantiation of the subclass will call `_no_init_or_replace_init` which
1940    # searches for a proper new `__init__` in the MRO. The new `__init__`
1941    # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent
1942    # instantiation of the protocol subclass will thus use the new
1943    # `__init__` and no longer call `_no_init_or_replace_init`.
1944    for base in cls.__mro__:
1945        init = base.__dict__.get('__init__', _no_init_or_replace_init)
1946        if init is not _no_init_or_replace_init:
1947            cls.__init__ = init
1948            break
1949    else:
1950        # should not happen
1951        cls.__init__ = object.__init__
1952
1953    cls.__init__(self, *args, **kwargs)
class RunEvaluatorFunction(typing.Protocol):
802class RunEvaluatorFunction(Protocol):
803    """Protocol defining the interface for run-level evaluator functions.
804
805    Run-level evaluators assess aggregate properties of the entire experiment run,
806    computing metrics that span across all items rather than individual outputs.
807    They receive the complete results from all processed items and can compute
808    statistics like averages, distributions, correlations, or other aggregate metrics.
809
810    Run evaluators should:
811    - Accept item_results as a keyword argument containing all item results
812    - Return Evaluation dict(s) with aggregate metrics
813    - Handle cases where some items may have failed processing
814    - Compute meaningful statistics across the dataset
815    - Can be either synchronous or asynchronous
816    """
817
818    def __call__(
819        self,
820        *,
821        item_results: List[ExperimentItemResult],
822        **kwargs: Dict[str, Any],
823    ) -> Union[
824        Evaluation, List[Evaluation], Awaitable[Union[Evaluation, List[Evaluation]]]
825    ]:
826        r"""Evaluate the entire experiment run with aggregate metrics.
827
828        This method should implement aggregate evaluation logic such as computing
829        averages, calculating distributions, finding correlations, detecting patterns
830        across items, or performing statistical analysis on the experiment results.
831
832        Args:
833            item_results: List of results from all successfully processed experiment items.
834                Each item result contains:
835                - item: The original experiment item
836                - output: The task function's output for this item
837                - evaluations: List of item-level evaluation results
838                - trace_id: Langfuse trace ID for this execution
839                - dataset_run_id: Dataset run ID (if using Langfuse datasets)
840
841                Note: This list only includes items that were successfully processed.
842                Failed items are excluded but logged separately.
843            **kwargs: Additional keyword arguments that may be passed by the framework
844
845        Returns:
846            Evaluation results in one of these formats:
847            - Single Evaluation dict: {"name": "avg_accuracy", "value": 0.78, "comment": "..."}
848            - List of Evaluation dicts: [{"name": "mean", ...}, {"name": "std_dev", ...}]
849            - Awaitable returning either of the above (for async evaluators)
850
851            Each Evaluation dict should contain:
852            - name (str): Unique identifier for this run-level metric
853            - value (int|float|str|bool): The aggregate evaluation result
854            - comment (str, optional): Human-readable explanation of the metric
855            - metadata (dict, optional): Additional structured data about the evaluation
856
857        Examples:
858            Average accuracy calculator:
859            ```python
860            def average_accuracy(*, item_results, **kwargs):
861                if not item_results:
862                    return {"name": "avg_accuracy", "value": 0.0, "comment": "No results"}
863
864                accuracy_values = []
865                for result in item_results:
866                    for evaluation in result.evaluations:
867                        if evaluation.name == "accuracy":
868                            accuracy_values.append(evaluation.value)
869
870                if not accuracy_values:
871                    return {"name": "avg_accuracy", "value": None, "comment": "No accuracy evaluations found"}
872
873                avg = sum(accuracy_values) / len(accuracy_values)
874                return {
875                    "name": "avg_accuracy",
876                    "value": avg,
877                    "comment": f"Average accuracy across {len(accuracy_values)} items: {avg:.2%}"
878                }
879            ```
880
881            Multiple aggregate metrics:
882            ```python
883            def statistical_summary(*, item_results, **kwargs):
884                if not item_results:
885                    return []
886
887                results = []
888
889                # Calculate output length statistics
890                lengths = [len(str(result.output)) for result in item_results]
891                results.extend([
892                    {"name": "avg_output_length", "value": sum(lengths) / len(lengths)},
893                    {"name": "min_output_length", "value": min(lengths)},
894                    {"name": "max_output_length", "value": max(lengths)}
895                ])
896
897                # Success rate
898                total_items = len(item_results)  # Only successful items are included
899                results.append({
900                    "name": "processing_success_rate",
901                    "value": 1.0,  # All items in item_results succeeded
902                    "comment": f"Successfully processed {total_items} items"
903                })
904
905                return results
906            ```
907
908            Async run evaluator with external analysis:
909            ```python
910            async def llm_batch_analysis(*, item_results, **kwargs):
911                # Prepare batch analysis prompt
912                outputs = [result.output for result in item_results]
913                prompt = f"Analyze these {len(outputs)} outputs for common themes:\n"
914                prompt += "\n".join(f"{i+1}. {output}" for i, output in enumerate(outputs))
915
916                response = await openai_client.chat.completions.create(
917                    model="gpt-4",
918                    messages=[{"role": "user", "content": prompt}]
919                )
920
921                return {
922                    "name": "thematic_analysis",
923                    "value": response.choices[0].message.content,
924                    "comment": f"LLM analysis of {len(outputs)} outputs"
925                }
926            ```
927
928            Performance distribution analysis:
929            ```python
930            def performance_distribution(*, item_results, **kwargs):
931                # Extract all evaluation scores
932                all_scores = []
933                score_by_metric = {}
934
935                for result in item_results:
936                    for evaluation in result.evaluations:
937                        metric_name = evaluation.name
938                        value = evaluation.value
939
940                        if isinstance(value, (int, float)):
941                            all_scores.append(value)
942                            if metric_name not in score_by_metric:
943                                score_by_metric[metric_name] = []
944                            score_by_metric[metric_name].append(value)
945
946                results = []
947
948                # Overall score distribution
949                if all_scores:
950                    import statistics
951                    results.append({
952                        "name": "score_std_dev",
953                        "value": statistics.stdev(all_scores) if len(all_scores) > 1 else 0,
954                        "comment": f"Standard deviation across all numeric scores"
955                    })
956
957                # Per-metric statistics
958                for metric, scores in score_by_metric.items():
959                    if len(scores) > 1:
960                        results.append({
961                            "name": f"{metric}_variance",
962                            "value": statistics.variance(scores),
963                            "comment": f"Variance in {metric} across {len(scores)} items"
964                        })
965
966                return results
967            ```
968        """
969        ...

Protocol defining the interface for run-level evaluator functions.

Run-level evaluators assess aggregate properties of the entire experiment run, computing metrics that span across all items rather than individual outputs. They receive the complete results from all processed items and can compute statistics like averages, distributions, correlations, or other aggregate metrics.

Run evaluators should:

  • Accept item_results as a keyword argument containing all item results
  • Return Evaluation dict(s) with aggregate metrics
  • Handle cases where some items may have failed processing
  • Compute meaningful statistics across the dataset
  • Can be either synchronous or asynchronous
RunEvaluatorFunction(*args, **kwargs)
1927def _no_init_or_replace_init(self, *args, **kwargs):
1928    cls = type(self)
1929
1930    if cls._is_protocol:
1931        raise TypeError('Protocols cannot be instantiated')
1932
1933    # Already using a custom `__init__`. No need to calculate correct
1934    # `__init__` to call. This can lead to RecursionError. See bpo-45121.
1935    if cls.__init__ is not _no_init_or_replace_init:
1936        return
1937
1938    # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`.
1939    # The first instantiation of the subclass will call `_no_init_or_replace_init` which
1940    # searches for a proper new `__init__` in the MRO. The new `__init__`
1941    # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent
1942    # instantiation of the protocol subclass will thus use the new
1943    # `__init__` and no longer call `_no_init_or_replace_init`.
1944    for base in cls.__mro__:
1945        init = base.__dict__.get('__init__', _no_init_or_replace_init)
1946        if init is not _no_init_or_replace_init:
1947            cls.__init__ = init
1948            break
1949    else:
1950        # should not happen
1951        cls.__init__ = object.__init__
1952
1953    cls.__init__(self, *args, **kwargs)
def create_evaluator_from_autoevals( autoevals_evaluator: Any, **kwargs: Optional[Dict[str, Any]]) -> EvaluatorFunction:
1017def create_evaluator_from_autoevals(
1018    autoevals_evaluator: Any, **kwargs: Optional[Dict[str, Any]]
1019) -> EvaluatorFunction:
1020    """Create a Langfuse evaluator from an autoevals evaluator.
1021
1022    Args:
1023        autoevals_evaluator: An autoevals evaluator instance
1024        **kwargs: Additional arguments passed to the evaluator
1025
1026    Returns:
1027        A Langfuse-compatible evaluator function
1028    """
1029
1030    def langfuse_evaluator(
1031        *,
1032        input: Any,
1033        output: Any,
1034        expected_output: Any,
1035        metadata: Optional[Dict[str, Any]],
1036        **langfuse_kwargs: Dict[str, Any],
1037    ) -> Evaluation:
1038        evaluation = autoevals_evaluator(
1039            input=input, output=output, expected=expected_output, **kwargs
1040        )
1041
1042        return Evaluation(
1043            name=evaluation.name,
1044            value=evaluation.score,
1045            comment=(evaluation.metadata or {}).get("comment"),
1046            metadata=evaluation.metadata,
1047        )
1048
1049    return langfuse_evaluator

Create a Langfuse evaluator from an autoevals evaluator.

Arguments:
  • autoevals_evaluator: An autoevals evaluator instance
  • **kwargs: Additional arguments passed to the evaluator
Returns:

A Langfuse-compatible evaluator function