langfuse.experiment

Langfuse experiment functionality for running and evaluating tasks on datasets.

This module provides the core experiment functionality for the Langfuse Python SDK, allowing users to run experiments on datasets with automatic tracing, evaluation, and result formatting.

   1"""Langfuse experiment functionality for running and evaluating tasks on datasets.
   2
   3This module provides the core experiment functionality for the Langfuse Python SDK,
   4allowing users to run experiments on datasets with automatic tracing, evaluation,
   5and result formatting.
   6"""
   7
   8import asyncio
   9import logging
  10from typing import (
  11    TYPE_CHECKING,
  12    Any,
  13    Awaitable,
  14    Dict,
  15    List,
  16    Optional,
  17    Protocol,
  18    TypedDict,
  19    Union,
  20)
  21
  22from langfuse.api import ScoreDataType
  23
  24if TYPE_CHECKING:
  25    from langfuse._client.datasets import DatasetItemClient
  26
  27
  28class LocalExperimentItem(TypedDict, total=False):
  29    """Structure for local experiment data items (not from Langfuse datasets).
  30
  31    This TypedDict defines the structure for experiment items when using local data
  32    rather than Langfuse-hosted datasets. All fields are optional to provide
  33    flexibility in data structure.
  34
  35    Attributes:
  36        input: The input data to pass to the task function. Can be any type that
  37            your task function can process (string, dict, list, etc.). This is
  38            typically the prompt, question, or data that your task will operate on.
  39        expected_output: Optional expected/ground truth output for evaluation purposes.
  40            Used by evaluators to assess correctness or quality. Can be None if
  41            no ground truth is available.
  42        metadata: Optional metadata dictionary containing additional context about
  43            this specific item. Can include information like difficulty level,
  44            category, source, or any other relevant attributes that evaluators
  45            might use for context-aware evaluation.
  46
  47    Examples:
  48        Simple text processing item:
  49        ```python
  50        item: LocalExperimentItem = {
  51            "input": "Summarize this article: ...",
  52            "expected_output": "Expected summary...",
  53            "metadata": {"difficulty": "medium", "category": "news"}
  54        }
  55        ```
  56
  57        Classification item:
  58        ```python
  59        item: LocalExperimentItem = {
  60            "input": {"text": "This movie is great!", "context": "movie review"},
  61            "expected_output": "positive",
  62            "metadata": {"dataset_source": "imdb", "confidence": 0.95}
  63        }
  64        ```
  65
  66        Minimal item with only input:
  67        ```python
  68        item: LocalExperimentItem = {
  69            "input": "What is the capital of France?"
  70        }
  71        ```
  72    """
  73
  74    input: Any
  75    expected_output: Any
  76    metadata: Optional[Dict[str, Any]]
  77
  78
  79ExperimentItem = Union[LocalExperimentItem, "DatasetItemClient"]
  80"""Type alias for items that can be processed in experiments.
  81
  82Can be either:
  83- LocalExperimentItem: Dict-like items with 'input', 'expected_output', 'metadata' keys
  84- DatasetItemClient: Items from Langfuse datasets with .input, .expected_output, .metadata attributes
  85"""
  86
  87ExperimentData = Union[List[LocalExperimentItem], List["DatasetItemClient"]]
  88"""Type alias for experiment datasets.
  89
  90Represents the collection of items to process in an experiment. Can be either:
  91- List[LocalExperimentItem]: Local data items as dictionaries
  92- List[DatasetItemClient]: Items from a Langfuse dataset (typically from dataset.items)
  93"""
  94
  95
  96class Evaluation:
  97    """Represents an evaluation result for an experiment item or an entire experiment run.
  98
  99    This class provides a strongly-typed way to create evaluation results in evaluator functions.
 100    Users must use keyword arguments when instantiating this class.
 101
 102    Attributes:
 103        name: Unique identifier for the evaluation metric. Should be descriptive
 104            and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity").
 105            Used for aggregation and comparison across experiment runs.
 106        value: The evaluation score or result. Can be:
 107            - Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42)
 108            - String: For categorical results like "positive", "negative", "neutral"
 109            - Boolean: For binary assessments like "passes_safety_check"
 110            - None: When evaluation cannot be computed (missing data, API errors, etc.)
 111        comment: Optional human-readable explanation of the evaluation result.
 112            Useful for providing context, explaining scoring rationale, or noting
 113            special conditions. Displayed in Langfuse UI for interpretability.
 114        metadata: Optional structured metadata about the evaluation process.
 115            Can include confidence scores, intermediate calculations, model versions,
 116            or any other relevant technical details.
 117        data_type: Optional score data type. Required if value is not NUMERIC.
 118            One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC.
 119        config_id: Optional Langfuse score config ID.
 120
 121    Examples:
 122        Basic accuracy evaluation:
 123        ```python
 124        from langfuse import Evaluation
 125
 126        def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
 127            if not expected_output:
 128                return Evaluation(name="accuracy", value=None, comment="No expected output")
 129
 130            is_correct = output.strip().lower() == expected_output.strip().lower()
 131            return Evaluation(
 132                name="accuracy",
 133                value=1.0 if is_correct else 0.0,
 134                comment="Correct answer" if is_correct else "Incorrect answer"
 135            )
 136        ```
 137
 138        Multi-metric evaluator:
 139        ```python
 140        def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs):
 141            return [
 142                Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"),
 143                Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"),
 144                Evaluation(
 145                    name="quality",
 146                    value=0.85,
 147                    comment="High quality response",
 148                    metadata={"confidence": 0.92, "model": "gpt-4"}
 149                )
 150            ]
 151        ```
 152
 153        Categorical evaluation:
 154        ```python
 155        def sentiment_evaluator(*, input, output, **kwargs):
 156            sentiment = analyze_sentiment(output)  # Returns "positive", "negative", or "neutral"
 157            return Evaluation(
 158                name="sentiment",
 159                value=sentiment,
 160                comment=f"Response expresses {sentiment} sentiment",
 161                data_type="CATEGORICAL"
 162            )
 163        ```
 164
 165        Failed evaluation with error handling:
 166        ```python
 167        def external_api_evaluator(*, input, output, **kwargs):
 168            try:
 169                score = external_api.evaluate(output)
 170                return Evaluation(name="external_score", value=score)
 171            except Exception as e:
 172                return Evaluation(
 173                    name="external_score",
 174                    value=None,
 175                    comment=f"API unavailable: {e}",
 176                    metadata={"error": str(e), "retry_count": 3}
 177                )
 178        ```
 179
 180    Note:
 181        All arguments must be passed as keywords. Positional arguments are not allowed
 182        to ensure code clarity and prevent errors from argument reordering.
 183    """
 184
 185    def __init__(
 186        self,
 187        *,
 188        name: str,
 189        value: Union[int, float, str, bool, None],
 190        comment: Optional[str] = None,
 191        metadata: Optional[Dict[str, Any]] = None,
 192        data_type: Optional[ScoreDataType] = None,
 193        config_id: Optional[str] = None,
 194    ):
 195        """Initialize an Evaluation with the provided data.
 196
 197        Args:
 198            name: Unique identifier for the evaluation metric.
 199            value: The evaluation score or result.
 200            comment: Optional human-readable explanation of the result.
 201            metadata: Optional structured metadata about the evaluation process.
 202            data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
 203            config_id: Optional Langfuse score config ID.
 204
 205        Note:
 206            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
 207        """
 208        self.name = name
 209        self.value = value
 210        self.comment = comment
 211        self.metadata = metadata
 212        self.data_type = data_type
 213        self.config_id = config_id
 214
 215
 216class ExperimentItemResult:
 217    """Result structure for individual experiment items.
 218
 219    This class represents the complete result of processing a single item
 220    during an experiment run, including the original input, task output,
 221    evaluations, and tracing information. Users must use keyword arguments when instantiating this class.
 222
 223    Attributes:
 224        item: The original experiment item that was processed. Can be either
 225            a dictionary with 'input', 'expected_output', and 'metadata' keys,
 226            or a DatasetItemClient from Langfuse datasets.
 227        output: The actual output produced by the task function for this item.
 228            Can be any type depending on what your task function returns.
 229        evaluations: List of evaluation results for this item. Each evaluation
 230            contains a name, value, optional comment, and optional metadata.
 231        trace_id: Optional Langfuse trace ID for this item's execution. Used
 232            to link the experiment result with the detailed trace in Langfuse UI.
 233        dataset_run_id: Optional dataset run ID if this item was part of a
 234            Langfuse dataset. None for local experiments.
 235
 236    Examples:
 237        Accessing item result data:
 238        ```python
 239        result = langfuse.run_experiment(...)
 240        for item_result in result.item_results:
 241            print(f"Input: {item_result.item}")
 242            print(f"Output: {item_result.output}")
 243            print(f"Trace: {item_result.trace_id}")
 244
 245            # Access evaluations
 246            for evaluation in item_result.evaluations:
 247                print(f"{evaluation.name}: {evaluation.value}")
 248        ```
 249
 250        Working with different item types:
 251        ```python
 252        # Local experiment item (dict)
 253        if isinstance(item_result.item, dict):
 254            input_data = item_result.item["input"]
 255            expected = item_result.item.get("expected_output")
 256
 257        # Langfuse dataset item (object with attributes)
 258        else:
 259            input_data = item_result.item.input
 260            expected = item_result.item.expected_output
 261        ```
 262
 263    Note:
 264        All arguments must be passed as keywords. Positional arguments are not allowed
 265        to ensure code clarity and prevent errors from argument reordering.
 266    """
 267
 268    def __init__(
 269        self,
 270        *,
 271        item: ExperimentItem,
 272        output: Any,
 273        evaluations: List[Evaluation],
 274        trace_id: Optional[str],
 275        dataset_run_id: Optional[str],
 276    ):
 277        """Initialize an ExperimentItemResult with the provided data.
 278
 279        Args:
 280            item: The original experiment item that was processed.
 281            output: The actual output produced by the task function for this item.
 282            evaluations: List of evaluation results for this item.
 283            trace_id: Optional Langfuse trace ID for this item's execution.
 284            dataset_run_id: Optional dataset run ID if this item was part of a Langfuse dataset.
 285
 286        Note:
 287            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
 288        """
 289        self.item = item
 290        self.output = output
 291        self.evaluations = evaluations
 292        self.trace_id = trace_id
 293        self.dataset_run_id = dataset_run_id
 294
 295
 296class ExperimentResult:
 297    """Complete result structure for experiment execution.
 298
 299    This class encapsulates the complete results of running an experiment on a dataset,
 300    including individual item results, aggregate run-level evaluations, and metadata
 301    about the experiment execution.
 302
 303    Attributes:
 304        name: The name of the experiment as specified during execution.
 305        run_name: The name of the current experiment run.
 306        description: Optional description of the experiment's purpose or methodology.
 307        item_results: List of results from processing each individual dataset item,
 308            containing the original item, task output, evaluations, and trace information.
 309        run_evaluations: List of aggregate evaluation results computed across all items,
 310            such as average scores, statistical summaries, or cross-item analyses.
 311        dataset_run_id: Optional ID of the dataset run in Langfuse (when using Langfuse datasets).
 312        dataset_run_url: Optional direct URL to view the experiment results in Langfuse UI.
 313
 314    Examples:
 315        Basic usage with local dataset:
 316        ```python
 317        result = langfuse.run_experiment(
 318            name="Capital Cities Test",
 319            data=local_data,
 320            task=generate_capital,
 321            evaluators=[accuracy_check]
 322        )
 323
 324        print(f"Processed {len(result.item_results)} items")
 325        print(result.format())  # Human-readable summary
 326
 327        # Access individual results
 328        for item_result in result.item_results:
 329            print(f"Input: {item_result.item}")
 330            print(f"Output: {item_result.output}")
 331            print(f"Scores: {item_result.evaluations}")
 332        ```
 333
 334        Usage with Langfuse datasets:
 335        ```python
 336        dataset = langfuse.get_dataset("qa-eval-set")
 337        result = dataset.run_experiment(
 338            name="GPT-4 QA Evaluation",
 339            task=answer_question,
 340            evaluators=[relevance_check, accuracy_check]
 341        )
 342
 343        # View in Langfuse UI
 344        if result.dataset_run_url:
 345            print(f"View detailed results: {result.dataset_run_url}")
 346        ```
 347
 348        Formatted output:
 349        ```python
 350        # Get summary view
 351        summary = result.format()
 352        print(summary)
 353
 354        # Get detailed view with individual items
 355        detailed = result.format(include_item_results=True)
 356        with open("experiment_report.txt", "w") as f:
 357            f.write(detailed)
 358        ```
 359    """
 360
 361    def __init__(
 362        self,
 363        *,
 364        name: str,
 365        run_name: str,
 366        description: Optional[str],
 367        item_results: List[ExperimentItemResult],
 368        run_evaluations: List[Evaluation],
 369        dataset_run_id: Optional[str] = None,
 370        dataset_run_url: Optional[str] = None,
 371    ):
 372        """Initialize an ExperimentResult with the provided data.
 373
 374        Args:
 375            name: The name of the experiment.
 376            run_name: The current experiment run name.
 377            description: Optional description of the experiment.
 378            item_results: List of results from processing individual dataset items.
 379            run_evaluations: List of aggregate evaluation results for the entire run.
 380            dataset_run_id: Optional ID of the dataset run (for Langfuse datasets).
 381            dataset_run_url: Optional URL to view results in Langfuse UI.
 382        """
 383        self.name = name
 384        self.run_name = run_name
 385        self.description = description
 386        self.item_results = item_results
 387        self.run_evaluations = run_evaluations
 388        self.dataset_run_id = dataset_run_id
 389        self.dataset_run_url = dataset_run_url
 390
 391    def format(self, *, include_item_results: bool = False) -> str:
 392        r"""Format the experiment result for human-readable display.
 393
 394        Converts the experiment result into a nicely formatted string suitable for
 395        console output, logging, or reporting. The output includes experiment overview,
 396        aggregate statistics, and optionally individual item details.
 397
 398        This method provides a comprehensive view of experiment performance including:
 399        - Experiment metadata (name, description, item count)
 400        - List of evaluation metrics used across items
 401        - Average scores computed across all processed items
 402        - Run-level evaluation results (aggregate metrics)
 403        - Links to view detailed results in Langfuse UI (when available)
 404        - Individual item details (when requested)
 405
 406        Args:
 407            include_item_results: Whether to include detailed results for each individual
 408                item in the formatted output. When False (default), only shows aggregate
 409                statistics and summary information. When True, includes input/output/scores
 410                for every processed item, making the output significantly longer but more
 411                detailed for debugging and analysis purposes.
 412
 413        Returns:
 414            A formatted multi-line string containing:
 415            - Experiment name and description (if provided)
 416            - Total number of items successfully processed
 417            - List of all evaluation metrics that were applied
 418            - Average scores across all items for each numeric metric
 419            - Run-level evaluation results with comments
 420            - Dataset run URL for viewing in Langfuse UI (if applicable)
 421            - Individual item details including inputs, outputs, and scores (if requested)
 422
 423        Examples:
 424            Basic usage showing aggregate results only:
 425            ```python
 426            result = langfuse.run_experiment(
 427                name="Capital Cities",
 428                data=dataset,
 429                task=generate_capital,
 430                evaluators=[accuracy_evaluator]
 431            )
 432
 433            print(result.format())
 434            # Output:
 435            # ──────────────────────────────────────────────────
 436            # 📊 Capital Cities
 437            # 100 items
 438            # Evaluations:
 439            #   • accuracy
 440            # Average Scores:
 441            #   • accuracy: 0.850
 442            ```
 443
 444            Detailed output including all individual item results:
 445            ```python
 446            detailed_report = result.format(include_item_results=True)
 447            print(detailed_report)
 448            # Output includes each item:
 449            # 1. Item 1:
 450            #    Input:    What is the capital of France?
 451            #    Expected: Paris
 452            #    Actual:   The capital of France is Paris.
 453            #    Scores:
 454            #      • accuracy: 1.000
 455            #        💭 Correct answer found
 456            # [... continues for all items ...]
 457            ```
 458
 459            Saving formatted results to file for reporting:
 460            ```python
 461            with open("experiment_report.txt", "w") as f:
 462                f.write(result.format(include_item_results=True))
 463
 464            # Or create summary report
 465            summary = result.format()  # Aggregate view only
 466            print(f"Experiment Summary:\\n{summary}")
 467            ```
 468
 469            Integration with logging systems:
 470            ```python
 471            import logging
 472            logger = logging.getLogger("experiments")
 473
 474            # Log summary after experiment
 475            logger.info(f"Experiment completed:\\n{result.format()}")
 476
 477            # Log detailed results for failed experiments
 478            if any(eval['value'] < threshold for eval in result.run_evaluations):
 479                logger.warning(f"Poor performance detected:\\n{result.format(include_item_results=True)}")
 480            ```
 481        """
 482        if not self.item_results:
 483            return "No experiment results to display."
 484
 485        output = ""
 486
 487        # Individual results section
 488        if include_item_results:
 489            for i, result in enumerate(self.item_results):
 490                output += f"\\n{i + 1}. Item {i + 1}:\\n"
 491
 492                # Extract and display input
 493                item_input = None
 494                if isinstance(result.item, dict):
 495                    item_input = result.item.get("input")
 496                elif hasattr(result.item, "input"):
 497                    item_input = result.item.input
 498
 499                if item_input is not None:
 500                    output += f"   Input:    {_format_value(item_input)}\\n"
 501
 502                # Extract and display expected output
 503                expected_output = None
 504                if isinstance(result.item, dict):
 505                    expected_output = result.item.get("expected_output")
 506                elif hasattr(result.item, "expected_output"):
 507                    expected_output = result.item.expected_output
 508
 509                if expected_output is not None:
 510                    output += f"   Expected: {_format_value(expected_output)}\\n"
 511                output += f"   Actual:   {_format_value(result.output)}\\n"
 512
 513                # Display evaluation scores
 514                if result.evaluations:
 515                    output += "   Scores:\\n"
 516                    for evaluation in result.evaluations:
 517                        score = evaluation.value
 518                        if isinstance(score, (int, float)):
 519                            score = f"{score:.3f}"
 520                        output += f"     • {evaluation.name}: {score}"
 521                        if evaluation.comment:
 522                            output += f"\\n       💭 {evaluation.comment}"
 523                        output += "\\n"
 524
 525                # Display trace link if available
 526                if result.trace_id:
 527                    output += f"\\n   Trace ID: {result.trace_id}\\n"
 528        else:
 529            output += f"Individual Results: Hidden ({len(self.item_results)} items)\\n"
 530            output += "💡 Set include_item_results=True to view them\\n"
 531
 532        # Experiment overview section
 533        output += f"\\n{'─' * 50}\\n"
 534        output += f"🧪 Experiment: {self.name}"
 535        output += f"\n📋 Run name: {self.run_name}"
 536        if self.description:
 537            output += f" - {self.description}"
 538
 539        output += f"\\n{len(self.item_results)} items"
 540
 541        # Collect unique evaluation names across all items
 542        evaluation_names = set()
 543        for result in self.item_results:
 544            for evaluation in result.evaluations:
 545                evaluation_names.add(evaluation.name)
 546
 547        if evaluation_names:
 548            output += "\\nEvaluations:"
 549            for eval_name in evaluation_names:
 550                output += f"\\n  • {eval_name}"
 551            output += "\\n"
 552
 553        # Calculate and display average scores
 554        if evaluation_names:
 555            output += "\\nAverage Scores:"
 556            for eval_name in evaluation_names:
 557                scores = []
 558                for result in self.item_results:
 559                    for evaluation in result.evaluations:
 560                        if evaluation.name == eval_name and isinstance(
 561                            evaluation.value, (int, float)
 562                        ):
 563                            scores.append(evaluation.value)
 564
 565                if scores:
 566                    avg = sum(scores) / len(scores)
 567                    output += f"\\n  • {eval_name}: {avg:.3f}"
 568            output += "\\n"
 569
 570        # Display run-level evaluations
 571        if self.run_evaluations:
 572            output += "\\nRun Evaluations:"
 573            for run_eval in self.run_evaluations:
 574                score = run_eval.value
 575                if isinstance(score, (int, float)):
 576                    score = f"{score:.3f}"
 577                output += f"\\n  • {run_eval.name}: {score}"
 578                if run_eval.comment:
 579                    output += f"\\n    💭 {run_eval.comment}"
 580            output += "\\n"
 581
 582        # Add dataset run URL if available
 583        if self.dataset_run_url:
 584            output += f"\\n🔗 Dataset Run:\\n   {self.dataset_run_url}"
 585
 586        return output
 587
 588
 589class TaskFunction(Protocol):
 590    """Protocol defining the interface for experiment task functions.
 591
 592    Task functions are the core processing functions that operate on each item
 593    in an experiment dataset. They receive an experiment item as input and
 594    produce some output that will be evaluated.
 595
 596    Task functions must:
 597    - Accept 'item' as a keyword argument
 598    - Return any type of output (will be passed to evaluators)
 599    - Can be either synchronous or asynchronous
 600    - Should handle their own errors gracefully (exceptions will be logged)
 601    """
 602
 603    def __call__(
 604        self,
 605        *,
 606        item: ExperimentItem,
 607        **kwargs: Dict[str, Any],
 608    ) -> Union[Any, Awaitable[Any]]:
 609        """Execute the task on an experiment item.
 610
 611        This method defines the core processing logic for each item in your experiment.
 612        The implementation should focus on the specific task you want to evaluate,
 613        such as text generation, classification, summarization, etc.
 614
 615        Args:
 616            item: The experiment item to process. Can be either:
 617                - Dict with keys like 'input', 'expected_output', 'metadata'
 618                - Langfuse DatasetItem object with .input, .expected_output attributes
 619            **kwargs: Additional keyword arguments that may be passed by the framework
 620
 621        Returns:
 622            Any: The output of processing the item. This output will be:
 623            - Stored in the experiment results
 624            - Passed to all item-level evaluators for assessment
 625            - Traced automatically in Langfuse for observability
 626
 627            Can return either a direct value or an awaitable (async) result.
 628
 629        Examples:
 630            Simple synchronous task:
 631            ```python
 632            def my_task(*, item, **kwargs):
 633                prompt = f"Summarize: {item['input']}"
 634                return my_llm_client.generate(prompt)
 635            ```
 636
 637            Async task with error handling:
 638            ```python
 639            async def my_async_task(*, item, **kwargs):
 640                try:
 641                    response = await openai_client.chat.completions.create(
 642                        model="gpt-4",
 643                        messages=[{"role": "user", "content": item["input"]}]
 644                    )
 645                    return response.choices[0].message.content
 646                except Exception as e:
 647                    # Log error and return fallback
 648                    print(f"Task failed for item {item}: {e}")
 649                    return "Error: Could not process item"
 650            ```
 651
 652            Task using dataset item attributes:
 653            ```python
 654            def classification_task(*, item, **kwargs):
 655                # Works with both dict items and DatasetItem objects
 656                text = item["input"] if isinstance(item, dict) else item.input
 657                return classify_text(text)
 658            ```
 659        """
 660        ...
 661
 662
 663class EvaluatorFunction(Protocol):
 664    """Protocol defining the interface for item-level evaluator functions.
 665
 666    Item-level evaluators assess the quality, correctness, or other properties
 667    of individual task outputs. They receive the input, output, expected output,
 668    and metadata for each item and return evaluation metrics.
 669
 670    Evaluators should:
 671    - Accept input, output, expected_output, and metadata as keyword arguments
 672    - Return Evaluation dict(s) with 'name', 'value', 'comment', 'metadata' fields
 673    - Be deterministic when possible for reproducible results
 674    - Handle edge cases gracefully (missing expected output, malformed data, etc.)
 675    - Can be either synchronous or asynchronous
 676    """
 677
 678    def __call__(
 679        self,
 680        *,
 681        input: Any,
 682        output: Any,
 683        expected_output: Any,
 684        metadata: Optional[Dict[str, Any]],
 685        **kwargs: Dict[str, Any],
 686    ) -> Union[
 687        Evaluation, List[Evaluation], Awaitable[Union[Evaluation, List[Evaluation]]]
 688    ]:
 689        r"""Evaluate a task output for quality, correctness, or other metrics.
 690
 691        This method should implement specific evaluation logic such as accuracy checking,
 692        similarity measurement, toxicity detection, fluency assessment, etc.
 693
 694        Args:
 695            input: The original input that was passed to the task function.
 696                This is typically the item['input'] or item.input value.
 697            output: The output produced by the task function for this input.
 698                This is the direct return value from your task function.
 699            expected_output: The expected/ground truth output for comparison.
 700                May be None if not available in the dataset. Evaluators should
 701                handle this case appropriately.
 702            metadata: Optional metadata from the experiment item that might
 703                contain additional context for evaluation (categories, difficulty, etc.)
 704            **kwargs: Additional keyword arguments that may be passed by the framework
 705
 706        Returns:
 707            Evaluation results in one of these formats:
 708            - Single Evaluation dict: {"name": "accuracy", "value": 0.85, "comment": "..."}
 709            - List of Evaluation dicts: [{"name": "precision", ...}, {"name": "recall", ...}]
 710            - Awaitable returning either of the above (for async evaluators)
 711
 712            Each Evaluation dict should contain:
 713            - name (str): Unique identifier for this evaluation metric
 714            - value (int|float|str|bool): The evaluation score or result
 715            - comment (str, optional): Human-readable explanation of the result
 716            - metadata (dict, optional): Additional structured data about the evaluation
 717
 718        Examples:
 719            Simple accuracy evaluator:
 720            ```python
 721            def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
 722                if expected_output is None:
 723                    return {"name": "accuracy", "value": None, "comment": "No expected output"}
 724
 725                is_correct = output.strip().lower() == expected_output.strip().lower()
 726                return {
 727                    "name": "accuracy",
 728                    "value": 1.0 if is_correct else 0.0,
 729                    "comment": "Exact match" if is_correct else "No match"
 730                }
 731            ```
 732
 733            Multi-metric evaluator:
 734            ```python
 735            def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs):
 736                results = []
 737
 738                # Length check
 739                results.append({
 740                    "name": "output_length",
 741                    "value": len(output),
 742                    "comment": f"Output contains {len(output)} characters"
 743                })
 744
 745                # Sentiment analysis
 746                sentiment_score = analyze_sentiment(output)
 747                results.append({
 748                    "name": "sentiment",
 749                    "value": sentiment_score,
 750                    "comment": f"Sentiment score: {sentiment_score:.2f}"
 751                })
 752
 753                return results
 754            ```
 755
 756            Async evaluator using external API:
 757            ```python
 758            async def llm_judge_evaluator(*, input, output, expected_output=None, **kwargs):
 759                prompt = f"Rate the quality of this response on a scale of 1-10:\n"
 760                prompt += f"Question: {input}\nResponse: {output}"
 761
 762                response = await openai_client.chat.completions.create(
 763                    model="gpt-4",
 764                    messages=[{"role": "user", "content": prompt}]
 765                )
 766
 767                try:
 768                    score = float(response.choices[0].message.content.strip())
 769                    return {
 770                        "name": "llm_judge_quality",
 771                        "value": score,
 772                        "comment": f"LLM judge rated this {score}/10"
 773                    }
 774                except ValueError:
 775                    return {
 776                        "name": "llm_judge_quality",
 777                        "value": None,
 778                        "comment": "Could not parse LLM judge score"
 779                    }
 780            ```
 781
 782            Context-aware evaluator:
 783            ```python
 784            def context_evaluator(*, input, output, metadata=None, **kwargs):
 785                # Use metadata for context-specific evaluation
 786                difficulty = metadata.get("difficulty", "medium") if metadata else "medium"
 787
 788                # Adjust expectations based on difficulty
 789                min_length = {"easy": 50, "medium": 100, "hard": 150}[difficulty]
 790
 791                meets_requirement = len(output) >= min_length
 792                return {
 793                    "name": f"meets_{difficulty}_requirement",
 794                    "value": meets_requirement,
 795                    "comment": f"Output {'meets' if meets_requirement else 'fails'} {difficulty} length requirement"
 796                }
 797            ```
 798        """
 799        ...
 800
 801
 802class RunEvaluatorFunction(Protocol):
 803    """Protocol defining the interface for run-level evaluator functions.
 804
 805    Run-level evaluators assess aggregate properties of the entire experiment run,
 806    computing metrics that span across all items rather than individual outputs.
 807    They receive the complete results from all processed items and can compute
 808    statistics like averages, distributions, correlations, or other aggregate metrics.
 809
 810    Run evaluators should:
 811    - Accept item_results as a keyword argument containing all item results
 812    - Return Evaluation dict(s) with aggregate metrics
 813    - Handle cases where some items may have failed processing
 814    - Compute meaningful statistics across the dataset
 815    - Can be either synchronous or asynchronous
 816    """
 817
 818    def __call__(
 819        self,
 820        *,
 821        item_results: List[ExperimentItemResult],
 822        **kwargs: Dict[str, Any],
 823    ) -> Union[
 824        Evaluation, List[Evaluation], Awaitable[Union[Evaluation, List[Evaluation]]]
 825    ]:
 826        r"""Evaluate the entire experiment run with aggregate metrics.
 827
 828        This method should implement aggregate evaluation logic such as computing
 829        averages, calculating distributions, finding correlations, detecting patterns
 830        across items, or performing statistical analysis on the experiment results.
 831
 832        Args:
 833            item_results: List of results from all successfully processed experiment items.
 834                Each item result contains:
 835                - item: The original experiment item
 836                - output: The task function's output for this item
 837                - evaluations: List of item-level evaluation results
 838                - trace_id: Langfuse trace ID for this execution
 839                - dataset_run_id: Dataset run ID (if using Langfuse datasets)
 840
 841                Note: This list only includes items that were successfully processed.
 842                Failed items are excluded but logged separately.
 843            **kwargs: Additional keyword arguments that may be passed by the framework
 844
 845        Returns:
 846            Evaluation results in one of these formats:
 847            - Single Evaluation dict: {"name": "avg_accuracy", "value": 0.78, "comment": "..."}
 848            - List of Evaluation dicts: [{"name": "mean", ...}, {"name": "std_dev", ...}]
 849            - Awaitable returning either of the above (for async evaluators)
 850
 851            Each Evaluation dict should contain:
 852            - name (str): Unique identifier for this run-level metric
 853            - value (int|float|str|bool): The aggregate evaluation result
 854            - comment (str, optional): Human-readable explanation of the metric
 855            - metadata (dict, optional): Additional structured data about the evaluation
 856
 857        Examples:
 858            Average accuracy calculator:
 859            ```python
 860            def average_accuracy(*, item_results, **kwargs):
 861                if not item_results:
 862                    return {"name": "avg_accuracy", "value": 0.0, "comment": "No results"}
 863
 864                accuracy_values = []
 865                for result in item_results:
 866                    for evaluation in result.evaluations:
 867                        if evaluation.name == "accuracy":
 868                            accuracy_values.append(evaluation.value)
 869
 870                if not accuracy_values:
 871                    return {"name": "avg_accuracy", "value": None, "comment": "No accuracy evaluations found"}
 872
 873                avg = sum(accuracy_values) / len(accuracy_values)
 874                return {
 875                    "name": "avg_accuracy",
 876                    "value": avg,
 877                    "comment": f"Average accuracy across {len(accuracy_values)} items: {avg:.2%}"
 878                }
 879            ```
 880
 881            Multiple aggregate metrics:
 882            ```python
 883            def statistical_summary(*, item_results, **kwargs):
 884                if not item_results:
 885                    return []
 886
 887                results = []
 888
 889                # Calculate output length statistics
 890                lengths = [len(str(result.output)) for result in item_results]
 891                results.extend([
 892                    {"name": "avg_output_length", "value": sum(lengths) / len(lengths)},
 893                    {"name": "min_output_length", "value": min(lengths)},
 894                    {"name": "max_output_length", "value": max(lengths)}
 895                ])
 896
 897                # Success rate
 898                total_items = len(item_results)  # Only successful items are included
 899                results.append({
 900                    "name": "processing_success_rate",
 901                    "value": 1.0,  # All items in item_results succeeded
 902                    "comment": f"Successfully processed {total_items} items"
 903                })
 904
 905                return results
 906            ```
 907
 908            Async run evaluator with external analysis:
 909            ```python
 910            async def llm_batch_analysis(*, item_results, **kwargs):
 911                # Prepare batch analysis prompt
 912                outputs = [result.output for result in item_results]
 913                prompt = f"Analyze these {len(outputs)} outputs for common themes:\n"
 914                prompt += "\n".join(f"{i+1}. {output}" for i, output in enumerate(outputs))
 915
 916                response = await openai_client.chat.completions.create(
 917                    model="gpt-4",
 918                    messages=[{"role": "user", "content": prompt}]
 919                )
 920
 921                return {
 922                    "name": "thematic_analysis",
 923                    "value": response.choices[0].message.content,
 924                    "comment": f"LLM analysis of {len(outputs)} outputs"
 925                }
 926            ```
 927
 928            Performance distribution analysis:
 929            ```python
 930            def performance_distribution(*, item_results, **kwargs):
 931                # Extract all evaluation scores
 932                all_scores = []
 933                score_by_metric = {}
 934
 935                for result in item_results:
 936                    for evaluation in result.evaluations:
 937                        metric_name = evaluation.name
 938                        value = evaluation.value
 939
 940                        if isinstance(value, (int, float)):
 941                            all_scores.append(value)
 942                            if metric_name not in score_by_metric:
 943                                score_by_metric[metric_name] = []
 944                            score_by_metric[metric_name].append(value)
 945
 946                results = []
 947
 948                # Overall score distribution
 949                if all_scores:
 950                    import statistics
 951                    results.append({
 952                        "name": "score_std_dev",
 953                        "value": statistics.stdev(all_scores) if len(all_scores) > 1 else 0,
 954                        "comment": f"Standard deviation across all numeric scores"
 955                    })
 956
 957                # Per-metric statistics
 958                for metric, scores in score_by_metric.items():
 959                    if len(scores) > 1:
 960                        results.append({
 961                            "name": f"{metric}_variance",
 962                            "value": statistics.variance(scores),
 963                            "comment": f"Variance in {metric} across {len(scores)} items"
 964                        })
 965
 966                return results
 967            ```
 968        """
 969        ...
 970
 971
 972def _format_value(value: Any) -> str:
 973    """Format a value for display."""
 974    if isinstance(value, str):
 975        return value[:50] + "..." if len(value) > 50 else value
 976    return str(value)
 977
 978
 979async def _run_evaluator(
 980    evaluator: Union[EvaluatorFunction, RunEvaluatorFunction], **kwargs: Any
 981) -> List[Evaluation]:
 982    """Run an evaluator function and normalize the result."""
 983    try:
 984        result = evaluator(**kwargs)
 985
 986        # Handle async evaluators
 987        if asyncio.iscoroutine(result):
 988            result = await result
 989
 990        # Normalize to list
 991        if isinstance(result, (dict, Evaluation)):
 992            return [result]  # type: ignore
 993
 994        elif isinstance(result, list):
 995            return result
 996
 997        else:
 998            return []
 999
1000    except Exception as e:
1001        evaluator_name = getattr(evaluator, "__name__", "unknown_evaluator")
1002        logging.getLogger("langfuse").error(f"Evaluator {evaluator_name} failed: {e}")
1003        return []
1004
1005
1006async def _run_task(task: TaskFunction, item: ExperimentItem) -> Any:
1007    """Run a task function and handle sync/async."""
1008    result = task(item=item)
1009
1010    # Handle async tasks
1011    if asyncio.iscoroutine(result):
1012        result = await result
1013
1014    return result
1015
1016
1017def create_evaluator_from_autoevals(
1018    autoevals_evaluator: Any, **kwargs: Optional[Dict[str, Any]]
1019) -> EvaluatorFunction:
1020    """Create a Langfuse evaluator from an autoevals evaluator.
1021
1022    Args:
1023        autoevals_evaluator: An autoevals evaluator instance
1024        **kwargs: Additional arguments passed to the evaluator
1025
1026    Returns:
1027        A Langfuse-compatible evaluator function
1028    """
1029
1030    def langfuse_evaluator(
1031        *,
1032        input: Any,
1033        output: Any,
1034        expected_output: Any,
1035        metadata: Optional[Dict[str, Any]],
1036        **langfuse_kwargs: Dict[str, Any],
1037    ) -> Evaluation:
1038        evaluation = autoevals_evaluator(
1039            input=input, output=output, expected=expected_output, **kwargs
1040        )
1041
1042        return Evaluation(
1043            name=evaluation.name, value=evaluation.score, metadata=evaluation.metadata
1044        )
1045
1046    return langfuse_evaluator
class LocalExperimentItem(typing.TypedDict):
29class LocalExperimentItem(TypedDict, total=False):
30    """Structure for local experiment data items (not from Langfuse datasets).
31
32    This TypedDict defines the structure for experiment items when using local data
33    rather than Langfuse-hosted datasets. All fields are optional to provide
34    flexibility in data structure.
35
36    Attributes:
37        input: The input data to pass to the task function. Can be any type that
38            your task function can process (string, dict, list, etc.). This is
39            typically the prompt, question, or data that your task will operate on.
40        expected_output: Optional expected/ground truth output for evaluation purposes.
41            Used by evaluators to assess correctness or quality. Can be None if
42            no ground truth is available.
43        metadata: Optional metadata dictionary containing additional context about
44            this specific item. Can include information like difficulty level,
45            category, source, or any other relevant attributes that evaluators
46            might use for context-aware evaluation.
47
48    Examples:
49        Simple text processing item:
50        ```python
51        item: LocalExperimentItem = {
52            "input": "Summarize this article: ...",
53            "expected_output": "Expected summary...",
54            "metadata": {"difficulty": "medium", "category": "news"}
55        }
56        ```
57
58        Classification item:
59        ```python
60        item: LocalExperimentItem = {
61            "input": {"text": "This movie is great!", "context": "movie review"},
62            "expected_output": "positive",
63            "metadata": {"dataset_source": "imdb", "confidence": 0.95}
64        }
65        ```
66
67        Minimal item with only input:
68        ```python
69        item: LocalExperimentItem = {
70            "input": "What is the capital of France?"
71        }
72        ```
73    """
74
75    input: Any
76    expected_output: Any
77    metadata: Optional[Dict[str, Any]]

Structure for local experiment data items (not from Langfuse datasets).

This TypedDict defines the structure for experiment items when using local data rather than Langfuse-hosted datasets. All fields are optional to provide flexibility in data structure.

Attributes:
  • input: The input data to pass to the task function. Can be any type that your task function can process (string, dict, list, etc.). This is typically the prompt, question, or data that your task will operate on.
  • expected_output: Optional expected/ground truth output for evaluation purposes. Used by evaluators to assess correctness or quality. Can be None if no ground truth is available.
  • metadata: Optional metadata dictionary containing additional context about this specific item. Can include information like difficulty level, category, source, or any other relevant attributes that evaluators might use for context-aware evaluation.
Examples:

Simple text processing item:

item: LocalExperimentItem = {
    "input": "Summarize this article: ...",
    "expected_output": "Expected summary...",
    "metadata": {"difficulty": "medium", "category": "news"}
}

Classification item:

item: LocalExperimentItem = {
    "input": {"text": "This movie is great!", "context": "movie review"},
    "expected_output": "positive",
    "metadata": {"dataset_source": "imdb", "confidence": 0.95}
}

Minimal item with only input:

item: LocalExperimentItem = {
    "input": "What is the capital of France?"
}
input: Any
expected_output: Any
metadata: Optional[Dict[str, Any]]
ExperimentItem = typing.Union[LocalExperimentItem, ForwardRef('DatasetItemClient')]

Type alias for items that can be processed in experiments.

Can be either:

  • LocalExperimentItem: Dict-like items with 'input', 'expected_output', 'metadata' keys
  • DatasetItemClient: Items from Langfuse datasets with .input, .expected_output, .metadata attributes
ExperimentData = typing.Union[typing.List[LocalExperimentItem], typing.List[ForwardRef('DatasetItemClient')]]

Type alias for experiment datasets.

Represents the collection of items to process in an experiment. Can be either:

  • List[LocalExperimentItem]: Local data items as dictionaries
  • List[DatasetItemClient]: Items from a Langfuse dataset (typically from dataset.items)
class Evaluation:
 97class Evaluation:
 98    """Represents an evaluation result for an experiment item or an entire experiment run.
 99
100    This class provides a strongly-typed way to create evaluation results in evaluator functions.
101    Users must use keyword arguments when instantiating this class.
102
103    Attributes:
104        name: Unique identifier for the evaluation metric. Should be descriptive
105            and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity").
106            Used for aggregation and comparison across experiment runs.
107        value: The evaluation score or result. Can be:
108            - Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42)
109            - String: For categorical results like "positive", "negative", "neutral"
110            - Boolean: For binary assessments like "passes_safety_check"
111            - None: When evaluation cannot be computed (missing data, API errors, etc.)
112        comment: Optional human-readable explanation of the evaluation result.
113            Useful for providing context, explaining scoring rationale, or noting
114            special conditions. Displayed in Langfuse UI for interpretability.
115        metadata: Optional structured metadata about the evaluation process.
116            Can include confidence scores, intermediate calculations, model versions,
117            or any other relevant technical details.
118        data_type: Optional score data type. Required if value is not NUMERIC.
119            One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC.
120        config_id: Optional Langfuse score config ID.
121
122    Examples:
123        Basic accuracy evaluation:
124        ```python
125        from langfuse import Evaluation
126
127        def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
128            if not expected_output:
129                return Evaluation(name="accuracy", value=None, comment="No expected output")
130
131            is_correct = output.strip().lower() == expected_output.strip().lower()
132            return Evaluation(
133                name="accuracy",
134                value=1.0 if is_correct else 0.0,
135                comment="Correct answer" if is_correct else "Incorrect answer"
136            )
137        ```
138
139        Multi-metric evaluator:
140        ```python
141        def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs):
142            return [
143                Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"),
144                Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"),
145                Evaluation(
146                    name="quality",
147                    value=0.85,
148                    comment="High quality response",
149                    metadata={"confidence": 0.92, "model": "gpt-4"}
150                )
151            ]
152        ```
153
154        Categorical evaluation:
155        ```python
156        def sentiment_evaluator(*, input, output, **kwargs):
157            sentiment = analyze_sentiment(output)  # Returns "positive", "negative", or "neutral"
158            return Evaluation(
159                name="sentiment",
160                value=sentiment,
161                comment=f"Response expresses {sentiment} sentiment",
162                data_type="CATEGORICAL"
163            )
164        ```
165
166        Failed evaluation with error handling:
167        ```python
168        def external_api_evaluator(*, input, output, **kwargs):
169            try:
170                score = external_api.evaluate(output)
171                return Evaluation(name="external_score", value=score)
172            except Exception as e:
173                return Evaluation(
174                    name="external_score",
175                    value=None,
176                    comment=f"API unavailable: {e}",
177                    metadata={"error": str(e), "retry_count": 3}
178                )
179        ```
180
181    Note:
182        All arguments must be passed as keywords. Positional arguments are not allowed
183        to ensure code clarity and prevent errors from argument reordering.
184    """
185
186    def __init__(
187        self,
188        *,
189        name: str,
190        value: Union[int, float, str, bool, None],
191        comment: Optional[str] = None,
192        metadata: Optional[Dict[str, Any]] = None,
193        data_type: Optional[ScoreDataType] = None,
194        config_id: Optional[str] = None,
195    ):
196        """Initialize an Evaluation with the provided data.
197
198        Args:
199            name: Unique identifier for the evaluation metric.
200            value: The evaluation score or result.
201            comment: Optional human-readable explanation of the result.
202            metadata: Optional structured metadata about the evaluation process.
203            data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
204            config_id: Optional Langfuse score config ID.
205
206        Note:
207            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
208        """
209        self.name = name
210        self.value = value
211        self.comment = comment
212        self.metadata = metadata
213        self.data_type = data_type
214        self.config_id = config_id

Represents an evaluation result for an experiment item or an entire experiment run.

This class provides a strongly-typed way to create evaluation results in evaluator functions. Users must use keyword arguments when instantiating this class.

Attributes:
  • name: Unique identifier for the evaluation metric. Should be descriptive and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity"). Used for aggregation and comparison across experiment runs.
  • value: The evaluation score or result. Can be:
    • Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42)
    • String: For categorical results like "positive", "negative", "neutral"
    • Boolean: For binary assessments like "passes_safety_check"
    • None: When evaluation cannot be computed (missing data, API errors, etc.)
  • comment: Optional human-readable explanation of the evaluation result. Useful for providing context, explaining scoring rationale, or noting special conditions. Displayed in Langfuse UI for interpretability.
  • metadata: Optional structured metadata about the evaluation process. Can include confidence scores, intermediate calculations, model versions, or any other relevant technical details.
  • data_type: Optional score data type. Required if value is not NUMERIC. One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC.
  • config_id: Optional Langfuse score config ID.
Examples:

Basic accuracy evaluation:

from langfuse import Evaluation

def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
    if not expected_output:
        return Evaluation(name="accuracy", value=None, comment="No expected output")

    is_correct = output.strip().lower() == expected_output.strip().lower()
    return Evaluation(
        name="accuracy",
        value=1.0 if is_correct else 0.0,
        comment="Correct answer" if is_correct else "Incorrect answer"
    )

Multi-metric evaluator:

def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs):
    return [
        Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"),
        Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"),
        Evaluation(
            name="quality",
            value=0.85,
            comment="High quality response",
            metadata={"confidence": 0.92, "model": "gpt-4"}
        )
    ]

Categorical evaluation:

def sentiment_evaluator(*, input, output, **kwargs):
    sentiment = analyze_sentiment(output)  # Returns "positive", "negative", or "neutral"
    return Evaluation(
        name="sentiment",
        value=sentiment,
        comment=f"Response expresses {sentiment} sentiment",
        data_type="CATEGORICAL"
    )

Failed evaluation with error handling:

def external_api_evaluator(*, input, output, **kwargs):
    try:
        score = external_api.evaluate(output)
        return Evaluation(name="external_score", value=score)
    except Exception as e:
        return Evaluation(
            name="external_score",
            value=None,
            comment=f"API unavailable: {e}",
            metadata={"error": str(e), "retry_count": 3}
        )
Note:

All arguments must be passed as keywords. Positional arguments are not allowed to ensure code clarity and prevent errors from argument reordering.

Evaluation( *, name: str, value: Union[int, float, str, bool, NoneType], comment: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, data_type: Optional[langfuse.api.ScoreDataType] = None, config_id: Optional[str] = None)
186    def __init__(
187        self,
188        *,
189        name: str,
190        value: Union[int, float, str, bool, None],
191        comment: Optional[str] = None,
192        metadata: Optional[Dict[str, Any]] = None,
193        data_type: Optional[ScoreDataType] = None,
194        config_id: Optional[str] = None,
195    ):
196        """Initialize an Evaluation with the provided data.
197
198        Args:
199            name: Unique identifier for the evaluation metric.
200            value: The evaluation score or result.
201            comment: Optional human-readable explanation of the result.
202            metadata: Optional structured metadata about the evaluation process.
203            data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
204            config_id: Optional Langfuse score config ID.
205
206        Note:
207            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
208        """
209        self.name = name
210        self.value = value
211        self.comment = comment
212        self.metadata = metadata
213        self.data_type = data_type
214        self.config_id = config_id

Initialize an Evaluation with the provided data.

Arguments:
  • name: Unique identifier for the evaluation metric.
  • value: The evaluation score or result.
  • comment: Optional human-readable explanation of the result.
  • metadata: Optional structured metadata about the evaluation process.
  • data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
  • config_id: Optional Langfuse score config ID.
Note:

All arguments must be provided as keywords. Positional arguments will raise a TypeError.

name
value
comment
metadata
data_type
config_id
class ExperimentItemResult:
217class ExperimentItemResult:
218    """Result structure for individual experiment items.
219
220    This class represents the complete result of processing a single item
221    during an experiment run, including the original input, task output,
222    evaluations, and tracing information. Users must use keyword arguments when instantiating this class.
223
224    Attributes:
225        item: The original experiment item that was processed. Can be either
226            a dictionary with 'input', 'expected_output', and 'metadata' keys,
227            or a DatasetItemClient from Langfuse datasets.
228        output: The actual output produced by the task function for this item.
229            Can be any type depending on what your task function returns.
230        evaluations: List of evaluation results for this item. Each evaluation
231            contains a name, value, optional comment, and optional metadata.
232        trace_id: Optional Langfuse trace ID for this item's execution. Used
233            to link the experiment result with the detailed trace in Langfuse UI.
234        dataset_run_id: Optional dataset run ID if this item was part of a
235            Langfuse dataset. None for local experiments.
236
237    Examples:
238        Accessing item result data:
239        ```python
240        result = langfuse.run_experiment(...)
241        for item_result in result.item_results:
242            print(f"Input: {item_result.item}")
243            print(f"Output: {item_result.output}")
244            print(f"Trace: {item_result.trace_id}")
245
246            # Access evaluations
247            for evaluation in item_result.evaluations:
248                print(f"{evaluation.name}: {evaluation.value}")
249        ```
250
251        Working with different item types:
252        ```python
253        # Local experiment item (dict)
254        if isinstance(item_result.item, dict):
255            input_data = item_result.item["input"]
256            expected = item_result.item.get("expected_output")
257
258        # Langfuse dataset item (object with attributes)
259        else:
260            input_data = item_result.item.input
261            expected = item_result.item.expected_output
262        ```
263
264    Note:
265        All arguments must be passed as keywords. Positional arguments are not allowed
266        to ensure code clarity and prevent errors from argument reordering.
267    """
268
269    def __init__(
270        self,
271        *,
272        item: ExperimentItem,
273        output: Any,
274        evaluations: List[Evaluation],
275        trace_id: Optional[str],
276        dataset_run_id: Optional[str],
277    ):
278        """Initialize an ExperimentItemResult with the provided data.
279
280        Args:
281            item: The original experiment item that was processed.
282            output: The actual output produced by the task function for this item.
283            evaluations: List of evaluation results for this item.
284            trace_id: Optional Langfuse trace ID for this item's execution.
285            dataset_run_id: Optional dataset run ID if this item was part of a Langfuse dataset.
286
287        Note:
288            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
289        """
290        self.item = item
291        self.output = output
292        self.evaluations = evaluations
293        self.trace_id = trace_id
294        self.dataset_run_id = dataset_run_id

Result structure for individual experiment items.

This class represents the complete result of processing a single item during an experiment run, including the original input, task output, evaluations, and tracing information. Users must use keyword arguments when instantiating this class.

Attributes:
  • item: The original experiment item that was processed. Can be either a dictionary with 'input', 'expected_output', and 'metadata' keys, or a DatasetItemClient from Langfuse datasets.
  • output: The actual output produced by the task function for this item. Can be any type depending on what your task function returns.
  • evaluations: List of evaluation results for this item. Each evaluation contains a name, value, optional comment, and optional metadata.
  • trace_id: Optional Langfuse trace ID for this item's execution. Used to link the experiment result with the detailed trace in Langfuse UI.
  • dataset_run_id: Optional dataset run ID if this item was part of a Langfuse dataset. None for local experiments.
Examples:

Accessing item result data:

result = langfuse.run_experiment(...)
for item_result in result.item_results:
    print(f"Input: {item_result.item}")
    print(f"Output: {item_result.output}")
    print(f"Trace: {item_result.trace_id}")

    # Access evaluations
    for evaluation in item_result.evaluations:
        print(f"{evaluation.name}: {evaluation.value}")

Working with different item types:

# Local experiment item (dict)
if isinstance(item_result.item, dict):
    input_data = item_result.item["input"]
    expected = item_result.item.get("expected_output")

# Langfuse dataset item (object with attributes)
else:
    input_data = item_result.item.input
    expected = item_result.item.expected_output
Note:

All arguments must be passed as keywords. Positional arguments are not allowed to ensure code clarity and prevent errors from argument reordering.

ExperimentItemResult( *, item: Union[LocalExperimentItem, langfuse._client.datasets.DatasetItemClient], output: Any, evaluations: List[Evaluation], trace_id: Optional[str], dataset_run_id: Optional[str])
269    def __init__(
270        self,
271        *,
272        item: ExperimentItem,
273        output: Any,
274        evaluations: List[Evaluation],
275        trace_id: Optional[str],
276        dataset_run_id: Optional[str],
277    ):
278        """Initialize an ExperimentItemResult with the provided data.
279
280        Args:
281            item: The original experiment item that was processed.
282            output: The actual output produced by the task function for this item.
283            evaluations: List of evaluation results for this item.
284            trace_id: Optional Langfuse trace ID for this item's execution.
285            dataset_run_id: Optional dataset run ID if this item was part of a Langfuse dataset.
286
287        Note:
288            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
289        """
290        self.item = item
291        self.output = output
292        self.evaluations = evaluations
293        self.trace_id = trace_id
294        self.dataset_run_id = dataset_run_id

Initialize an ExperimentItemResult with the provided data.

Arguments:
  • item: The original experiment item that was processed.
  • output: The actual output produced by the task function for this item.
  • evaluations: List of evaluation results for this item.
  • trace_id: Optional Langfuse trace ID for this item's execution.
  • dataset_run_id: Optional dataset run ID if this item was part of a Langfuse dataset.
Note:

All arguments must be provided as keywords. Positional arguments will raise a TypeError.

item
output
evaluations
trace_id
dataset_run_id
class ExperimentResult:
297class ExperimentResult:
298    """Complete result structure for experiment execution.
299
300    This class encapsulates the complete results of running an experiment on a dataset,
301    including individual item results, aggregate run-level evaluations, and metadata
302    about the experiment execution.
303
304    Attributes:
305        name: The name of the experiment as specified during execution.
306        run_name: The name of the current experiment run.
307        description: Optional description of the experiment's purpose or methodology.
308        item_results: List of results from processing each individual dataset item,
309            containing the original item, task output, evaluations, and trace information.
310        run_evaluations: List of aggregate evaluation results computed across all items,
311            such as average scores, statistical summaries, or cross-item analyses.
312        dataset_run_id: Optional ID of the dataset run in Langfuse (when using Langfuse datasets).
313        dataset_run_url: Optional direct URL to view the experiment results in Langfuse UI.
314
315    Examples:
316        Basic usage with local dataset:
317        ```python
318        result = langfuse.run_experiment(
319            name="Capital Cities Test",
320            data=local_data,
321            task=generate_capital,
322            evaluators=[accuracy_check]
323        )
324
325        print(f"Processed {len(result.item_results)} items")
326        print(result.format())  # Human-readable summary
327
328        # Access individual results
329        for item_result in result.item_results:
330            print(f"Input: {item_result.item}")
331            print(f"Output: {item_result.output}")
332            print(f"Scores: {item_result.evaluations}")
333        ```
334
335        Usage with Langfuse datasets:
336        ```python
337        dataset = langfuse.get_dataset("qa-eval-set")
338        result = dataset.run_experiment(
339            name="GPT-4 QA Evaluation",
340            task=answer_question,
341            evaluators=[relevance_check, accuracy_check]
342        )
343
344        # View in Langfuse UI
345        if result.dataset_run_url:
346            print(f"View detailed results: {result.dataset_run_url}")
347        ```
348
349        Formatted output:
350        ```python
351        # Get summary view
352        summary = result.format()
353        print(summary)
354
355        # Get detailed view with individual items
356        detailed = result.format(include_item_results=True)
357        with open("experiment_report.txt", "w") as f:
358            f.write(detailed)
359        ```
360    """
361
362    def __init__(
363        self,
364        *,
365        name: str,
366        run_name: str,
367        description: Optional[str],
368        item_results: List[ExperimentItemResult],
369        run_evaluations: List[Evaluation],
370        dataset_run_id: Optional[str] = None,
371        dataset_run_url: Optional[str] = None,
372    ):
373        """Initialize an ExperimentResult with the provided data.
374
375        Args:
376            name: The name of the experiment.
377            run_name: The current experiment run name.
378            description: Optional description of the experiment.
379            item_results: List of results from processing individual dataset items.
380            run_evaluations: List of aggregate evaluation results for the entire run.
381            dataset_run_id: Optional ID of the dataset run (for Langfuse datasets).
382            dataset_run_url: Optional URL to view results in Langfuse UI.
383        """
384        self.name = name
385        self.run_name = run_name
386        self.description = description
387        self.item_results = item_results
388        self.run_evaluations = run_evaluations
389        self.dataset_run_id = dataset_run_id
390        self.dataset_run_url = dataset_run_url
391
392    def format(self, *, include_item_results: bool = False) -> str:
393        r"""Format the experiment result for human-readable display.
394
395        Converts the experiment result into a nicely formatted string suitable for
396        console output, logging, or reporting. The output includes experiment overview,
397        aggregate statistics, and optionally individual item details.
398
399        This method provides a comprehensive view of experiment performance including:
400        - Experiment metadata (name, description, item count)
401        - List of evaluation metrics used across items
402        - Average scores computed across all processed items
403        - Run-level evaluation results (aggregate metrics)
404        - Links to view detailed results in Langfuse UI (when available)
405        - Individual item details (when requested)
406
407        Args:
408            include_item_results: Whether to include detailed results for each individual
409                item in the formatted output. When False (default), only shows aggregate
410                statistics and summary information. When True, includes input/output/scores
411                for every processed item, making the output significantly longer but more
412                detailed for debugging and analysis purposes.
413
414        Returns:
415            A formatted multi-line string containing:
416            - Experiment name and description (if provided)
417            - Total number of items successfully processed
418            - List of all evaluation metrics that were applied
419            - Average scores across all items for each numeric metric
420            - Run-level evaluation results with comments
421            - Dataset run URL for viewing in Langfuse UI (if applicable)
422            - Individual item details including inputs, outputs, and scores (if requested)
423
424        Examples:
425            Basic usage showing aggregate results only:
426            ```python
427            result = langfuse.run_experiment(
428                name="Capital Cities",
429                data=dataset,
430                task=generate_capital,
431                evaluators=[accuracy_evaluator]
432            )
433
434            print(result.format())
435            # Output:
436            # ──────────────────────────────────────────────────
437            # 📊 Capital Cities
438            # 100 items
439            # Evaluations:
440            #   • accuracy
441            # Average Scores:
442            #   • accuracy: 0.850
443            ```
444
445            Detailed output including all individual item results:
446            ```python
447            detailed_report = result.format(include_item_results=True)
448            print(detailed_report)
449            # Output includes each item:
450            # 1. Item 1:
451            #    Input:    What is the capital of France?
452            #    Expected: Paris
453            #    Actual:   The capital of France is Paris.
454            #    Scores:
455            #      • accuracy: 1.000
456            #        💭 Correct answer found
457            # [... continues for all items ...]
458            ```
459
460            Saving formatted results to file for reporting:
461            ```python
462            with open("experiment_report.txt", "w") as f:
463                f.write(result.format(include_item_results=True))
464
465            # Or create summary report
466            summary = result.format()  # Aggregate view only
467            print(f"Experiment Summary:\\n{summary}")
468            ```
469
470            Integration with logging systems:
471            ```python
472            import logging
473            logger = logging.getLogger("experiments")
474
475            # Log summary after experiment
476            logger.info(f"Experiment completed:\\n{result.format()}")
477
478            # Log detailed results for failed experiments
479            if any(eval['value'] < threshold for eval in result.run_evaluations):
480                logger.warning(f"Poor performance detected:\\n{result.format(include_item_results=True)}")
481            ```
482        """
483        if not self.item_results:
484            return "No experiment results to display."
485
486        output = ""
487
488        # Individual results section
489        if include_item_results:
490            for i, result in enumerate(self.item_results):
491                output += f"\\n{i + 1}. Item {i + 1}:\\n"
492
493                # Extract and display input
494                item_input = None
495                if isinstance(result.item, dict):
496                    item_input = result.item.get("input")
497                elif hasattr(result.item, "input"):
498                    item_input = result.item.input
499
500                if item_input is not None:
501                    output += f"   Input:    {_format_value(item_input)}\\n"
502
503                # Extract and display expected output
504                expected_output = None
505                if isinstance(result.item, dict):
506                    expected_output = result.item.get("expected_output")
507                elif hasattr(result.item, "expected_output"):
508                    expected_output = result.item.expected_output
509
510                if expected_output is not None:
511                    output += f"   Expected: {_format_value(expected_output)}\\n"
512                output += f"   Actual:   {_format_value(result.output)}\\n"
513
514                # Display evaluation scores
515                if result.evaluations:
516                    output += "   Scores:\\n"
517                    for evaluation in result.evaluations:
518                        score = evaluation.value
519                        if isinstance(score, (int, float)):
520                            score = f"{score:.3f}"
521                        output += f"     • {evaluation.name}: {score}"
522                        if evaluation.comment:
523                            output += f"\\n       💭 {evaluation.comment}"
524                        output += "\\n"
525
526                # Display trace link if available
527                if result.trace_id:
528                    output += f"\\n   Trace ID: {result.trace_id}\\n"
529        else:
530            output += f"Individual Results: Hidden ({len(self.item_results)} items)\\n"
531            output += "💡 Set include_item_results=True to view them\\n"
532
533        # Experiment overview section
534        output += f"\\n{'─' * 50}\\n"
535        output += f"🧪 Experiment: {self.name}"
536        output += f"\n📋 Run name: {self.run_name}"
537        if self.description:
538            output += f" - {self.description}"
539
540        output += f"\\n{len(self.item_results)} items"
541
542        # Collect unique evaluation names across all items
543        evaluation_names = set()
544        for result in self.item_results:
545            for evaluation in result.evaluations:
546                evaluation_names.add(evaluation.name)
547
548        if evaluation_names:
549            output += "\\nEvaluations:"
550            for eval_name in evaluation_names:
551                output += f"\\n  • {eval_name}"
552            output += "\\n"
553
554        # Calculate and display average scores
555        if evaluation_names:
556            output += "\\nAverage Scores:"
557            for eval_name in evaluation_names:
558                scores = []
559                for result in self.item_results:
560                    for evaluation in result.evaluations:
561                        if evaluation.name == eval_name and isinstance(
562                            evaluation.value, (int, float)
563                        ):
564                            scores.append(evaluation.value)
565
566                if scores:
567                    avg = sum(scores) / len(scores)
568                    output += f"\\n  • {eval_name}: {avg:.3f}"
569            output += "\\n"
570
571        # Display run-level evaluations
572        if self.run_evaluations:
573            output += "\\nRun Evaluations:"
574            for run_eval in self.run_evaluations:
575                score = run_eval.value
576                if isinstance(score, (int, float)):
577                    score = f"{score:.3f}"
578                output += f"\\n  • {run_eval.name}: {score}"
579                if run_eval.comment:
580                    output += f"\\n    💭 {run_eval.comment}"
581            output += "\\n"
582
583        # Add dataset run URL if available
584        if self.dataset_run_url:
585            output += f"\\n🔗 Dataset Run:\\n   {self.dataset_run_url}"
586
587        return output

Complete result structure for experiment execution.

This class encapsulates the complete results of running an experiment on a dataset, including individual item results, aggregate run-level evaluations, and metadata about the experiment execution.

Attributes:
  • name: The name of the experiment as specified during execution.
  • run_name: The name of the current experiment run.
  • description: Optional description of the experiment's purpose or methodology.
  • item_results: List of results from processing each individual dataset item, containing the original item, task output, evaluations, and trace information.
  • run_evaluations: List of aggregate evaluation results computed across all items, such as average scores, statistical summaries, or cross-item analyses.
  • dataset_run_id: Optional ID of the dataset run in Langfuse (when using Langfuse datasets).
  • dataset_run_url: Optional direct URL to view the experiment results in Langfuse UI.
Examples:

Basic usage with local dataset:

result = langfuse.run_experiment(
    name="Capital Cities Test",
    data=local_data,
    task=generate_capital,
    evaluators=[accuracy_check]
)

print(f"Processed {len(result.item_results)} items")
print(result.format())  # Human-readable summary

# Access individual results
for item_result in result.item_results:
    print(f"Input: {item_result.item}")
    print(f"Output: {item_result.output}")
    print(f"Scores: {item_result.evaluations}")

Usage with Langfuse datasets:

dataset = langfuse.get_dataset("qa-eval-set")
result = dataset.run_experiment(
    name="GPT-4 QA Evaluation",
    task=answer_question,
    evaluators=[relevance_check, accuracy_check]
)

# View in Langfuse UI
if result.dataset_run_url:
    print(f"View detailed results: {result.dataset_run_url}")

Formatted output:

# Get summary view
summary = result.format()
print(summary)

# Get detailed view with individual items
detailed = result.format(include_item_results=True)
with open("experiment_report.txt", "w") as f:
    f.write(detailed)
ExperimentResult( *, name: str, run_name: str, description: Optional[str], item_results: List[ExperimentItemResult], run_evaluations: List[Evaluation], dataset_run_id: Optional[str] = None, dataset_run_url: Optional[str] = None)
362    def __init__(
363        self,
364        *,
365        name: str,
366        run_name: str,
367        description: Optional[str],
368        item_results: List[ExperimentItemResult],
369        run_evaluations: List[Evaluation],
370        dataset_run_id: Optional[str] = None,
371        dataset_run_url: Optional[str] = None,
372    ):
373        """Initialize an ExperimentResult with the provided data.
374
375        Args:
376            name: The name of the experiment.
377            run_name: The current experiment run name.
378            description: Optional description of the experiment.
379            item_results: List of results from processing individual dataset items.
380            run_evaluations: List of aggregate evaluation results for the entire run.
381            dataset_run_id: Optional ID of the dataset run (for Langfuse datasets).
382            dataset_run_url: Optional URL to view results in Langfuse UI.
383        """
384        self.name = name
385        self.run_name = run_name
386        self.description = description
387        self.item_results = item_results
388        self.run_evaluations = run_evaluations
389        self.dataset_run_id = dataset_run_id
390        self.dataset_run_url = dataset_run_url

Initialize an ExperimentResult with the provided data.

Arguments:
  • name: The name of the experiment.
  • run_name: The current experiment run name.
  • description: Optional description of the experiment.
  • item_results: List of results from processing individual dataset items.
  • run_evaluations: List of aggregate evaluation results for the entire run.
  • dataset_run_id: Optional ID of the dataset run (for Langfuse datasets).
  • dataset_run_url: Optional URL to view results in Langfuse UI.
name
run_name
description
item_results
run_evaluations
dataset_run_id
dataset_run_url
def format(self, *, include_item_results: bool = False) -> str:
392    def format(self, *, include_item_results: bool = False) -> str:
393        r"""Format the experiment result for human-readable display.
394
395        Converts the experiment result into a nicely formatted string suitable for
396        console output, logging, or reporting. The output includes experiment overview,
397        aggregate statistics, and optionally individual item details.
398
399        This method provides a comprehensive view of experiment performance including:
400        - Experiment metadata (name, description, item count)
401        - List of evaluation metrics used across items
402        - Average scores computed across all processed items
403        - Run-level evaluation results (aggregate metrics)
404        - Links to view detailed results in Langfuse UI (when available)
405        - Individual item details (when requested)
406
407        Args:
408            include_item_results: Whether to include detailed results for each individual
409                item in the formatted output. When False (default), only shows aggregate
410                statistics and summary information. When True, includes input/output/scores
411                for every processed item, making the output significantly longer but more
412                detailed for debugging and analysis purposes.
413
414        Returns:
415            A formatted multi-line string containing:
416            - Experiment name and description (if provided)
417            - Total number of items successfully processed
418            - List of all evaluation metrics that were applied
419            - Average scores across all items for each numeric metric
420            - Run-level evaluation results with comments
421            - Dataset run URL for viewing in Langfuse UI (if applicable)
422            - Individual item details including inputs, outputs, and scores (if requested)
423
424        Examples:
425            Basic usage showing aggregate results only:
426            ```python
427            result = langfuse.run_experiment(
428                name="Capital Cities",
429                data=dataset,
430                task=generate_capital,
431                evaluators=[accuracy_evaluator]
432            )
433
434            print(result.format())
435            # Output:
436            # ──────────────────────────────────────────────────
437            # 📊 Capital Cities
438            # 100 items
439            # Evaluations:
440            #   • accuracy
441            # Average Scores:
442            #   • accuracy: 0.850
443            ```
444
445            Detailed output including all individual item results:
446            ```python
447            detailed_report = result.format(include_item_results=True)
448            print(detailed_report)
449            # Output includes each item:
450            # 1. Item 1:
451            #    Input:    What is the capital of France?
452            #    Expected: Paris
453            #    Actual:   The capital of France is Paris.
454            #    Scores:
455            #      • accuracy: 1.000
456            #        💭 Correct answer found
457            # [... continues for all items ...]
458            ```
459
460            Saving formatted results to file for reporting:
461            ```python
462            with open("experiment_report.txt", "w") as f:
463                f.write(result.format(include_item_results=True))
464
465            # Or create summary report
466            summary = result.format()  # Aggregate view only
467            print(f"Experiment Summary:\\n{summary}")
468            ```
469
470            Integration with logging systems:
471            ```python
472            import logging
473            logger = logging.getLogger("experiments")
474
475            # Log summary after experiment
476            logger.info(f"Experiment completed:\\n{result.format()}")
477
478            # Log detailed results for failed experiments
479            if any(eval['value'] < threshold for eval in result.run_evaluations):
480                logger.warning(f"Poor performance detected:\\n{result.format(include_item_results=True)}")
481            ```
482        """
483        if not self.item_results:
484            return "No experiment results to display."
485
486        output = ""
487
488        # Individual results section
489        if include_item_results:
490            for i, result in enumerate(self.item_results):
491                output += f"\\n{i + 1}. Item {i + 1}:\\n"
492
493                # Extract and display input
494                item_input = None
495                if isinstance(result.item, dict):
496                    item_input = result.item.get("input")
497                elif hasattr(result.item, "input"):
498                    item_input = result.item.input
499
500                if item_input is not None:
501                    output += f"   Input:    {_format_value(item_input)}\\n"
502
503                # Extract and display expected output
504                expected_output = None
505                if isinstance(result.item, dict):
506                    expected_output = result.item.get("expected_output")
507                elif hasattr(result.item, "expected_output"):
508                    expected_output = result.item.expected_output
509
510                if expected_output is not None:
511                    output += f"   Expected: {_format_value(expected_output)}\\n"
512                output += f"   Actual:   {_format_value(result.output)}\\n"
513
514                # Display evaluation scores
515                if result.evaluations:
516                    output += "   Scores:\\n"
517                    for evaluation in result.evaluations:
518                        score = evaluation.value
519                        if isinstance(score, (int, float)):
520                            score = f"{score:.3f}"
521                        output += f"     • {evaluation.name}: {score}"
522                        if evaluation.comment:
523                            output += f"\\n       💭 {evaluation.comment}"
524                        output += "\\n"
525
526                # Display trace link if available
527                if result.trace_id:
528                    output += f"\\n   Trace ID: {result.trace_id}\\n"
529        else:
530            output += f"Individual Results: Hidden ({len(self.item_results)} items)\\n"
531            output += "💡 Set include_item_results=True to view them\\n"
532
533        # Experiment overview section
534        output += f"\\n{'─' * 50}\\n"
535        output += f"🧪 Experiment: {self.name}"
536        output += f"\n📋 Run name: {self.run_name}"
537        if self.description:
538            output += f" - {self.description}"
539
540        output += f"\\n{len(self.item_results)} items"
541
542        # Collect unique evaluation names across all items
543        evaluation_names = set()
544        for result in self.item_results:
545            for evaluation in result.evaluations:
546                evaluation_names.add(evaluation.name)
547
548        if evaluation_names:
549            output += "\\nEvaluations:"
550            for eval_name in evaluation_names:
551                output += f"\\n  • {eval_name}"
552            output += "\\n"
553
554        # Calculate and display average scores
555        if evaluation_names:
556            output += "\\nAverage Scores:"
557            for eval_name in evaluation_names:
558                scores = []
559                for result in self.item_results:
560                    for evaluation in result.evaluations:
561                        if evaluation.name == eval_name and isinstance(
562                            evaluation.value, (int, float)
563                        ):
564                            scores.append(evaluation.value)
565
566                if scores:
567                    avg = sum(scores) / len(scores)
568                    output += f"\\n  • {eval_name}: {avg:.3f}"
569            output += "\\n"
570
571        # Display run-level evaluations
572        if self.run_evaluations:
573            output += "\\nRun Evaluations:"
574            for run_eval in self.run_evaluations:
575                score = run_eval.value
576                if isinstance(score, (int, float)):
577                    score = f"{score:.3f}"
578                output += f"\\n  • {run_eval.name}: {score}"
579                if run_eval.comment:
580                    output += f"\\n    💭 {run_eval.comment}"
581            output += "\\n"
582
583        # Add dataset run URL if available
584        if self.dataset_run_url:
585            output += f"\\n🔗 Dataset Run:\\n   {self.dataset_run_url}"
586
587        return output

Format the experiment result for human-readable display.

Converts the experiment result into a nicely formatted string suitable for console output, logging, or reporting. The output includes experiment overview, aggregate statistics, and optionally individual item details.

This method provides a comprehensive view of experiment performance including:

  • Experiment metadata (name, description, item count)
  • List of evaluation metrics used across items
  • Average scores computed across all processed items
  • Run-level evaluation results (aggregate metrics)
  • Links to view detailed results in Langfuse UI (when available)
  • Individual item details (when requested)
Arguments:
  • include_item_results: Whether to include detailed results for each individual item in the formatted output. When False (default), only shows aggregate statistics and summary information. When True, includes input/output/scores for every processed item, making the output significantly longer but more detailed for debugging and analysis purposes.
Returns:

A formatted multi-line string containing:

  • Experiment name and description (if provided)
  • Total number of items successfully processed
  • List of all evaluation metrics that were applied
  • Average scores across all items for each numeric metric
  • Run-level evaluation results with comments
  • Dataset run URL for viewing in Langfuse UI (if applicable)
  • Individual item details including inputs, outputs, and scores (if requested)
Examples:

Basic usage showing aggregate results only:

result = langfuse.run_experiment(
    name="Capital Cities",
    data=dataset,
    task=generate_capital,
    evaluators=[accuracy_evaluator]
)

print(result.format())
# Output:
# ──────────────────────────────────────────────────
# 📊 Capital Cities
# 100 items
# Evaluations:
#   • accuracy
# Average Scores:
#   • accuracy: 0.850

Detailed output including all individual item results:

detailed_report = result.format(include_item_results=True)
print(detailed_report)
# Output includes each item:
# 1. Item 1:
#    Input:    What is the capital of France?
#    Expected: Paris
#    Actual:   The capital of France is Paris.
#    Scores:
#      • accuracy: 1.000
#        💭 Correct answer found
# [... continues for all items ...]

Saving formatted results to file for reporting:

with open("experiment_report.txt", "w") as f:
    f.write(result.format(include_item_results=True))

# Or create summary report
summary = result.format()  # Aggregate view only
print(f"Experiment Summary:\\n{summary}")

Integration with logging systems:

import logging
logger = logging.getLogger("experiments")

# Log summary after experiment
logger.info(f"Experiment completed:\\n{result.format()}")

# Log detailed results for failed experiments
if any(eval['value'] < threshold for eval in result.run_evaluations):
    logger.warning(f"Poor performance detected:\\n{result.format(include_item_results=True)}")
class TaskFunction(typing.Protocol):
590class TaskFunction(Protocol):
591    """Protocol defining the interface for experiment task functions.
592
593    Task functions are the core processing functions that operate on each item
594    in an experiment dataset. They receive an experiment item as input and
595    produce some output that will be evaluated.
596
597    Task functions must:
598    - Accept 'item' as a keyword argument
599    - Return any type of output (will be passed to evaluators)
600    - Can be either synchronous or asynchronous
601    - Should handle their own errors gracefully (exceptions will be logged)
602    """
603
604    def __call__(
605        self,
606        *,
607        item: ExperimentItem,
608        **kwargs: Dict[str, Any],
609    ) -> Union[Any, Awaitable[Any]]:
610        """Execute the task on an experiment item.
611
612        This method defines the core processing logic for each item in your experiment.
613        The implementation should focus on the specific task you want to evaluate,
614        such as text generation, classification, summarization, etc.
615
616        Args:
617            item: The experiment item to process. Can be either:
618                - Dict with keys like 'input', 'expected_output', 'metadata'
619                - Langfuse DatasetItem object with .input, .expected_output attributes
620            **kwargs: Additional keyword arguments that may be passed by the framework
621
622        Returns:
623            Any: The output of processing the item. This output will be:
624            - Stored in the experiment results
625            - Passed to all item-level evaluators for assessment
626            - Traced automatically in Langfuse for observability
627
628            Can return either a direct value or an awaitable (async) result.
629
630        Examples:
631            Simple synchronous task:
632            ```python
633            def my_task(*, item, **kwargs):
634                prompt = f"Summarize: {item['input']}"
635                return my_llm_client.generate(prompt)
636            ```
637
638            Async task with error handling:
639            ```python
640            async def my_async_task(*, item, **kwargs):
641                try:
642                    response = await openai_client.chat.completions.create(
643                        model="gpt-4",
644                        messages=[{"role": "user", "content": item["input"]}]
645                    )
646                    return response.choices[0].message.content
647                except Exception as e:
648                    # Log error and return fallback
649                    print(f"Task failed for item {item}: {e}")
650                    return "Error: Could not process item"
651            ```
652
653            Task using dataset item attributes:
654            ```python
655            def classification_task(*, item, **kwargs):
656                # Works with both dict items and DatasetItem objects
657                text = item["input"] if isinstance(item, dict) else item.input
658                return classify_text(text)
659            ```
660        """
661        ...

Protocol defining the interface for experiment task functions.

Task functions are the core processing functions that operate on each item in an experiment dataset. They receive an experiment item as input and produce some output that will be evaluated.

Task functions must:

  • Accept 'item' as a keyword argument
  • Return any type of output (will be passed to evaluators)
  • Can be either synchronous or asynchronous
  • Should handle their own errors gracefully (exceptions will be logged)
TaskFunction(*args, **kwargs)
1927def _no_init_or_replace_init(self, *args, **kwargs):
1928    cls = type(self)
1929
1930    if cls._is_protocol:
1931        raise TypeError('Protocols cannot be instantiated')
1932
1933    # Already using a custom `__init__`. No need to calculate correct
1934    # `__init__` to call. This can lead to RecursionError. See bpo-45121.
1935    if cls.__init__ is not _no_init_or_replace_init:
1936        return
1937
1938    # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`.
1939    # The first instantiation of the subclass will call `_no_init_or_replace_init` which
1940    # searches for a proper new `__init__` in the MRO. The new `__init__`
1941    # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent
1942    # instantiation of the protocol subclass will thus use the new
1943    # `__init__` and no longer call `_no_init_or_replace_init`.
1944    for base in cls.__mro__:
1945        init = base.__dict__.get('__init__', _no_init_or_replace_init)
1946        if init is not _no_init_or_replace_init:
1947            cls.__init__ = init
1948            break
1949    else:
1950        # should not happen
1951        cls.__init__ = object.__init__
1952
1953    cls.__init__(self, *args, **kwargs)
class EvaluatorFunction(typing.Protocol):
664class EvaluatorFunction(Protocol):
665    """Protocol defining the interface for item-level evaluator functions.
666
667    Item-level evaluators assess the quality, correctness, or other properties
668    of individual task outputs. They receive the input, output, expected output,
669    and metadata for each item and return evaluation metrics.
670
671    Evaluators should:
672    - Accept input, output, expected_output, and metadata as keyword arguments
673    - Return Evaluation dict(s) with 'name', 'value', 'comment', 'metadata' fields
674    - Be deterministic when possible for reproducible results
675    - Handle edge cases gracefully (missing expected output, malformed data, etc.)
676    - Can be either synchronous or asynchronous
677    """
678
679    def __call__(
680        self,
681        *,
682        input: Any,
683        output: Any,
684        expected_output: Any,
685        metadata: Optional[Dict[str, Any]],
686        **kwargs: Dict[str, Any],
687    ) -> Union[
688        Evaluation, List[Evaluation], Awaitable[Union[Evaluation, List[Evaluation]]]
689    ]:
690        r"""Evaluate a task output for quality, correctness, or other metrics.
691
692        This method should implement specific evaluation logic such as accuracy checking,
693        similarity measurement, toxicity detection, fluency assessment, etc.
694
695        Args:
696            input: The original input that was passed to the task function.
697                This is typically the item['input'] or item.input value.
698            output: The output produced by the task function for this input.
699                This is the direct return value from your task function.
700            expected_output: The expected/ground truth output for comparison.
701                May be None if not available in the dataset. Evaluators should
702                handle this case appropriately.
703            metadata: Optional metadata from the experiment item that might
704                contain additional context for evaluation (categories, difficulty, etc.)
705            **kwargs: Additional keyword arguments that may be passed by the framework
706
707        Returns:
708            Evaluation results in one of these formats:
709            - Single Evaluation dict: {"name": "accuracy", "value": 0.85, "comment": "..."}
710            - List of Evaluation dicts: [{"name": "precision", ...}, {"name": "recall", ...}]
711            - Awaitable returning either of the above (for async evaluators)
712
713            Each Evaluation dict should contain:
714            - name (str): Unique identifier for this evaluation metric
715            - value (int|float|str|bool): The evaluation score or result
716            - comment (str, optional): Human-readable explanation of the result
717            - metadata (dict, optional): Additional structured data about the evaluation
718
719        Examples:
720            Simple accuracy evaluator:
721            ```python
722            def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
723                if expected_output is None:
724                    return {"name": "accuracy", "value": None, "comment": "No expected output"}
725
726                is_correct = output.strip().lower() == expected_output.strip().lower()
727                return {
728                    "name": "accuracy",
729                    "value": 1.0 if is_correct else 0.0,
730                    "comment": "Exact match" if is_correct else "No match"
731                }
732            ```
733
734            Multi-metric evaluator:
735            ```python
736            def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs):
737                results = []
738
739                # Length check
740                results.append({
741                    "name": "output_length",
742                    "value": len(output),
743                    "comment": f"Output contains {len(output)} characters"
744                })
745
746                # Sentiment analysis
747                sentiment_score = analyze_sentiment(output)
748                results.append({
749                    "name": "sentiment",
750                    "value": sentiment_score,
751                    "comment": f"Sentiment score: {sentiment_score:.2f}"
752                })
753
754                return results
755            ```
756
757            Async evaluator using external API:
758            ```python
759            async def llm_judge_evaluator(*, input, output, expected_output=None, **kwargs):
760                prompt = f"Rate the quality of this response on a scale of 1-10:\n"
761                prompt += f"Question: {input}\nResponse: {output}"
762
763                response = await openai_client.chat.completions.create(
764                    model="gpt-4",
765                    messages=[{"role": "user", "content": prompt}]
766                )
767
768                try:
769                    score = float(response.choices[0].message.content.strip())
770                    return {
771                        "name": "llm_judge_quality",
772                        "value": score,
773                        "comment": f"LLM judge rated this {score}/10"
774                    }
775                except ValueError:
776                    return {
777                        "name": "llm_judge_quality",
778                        "value": None,
779                        "comment": "Could not parse LLM judge score"
780                    }
781            ```
782
783            Context-aware evaluator:
784            ```python
785            def context_evaluator(*, input, output, metadata=None, **kwargs):
786                # Use metadata for context-specific evaluation
787                difficulty = metadata.get("difficulty", "medium") if metadata else "medium"
788
789                # Adjust expectations based on difficulty
790                min_length = {"easy": 50, "medium": 100, "hard": 150}[difficulty]
791
792                meets_requirement = len(output) >= min_length
793                return {
794                    "name": f"meets_{difficulty}_requirement",
795                    "value": meets_requirement,
796                    "comment": f"Output {'meets' if meets_requirement else 'fails'} {difficulty} length requirement"
797                }
798            ```
799        """
800        ...

Protocol defining the interface for item-level evaluator functions.

Item-level evaluators assess the quality, correctness, or other properties of individual task outputs. They receive the input, output, expected output, and metadata for each item and return evaluation metrics.

Evaluators should:

  • Accept input, output, expected_output, and metadata as keyword arguments
  • Return Evaluation dict(s) with 'name', 'value', 'comment', 'metadata' fields
  • Be deterministic when possible for reproducible results
  • Handle edge cases gracefully (missing expected output, malformed data, etc.)
  • Can be either synchronous or asynchronous
EvaluatorFunction(*args, **kwargs)
1927def _no_init_or_replace_init(self, *args, **kwargs):
1928    cls = type(self)
1929
1930    if cls._is_protocol:
1931        raise TypeError('Protocols cannot be instantiated')
1932
1933    # Already using a custom `__init__`. No need to calculate correct
1934    # `__init__` to call. This can lead to RecursionError. See bpo-45121.
1935    if cls.__init__ is not _no_init_or_replace_init:
1936        return
1937
1938    # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`.
1939    # The first instantiation of the subclass will call `_no_init_or_replace_init` which
1940    # searches for a proper new `__init__` in the MRO. The new `__init__`
1941    # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent
1942    # instantiation of the protocol subclass will thus use the new
1943    # `__init__` and no longer call `_no_init_or_replace_init`.
1944    for base in cls.__mro__:
1945        init = base.__dict__.get('__init__', _no_init_or_replace_init)
1946        if init is not _no_init_or_replace_init:
1947            cls.__init__ = init
1948            break
1949    else:
1950        # should not happen
1951        cls.__init__ = object.__init__
1952
1953    cls.__init__(self, *args, **kwargs)
class RunEvaluatorFunction(typing.Protocol):
803class RunEvaluatorFunction(Protocol):
804    """Protocol defining the interface for run-level evaluator functions.
805
806    Run-level evaluators assess aggregate properties of the entire experiment run,
807    computing metrics that span across all items rather than individual outputs.
808    They receive the complete results from all processed items and can compute
809    statistics like averages, distributions, correlations, or other aggregate metrics.
810
811    Run evaluators should:
812    - Accept item_results as a keyword argument containing all item results
813    - Return Evaluation dict(s) with aggregate metrics
814    - Handle cases where some items may have failed processing
815    - Compute meaningful statistics across the dataset
816    - Can be either synchronous or asynchronous
817    """
818
819    def __call__(
820        self,
821        *,
822        item_results: List[ExperimentItemResult],
823        **kwargs: Dict[str, Any],
824    ) -> Union[
825        Evaluation, List[Evaluation], Awaitable[Union[Evaluation, List[Evaluation]]]
826    ]:
827        r"""Evaluate the entire experiment run with aggregate metrics.
828
829        This method should implement aggregate evaluation logic such as computing
830        averages, calculating distributions, finding correlations, detecting patterns
831        across items, or performing statistical analysis on the experiment results.
832
833        Args:
834            item_results: List of results from all successfully processed experiment items.
835                Each item result contains:
836                - item: The original experiment item
837                - output: The task function's output for this item
838                - evaluations: List of item-level evaluation results
839                - trace_id: Langfuse trace ID for this execution
840                - dataset_run_id: Dataset run ID (if using Langfuse datasets)
841
842                Note: This list only includes items that were successfully processed.
843                Failed items are excluded but logged separately.
844            **kwargs: Additional keyword arguments that may be passed by the framework
845
846        Returns:
847            Evaluation results in one of these formats:
848            - Single Evaluation dict: {"name": "avg_accuracy", "value": 0.78, "comment": "..."}
849            - List of Evaluation dicts: [{"name": "mean", ...}, {"name": "std_dev", ...}]
850            - Awaitable returning either of the above (for async evaluators)
851
852            Each Evaluation dict should contain:
853            - name (str): Unique identifier for this run-level metric
854            - value (int|float|str|bool): The aggregate evaluation result
855            - comment (str, optional): Human-readable explanation of the metric
856            - metadata (dict, optional): Additional structured data about the evaluation
857
858        Examples:
859            Average accuracy calculator:
860            ```python
861            def average_accuracy(*, item_results, **kwargs):
862                if not item_results:
863                    return {"name": "avg_accuracy", "value": 0.0, "comment": "No results"}
864
865                accuracy_values = []
866                for result in item_results:
867                    for evaluation in result.evaluations:
868                        if evaluation.name == "accuracy":
869                            accuracy_values.append(evaluation.value)
870
871                if not accuracy_values:
872                    return {"name": "avg_accuracy", "value": None, "comment": "No accuracy evaluations found"}
873
874                avg = sum(accuracy_values) / len(accuracy_values)
875                return {
876                    "name": "avg_accuracy",
877                    "value": avg,
878                    "comment": f"Average accuracy across {len(accuracy_values)} items: {avg:.2%}"
879                }
880            ```
881
882            Multiple aggregate metrics:
883            ```python
884            def statistical_summary(*, item_results, **kwargs):
885                if not item_results:
886                    return []
887
888                results = []
889
890                # Calculate output length statistics
891                lengths = [len(str(result.output)) for result in item_results]
892                results.extend([
893                    {"name": "avg_output_length", "value": sum(lengths) / len(lengths)},
894                    {"name": "min_output_length", "value": min(lengths)},
895                    {"name": "max_output_length", "value": max(lengths)}
896                ])
897
898                # Success rate
899                total_items = len(item_results)  # Only successful items are included
900                results.append({
901                    "name": "processing_success_rate",
902                    "value": 1.0,  # All items in item_results succeeded
903                    "comment": f"Successfully processed {total_items} items"
904                })
905
906                return results
907            ```
908
909            Async run evaluator with external analysis:
910            ```python
911            async def llm_batch_analysis(*, item_results, **kwargs):
912                # Prepare batch analysis prompt
913                outputs = [result.output for result in item_results]
914                prompt = f"Analyze these {len(outputs)} outputs for common themes:\n"
915                prompt += "\n".join(f"{i+1}. {output}" for i, output in enumerate(outputs))
916
917                response = await openai_client.chat.completions.create(
918                    model="gpt-4",
919                    messages=[{"role": "user", "content": prompt}]
920                )
921
922                return {
923                    "name": "thematic_analysis",
924                    "value": response.choices[0].message.content,
925                    "comment": f"LLM analysis of {len(outputs)} outputs"
926                }
927            ```
928
929            Performance distribution analysis:
930            ```python
931            def performance_distribution(*, item_results, **kwargs):
932                # Extract all evaluation scores
933                all_scores = []
934                score_by_metric = {}
935
936                for result in item_results:
937                    for evaluation in result.evaluations:
938                        metric_name = evaluation.name
939                        value = evaluation.value
940
941                        if isinstance(value, (int, float)):
942                            all_scores.append(value)
943                            if metric_name not in score_by_metric:
944                                score_by_metric[metric_name] = []
945                            score_by_metric[metric_name].append(value)
946
947                results = []
948
949                # Overall score distribution
950                if all_scores:
951                    import statistics
952                    results.append({
953                        "name": "score_std_dev",
954                        "value": statistics.stdev(all_scores) if len(all_scores) > 1 else 0,
955                        "comment": f"Standard deviation across all numeric scores"
956                    })
957
958                # Per-metric statistics
959                for metric, scores in score_by_metric.items():
960                    if len(scores) > 1:
961                        results.append({
962                            "name": f"{metric}_variance",
963                            "value": statistics.variance(scores),
964                            "comment": f"Variance in {metric} across {len(scores)} items"
965                        })
966
967                return results
968            ```
969        """
970        ...

Protocol defining the interface for run-level evaluator functions.

Run-level evaluators assess aggregate properties of the entire experiment run, computing metrics that span across all items rather than individual outputs. They receive the complete results from all processed items and can compute statistics like averages, distributions, correlations, or other aggregate metrics.

Run evaluators should:

  • Accept item_results as a keyword argument containing all item results
  • Return Evaluation dict(s) with aggregate metrics
  • Handle cases where some items may have failed processing
  • Compute meaningful statistics across the dataset
  • Can be either synchronous or asynchronous
RunEvaluatorFunction(*args, **kwargs)
1927def _no_init_or_replace_init(self, *args, **kwargs):
1928    cls = type(self)
1929
1930    if cls._is_protocol:
1931        raise TypeError('Protocols cannot be instantiated')
1932
1933    # Already using a custom `__init__`. No need to calculate correct
1934    # `__init__` to call. This can lead to RecursionError. See bpo-45121.
1935    if cls.__init__ is not _no_init_or_replace_init:
1936        return
1937
1938    # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`.
1939    # The first instantiation of the subclass will call `_no_init_or_replace_init` which
1940    # searches for a proper new `__init__` in the MRO. The new `__init__`
1941    # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent
1942    # instantiation of the protocol subclass will thus use the new
1943    # `__init__` and no longer call `_no_init_or_replace_init`.
1944    for base in cls.__mro__:
1945        init = base.__dict__.get('__init__', _no_init_or_replace_init)
1946        if init is not _no_init_or_replace_init:
1947            cls.__init__ = init
1948            break
1949    else:
1950        # should not happen
1951        cls.__init__ = object.__init__
1952
1953    cls.__init__(self, *args, **kwargs)
def create_evaluator_from_autoevals( autoevals_evaluator: Any, **kwargs: Optional[Dict[str, Any]]) -> EvaluatorFunction:
1018def create_evaluator_from_autoevals(
1019    autoevals_evaluator: Any, **kwargs: Optional[Dict[str, Any]]
1020) -> EvaluatorFunction:
1021    """Create a Langfuse evaluator from an autoevals evaluator.
1022
1023    Args:
1024        autoevals_evaluator: An autoevals evaluator instance
1025        **kwargs: Additional arguments passed to the evaluator
1026
1027    Returns:
1028        A Langfuse-compatible evaluator function
1029    """
1030
1031    def langfuse_evaluator(
1032        *,
1033        input: Any,
1034        output: Any,
1035        expected_output: Any,
1036        metadata: Optional[Dict[str, Any]],
1037        **langfuse_kwargs: Dict[str, Any],
1038    ) -> Evaluation:
1039        evaluation = autoevals_evaluator(
1040            input=input, output=output, expected=expected_output, **kwargs
1041        )
1042
1043        return Evaluation(
1044            name=evaluation.name, value=evaluation.score, metadata=evaluation.metadata
1045        )
1046
1047    return langfuse_evaluator

Create a Langfuse evaluator from an autoevals evaluator.

Arguments:
  • autoevals_evaluator: An autoevals evaluator instance
  • **kwargs: Additional arguments passed to the evaluator
Returns:

A Langfuse-compatible evaluator function