langfuse

Langfuse GitHub Banner

Langfuse Python SDK

MIT License CI test status PyPI Version GitHub Repo stars Discord YC W23

Installation

Important

The SDK was rewritten in v3 and released in June 2025. Refer to the v3 migration guide for instructions on updating your code.

pip install langfuse

Docs

Please see our docs for detailed information on this SDK.

 1""".. include:: ../README.md"""
 2
 3from langfuse.batch_evaluation import (
 4    BatchEvaluationResult,
 5    BatchEvaluationResumeToken,
 6    CompositeEvaluatorFunction,
 7    EvaluatorInputs,
 8    EvaluatorStats,
 9    MapperFunction,
10)
11from langfuse.experiment import Evaluation
12
13from ._client import client as _client_module
14from ._client.attributes import LangfuseOtelSpanAttributes
15from ._client.constants import ObservationTypeLiteral
16from ._client.get_client import get_client
17from ._client.observe import observe
18from ._client.propagation import propagate_attributes
19from ._client.span import (
20    LangfuseAgent,
21    LangfuseChain,
22    LangfuseEmbedding,
23    LangfuseEvaluator,
24    LangfuseEvent,
25    LangfuseGeneration,
26    LangfuseGuardrail,
27    LangfuseRetriever,
28    LangfuseSpan,
29    LangfuseTool,
30)
31
32Langfuse = _client_module.Langfuse
33
34__all__ = [
35    "Langfuse",
36    "get_client",
37    "observe",
38    "propagate_attributes",
39    "ObservationTypeLiteral",
40    "LangfuseSpan",
41    "LangfuseGeneration",
42    "LangfuseEvent",
43    "LangfuseOtelSpanAttributes",
44    "LangfuseAgent",
45    "LangfuseTool",
46    "LangfuseChain",
47    "LangfuseEmbedding",
48    "LangfuseEvaluator",
49    "LangfuseRetriever",
50    "LangfuseGuardrail",
51    "Evaluation",
52    "EvaluatorInputs",
53    "MapperFunction",
54    "CompositeEvaluatorFunction",
55    "EvaluatorStats",
56    "BatchEvaluationResumeToken",
57    "BatchEvaluationResult",
58    "experiment",
59    "api",
60]
class Langfuse:
 129class Langfuse:
 130    """Main client for Langfuse tracing and platform features.
 131
 132    This class provides an interface for creating and managing traces, spans,
 133    and generations in Langfuse as well as interacting with the Langfuse API.
 134
 135    The client features a thread-safe singleton pattern for each unique public API key,
 136    ensuring consistent trace context propagation across your application. It implements
 137    efficient batching of spans with configurable flush settings and includes background
 138    thread management for media uploads and score ingestion.
 139
 140    Configuration is flexible through either direct parameters or environment variables,
 141    with graceful fallbacks and runtime configuration updates.
 142
 143    Attributes:
 144        api: Synchronous API client for Langfuse backend communication
 145        async_api: Asynchronous API client for Langfuse backend communication
 146        _otel_tracer: Internal LangfuseTracer instance managing OpenTelemetry components
 147
 148    Parameters:
 149        public_key (Optional[str]): Your Langfuse public API key. Can also be set via LANGFUSE_PUBLIC_KEY environment variable.
 150        secret_key (Optional[str]): Your Langfuse secret API key. Can also be set via LANGFUSE_SECRET_KEY environment variable.
 151        base_url (Optional[str]): The Langfuse API base URL. Defaults to "https://cloud.langfuse.com". Can also be set via LANGFUSE_BASE_URL environment variable.
 152        host (Optional[str]): Deprecated. Use base_url instead. The Langfuse API host URL. Defaults to "https://cloud.langfuse.com".
 153        timeout (Optional[int]): Timeout in seconds for API requests. Defaults to 5 seconds.
 154        httpx_client (Optional[httpx.Client]): Custom httpx client for making non-tracing HTTP requests. If not provided, a default client will be created.
 155        debug (bool): Enable debug logging. Defaults to False. Can also be set via LANGFUSE_DEBUG environment variable.
 156        tracing_enabled (Optional[bool]): Enable or disable tracing. Defaults to True. Can also be set via LANGFUSE_TRACING_ENABLED environment variable.
 157        flush_at (Optional[int]): Number of spans to batch before sending to the API. Defaults to 512. Can also be set via LANGFUSE_FLUSH_AT environment variable.
 158        flush_interval (Optional[float]): Time in seconds between batch flushes. Defaults to 5 seconds. Can also be set via LANGFUSE_FLUSH_INTERVAL environment variable.
 159        environment (Optional[str]): Environment name for tracing. Default is 'default'. Can also be set via LANGFUSE_TRACING_ENVIRONMENT environment variable. Can be any lowercase alphanumeric string with hyphens and underscores that does not start with 'langfuse'.
 160        release (Optional[str]): Release version/hash of your application. Used for grouping analytics by release.
 161        media_upload_thread_count (Optional[int]): Number of background threads for handling media uploads. Defaults to 1. Can also be set via LANGFUSE_MEDIA_UPLOAD_THREAD_COUNT environment variable.
 162        sample_rate (Optional[float]): Sampling rate for traces (0.0 to 1.0). Defaults to 1.0 (100% of traces are sampled). Can also be set via LANGFUSE_SAMPLE_RATE environment variable.
 163        mask (Optional[MaskFunction]): Function to mask sensitive data in traces before sending to the API.
 164        blocked_instrumentation_scopes (Optional[List[str]]): List of instrumentation scope names to block from being exported to Langfuse. Spans from these scopes will be filtered out before being sent to the API. Useful for filtering out spans from specific libraries or frameworks. For exported spans, you can see the instrumentation scope name in the span metadata in Langfuse (`metadata.scope.name`)
 165        additional_headers (Optional[Dict[str, str]]): Additional headers to include in all API requests and OTLPSpanExporter requests. These headers will be merged with default headers. Note: If httpx_client is provided, additional_headers must be set directly on your custom httpx_client as well.
 166        tracer_provider(Optional[TracerProvider]): OpenTelemetry TracerProvider to use for Langfuse. This can be useful to set to have disconnected tracing between Langfuse and other OpenTelemetry-span emitting libraries. Note: To track active spans, the context is still shared between TracerProviders. This may lead to broken trace trees.
 167
 168    Example:
 169        ```python
 170        from langfuse.otel import Langfuse
 171
 172        # Initialize the client (reads from env vars if not provided)
 173        langfuse = Langfuse(
 174            public_key="your-public-key",
 175            secret_key="your-secret-key",
 176            host="https://cloud.langfuse.com",  # Optional, default shown
 177        )
 178
 179        # Create a trace span
 180        with langfuse.start_as_current_span(name="process-query") as span:
 181            # Your application code here
 182
 183            # Create a nested generation span for an LLM call
 184            with span.start_as_current_generation(
 185                name="generate-response",
 186                model="gpt-4",
 187                input={"query": "Tell me about AI"},
 188                model_parameters={"temperature": 0.7, "max_tokens": 500}
 189            ) as generation:
 190                # Generate response here
 191                response = "AI is a field of computer science..."
 192
 193                generation.update(
 194                    output=response,
 195                    usage_details={"prompt_tokens": 10, "completion_tokens": 50},
 196                    cost_details={"total_cost": 0.0023}
 197                )
 198
 199                # Score the generation (supports NUMERIC, BOOLEAN, CATEGORICAL)
 200                generation.score(name="relevance", value=0.95, data_type="NUMERIC")
 201        ```
 202    """
 203
 204    _resources: Optional[LangfuseResourceManager] = None
 205    _mask: Optional[MaskFunction] = None
 206    _otel_tracer: otel_trace_api.Tracer
 207
 208    def __init__(
 209        self,
 210        *,
 211        public_key: Optional[str] = None,
 212        secret_key: Optional[str] = None,
 213        base_url: Optional[str] = None,
 214        host: Optional[str] = None,
 215        timeout: Optional[int] = None,
 216        httpx_client: Optional[httpx.Client] = None,
 217        debug: bool = False,
 218        tracing_enabled: Optional[bool] = True,
 219        flush_at: Optional[int] = None,
 220        flush_interval: Optional[float] = None,
 221        environment: Optional[str] = None,
 222        release: Optional[str] = None,
 223        media_upload_thread_count: Optional[int] = None,
 224        sample_rate: Optional[float] = None,
 225        mask: Optional[MaskFunction] = None,
 226        blocked_instrumentation_scopes: Optional[List[str]] = None,
 227        additional_headers: Optional[Dict[str, str]] = None,
 228        tracer_provider: Optional[TracerProvider] = None,
 229    ):
 230        self._base_url = (
 231            base_url
 232            or os.environ.get(LANGFUSE_BASE_URL)
 233            or host
 234            or os.environ.get(LANGFUSE_HOST, "https://cloud.langfuse.com")
 235        )
 236        self._environment = environment or cast(
 237            str, os.environ.get(LANGFUSE_TRACING_ENVIRONMENT)
 238        )
 239        self._project_id: Optional[str] = None
 240        sample_rate = sample_rate or float(os.environ.get(LANGFUSE_SAMPLE_RATE, 1.0))
 241        if not 0.0 <= sample_rate <= 1.0:
 242            raise ValueError(
 243                f"Sample rate must be between 0.0 and 1.0, got {sample_rate}"
 244            )
 245
 246        timeout = timeout or int(os.environ.get(LANGFUSE_TIMEOUT, 5))
 247
 248        self._tracing_enabled = (
 249            tracing_enabled
 250            and os.environ.get(LANGFUSE_TRACING_ENABLED, "true").lower() != "false"
 251        )
 252        if not self._tracing_enabled:
 253            langfuse_logger.info(
 254                "Configuration: Langfuse tracing is explicitly disabled. No data will be sent to the Langfuse API."
 255            )
 256
 257        debug = (
 258            debug if debug else (os.getenv(LANGFUSE_DEBUG, "false").lower() == "true")
 259        )
 260        if debug:
 261            logging.basicConfig(
 262                format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 263            )
 264            langfuse_logger.setLevel(logging.DEBUG)
 265
 266        public_key = public_key or os.environ.get(LANGFUSE_PUBLIC_KEY)
 267        if public_key is None:
 268            langfuse_logger.warning(
 269                "Authentication error: Langfuse client initialized without public_key. Client will be disabled. "
 270                "Provide a public_key parameter or set LANGFUSE_PUBLIC_KEY environment variable. "
 271            )
 272            self._otel_tracer = otel_trace_api.NoOpTracer()
 273            return
 274
 275        secret_key = secret_key or os.environ.get(LANGFUSE_SECRET_KEY)
 276        if secret_key is None:
 277            langfuse_logger.warning(
 278                "Authentication error: Langfuse client initialized without secret_key. Client will be disabled. "
 279                "Provide a secret_key parameter or set LANGFUSE_SECRET_KEY environment variable. "
 280            )
 281            self._otel_tracer = otel_trace_api.NoOpTracer()
 282            return
 283
 284        if os.environ.get("OTEL_SDK_DISABLED", "false").lower() == "true":
 285            langfuse_logger.warning(
 286                "OTEL_SDK_DISABLED is set. Langfuse tracing will be disabled and no traces will appear in the UI."
 287            )
 288
 289        # Initialize api and tracer if requirements are met
 290        self._resources = LangfuseResourceManager(
 291            public_key=public_key,
 292            secret_key=secret_key,
 293            base_url=self._base_url,
 294            timeout=timeout,
 295            environment=self._environment,
 296            release=release,
 297            flush_at=flush_at,
 298            flush_interval=flush_interval,
 299            httpx_client=httpx_client,
 300            media_upload_thread_count=media_upload_thread_count,
 301            sample_rate=sample_rate,
 302            mask=mask,
 303            tracing_enabled=self._tracing_enabled,
 304            blocked_instrumentation_scopes=blocked_instrumentation_scopes,
 305            additional_headers=additional_headers,
 306            tracer_provider=tracer_provider,
 307        )
 308        self._mask = self._resources.mask
 309
 310        self._otel_tracer = (
 311            self._resources.tracer
 312            if self._tracing_enabled and self._resources.tracer is not None
 313            else otel_trace_api.NoOpTracer()
 314        )
 315        self.api = self._resources.api
 316        self.async_api = self._resources.async_api
 317
 318    def start_span(
 319        self,
 320        *,
 321        trace_context: Optional[TraceContext] = None,
 322        name: str,
 323        input: Optional[Any] = None,
 324        output: Optional[Any] = None,
 325        metadata: Optional[Any] = None,
 326        version: Optional[str] = None,
 327        level: Optional[SpanLevel] = None,
 328        status_message: Optional[str] = None,
 329    ) -> LangfuseSpan:
 330        """Create a new span for tracing a unit of work.
 331
 332        This method creates a new span but does not set it as the current span in the
 333        context. To create and use a span within a context, use start_as_current_span().
 334
 335        The created span will be the child of the current span in the context.
 336
 337        Args:
 338            trace_context: Optional context for connecting to an existing trace
 339            name: Name of the span (e.g., function or operation name)
 340            input: Input data for the operation (can be any JSON-serializable object)
 341            output: Output data from the operation (can be any JSON-serializable object)
 342            metadata: Additional metadata to associate with the span
 343            version: Version identifier for the code or component
 344            level: Importance level of the span (info, warning, error)
 345            status_message: Optional status message for the span
 346
 347        Returns:
 348            A LangfuseSpan object that must be ended with .end() when the operation completes
 349
 350        Example:
 351            ```python
 352            span = langfuse.start_span(name="process-data")
 353            try:
 354                # Do work
 355                span.update(output="result")
 356            finally:
 357                span.end()
 358            ```
 359        """
 360        return self.start_observation(
 361            trace_context=trace_context,
 362            name=name,
 363            as_type="span",
 364            input=input,
 365            output=output,
 366            metadata=metadata,
 367            version=version,
 368            level=level,
 369            status_message=status_message,
 370        )
 371
 372    def start_as_current_span(
 373        self,
 374        *,
 375        trace_context: Optional[TraceContext] = None,
 376        name: str,
 377        input: Optional[Any] = None,
 378        output: Optional[Any] = None,
 379        metadata: Optional[Any] = None,
 380        version: Optional[str] = None,
 381        level: Optional[SpanLevel] = None,
 382        status_message: Optional[str] = None,
 383        end_on_exit: Optional[bool] = None,
 384    ) -> _AgnosticContextManager[LangfuseSpan]:
 385        """Create a new span and set it as the current span in a context manager.
 386
 387        This method creates a new span and sets it as the current span within a context
 388        manager. Use this method with a 'with' statement to automatically handle span
 389        lifecycle within a code block.
 390
 391        The created span will be the child of the current span in the context.
 392
 393        Args:
 394            trace_context: Optional context for connecting to an existing trace
 395            name: Name of the span (e.g., function or operation name)
 396            input: Input data for the operation (can be any JSON-serializable object)
 397            output: Output data from the operation (can be any JSON-serializable object)
 398            metadata: Additional metadata to associate with the span
 399            version: Version identifier for the code or component
 400            level: Importance level of the span (info, warning, error)
 401            status_message: Optional status message for the span
 402            end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
 403
 404        Returns:
 405            A context manager that yields a LangfuseSpan
 406
 407        Example:
 408            ```python
 409            with langfuse.start_as_current_span(name="process-query") as span:
 410                # Do work
 411                result = process_data()
 412                span.update(output=result)
 413
 414                # Create a child span automatically
 415                with span.start_as_current_span(name="sub-operation") as child_span:
 416                    # Do sub-operation work
 417                    child_span.update(output="sub-result")
 418            ```
 419        """
 420        return self.start_as_current_observation(
 421            trace_context=trace_context,
 422            name=name,
 423            as_type="span",
 424            input=input,
 425            output=output,
 426            metadata=metadata,
 427            version=version,
 428            level=level,
 429            status_message=status_message,
 430            end_on_exit=end_on_exit,
 431        )
 432
 433    @overload
 434    def start_observation(
 435        self,
 436        *,
 437        trace_context: Optional[TraceContext] = None,
 438        name: str,
 439        as_type: Literal["generation"],
 440        input: Optional[Any] = None,
 441        output: Optional[Any] = None,
 442        metadata: Optional[Any] = None,
 443        version: Optional[str] = None,
 444        level: Optional[SpanLevel] = None,
 445        status_message: Optional[str] = None,
 446        completion_start_time: Optional[datetime] = None,
 447        model: Optional[str] = None,
 448        model_parameters: Optional[Dict[str, MapValue]] = None,
 449        usage_details: Optional[Dict[str, int]] = None,
 450        cost_details: Optional[Dict[str, float]] = None,
 451        prompt: Optional[PromptClient] = None,
 452    ) -> LangfuseGeneration: ...
 453
 454    @overload
 455    def start_observation(
 456        self,
 457        *,
 458        trace_context: Optional[TraceContext] = None,
 459        name: str,
 460        as_type: Literal["span"] = "span",
 461        input: Optional[Any] = None,
 462        output: Optional[Any] = None,
 463        metadata: Optional[Any] = None,
 464        version: Optional[str] = None,
 465        level: Optional[SpanLevel] = None,
 466        status_message: Optional[str] = None,
 467    ) -> LangfuseSpan: ...
 468
 469    @overload
 470    def start_observation(
 471        self,
 472        *,
 473        trace_context: Optional[TraceContext] = None,
 474        name: str,
 475        as_type: Literal["agent"],
 476        input: Optional[Any] = None,
 477        output: Optional[Any] = None,
 478        metadata: Optional[Any] = None,
 479        version: Optional[str] = None,
 480        level: Optional[SpanLevel] = None,
 481        status_message: Optional[str] = None,
 482    ) -> LangfuseAgent: ...
 483
 484    @overload
 485    def start_observation(
 486        self,
 487        *,
 488        trace_context: Optional[TraceContext] = None,
 489        name: str,
 490        as_type: Literal["tool"],
 491        input: Optional[Any] = None,
 492        output: Optional[Any] = None,
 493        metadata: Optional[Any] = None,
 494        version: Optional[str] = None,
 495        level: Optional[SpanLevel] = None,
 496        status_message: Optional[str] = None,
 497    ) -> LangfuseTool: ...
 498
 499    @overload
 500    def start_observation(
 501        self,
 502        *,
 503        trace_context: Optional[TraceContext] = None,
 504        name: str,
 505        as_type: Literal["chain"],
 506        input: Optional[Any] = None,
 507        output: Optional[Any] = None,
 508        metadata: Optional[Any] = None,
 509        version: Optional[str] = None,
 510        level: Optional[SpanLevel] = None,
 511        status_message: Optional[str] = None,
 512    ) -> LangfuseChain: ...
 513
 514    @overload
 515    def start_observation(
 516        self,
 517        *,
 518        trace_context: Optional[TraceContext] = None,
 519        name: str,
 520        as_type: Literal["retriever"],
 521        input: Optional[Any] = None,
 522        output: Optional[Any] = None,
 523        metadata: Optional[Any] = None,
 524        version: Optional[str] = None,
 525        level: Optional[SpanLevel] = None,
 526        status_message: Optional[str] = None,
 527    ) -> LangfuseRetriever: ...
 528
 529    @overload
 530    def start_observation(
 531        self,
 532        *,
 533        trace_context: Optional[TraceContext] = None,
 534        name: str,
 535        as_type: Literal["evaluator"],
 536        input: Optional[Any] = None,
 537        output: Optional[Any] = None,
 538        metadata: Optional[Any] = None,
 539        version: Optional[str] = None,
 540        level: Optional[SpanLevel] = None,
 541        status_message: Optional[str] = None,
 542    ) -> LangfuseEvaluator: ...
 543
 544    @overload
 545    def start_observation(
 546        self,
 547        *,
 548        trace_context: Optional[TraceContext] = None,
 549        name: str,
 550        as_type: Literal["embedding"],
 551        input: Optional[Any] = None,
 552        output: Optional[Any] = None,
 553        metadata: Optional[Any] = None,
 554        version: Optional[str] = None,
 555        level: Optional[SpanLevel] = None,
 556        status_message: Optional[str] = None,
 557        completion_start_time: Optional[datetime] = None,
 558        model: Optional[str] = None,
 559        model_parameters: Optional[Dict[str, MapValue]] = None,
 560        usage_details: Optional[Dict[str, int]] = None,
 561        cost_details: Optional[Dict[str, float]] = None,
 562        prompt: Optional[PromptClient] = None,
 563    ) -> LangfuseEmbedding: ...
 564
 565    @overload
 566    def start_observation(
 567        self,
 568        *,
 569        trace_context: Optional[TraceContext] = None,
 570        name: str,
 571        as_type: Literal["guardrail"],
 572        input: Optional[Any] = None,
 573        output: Optional[Any] = None,
 574        metadata: Optional[Any] = None,
 575        version: Optional[str] = None,
 576        level: Optional[SpanLevel] = None,
 577        status_message: Optional[str] = None,
 578    ) -> LangfuseGuardrail: ...
 579
 580    def start_observation(
 581        self,
 582        *,
 583        trace_context: Optional[TraceContext] = None,
 584        name: str,
 585        as_type: ObservationTypeLiteralNoEvent = "span",
 586        input: Optional[Any] = None,
 587        output: Optional[Any] = None,
 588        metadata: Optional[Any] = None,
 589        version: Optional[str] = None,
 590        level: Optional[SpanLevel] = None,
 591        status_message: Optional[str] = None,
 592        completion_start_time: Optional[datetime] = None,
 593        model: Optional[str] = None,
 594        model_parameters: Optional[Dict[str, MapValue]] = None,
 595        usage_details: Optional[Dict[str, int]] = None,
 596        cost_details: Optional[Dict[str, float]] = None,
 597        prompt: Optional[PromptClient] = None,
 598    ) -> Union[
 599        LangfuseSpan,
 600        LangfuseGeneration,
 601        LangfuseAgent,
 602        LangfuseTool,
 603        LangfuseChain,
 604        LangfuseRetriever,
 605        LangfuseEvaluator,
 606        LangfuseEmbedding,
 607        LangfuseGuardrail,
 608    ]:
 609        """Create a new observation of the specified type.
 610
 611        This method creates a new observation but does not set it as the current span in the
 612        context. To create and use an observation within a context, use start_as_current_observation().
 613
 614        Args:
 615            trace_context: Optional context for connecting to an existing trace
 616            name: Name of the observation
 617            as_type: Type of observation to create (defaults to "span")
 618            input: Input data for the operation
 619            output: Output data from the operation
 620            metadata: Additional metadata to associate with the observation
 621            version: Version identifier for the code or component
 622            level: Importance level of the observation
 623            status_message: Optional status message for the observation
 624            completion_start_time: When the model started generating (for generation types)
 625            model: Name/identifier of the AI model used (for generation types)
 626            model_parameters: Parameters used for the model (for generation types)
 627            usage_details: Token usage information (for generation types)
 628            cost_details: Cost information (for generation types)
 629            prompt: Associated prompt template (for generation types)
 630
 631        Returns:
 632            An observation object of the appropriate type that must be ended with .end()
 633        """
 634        if trace_context:
 635            trace_id = trace_context.get("trace_id", None)
 636            parent_span_id = trace_context.get("parent_span_id", None)
 637
 638            if trace_id:
 639                remote_parent_span = self._create_remote_parent_span(
 640                    trace_id=trace_id, parent_span_id=parent_span_id
 641                )
 642
 643                with otel_trace_api.use_span(
 644                    cast(otel_trace_api.Span, remote_parent_span)
 645                ):
 646                    otel_span = self._otel_tracer.start_span(name=name)
 647                    otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True)
 648
 649                    return self._create_observation_from_otel_span(
 650                        otel_span=otel_span,
 651                        as_type=as_type,
 652                        input=input,
 653                        output=output,
 654                        metadata=metadata,
 655                        version=version,
 656                        level=level,
 657                        status_message=status_message,
 658                        completion_start_time=completion_start_time,
 659                        model=model,
 660                        model_parameters=model_parameters,
 661                        usage_details=usage_details,
 662                        cost_details=cost_details,
 663                        prompt=prompt,
 664                    )
 665
 666        otel_span = self._otel_tracer.start_span(name=name)
 667
 668        return self._create_observation_from_otel_span(
 669            otel_span=otel_span,
 670            as_type=as_type,
 671            input=input,
 672            output=output,
 673            metadata=metadata,
 674            version=version,
 675            level=level,
 676            status_message=status_message,
 677            completion_start_time=completion_start_time,
 678            model=model,
 679            model_parameters=model_parameters,
 680            usage_details=usage_details,
 681            cost_details=cost_details,
 682            prompt=prompt,
 683        )
 684
 685    def _create_observation_from_otel_span(
 686        self,
 687        *,
 688        otel_span: otel_trace_api.Span,
 689        as_type: ObservationTypeLiteralNoEvent,
 690        input: Optional[Any] = None,
 691        output: Optional[Any] = None,
 692        metadata: Optional[Any] = None,
 693        version: Optional[str] = None,
 694        level: Optional[SpanLevel] = None,
 695        status_message: Optional[str] = None,
 696        completion_start_time: Optional[datetime] = None,
 697        model: Optional[str] = None,
 698        model_parameters: Optional[Dict[str, MapValue]] = None,
 699        usage_details: Optional[Dict[str, int]] = None,
 700        cost_details: Optional[Dict[str, float]] = None,
 701        prompt: Optional[PromptClient] = None,
 702    ) -> Union[
 703        LangfuseSpan,
 704        LangfuseGeneration,
 705        LangfuseAgent,
 706        LangfuseTool,
 707        LangfuseChain,
 708        LangfuseRetriever,
 709        LangfuseEvaluator,
 710        LangfuseEmbedding,
 711        LangfuseGuardrail,
 712    ]:
 713        """Create the appropriate observation type from an OTEL span."""
 714        if as_type in get_observation_types_list(ObservationTypeGenerationLike):
 715            observation_class = self._get_span_class(as_type)
 716            # Type ignore to prevent overloads of internal _get_span_class function,
 717            # issue is that LangfuseEvent could be returned and that classes have diff. args
 718            return observation_class(  # type: ignore[return-value,call-arg]
 719                otel_span=otel_span,
 720                langfuse_client=self,
 721                environment=self._environment,
 722                input=input,
 723                output=output,
 724                metadata=metadata,
 725                version=version,
 726                level=level,
 727                status_message=status_message,
 728                completion_start_time=completion_start_time,
 729                model=model,
 730                model_parameters=model_parameters,
 731                usage_details=usage_details,
 732                cost_details=cost_details,
 733                prompt=prompt,
 734            )
 735        else:
 736            # For other types (e.g. span, guardrail), create appropriate class without generation properties
 737            observation_class = self._get_span_class(as_type)
 738            # Type ignore to prevent overloads of internal _get_span_class function,
 739            # issue is that LangfuseEvent could be returned and that classes have diff. args
 740            return observation_class(  # type: ignore[return-value,call-arg]
 741                otel_span=otel_span,
 742                langfuse_client=self,
 743                environment=self._environment,
 744                input=input,
 745                output=output,
 746                metadata=metadata,
 747                version=version,
 748                level=level,
 749                status_message=status_message,
 750            )
 751            # span._observation_type = as_type
 752            # span._otel_span.set_attribute("langfuse.observation.type", as_type)
 753            # return span
 754
 755    def start_generation(
 756        self,
 757        *,
 758        trace_context: Optional[TraceContext] = None,
 759        name: str,
 760        input: Optional[Any] = None,
 761        output: Optional[Any] = None,
 762        metadata: Optional[Any] = None,
 763        version: Optional[str] = None,
 764        level: Optional[SpanLevel] = None,
 765        status_message: Optional[str] = None,
 766        completion_start_time: Optional[datetime] = None,
 767        model: Optional[str] = None,
 768        model_parameters: Optional[Dict[str, MapValue]] = None,
 769        usage_details: Optional[Dict[str, int]] = None,
 770        cost_details: Optional[Dict[str, float]] = None,
 771        prompt: Optional[PromptClient] = None,
 772    ) -> LangfuseGeneration:
 773        """Create a new generation span for model generations.
 774
 775        DEPRECATED: This method is deprecated and will be removed in a future version.
 776        Use start_observation(as_type='generation') instead.
 777
 778        This method creates a specialized span for tracking model generations.
 779        It includes additional fields specific to model generations such as model name,
 780        token usage, and cost details.
 781
 782        The created generation span will be the child of the current span in the context.
 783
 784        Args:
 785            trace_context: Optional context for connecting to an existing trace
 786            name: Name of the generation operation
 787            input: Input data for the model (e.g., prompts)
 788            output: Output from the model (e.g., completions)
 789            metadata: Additional metadata to associate with the generation
 790            version: Version identifier for the model or component
 791            level: Importance level of the generation (info, warning, error)
 792            status_message: Optional status message for the generation
 793            completion_start_time: When the model started generating the response
 794            model: Name/identifier of the AI model used (e.g., "gpt-4")
 795            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
 796            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
 797            cost_details: Cost information for the model call
 798            prompt: Associated prompt template from Langfuse prompt management
 799
 800        Returns:
 801            A LangfuseGeneration object that must be ended with .end() when complete
 802
 803        Example:
 804            ```python
 805            generation = langfuse.start_generation(
 806                name="answer-generation",
 807                model="gpt-4",
 808                input={"prompt": "Explain quantum computing"},
 809                model_parameters={"temperature": 0.7}
 810            )
 811            try:
 812                # Call model API
 813                response = llm.generate(...)
 814
 815                generation.update(
 816                    output=response.text,
 817                    usage_details={
 818                        "prompt_tokens": response.usage.prompt_tokens,
 819                        "completion_tokens": response.usage.completion_tokens
 820                    }
 821                )
 822            finally:
 823                generation.end()
 824            ```
 825        """
 826        warnings.warn(
 827            "start_generation is deprecated and will be removed in a future version. "
 828            "Use start_observation(as_type='generation') instead.",
 829            DeprecationWarning,
 830            stacklevel=2,
 831        )
 832        return self.start_observation(
 833            trace_context=trace_context,
 834            name=name,
 835            as_type="generation",
 836            input=input,
 837            output=output,
 838            metadata=metadata,
 839            version=version,
 840            level=level,
 841            status_message=status_message,
 842            completion_start_time=completion_start_time,
 843            model=model,
 844            model_parameters=model_parameters,
 845            usage_details=usage_details,
 846            cost_details=cost_details,
 847            prompt=prompt,
 848        )
 849
 850    def start_as_current_generation(
 851        self,
 852        *,
 853        trace_context: Optional[TraceContext] = None,
 854        name: str,
 855        input: Optional[Any] = None,
 856        output: Optional[Any] = None,
 857        metadata: Optional[Any] = None,
 858        version: Optional[str] = None,
 859        level: Optional[SpanLevel] = None,
 860        status_message: Optional[str] = None,
 861        completion_start_time: Optional[datetime] = None,
 862        model: Optional[str] = None,
 863        model_parameters: Optional[Dict[str, MapValue]] = None,
 864        usage_details: Optional[Dict[str, int]] = None,
 865        cost_details: Optional[Dict[str, float]] = None,
 866        prompt: Optional[PromptClient] = None,
 867        end_on_exit: Optional[bool] = None,
 868    ) -> _AgnosticContextManager[LangfuseGeneration]:
 869        """Create a new generation span and set it as the current span in a context manager.
 870
 871        DEPRECATED: This method is deprecated and will be removed in a future version.
 872        Use start_as_current_observation(as_type='generation') instead.
 873
 874        This method creates a specialized span for model generations and sets it as the
 875        current span within a context manager. Use this method with a 'with' statement to
 876        automatically handle the generation span lifecycle within a code block.
 877
 878        The created generation span will be the child of the current span in the context.
 879
 880        Args:
 881            trace_context: Optional context for connecting to an existing trace
 882            name: Name of the generation operation
 883            input: Input data for the model (e.g., prompts)
 884            output: Output from the model (e.g., completions)
 885            metadata: Additional metadata to associate with the generation
 886            version: Version identifier for the model or component
 887            level: Importance level of the generation (info, warning, error)
 888            status_message: Optional status message for the generation
 889            completion_start_time: When the model started generating the response
 890            model: Name/identifier of the AI model used (e.g., "gpt-4")
 891            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
 892            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
 893            cost_details: Cost information for the model call
 894            prompt: Associated prompt template from Langfuse prompt management
 895            end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
 896
 897        Returns:
 898            A context manager that yields a LangfuseGeneration
 899
 900        Example:
 901            ```python
 902            with langfuse.start_as_current_generation(
 903                name="answer-generation",
 904                model="gpt-4",
 905                input={"prompt": "Explain quantum computing"}
 906            ) as generation:
 907                # Call model API
 908                response = llm.generate(...)
 909
 910                # Update with results
 911                generation.update(
 912                    output=response.text,
 913                    usage_details={
 914                        "prompt_tokens": response.usage.prompt_tokens,
 915                        "completion_tokens": response.usage.completion_tokens
 916                    }
 917                )
 918            ```
 919        """
 920        warnings.warn(
 921            "start_as_current_generation is deprecated and will be removed in a future version. "
 922            "Use start_as_current_observation(as_type='generation') instead.",
 923            DeprecationWarning,
 924            stacklevel=2,
 925        )
 926        return self.start_as_current_observation(
 927            trace_context=trace_context,
 928            name=name,
 929            as_type="generation",
 930            input=input,
 931            output=output,
 932            metadata=metadata,
 933            version=version,
 934            level=level,
 935            status_message=status_message,
 936            completion_start_time=completion_start_time,
 937            model=model,
 938            model_parameters=model_parameters,
 939            usage_details=usage_details,
 940            cost_details=cost_details,
 941            prompt=prompt,
 942            end_on_exit=end_on_exit,
 943        )
 944
 945    @overload
 946    def start_as_current_observation(
 947        self,
 948        *,
 949        trace_context: Optional[TraceContext] = None,
 950        name: str,
 951        as_type: Literal["generation"],
 952        input: Optional[Any] = None,
 953        output: Optional[Any] = None,
 954        metadata: Optional[Any] = None,
 955        version: Optional[str] = None,
 956        level: Optional[SpanLevel] = None,
 957        status_message: Optional[str] = None,
 958        completion_start_time: Optional[datetime] = None,
 959        model: Optional[str] = None,
 960        model_parameters: Optional[Dict[str, MapValue]] = None,
 961        usage_details: Optional[Dict[str, int]] = None,
 962        cost_details: Optional[Dict[str, float]] = None,
 963        prompt: Optional[PromptClient] = None,
 964        end_on_exit: Optional[bool] = None,
 965    ) -> _AgnosticContextManager[LangfuseGeneration]: ...
 966
 967    @overload
 968    def start_as_current_observation(
 969        self,
 970        *,
 971        trace_context: Optional[TraceContext] = None,
 972        name: str,
 973        as_type: Literal["span"] = "span",
 974        input: Optional[Any] = None,
 975        output: Optional[Any] = None,
 976        metadata: Optional[Any] = None,
 977        version: Optional[str] = None,
 978        level: Optional[SpanLevel] = None,
 979        status_message: Optional[str] = None,
 980        end_on_exit: Optional[bool] = None,
 981    ) -> _AgnosticContextManager[LangfuseSpan]: ...
 982
 983    @overload
 984    def start_as_current_observation(
 985        self,
 986        *,
 987        trace_context: Optional[TraceContext] = None,
 988        name: str,
 989        as_type: Literal["agent"],
 990        input: Optional[Any] = None,
 991        output: Optional[Any] = None,
 992        metadata: Optional[Any] = None,
 993        version: Optional[str] = None,
 994        level: Optional[SpanLevel] = None,
 995        status_message: Optional[str] = None,
 996        end_on_exit: Optional[bool] = None,
 997    ) -> _AgnosticContextManager[LangfuseAgent]: ...
 998
 999    @overload
1000    def start_as_current_observation(
1001        self,
1002        *,
1003        trace_context: Optional[TraceContext] = None,
1004        name: str,
1005        as_type: Literal["tool"],
1006        input: Optional[Any] = None,
1007        output: Optional[Any] = None,
1008        metadata: Optional[Any] = None,
1009        version: Optional[str] = None,
1010        level: Optional[SpanLevel] = None,
1011        status_message: Optional[str] = None,
1012        end_on_exit: Optional[bool] = None,
1013    ) -> _AgnosticContextManager[LangfuseTool]: ...
1014
1015    @overload
1016    def start_as_current_observation(
1017        self,
1018        *,
1019        trace_context: Optional[TraceContext] = None,
1020        name: str,
1021        as_type: Literal["chain"],
1022        input: Optional[Any] = None,
1023        output: Optional[Any] = None,
1024        metadata: Optional[Any] = None,
1025        version: Optional[str] = None,
1026        level: Optional[SpanLevel] = None,
1027        status_message: Optional[str] = None,
1028        end_on_exit: Optional[bool] = None,
1029    ) -> _AgnosticContextManager[LangfuseChain]: ...
1030
1031    @overload
1032    def start_as_current_observation(
1033        self,
1034        *,
1035        trace_context: Optional[TraceContext] = None,
1036        name: str,
1037        as_type: Literal["retriever"],
1038        input: Optional[Any] = None,
1039        output: Optional[Any] = None,
1040        metadata: Optional[Any] = None,
1041        version: Optional[str] = None,
1042        level: Optional[SpanLevel] = None,
1043        status_message: Optional[str] = None,
1044        end_on_exit: Optional[bool] = None,
1045    ) -> _AgnosticContextManager[LangfuseRetriever]: ...
1046
1047    @overload
1048    def start_as_current_observation(
1049        self,
1050        *,
1051        trace_context: Optional[TraceContext] = None,
1052        name: str,
1053        as_type: Literal["evaluator"],
1054        input: Optional[Any] = None,
1055        output: Optional[Any] = None,
1056        metadata: Optional[Any] = None,
1057        version: Optional[str] = None,
1058        level: Optional[SpanLevel] = None,
1059        status_message: Optional[str] = None,
1060        end_on_exit: Optional[bool] = None,
1061    ) -> _AgnosticContextManager[LangfuseEvaluator]: ...
1062
1063    @overload
1064    def start_as_current_observation(
1065        self,
1066        *,
1067        trace_context: Optional[TraceContext] = None,
1068        name: str,
1069        as_type: Literal["embedding"],
1070        input: Optional[Any] = None,
1071        output: Optional[Any] = None,
1072        metadata: Optional[Any] = None,
1073        version: Optional[str] = None,
1074        level: Optional[SpanLevel] = None,
1075        status_message: Optional[str] = None,
1076        completion_start_time: Optional[datetime] = None,
1077        model: Optional[str] = None,
1078        model_parameters: Optional[Dict[str, MapValue]] = None,
1079        usage_details: Optional[Dict[str, int]] = None,
1080        cost_details: Optional[Dict[str, float]] = None,
1081        prompt: Optional[PromptClient] = None,
1082        end_on_exit: Optional[bool] = None,
1083    ) -> _AgnosticContextManager[LangfuseEmbedding]: ...
1084
1085    @overload
1086    def start_as_current_observation(
1087        self,
1088        *,
1089        trace_context: Optional[TraceContext] = None,
1090        name: str,
1091        as_type: Literal["guardrail"],
1092        input: Optional[Any] = None,
1093        output: Optional[Any] = None,
1094        metadata: Optional[Any] = None,
1095        version: Optional[str] = None,
1096        level: Optional[SpanLevel] = None,
1097        status_message: Optional[str] = None,
1098        end_on_exit: Optional[bool] = None,
1099    ) -> _AgnosticContextManager[LangfuseGuardrail]: ...
1100
1101    def start_as_current_observation(
1102        self,
1103        *,
1104        trace_context: Optional[TraceContext] = None,
1105        name: str,
1106        as_type: ObservationTypeLiteralNoEvent = "span",
1107        input: Optional[Any] = None,
1108        output: Optional[Any] = None,
1109        metadata: Optional[Any] = None,
1110        version: Optional[str] = None,
1111        level: Optional[SpanLevel] = None,
1112        status_message: Optional[str] = None,
1113        completion_start_time: Optional[datetime] = None,
1114        model: Optional[str] = None,
1115        model_parameters: Optional[Dict[str, MapValue]] = None,
1116        usage_details: Optional[Dict[str, int]] = None,
1117        cost_details: Optional[Dict[str, float]] = None,
1118        prompt: Optional[PromptClient] = None,
1119        end_on_exit: Optional[bool] = None,
1120    ) -> Union[
1121        _AgnosticContextManager[LangfuseGeneration],
1122        _AgnosticContextManager[LangfuseSpan],
1123        _AgnosticContextManager[LangfuseAgent],
1124        _AgnosticContextManager[LangfuseTool],
1125        _AgnosticContextManager[LangfuseChain],
1126        _AgnosticContextManager[LangfuseRetriever],
1127        _AgnosticContextManager[LangfuseEvaluator],
1128        _AgnosticContextManager[LangfuseEmbedding],
1129        _AgnosticContextManager[LangfuseGuardrail],
1130    ]:
1131        """Create a new observation and set it as the current span in a context manager.
1132
1133        This method creates a new observation of the specified type and sets it as the
1134        current span within a context manager. Use this method with a 'with' statement to
1135        automatically handle the observation lifecycle within a code block.
1136
1137        The created observation will be the child of the current span in the context.
1138
1139        Args:
1140            trace_context: Optional context for connecting to an existing trace
1141            name: Name of the observation (e.g., function or operation name)
1142            as_type: Type of observation to create (defaults to "span")
1143            input: Input data for the operation (can be any JSON-serializable object)
1144            output: Output data from the operation (can be any JSON-serializable object)
1145            metadata: Additional metadata to associate with the observation
1146            version: Version identifier for the code or component
1147            level: Importance level of the observation (info, warning, error)
1148            status_message: Optional status message for the observation
1149            end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
1150
1151            The following parameters are available when as_type is: "generation" or "embedding".
1152            completion_start_time: When the model started generating the response
1153            model: Name/identifier of the AI model used (e.g., "gpt-4")
1154            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1155            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1156            cost_details: Cost information for the model call
1157            prompt: Associated prompt template from Langfuse prompt management
1158
1159        Returns:
1160            A context manager that yields the appropriate observation type based on as_type
1161
1162        Example:
1163            ```python
1164            # Create a span
1165            with langfuse.start_as_current_observation(name="process-query", as_type="span") as span:
1166                # Do work
1167                result = process_data()
1168                span.update(output=result)
1169
1170                # Create a child span automatically
1171                with span.start_as_current_span(name="sub-operation") as child_span:
1172                    # Do sub-operation work
1173                    child_span.update(output="sub-result")
1174
1175            # Create a tool observation
1176            with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool:
1177                # Do tool work
1178                results = search_web(query)
1179                tool.update(output=results)
1180
1181            # Create a generation observation
1182            with langfuse.start_as_current_observation(
1183                name="answer-generation",
1184                as_type="generation",
1185                model="gpt-4"
1186            ) as generation:
1187                # Generate answer
1188                response = llm.generate(...)
1189                generation.update(output=response)
1190            ```
1191        """
1192        if as_type in get_observation_types_list(ObservationTypeGenerationLike):
1193            if trace_context:
1194                trace_id = trace_context.get("trace_id", None)
1195                parent_span_id = trace_context.get("parent_span_id", None)
1196
1197                if trace_id:
1198                    remote_parent_span = self._create_remote_parent_span(
1199                        trace_id=trace_id, parent_span_id=parent_span_id
1200                    )
1201
1202                    return cast(
1203                        Union[
1204                            _AgnosticContextManager[LangfuseGeneration],
1205                            _AgnosticContextManager[LangfuseEmbedding],
1206                        ],
1207                        self._create_span_with_parent_context(
1208                            as_type=as_type,
1209                            name=name,
1210                            remote_parent_span=remote_parent_span,
1211                            parent=None,
1212                            end_on_exit=end_on_exit,
1213                            input=input,
1214                            output=output,
1215                            metadata=metadata,
1216                            version=version,
1217                            level=level,
1218                            status_message=status_message,
1219                            completion_start_time=completion_start_time,
1220                            model=model,
1221                            model_parameters=model_parameters,
1222                            usage_details=usage_details,
1223                            cost_details=cost_details,
1224                            prompt=prompt,
1225                        ),
1226                    )
1227
1228            return cast(
1229                Union[
1230                    _AgnosticContextManager[LangfuseGeneration],
1231                    _AgnosticContextManager[LangfuseEmbedding],
1232                ],
1233                self._start_as_current_otel_span_with_processed_media(
1234                    as_type=as_type,
1235                    name=name,
1236                    end_on_exit=end_on_exit,
1237                    input=input,
1238                    output=output,
1239                    metadata=metadata,
1240                    version=version,
1241                    level=level,
1242                    status_message=status_message,
1243                    completion_start_time=completion_start_time,
1244                    model=model,
1245                    model_parameters=model_parameters,
1246                    usage_details=usage_details,
1247                    cost_details=cost_details,
1248                    prompt=prompt,
1249                ),
1250            )
1251
1252        if as_type in get_observation_types_list(ObservationTypeSpanLike):
1253            if trace_context:
1254                trace_id = trace_context.get("trace_id", None)
1255                parent_span_id = trace_context.get("parent_span_id", None)
1256
1257                if trace_id:
1258                    remote_parent_span = self._create_remote_parent_span(
1259                        trace_id=trace_id, parent_span_id=parent_span_id
1260                    )
1261
1262                    return cast(
1263                        Union[
1264                            _AgnosticContextManager[LangfuseSpan],
1265                            _AgnosticContextManager[LangfuseAgent],
1266                            _AgnosticContextManager[LangfuseTool],
1267                            _AgnosticContextManager[LangfuseChain],
1268                            _AgnosticContextManager[LangfuseRetriever],
1269                            _AgnosticContextManager[LangfuseEvaluator],
1270                            _AgnosticContextManager[LangfuseGuardrail],
1271                        ],
1272                        self._create_span_with_parent_context(
1273                            as_type=as_type,
1274                            name=name,
1275                            remote_parent_span=remote_parent_span,
1276                            parent=None,
1277                            end_on_exit=end_on_exit,
1278                            input=input,
1279                            output=output,
1280                            metadata=metadata,
1281                            version=version,
1282                            level=level,
1283                            status_message=status_message,
1284                        ),
1285                    )
1286
1287            return cast(
1288                Union[
1289                    _AgnosticContextManager[LangfuseSpan],
1290                    _AgnosticContextManager[LangfuseAgent],
1291                    _AgnosticContextManager[LangfuseTool],
1292                    _AgnosticContextManager[LangfuseChain],
1293                    _AgnosticContextManager[LangfuseRetriever],
1294                    _AgnosticContextManager[LangfuseEvaluator],
1295                    _AgnosticContextManager[LangfuseGuardrail],
1296                ],
1297                self._start_as_current_otel_span_with_processed_media(
1298                    as_type=as_type,
1299                    name=name,
1300                    end_on_exit=end_on_exit,
1301                    input=input,
1302                    output=output,
1303                    metadata=metadata,
1304                    version=version,
1305                    level=level,
1306                    status_message=status_message,
1307                ),
1308            )
1309
1310        # This should never be reached since all valid types are handled above
1311        langfuse_logger.warning(
1312            f"Unknown observation type: {as_type}, falling back to span"
1313        )
1314        return self._start_as_current_otel_span_with_processed_media(
1315            as_type="span",
1316            name=name,
1317            end_on_exit=end_on_exit,
1318            input=input,
1319            output=output,
1320            metadata=metadata,
1321            version=version,
1322            level=level,
1323            status_message=status_message,
1324        )
1325
1326    def _get_span_class(
1327        self,
1328        as_type: ObservationTypeLiteral,
1329    ) -> Union[
1330        Type[LangfuseAgent],
1331        Type[LangfuseTool],
1332        Type[LangfuseChain],
1333        Type[LangfuseRetriever],
1334        Type[LangfuseEvaluator],
1335        Type[LangfuseEmbedding],
1336        Type[LangfuseGuardrail],
1337        Type[LangfuseGeneration],
1338        Type[LangfuseEvent],
1339        Type[LangfuseSpan],
1340    ]:
1341        """Get the appropriate span class based on as_type."""
1342        normalized_type = as_type.lower()
1343
1344        if normalized_type == "agent":
1345            return LangfuseAgent
1346        elif normalized_type == "tool":
1347            return LangfuseTool
1348        elif normalized_type == "chain":
1349            return LangfuseChain
1350        elif normalized_type == "retriever":
1351            return LangfuseRetriever
1352        elif normalized_type == "evaluator":
1353            return LangfuseEvaluator
1354        elif normalized_type == "embedding":
1355            return LangfuseEmbedding
1356        elif normalized_type == "guardrail":
1357            return LangfuseGuardrail
1358        elif normalized_type == "generation":
1359            return LangfuseGeneration
1360        elif normalized_type == "event":
1361            return LangfuseEvent
1362        elif normalized_type == "span":
1363            return LangfuseSpan
1364        else:
1365            return LangfuseSpan
1366
1367    @_agnosticcontextmanager
1368    def _create_span_with_parent_context(
1369        self,
1370        *,
1371        name: str,
1372        parent: Optional[otel_trace_api.Span] = None,
1373        remote_parent_span: Optional[otel_trace_api.Span] = None,
1374        as_type: ObservationTypeLiteralNoEvent,
1375        end_on_exit: Optional[bool] = None,
1376        input: Optional[Any] = None,
1377        output: Optional[Any] = None,
1378        metadata: Optional[Any] = None,
1379        version: Optional[str] = None,
1380        level: Optional[SpanLevel] = None,
1381        status_message: Optional[str] = None,
1382        completion_start_time: Optional[datetime] = None,
1383        model: Optional[str] = None,
1384        model_parameters: Optional[Dict[str, MapValue]] = None,
1385        usage_details: Optional[Dict[str, int]] = None,
1386        cost_details: Optional[Dict[str, float]] = None,
1387        prompt: Optional[PromptClient] = None,
1388    ) -> Any:
1389        parent_span = parent or cast(otel_trace_api.Span, remote_parent_span)
1390
1391        with otel_trace_api.use_span(parent_span):
1392            with self._start_as_current_otel_span_with_processed_media(
1393                name=name,
1394                as_type=as_type,
1395                end_on_exit=end_on_exit,
1396                input=input,
1397                output=output,
1398                metadata=metadata,
1399                version=version,
1400                level=level,
1401                status_message=status_message,
1402                completion_start_time=completion_start_time,
1403                model=model,
1404                model_parameters=model_parameters,
1405                usage_details=usage_details,
1406                cost_details=cost_details,
1407                prompt=prompt,
1408            ) as langfuse_span:
1409                if remote_parent_span is not None:
1410                    langfuse_span._otel_span.set_attribute(
1411                        LangfuseOtelSpanAttributes.AS_ROOT, True
1412                    )
1413
1414                yield langfuse_span
1415
1416    @_agnosticcontextmanager
1417    def _start_as_current_otel_span_with_processed_media(
1418        self,
1419        *,
1420        name: str,
1421        as_type: Optional[ObservationTypeLiteralNoEvent] = None,
1422        end_on_exit: Optional[bool] = None,
1423        input: Optional[Any] = None,
1424        output: Optional[Any] = None,
1425        metadata: Optional[Any] = None,
1426        version: Optional[str] = None,
1427        level: Optional[SpanLevel] = None,
1428        status_message: Optional[str] = None,
1429        completion_start_time: Optional[datetime] = None,
1430        model: Optional[str] = None,
1431        model_parameters: Optional[Dict[str, MapValue]] = None,
1432        usage_details: Optional[Dict[str, int]] = None,
1433        cost_details: Optional[Dict[str, float]] = None,
1434        prompt: Optional[PromptClient] = None,
1435    ) -> Any:
1436        with self._otel_tracer.start_as_current_span(
1437            name=name,
1438            end_on_exit=end_on_exit if end_on_exit is not None else True,
1439        ) as otel_span:
1440            span_class = self._get_span_class(
1441                as_type or "generation"
1442            )  # default was "generation"
1443            common_args = {
1444                "otel_span": otel_span,
1445                "langfuse_client": self,
1446                "environment": self._environment,
1447                "input": input,
1448                "output": output,
1449                "metadata": metadata,
1450                "version": version,
1451                "level": level,
1452                "status_message": status_message,
1453            }
1454
1455            if span_class in [
1456                LangfuseGeneration,
1457                LangfuseEmbedding,
1458            ]:
1459                common_args.update(
1460                    {
1461                        "completion_start_time": completion_start_time,
1462                        "model": model,
1463                        "model_parameters": model_parameters,
1464                        "usage_details": usage_details,
1465                        "cost_details": cost_details,
1466                        "prompt": prompt,
1467                    }
1468                )
1469            # For span-like types (span, agent, tool, chain, retriever, evaluator, guardrail), no generation properties needed
1470
1471            yield span_class(**common_args)  # type: ignore[arg-type]
1472
1473    def _get_current_otel_span(self) -> Optional[otel_trace_api.Span]:
1474        current_span = otel_trace_api.get_current_span()
1475
1476        if current_span is otel_trace_api.INVALID_SPAN:
1477            langfuse_logger.warning(
1478                "Context error: No active span in current context. Operations that depend on an active span will be skipped. "
1479                "Ensure spans are created with start_as_current_span() or that you're operating within an active span context."
1480            )
1481            return None
1482
1483        return current_span
1484
1485    def update_current_generation(
1486        self,
1487        *,
1488        name: Optional[str] = None,
1489        input: Optional[Any] = None,
1490        output: Optional[Any] = None,
1491        metadata: Optional[Any] = None,
1492        version: Optional[str] = None,
1493        level: Optional[SpanLevel] = None,
1494        status_message: Optional[str] = None,
1495        completion_start_time: Optional[datetime] = None,
1496        model: Optional[str] = None,
1497        model_parameters: Optional[Dict[str, MapValue]] = None,
1498        usage_details: Optional[Dict[str, int]] = None,
1499        cost_details: Optional[Dict[str, float]] = None,
1500        prompt: Optional[PromptClient] = None,
1501    ) -> None:
1502        """Update the current active generation span with new information.
1503
1504        This method updates the current generation span in the active context with
1505        additional information. It's useful for adding output, usage stats, or other
1506        details that become available during or after model generation.
1507
1508        Args:
1509            name: The generation name
1510            input: Updated input data for the model
1511            output: Output from the model (e.g., completions)
1512            metadata: Additional metadata to associate with the generation
1513            version: Version identifier for the model or component
1514            level: Importance level of the generation (info, warning, error)
1515            status_message: Optional status message for the generation
1516            completion_start_time: When the model started generating the response
1517            model: Name/identifier of the AI model used (e.g., "gpt-4")
1518            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1519            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1520            cost_details: Cost information for the model call
1521            prompt: Associated prompt template from Langfuse prompt management
1522
1523        Example:
1524            ```python
1525            with langfuse.start_as_current_generation(name="answer-query") as generation:
1526                # Initial setup and API call
1527                response = llm.generate(...)
1528
1529                # Update with results that weren't available at creation time
1530                langfuse.update_current_generation(
1531                    output=response.text,
1532                    usage_details={
1533                        "prompt_tokens": response.usage.prompt_tokens,
1534                        "completion_tokens": response.usage.completion_tokens
1535                    }
1536                )
1537            ```
1538        """
1539        if not self._tracing_enabled:
1540            langfuse_logger.debug(
1541                "Operation skipped: update_current_generation - Tracing is disabled or client is in no-op mode."
1542            )
1543            return
1544
1545        current_otel_span = self._get_current_otel_span()
1546
1547        if current_otel_span is not None:
1548            generation = LangfuseGeneration(
1549                otel_span=current_otel_span, langfuse_client=self
1550            )
1551
1552            if name:
1553                current_otel_span.update_name(name)
1554
1555            generation.update(
1556                input=input,
1557                output=output,
1558                metadata=metadata,
1559                version=version,
1560                level=level,
1561                status_message=status_message,
1562                completion_start_time=completion_start_time,
1563                model=model,
1564                model_parameters=model_parameters,
1565                usage_details=usage_details,
1566                cost_details=cost_details,
1567                prompt=prompt,
1568            )
1569
1570    def update_current_span(
1571        self,
1572        *,
1573        name: Optional[str] = None,
1574        input: Optional[Any] = None,
1575        output: Optional[Any] = None,
1576        metadata: Optional[Any] = None,
1577        version: Optional[str] = None,
1578        level: Optional[SpanLevel] = None,
1579        status_message: Optional[str] = None,
1580    ) -> None:
1581        """Update the current active span with new information.
1582
1583        This method updates the current span in the active context with
1584        additional information. It's useful for adding outputs or metadata
1585        that become available during execution.
1586
1587        Args:
1588            name: The span name
1589            input: Updated input data for the operation
1590            output: Output data from the operation
1591            metadata: Additional metadata to associate with the span
1592            version: Version identifier for the code or component
1593            level: Importance level of the span (info, warning, error)
1594            status_message: Optional status message for the span
1595
1596        Example:
1597            ```python
1598            with langfuse.start_as_current_span(name="process-data") as span:
1599                # Initial processing
1600                result = process_first_part()
1601
1602                # Update with intermediate results
1603                langfuse.update_current_span(metadata={"intermediate_result": result})
1604
1605                # Continue processing
1606                final_result = process_second_part(result)
1607
1608                # Final update
1609                langfuse.update_current_span(output=final_result)
1610            ```
1611        """
1612        if not self._tracing_enabled:
1613            langfuse_logger.debug(
1614                "Operation skipped: update_current_span - Tracing is disabled or client is in no-op mode."
1615            )
1616            return
1617
1618        current_otel_span = self._get_current_otel_span()
1619
1620        if current_otel_span is not None:
1621            span = LangfuseSpan(
1622                otel_span=current_otel_span,
1623                langfuse_client=self,
1624                environment=self._environment,
1625            )
1626
1627            if name:
1628                current_otel_span.update_name(name)
1629
1630            span.update(
1631                input=input,
1632                output=output,
1633                metadata=metadata,
1634                version=version,
1635                level=level,
1636                status_message=status_message,
1637            )
1638
1639    def update_current_trace(
1640        self,
1641        *,
1642        name: Optional[str] = None,
1643        user_id: Optional[str] = None,
1644        session_id: Optional[str] = None,
1645        version: Optional[str] = None,
1646        input: Optional[Any] = None,
1647        output: Optional[Any] = None,
1648        metadata: Optional[Any] = None,
1649        tags: Optional[List[str]] = None,
1650        public: Optional[bool] = None,
1651    ) -> None:
1652        """Update the current trace with additional information.
1653
1654        Args:
1655            name: Updated name for the Langfuse trace
1656            user_id: ID of the user who initiated the Langfuse trace
1657            session_id: Session identifier for grouping related Langfuse traces
1658            version: Version identifier for the application or service
1659            input: Input data for the overall Langfuse trace
1660            output: Output data from the overall Langfuse trace
1661            metadata: Additional metadata to associate with the Langfuse trace
1662            tags: List of tags to categorize the Langfuse trace
1663            public: Whether the Langfuse trace should be publicly accessible
1664
1665        See Also:
1666            :func:`langfuse.propagate_attributes`: Recommended replacement
1667        """
1668        if not self._tracing_enabled:
1669            langfuse_logger.debug(
1670                "Operation skipped: update_current_trace - Tracing is disabled or client is in no-op mode."
1671            )
1672            return
1673
1674        current_otel_span = self._get_current_otel_span()
1675
1676        if current_otel_span is not None and current_otel_span.is_recording():
1677            existing_observation_type = current_otel_span.attributes.get(  # type: ignore[attr-defined]
1678                LangfuseOtelSpanAttributes.OBSERVATION_TYPE, "span"
1679            )
1680            # We need to preserve the class to keep the correct observation type
1681            span_class = self._get_span_class(existing_observation_type)
1682            span = span_class(
1683                otel_span=current_otel_span,
1684                langfuse_client=self,
1685                environment=self._environment,
1686            )
1687
1688            span.update_trace(
1689                name=name,
1690                user_id=user_id,
1691                session_id=session_id,
1692                version=version,
1693                input=input,
1694                output=output,
1695                metadata=metadata,
1696                tags=tags,
1697                public=public,
1698            )
1699
1700    def create_event(
1701        self,
1702        *,
1703        trace_context: Optional[TraceContext] = None,
1704        name: str,
1705        input: Optional[Any] = None,
1706        output: Optional[Any] = None,
1707        metadata: Optional[Any] = None,
1708        version: Optional[str] = None,
1709        level: Optional[SpanLevel] = None,
1710        status_message: Optional[str] = None,
1711    ) -> LangfuseEvent:
1712        """Create a new Langfuse observation of type 'EVENT'.
1713
1714        The created Langfuse Event observation will be the child of the current span in the context.
1715
1716        Args:
1717            trace_context: Optional context for connecting to an existing trace
1718            name: Name of the span (e.g., function or operation name)
1719            input: Input data for the operation (can be any JSON-serializable object)
1720            output: Output data from the operation (can be any JSON-serializable object)
1721            metadata: Additional metadata to associate with the span
1722            version: Version identifier for the code or component
1723            level: Importance level of the span (info, warning, error)
1724            status_message: Optional status message for the span
1725
1726        Returns:
1727            The Langfuse Event object
1728
1729        Example:
1730            ```python
1731            event = langfuse.create_event(name="process-event")
1732            ```
1733        """
1734        timestamp = time_ns()
1735
1736        if trace_context:
1737            trace_id = trace_context.get("trace_id", None)
1738            parent_span_id = trace_context.get("parent_span_id", None)
1739
1740            if trace_id:
1741                remote_parent_span = self._create_remote_parent_span(
1742                    trace_id=trace_id, parent_span_id=parent_span_id
1743                )
1744
1745                with otel_trace_api.use_span(
1746                    cast(otel_trace_api.Span, remote_parent_span)
1747                ):
1748                    otel_span = self._otel_tracer.start_span(
1749                        name=name, start_time=timestamp
1750                    )
1751                    otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True)
1752
1753                    return cast(
1754                        LangfuseEvent,
1755                        LangfuseEvent(
1756                            otel_span=otel_span,
1757                            langfuse_client=self,
1758                            environment=self._environment,
1759                            input=input,
1760                            output=output,
1761                            metadata=metadata,
1762                            version=version,
1763                            level=level,
1764                            status_message=status_message,
1765                        ).end(end_time=timestamp),
1766                    )
1767
1768        otel_span = self._otel_tracer.start_span(name=name, start_time=timestamp)
1769
1770        return cast(
1771            LangfuseEvent,
1772            LangfuseEvent(
1773                otel_span=otel_span,
1774                langfuse_client=self,
1775                environment=self._environment,
1776                input=input,
1777                output=output,
1778                metadata=metadata,
1779                version=version,
1780                level=level,
1781                status_message=status_message,
1782            ).end(end_time=timestamp),
1783        )
1784
1785    def _create_remote_parent_span(
1786        self, *, trace_id: str, parent_span_id: Optional[str]
1787    ) -> Any:
1788        if not self._is_valid_trace_id(trace_id):
1789            langfuse_logger.warning(
1790                f"Passed trace ID '{trace_id}' is not a valid 32 lowercase hex char Langfuse trace id. Ignoring trace ID."
1791            )
1792
1793        if parent_span_id and not self._is_valid_span_id(parent_span_id):
1794            langfuse_logger.warning(
1795                f"Passed span ID '{parent_span_id}' is not a valid 16 lowercase hex char Langfuse span id. Ignoring parent span ID."
1796            )
1797
1798        int_trace_id = int(trace_id, 16)
1799        int_parent_span_id = (
1800            int(parent_span_id, 16)
1801            if parent_span_id
1802            else RandomIdGenerator().generate_span_id()
1803        )
1804
1805        span_context = otel_trace_api.SpanContext(
1806            trace_id=int_trace_id,
1807            span_id=int_parent_span_id,
1808            trace_flags=otel_trace_api.TraceFlags(0x01),  # mark span as sampled
1809            is_remote=False,
1810        )
1811
1812        return otel_trace_api.NonRecordingSpan(span_context)
1813
1814    def _is_valid_trace_id(self, trace_id: str) -> bool:
1815        pattern = r"^[0-9a-f]{32}$"
1816
1817        return bool(re.match(pattern, trace_id))
1818
1819    def _is_valid_span_id(self, span_id: str) -> bool:
1820        pattern = r"^[0-9a-f]{16}$"
1821
1822        return bool(re.match(pattern, span_id))
1823
1824    def _create_observation_id(self, *, seed: Optional[str] = None) -> str:
1825        """Create a unique observation ID for use with Langfuse.
1826
1827        This method generates a unique observation ID (span ID in OpenTelemetry terms)
1828        for use with various Langfuse APIs. It can either generate a random ID or
1829        create a deterministic ID based on a seed string.
1830
1831        Observation IDs must be 16 lowercase hexadecimal characters, representing 8 bytes.
1832        This method ensures the generated ID meets this requirement. If you need to
1833        correlate an external ID with a Langfuse observation ID, use the external ID as
1834        the seed to get a valid, deterministic observation ID.
1835
1836        Args:
1837            seed: Optional string to use as a seed for deterministic ID generation.
1838                 If provided, the same seed will always produce the same ID.
1839                 If not provided, a random ID will be generated.
1840
1841        Returns:
1842            A 16-character lowercase hexadecimal string representing the observation ID.
1843
1844        Example:
1845            ```python
1846            # Generate a random observation ID
1847            obs_id = langfuse.create_observation_id()
1848
1849            # Generate a deterministic ID based on a seed
1850            user_obs_id = langfuse.create_observation_id(seed="user-123-feedback")
1851
1852            # Correlate an external item ID with a Langfuse observation ID
1853            item_id = "item-789012"
1854            correlated_obs_id = langfuse.create_observation_id(seed=item_id)
1855
1856            # Use the ID with Langfuse APIs
1857            langfuse.create_score(
1858                name="relevance",
1859                value=0.95,
1860                trace_id=trace_id,
1861                observation_id=obs_id
1862            )
1863            ```
1864        """
1865        if not seed:
1866            span_id_int = RandomIdGenerator().generate_span_id()
1867
1868            return self._format_otel_span_id(span_id_int)
1869
1870        return sha256(seed.encode("utf-8")).digest()[:8].hex()
1871
1872    @staticmethod
1873    def create_trace_id(*, seed: Optional[str] = None) -> str:
1874        """Create a unique trace ID for use with Langfuse.
1875
1876        This method generates a unique trace ID for use with various Langfuse APIs.
1877        It can either generate a random ID or create a deterministic ID based on
1878        a seed string.
1879
1880        Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes.
1881        This method ensures the generated ID meets this requirement. If you need to
1882        correlate an external ID with a Langfuse trace ID, use the external ID as the
1883        seed to get a valid, deterministic Langfuse trace ID.
1884
1885        Args:
1886            seed: Optional string to use as a seed for deterministic ID generation.
1887                 If provided, the same seed will always produce the same ID.
1888                 If not provided, a random ID will be generated.
1889
1890        Returns:
1891            A 32-character lowercase hexadecimal string representing the Langfuse trace ID.
1892
1893        Example:
1894            ```python
1895            # Generate a random trace ID
1896            trace_id = langfuse.create_trace_id()
1897
1898            # Generate a deterministic ID based on a seed
1899            session_trace_id = langfuse.create_trace_id(seed="session-456")
1900
1901            # Correlate an external ID with a Langfuse trace ID
1902            external_id = "external-system-123456"
1903            correlated_trace_id = langfuse.create_trace_id(seed=external_id)
1904
1905            # Use the ID with trace context
1906            with langfuse.start_as_current_span(
1907                name="process-request",
1908                trace_context={"trace_id": trace_id}
1909            ) as span:
1910                # Operation will be part of the specific trace
1911                pass
1912            ```
1913        """
1914        if not seed:
1915            trace_id_int = RandomIdGenerator().generate_trace_id()
1916
1917            return Langfuse._format_otel_trace_id(trace_id_int)
1918
1919        return sha256(seed.encode("utf-8")).digest()[:16].hex()
1920
1921    def _get_otel_trace_id(self, otel_span: otel_trace_api.Span) -> str:
1922        span_context = otel_span.get_span_context()
1923
1924        return self._format_otel_trace_id(span_context.trace_id)
1925
1926    def _get_otel_span_id(self, otel_span: otel_trace_api.Span) -> str:
1927        span_context = otel_span.get_span_context()
1928
1929        return self._format_otel_span_id(span_context.span_id)
1930
1931    @staticmethod
1932    def _format_otel_span_id(span_id_int: int) -> str:
1933        """Format an integer span ID to a 16-character lowercase hex string.
1934
1935        Internal method to convert an OpenTelemetry integer span ID to the standard
1936        W3C Trace Context format (16-character lowercase hex string).
1937
1938        Args:
1939            span_id_int: 64-bit integer representing a span ID
1940
1941        Returns:
1942            A 16-character lowercase hexadecimal string
1943        """
1944        return format(span_id_int, "016x")
1945
1946    @staticmethod
1947    def _format_otel_trace_id(trace_id_int: int) -> str:
1948        """Format an integer trace ID to a 32-character lowercase hex string.
1949
1950        Internal method to convert an OpenTelemetry integer trace ID to the standard
1951        W3C Trace Context format (32-character lowercase hex string).
1952
1953        Args:
1954            trace_id_int: 128-bit integer representing a trace ID
1955
1956        Returns:
1957            A 32-character lowercase hexadecimal string
1958        """
1959        return format(trace_id_int, "032x")
1960
1961    @overload
1962    def create_score(
1963        self,
1964        *,
1965        name: str,
1966        value: float,
1967        session_id: Optional[str] = None,
1968        dataset_run_id: Optional[str] = None,
1969        trace_id: Optional[str] = None,
1970        observation_id: Optional[str] = None,
1971        score_id: Optional[str] = None,
1972        data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None,
1973        comment: Optional[str] = None,
1974        config_id: Optional[str] = None,
1975        metadata: Optional[Any] = None,
1976        timestamp: Optional[datetime] = None,
1977    ) -> None: ...
1978
1979    @overload
1980    def create_score(
1981        self,
1982        *,
1983        name: str,
1984        value: str,
1985        session_id: Optional[str] = None,
1986        dataset_run_id: Optional[str] = None,
1987        trace_id: Optional[str] = None,
1988        score_id: Optional[str] = None,
1989        observation_id: Optional[str] = None,
1990        data_type: Optional[Literal["CATEGORICAL"]] = "CATEGORICAL",
1991        comment: Optional[str] = None,
1992        config_id: Optional[str] = None,
1993        metadata: Optional[Any] = None,
1994        timestamp: Optional[datetime] = None,
1995    ) -> None: ...
1996
1997    def create_score(
1998        self,
1999        *,
2000        name: str,
2001        value: Union[float, str],
2002        session_id: Optional[str] = None,
2003        dataset_run_id: Optional[str] = None,
2004        trace_id: Optional[str] = None,
2005        observation_id: Optional[str] = None,
2006        score_id: Optional[str] = None,
2007        data_type: Optional[ScoreDataType] = None,
2008        comment: Optional[str] = None,
2009        config_id: Optional[str] = None,
2010        metadata: Optional[Any] = None,
2011        timestamp: Optional[datetime] = None,
2012    ) -> None:
2013        """Create a score for a specific trace or observation.
2014
2015        This method creates a score for evaluating a Langfuse trace or observation. Scores can be
2016        used to track quality metrics, user feedback, or automated evaluations.
2017
2018        Args:
2019            name: Name of the score (e.g., "relevance", "accuracy")
2020            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
2021            session_id: ID of the Langfuse session to associate the score with
2022            dataset_run_id: ID of the Langfuse dataset run to associate the score with
2023            trace_id: ID of the Langfuse trace to associate the score with
2024            observation_id: Optional ID of the specific observation to score. Trace ID must be provided too.
2025            score_id: Optional custom ID for the score (auto-generated if not provided)
2026            data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
2027            comment: Optional comment or explanation for the score
2028            config_id: Optional ID of a score config defined in Langfuse
2029            metadata: Optional metadata to be attached to the score
2030            timestamp: Optional timestamp for the score (defaults to current UTC time)
2031
2032        Example:
2033            ```python
2034            # Create a numeric score for accuracy
2035            langfuse.create_score(
2036                name="accuracy",
2037                value=0.92,
2038                trace_id="abcdef1234567890abcdef1234567890",
2039                data_type="NUMERIC",
2040                comment="High accuracy with minor irrelevant details"
2041            )
2042
2043            # Create a categorical score for sentiment
2044            langfuse.create_score(
2045                name="sentiment",
2046                value="positive",
2047                trace_id="abcdef1234567890abcdef1234567890",
2048                observation_id="abcdef1234567890",
2049                data_type="CATEGORICAL"
2050            )
2051            ```
2052        """
2053        if not self._tracing_enabled:
2054            return
2055
2056        score_id = score_id or self._create_observation_id()
2057
2058        try:
2059            new_body = ScoreBody(
2060                id=score_id,
2061                sessionId=session_id,
2062                datasetRunId=dataset_run_id,
2063                traceId=trace_id,
2064                observationId=observation_id,
2065                name=name,
2066                value=value,
2067                dataType=data_type,  # type: ignore
2068                comment=comment,
2069                configId=config_id,
2070                environment=self._environment,
2071                metadata=metadata,
2072            )
2073
2074            event = {
2075                "id": self.create_trace_id(),
2076                "type": "score-create",
2077                "timestamp": timestamp or _get_timestamp(),
2078                "body": new_body,
2079            }
2080
2081            if self._resources is not None:
2082                # Force the score to be in sample if it was for a legacy trace ID, i.e. non-32 hexchar
2083                force_sample = (
2084                    not self._is_valid_trace_id(trace_id) if trace_id else True
2085                )
2086
2087                self._resources.add_score_task(
2088                    event,
2089                    force_sample=force_sample,
2090                )
2091
2092        except Exception as e:
2093            langfuse_logger.exception(
2094                f"Error creating score: Failed to process score event for trace_id={trace_id}, name={name}. Error: {e}"
2095            )
2096
2097    @overload
2098    def score_current_span(
2099        self,
2100        *,
2101        name: str,
2102        value: float,
2103        score_id: Optional[str] = None,
2104        data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None,
2105        comment: Optional[str] = None,
2106        config_id: Optional[str] = None,
2107    ) -> None: ...
2108
2109    @overload
2110    def score_current_span(
2111        self,
2112        *,
2113        name: str,
2114        value: str,
2115        score_id: Optional[str] = None,
2116        data_type: Optional[Literal["CATEGORICAL"]] = "CATEGORICAL",
2117        comment: Optional[str] = None,
2118        config_id: Optional[str] = None,
2119    ) -> None: ...
2120
2121    def score_current_span(
2122        self,
2123        *,
2124        name: str,
2125        value: Union[float, str],
2126        score_id: Optional[str] = None,
2127        data_type: Optional[ScoreDataType] = None,
2128        comment: Optional[str] = None,
2129        config_id: Optional[str] = None,
2130    ) -> None:
2131        """Create a score for the current active span.
2132
2133        This method scores the currently active span in the context. It's a convenient
2134        way to score the current operation without needing to know its trace and span IDs.
2135
2136        Args:
2137            name: Name of the score (e.g., "relevance", "accuracy")
2138            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
2139            score_id: Optional custom ID for the score (auto-generated if not provided)
2140            data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
2141            comment: Optional comment or explanation for the score
2142            config_id: Optional ID of a score config defined in Langfuse
2143
2144        Example:
2145            ```python
2146            with langfuse.start_as_current_generation(name="answer-query") as generation:
2147                # Generate answer
2148                response = generate_answer(...)
2149                generation.update(output=response)
2150
2151                # Score the generation
2152                langfuse.score_current_span(
2153                    name="relevance",
2154                    value=0.85,
2155                    data_type="NUMERIC",
2156                    comment="Mostly relevant but contains some tangential information"
2157                )
2158            ```
2159        """
2160        current_span = self._get_current_otel_span()
2161
2162        if current_span is not None:
2163            trace_id = self._get_otel_trace_id(current_span)
2164            observation_id = self._get_otel_span_id(current_span)
2165
2166            langfuse_logger.info(
2167                f"Score: Creating score name='{name}' value={value} for current span ({observation_id}) in trace {trace_id}"
2168            )
2169
2170            self.create_score(
2171                trace_id=trace_id,
2172                observation_id=observation_id,
2173                name=name,
2174                value=cast(str, value),
2175                score_id=score_id,
2176                data_type=cast(Literal["CATEGORICAL"], data_type),
2177                comment=comment,
2178                config_id=config_id,
2179            )
2180
2181    @overload
2182    def score_current_trace(
2183        self,
2184        *,
2185        name: str,
2186        value: float,
2187        score_id: Optional[str] = None,
2188        data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None,
2189        comment: Optional[str] = None,
2190        config_id: Optional[str] = None,
2191    ) -> None: ...
2192
2193    @overload
2194    def score_current_trace(
2195        self,
2196        *,
2197        name: str,
2198        value: str,
2199        score_id: Optional[str] = None,
2200        data_type: Optional[Literal["CATEGORICAL"]] = "CATEGORICAL",
2201        comment: Optional[str] = None,
2202        config_id: Optional[str] = None,
2203    ) -> None: ...
2204
2205    def score_current_trace(
2206        self,
2207        *,
2208        name: str,
2209        value: Union[float, str],
2210        score_id: Optional[str] = None,
2211        data_type: Optional[ScoreDataType] = None,
2212        comment: Optional[str] = None,
2213        config_id: Optional[str] = None,
2214    ) -> None:
2215        """Create a score for the current trace.
2216
2217        This method scores the trace of the currently active span. Unlike score_current_span,
2218        this method associates the score with the entire trace rather than a specific span.
2219        It's useful for scoring overall performance or quality of the entire operation.
2220
2221        Args:
2222            name: Name of the score (e.g., "user_satisfaction", "overall_quality")
2223            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
2224            score_id: Optional custom ID for the score (auto-generated if not provided)
2225            data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
2226            comment: Optional comment or explanation for the score
2227            config_id: Optional ID of a score config defined in Langfuse
2228
2229        Example:
2230            ```python
2231            with langfuse.start_as_current_span(name="process-user-request") as span:
2232                # Process request
2233                result = process_complete_request()
2234                span.update(output=result)
2235
2236                # Score the overall trace
2237                langfuse.score_current_trace(
2238                    name="overall_quality",
2239                    value=0.95,
2240                    data_type="NUMERIC",
2241                    comment="High quality end-to-end response"
2242                )
2243            ```
2244        """
2245        current_span = self._get_current_otel_span()
2246
2247        if current_span is not None:
2248            trace_id = self._get_otel_trace_id(current_span)
2249
2250            langfuse_logger.info(
2251                f"Score: Creating score name='{name}' value={value} for entire trace {trace_id}"
2252            )
2253
2254            self.create_score(
2255                trace_id=trace_id,
2256                name=name,
2257                value=cast(str, value),
2258                score_id=score_id,
2259                data_type=cast(Literal["CATEGORICAL"], data_type),
2260                comment=comment,
2261                config_id=config_id,
2262            )
2263
2264    def flush(self) -> None:
2265        """Force flush all pending spans and events to the Langfuse API.
2266
2267        This method manually flushes any pending spans, scores, and other events to the
2268        Langfuse API. It's useful in scenarios where you want to ensure all data is sent
2269        before proceeding, without waiting for the automatic flush interval.
2270
2271        Example:
2272            ```python
2273            # Record some spans and scores
2274            with langfuse.start_as_current_span(name="operation") as span:
2275                # Do work...
2276                pass
2277
2278            # Ensure all data is sent to Langfuse before proceeding
2279            langfuse.flush()
2280
2281            # Continue with other work
2282            ```
2283        """
2284        if self._resources is not None:
2285            self._resources.flush()
2286
2287    def shutdown(self) -> None:
2288        """Shut down the Langfuse client and flush all pending data.
2289
2290        This method cleanly shuts down the Langfuse client, ensuring all pending data
2291        is flushed to the API and all background threads are properly terminated.
2292
2293        It's important to call this method when your application is shutting down to
2294        prevent data loss and resource leaks. For most applications, using the client
2295        as a context manager or relying on the automatic shutdown via atexit is sufficient.
2296
2297        Example:
2298            ```python
2299            # Initialize Langfuse
2300            langfuse = Langfuse(public_key="...", secret_key="...")
2301
2302            # Use Langfuse throughout your application
2303            # ...
2304
2305            # When application is shutting down
2306            langfuse.shutdown()
2307            ```
2308        """
2309        if self._resources is not None:
2310            self._resources.shutdown()
2311
2312    def get_current_trace_id(self) -> Optional[str]:
2313        """Get the trace ID of the current active span.
2314
2315        This method retrieves the trace ID from the currently active span in the context.
2316        It can be used to get the trace ID for referencing in logs, external systems,
2317        or for creating related operations.
2318
2319        Returns:
2320            The current trace ID as a 32-character lowercase hexadecimal string,
2321            or None if there is no active span.
2322
2323        Example:
2324            ```python
2325            with langfuse.start_as_current_span(name="process-request") as span:
2326                # Get the current trace ID for reference
2327                trace_id = langfuse.get_current_trace_id()
2328
2329                # Use it for external correlation
2330                log.info(f"Processing request with trace_id: {trace_id}")
2331
2332                # Or pass to another system
2333                external_system.process(data, trace_id=trace_id)
2334            ```
2335        """
2336        if not self._tracing_enabled:
2337            langfuse_logger.debug(
2338                "Operation skipped: get_current_trace_id - Tracing is disabled or client is in no-op mode."
2339            )
2340            return None
2341
2342        current_otel_span = self._get_current_otel_span()
2343
2344        return self._get_otel_trace_id(current_otel_span) if current_otel_span else None
2345
2346    def get_current_observation_id(self) -> Optional[str]:
2347        """Get the observation ID (span ID) of the current active span.
2348
2349        This method retrieves the observation ID from the currently active span in the context.
2350        It can be used to get the observation ID for referencing in logs, external systems,
2351        or for creating scores or other related operations.
2352
2353        Returns:
2354            The current observation ID as a 16-character lowercase hexadecimal string,
2355            or None if there is no active span.
2356
2357        Example:
2358            ```python
2359            with langfuse.start_as_current_span(name="process-user-query") as span:
2360                # Get the current observation ID
2361                observation_id = langfuse.get_current_observation_id()
2362
2363                # Store it for later reference
2364                cache.set(f"query_{query_id}_observation", observation_id)
2365
2366                # Process the query...
2367            ```
2368        """
2369        if not self._tracing_enabled:
2370            langfuse_logger.debug(
2371                "Operation skipped: get_current_observation_id - Tracing is disabled or client is in no-op mode."
2372            )
2373            return None
2374
2375        current_otel_span = self._get_current_otel_span()
2376
2377        return self._get_otel_span_id(current_otel_span) if current_otel_span else None
2378
2379    def _get_project_id(self) -> Optional[str]:
2380        """Fetch and return the current project id. Persisted across requests. Returns None if no project id is found for api keys."""
2381        if not self._project_id:
2382            proj = self.api.projects.get()
2383            if not proj.data or not proj.data[0].id:
2384                return None
2385
2386            self._project_id = proj.data[0].id
2387
2388        return self._project_id
2389
2390    def get_trace_url(self, *, trace_id: Optional[str] = None) -> Optional[str]:
2391        """Get the URL to view a trace in the Langfuse UI.
2392
2393        This method generates a URL that links directly to a trace in the Langfuse UI.
2394        It's useful for providing links in logs, notifications, or debugging tools.
2395
2396        Args:
2397            trace_id: Optional trace ID to generate a URL for. If not provided,
2398                     the trace ID of the current active span will be used.
2399
2400        Returns:
2401            A URL string pointing to the trace in the Langfuse UI,
2402            or None if the project ID couldn't be retrieved or no trace ID is available.
2403
2404        Example:
2405            ```python
2406            # Get URL for the current trace
2407            with langfuse.start_as_current_span(name="process-request") as span:
2408                trace_url = langfuse.get_trace_url()
2409                log.info(f"Processing trace: {trace_url}")
2410
2411            # Get URL for a specific trace
2412            specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef")
2413            send_notification(f"Review needed for trace: {specific_trace_url}")
2414            ```
2415        """
2416        project_id = self._get_project_id()
2417        final_trace_id = trace_id or self.get_current_trace_id()
2418
2419        return (
2420            f"{self._base_url}/project/{project_id}/traces/{final_trace_id}"
2421            if project_id and final_trace_id
2422            else None
2423        )
2424
2425    def get_dataset(
2426        self, name: str, *, fetch_items_page_size: Optional[int] = 50
2427    ) -> "DatasetClient":
2428        """Fetch a dataset by its name.
2429
2430        Args:
2431            name (str): The name of the dataset to fetch.
2432            fetch_items_page_size (Optional[int]): All items of the dataset will be fetched in chunks of this size. Defaults to 50.
2433
2434        Returns:
2435            DatasetClient: The dataset with the given name.
2436        """
2437        try:
2438            langfuse_logger.debug(f"Getting datasets {name}")
2439            dataset = self.api.datasets.get(dataset_name=self._url_encode(name))
2440
2441            dataset_items = []
2442            page = 1
2443
2444            while True:
2445                new_items = self.api.dataset_items.list(
2446                    dataset_name=self._url_encode(name, is_url_param=True),
2447                    page=page,
2448                    limit=fetch_items_page_size,
2449                )
2450                dataset_items.extend(new_items.data)
2451
2452                if new_items.meta.total_pages <= page:
2453                    break
2454
2455                page += 1
2456
2457            items = [DatasetItemClient(i, langfuse=self) for i in dataset_items]
2458
2459            return DatasetClient(dataset, items=items)
2460
2461        except Error as e:
2462            handle_fern_exception(e)
2463            raise e
2464
2465    def run_experiment(
2466        self,
2467        *,
2468        name: str,
2469        run_name: Optional[str] = None,
2470        description: Optional[str] = None,
2471        data: ExperimentData,
2472        task: TaskFunction,
2473        evaluators: List[EvaluatorFunction] = [],
2474        composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
2475        run_evaluators: List[RunEvaluatorFunction] = [],
2476        max_concurrency: int = 50,
2477        metadata: Optional[Dict[str, str]] = None,
2478    ) -> ExperimentResult:
2479        """Run an experiment on a dataset with automatic tracing and evaluation.
2480
2481        This method executes a task function on each item in the provided dataset,
2482        automatically traces all executions with Langfuse for observability, runs
2483        item-level and run-level evaluators on the outputs, and returns comprehensive
2484        results with evaluation metrics.
2485
2486        The experiment system provides:
2487        - Automatic tracing of all task executions
2488        - Concurrent processing with configurable limits
2489        - Comprehensive error handling that isolates failures
2490        - Integration with Langfuse datasets for experiment tracking
2491        - Flexible evaluation framework supporting both sync and async evaluators
2492
2493        Args:
2494            name: Human-readable name for the experiment. Used for identification
2495                in the Langfuse UI.
2496            run_name: Optional exact name for the experiment run. If provided, this will be
2497                used as the exact dataset run name if the `data` contains Langfuse dataset items.
2498                If not provided, this will default to the experiment name appended with an ISO timestamp.
2499            description: Optional description explaining the experiment's purpose,
2500                methodology, or expected outcomes.
2501            data: Array of data items to process. Can be either:
2502                - List of dict-like items with 'input', 'expected_output', 'metadata' keys
2503                - List of Langfuse DatasetItem objects from dataset.items
2504            task: Function that processes each data item and returns output.
2505                Must accept 'item' as keyword argument and can return sync or async results.
2506                The task function signature should be: task(*, item, **kwargs) -> Any
2507            evaluators: List of functions to evaluate each item's output individually.
2508                Each evaluator receives input, output, expected_output, and metadata.
2509                Can return single Evaluation dict or list of Evaluation dicts.
2510            composite_evaluator: Optional function that creates composite scores from item-level evaluations.
2511                Receives the same inputs as item-level evaluators (input, output, expected_output, metadata)
2512                plus the list of evaluations from item-level evaluators. Useful for weighted averages,
2513                pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics.
2514            run_evaluators: List of functions to evaluate the entire experiment run.
2515                Each run evaluator receives all item_results and can compute aggregate metrics.
2516                Useful for calculating averages, distributions, or cross-item comparisons.
2517            max_concurrency: Maximum number of concurrent task executions (default: 50).
2518                Controls the number of items processed simultaneously. Adjust based on
2519                API rate limits and system resources.
2520            metadata: Optional metadata dictionary to attach to all experiment traces.
2521                This metadata will be included in every trace created during the experiment.
2522                If `data` are Langfuse dataset items, the metadata will be attached to the dataset run, too.
2523
2524        Returns:
2525            ExperimentResult containing:
2526            - run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset.
2527            - item_results: List of results for each processed item with outputs and evaluations
2528            - run_evaluations: List of aggregate evaluation results for the entire run
2529            - dataset_run_id: ID of the dataset run (if using Langfuse datasets)
2530            - dataset_run_url: Direct URL to view results in Langfuse UI (if applicable)
2531
2532        Raises:
2533            ValueError: If required parameters are missing or invalid
2534            Exception: If experiment setup fails (individual item failures are handled gracefully)
2535
2536        Examples:
2537            Basic experiment with local data:
2538            ```python
2539            def summarize_text(*, item, **kwargs):
2540                return f"Summary: {item['input'][:50]}..."
2541
2542            def length_evaluator(*, input, output, expected_output=None, **kwargs):
2543                return {
2544                    "name": "output_length",
2545                    "value": len(output),
2546                    "comment": f"Output contains {len(output)} characters"
2547                }
2548
2549            result = langfuse.run_experiment(
2550                name="Text Summarization Test",
2551                description="Evaluate summarization quality and length",
2552                data=[
2553                    {"input": "Long article text...", "expected_output": "Expected summary"},
2554                    {"input": "Another article...", "expected_output": "Another summary"}
2555                ],
2556                task=summarize_text,
2557                evaluators=[length_evaluator]
2558            )
2559
2560            print(f"Processed {len(result.item_results)} items")
2561            for item_result in result.item_results:
2562                print(f"Input: {item_result.item['input']}")
2563                print(f"Output: {item_result.output}")
2564                print(f"Evaluations: {item_result.evaluations}")
2565            ```
2566
2567            Advanced experiment with async task and multiple evaluators:
2568            ```python
2569            async def llm_task(*, item, **kwargs):
2570                # Simulate async LLM call
2571                response = await openai_client.chat.completions.create(
2572                    model="gpt-4",
2573                    messages=[{"role": "user", "content": item["input"]}]
2574                )
2575                return response.choices[0].message.content
2576
2577            def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
2578                if expected_output and expected_output.lower() in output.lower():
2579                    return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"}
2580                return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"}
2581
2582            def toxicity_evaluator(*, input, output, expected_output=None, **kwargs):
2583                # Simulate toxicity check
2584                toxicity_score = check_toxicity(output)  # Your toxicity checker
2585                return {
2586                    "name": "toxicity",
2587                    "value": toxicity_score,
2588                    "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}"
2589                }
2590
2591            def average_accuracy(*, item_results, **kwargs):
2592                accuracies = [
2593                    eval.value for result in item_results
2594                    for eval in result.evaluations
2595                    if eval.name == "accuracy"
2596                ]
2597                return {
2598                    "name": "average_accuracy",
2599                    "value": sum(accuracies) / len(accuracies) if accuracies else 0,
2600                    "comment": f"Average accuracy across {len(accuracies)} items"
2601                }
2602
2603            result = langfuse.run_experiment(
2604                name="LLM Safety and Accuracy Test",
2605                description="Evaluate model accuracy and safety across diverse prompts",
2606                data=test_dataset,  # Your dataset items
2607                task=llm_task,
2608                evaluators=[accuracy_evaluator, toxicity_evaluator],
2609                run_evaluators=[average_accuracy],
2610                max_concurrency=5,  # Limit concurrent API calls
2611                metadata={"model": "gpt-4", "temperature": 0.7}
2612            )
2613            ```
2614
2615            Using with Langfuse datasets:
2616            ```python
2617            # Get dataset from Langfuse
2618            dataset = langfuse.get_dataset("my-eval-dataset")
2619
2620            result = dataset.run_experiment(
2621                name="Production Model Evaluation",
2622                description="Monthly evaluation of production model performance",
2623                task=my_production_task,
2624                evaluators=[accuracy_evaluator, latency_evaluator]
2625            )
2626
2627            # Results automatically linked to dataset in Langfuse UI
2628            print(f"View results: {result['dataset_run_url']}")
2629            ```
2630
2631        Note:
2632            - Task and evaluator functions can be either synchronous or asynchronous
2633            - Individual item failures are logged but don't stop the experiment
2634            - All executions are automatically traced and visible in Langfuse UI
2635            - When using Langfuse datasets, results are automatically linked for easy comparison
2636            - This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.)
2637            - Async execution is handled automatically with smart event loop detection
2638        """
2639        return cast(
2640            ExperimentResult,
2641            run_async_safely(
2642                self._run_experiment_async(
2643                    name=name,
2644                    run_name=self._create_experiment_run_name(
2645                        name=name, run_name=run_name
2646                    ),
2647                    description=description,
2648                    data=data,
2649                    task=task,
2650                    evaluators=evaluators or [],
2651                    composite_evaluator=composite_evaluator,
2652                    run_evaluators=run_evaluators or [],
2653                    max_concurrency=max_concurrency,
2654                    metadata=metadata,
2655                ),
2656            ),
2657        )
2658
2659    async def _run_experiment_async(
2660        self,
2661        *,
2662        name: str,
2663        run_name: str,
2664        description: Optional[str],
2665        data: ExperimentData,
2666        task: TaskFunction,
2667        evaluators: List[EvaluatorFunction],
2668        composite_evaluator: Optional[CompositeEvaluatorFunction],
2669        run_evaluators: List[RunEvaluatorFunction],
2670        max_concurrency: int,
2671        metadata: Optional[Dict[str, Any]] = None,
2672    ) -> ExperimentResult:
2673        langfuse_logger.debug(
2674            f"Starting experiment '{name}' run '{run_name}' with {len(data)} items"
2675        )
2676
2677        # Set up concurrency control
2678        semaphore = asyncio.Semaphore(max_concurrency)
2679
2680        # Process all items
2681        async def process_item(item: ExperimentItem) -> ExperimentItemResult:
2682            async with semaphore:
2683                return await self._process_experiment_item(
2684                    item,
2685                    task,
2686                    evaluators,
2687                    composite_evaluator,
2688                    name,
2689                    run_name,
2690                    description,
2691                    metadata,
2692                )
2693
2694        # Run all items concurrently
2695        tasks = [process_item(item) for item in data]
2696        item_results = await asyncio.gather(*tasks, return_exceptions=True)
2697
2698        # Filter out any exceptions and log errors
2699        valid_results: List[ExperimentItemResult] = []
2700        for i, result in enumerate(item_results):
2701            if isinstance(result, Exception):
2702                langfuse_logger.error(f"Item {i} failed: {result}")
2703            elif isinstance(result, ExperimentItemResult):
2704                valid_results.append(result)  # type: ignore
2705
2706        # Run experiment-level evaluators
2707        run_evaluations: List[Evaluation] = []
2708        for run_evaluator in run_evaluators:
2709            try:
2710                evaluations = await _run_evaluator(
2711                    run_evaluator, item_results=valid_results
2712                )
2713                run_evaluations.extend(evaluations)
2714            except Exception as e:
2715                langfuse_logger.error(f"Run evaluator failed: {e}")
2716
2717        # Generate dataset run URL if applicable
2718        dataset_run_id = valid_results[0].dataset_run_id if valid_results else None
2719        dataset_run_url = None
2720        if dataset_run_id and data:
2721            try:
2722                # Check if the first item has dataset_id (for DatasetItem objects)
2723                first_item = data[0]
2724                dataset_id = None
2725
2726                if hasattr(first_item, "dataset_id"):
2727                    dataset_id = getattr(first_item, "dataset_id", None)
2728
2729                if dataset_id:
2730                    project_id = self._get_project_id()
2731
2732                    if project_id:
2733                        dataset_run_url = f"{self._base_url}/project/{project_id}/datasets/{dataset_id}/runs/{dataset_run_id}"
2734
2735            except Exception:
2736                pass  # URL generation is optional
2737
2738        # Store run-level evaluations as scores
2739        for evaluation in run_evaluations:
2740            try:
2741                if dataset_run_id:
2742                    self.create_score(
2743                        dataset_run_id=dataset_run_id,
2744                        name=evaluation.name or "<unknown>",
2745                        value=evaluation.value,  # type: ignore
2746                        comment=evaluation.comment,
2747                        metadata=evaluation.metadata,
2748                        data_type=evaluation.data_type,  # type: ignore
2749                        config_id=evaluation.config_id,
2750                    )
2751
2752            except Exception as e:
2753                langfuse_logger.error(f"Failed to store run evaluation: {e}")
2754
2755        # Flush scores and traces
2756        self.flush()
2757
2758        return ExperimentResult(
2759            name=name,
2760            run_name=run_name,
2761            description=description,
2762            item_results=valid_results,
2763            run_evaluations=run_evaluations,
2764            dataset_run_id=dataset_run_id,
2765            dataset_run_url=dataset_run_url,
2766        )
2767
2768    async def _process_experiment_item(
2769        self,
2770        item: ExperimentItem,
2771        task: Callable,
2772        evaluators: List[Callable],
2773        composite_evaluator: Optional[CompositeEvaluatorFunction],
2774        experiment_name: str,
2775        experiment_run_name: str,
2776        experiment_description: Optional[str],
2777        experiment_metadata: Optional[Dict[str, Any]] = None,
2778    ) -> ExperimentItemResult:
2779        span_name = "experiment-item-run"
2780
2781        with self.start_as_current_span(name=span_name) as span:
2782            try:
2783                input_data = (
2784                    item.get("input")
2785                    if isinstance(item, dict)
2786                    else getattr(item, "input", None)
2787                )
2788
2789                if input_data is None:
2790                    raise ValueError("Experiment Item is missing input. Skipping item.")
2791
2792                expected_output = (
2793                    item.get("expected_output")
2794                    if isinstance(item, dict)
2795                    else getattr(item, "expected_output", None)
2796                )
2797
2798                item_metadata = (
2799                    item.get("metadata")
2800                    if isinstance(item, dict)
2801                    else getattr(item, "metadata", None)
2802                )
2803
2804                final_observation_metadata = {
2805                    "experiment_name": experiment_name,
2806                    "experiment_run_name": experiment_run_name,
2807                    **(experiment_metadata or {}),
2808                }
2809
2810                trace_id = span.trace_id
2811                dataset_id = None
2812                dataset_item_id = None
2813                dataset_run_id = None
2814
2815                # Link to dataset run if this is a dataset item
2816                if hasattr(item, "id") and hasattr(item, "dataset_id"):
2817                    try:
2818                        # Use sync API to avoid event loop issues when run_async_safely
2819                        # creates multiple event loops across different threads
2820                        dataset_run_item = await asyncio.to_thread(
2821                            self.api.dataset_run_items.create,
2822                            request=CreateDatasetRunItemRequest(
2823                                runName=experiment_run_name,
2824                                runDescription=experiment_description,
2825                                metadata=experiment_metadata,
2826                                datasetItemId=item.id,  # type: ignore
2827                                traceId=trace_id,
2828                                observationId=span.id,
2829                            ),
2830                        )
2831
2832                        dataset_run_id = dataset_run_item.dataset_run_id
2833
2834                    except Exception as e:
2835                        langfuse_logger.error(f"Failed to create dataset run item: {e}")
2836
2837                if (
2838                    not isinstance(item, dict)
2839                    and hasattr(item, "dataset_id")
2840                    and hasattr(item, "id")
2841                ):
2842                    dataset_id = item.dataset_id
2843                    dataset_item_id = item.id
2844
2845                    final_observation_metadata.update(
2846                        {"dataset_id": dataset_id, "dataset_item_id": dataset_item_id}
2847                    )
2848
2849                if isinstance(item_metadata, dict):
2850                    final_observation_metadata.update(item_metadata)
2851
2852                experiment_id = dataset_run_id or self._create_observation_id()
2853                experiment_item_id = (
2854                    dataset_item_id or get_sha256_hash_hex(_serialize(input_data))[:16]
2855                )
2856                span._otel_span.set_attributes(
2857                    {
2858                        k: v
2859                        for k, v in {
2860                            LangfuseOtelSpanAttributes.ENVIRONMENT: LANGFUSE_SDK_EXPERIMENT_ENVIRONMENT,
2861                            LangfuseOtelSpanAttributes.EXPERIMENT_DESCRIPTION: experiment_description,
2862                            LangfuseOtelSpanAttributes.EXPERIMENT_ITEM_EXPECTED_OUTPUT: _serialize(
2863                                expected_output
2864                            ),
2865                        }.items()
2866                        if v is not None
2867                    }
2868                )
2869
2870                with _propagate_attributes(
2871                    experiment=PropagatedExperimentAttributes(
2872                        experiment_id=experiment_id,
2873                        experiment_name=experiment_run_name,
2874                        experiment_metadata=_serialize(experiment_metadata),
2875                        experiment_dataset_id=dataset_id,
2876                        experiment_item_id=experiment_item_id,
2877                        experiment_item_metadata=_serialize(item_metadata),
2878                        experiment_item_root_observation_id=span.id,
2879                    )
2880                ):
2881                    output = await _run_task(task, item)
2882
2883                span.update(
2884                    input=input_data,
2885                    output=output,
2886                    metadata=final_observation_metadata,
2887                )
2888
2889            except Exception as e:
2890                span.update(
2891                    output=f"Error: {str(e)}", level="ERROR", status_message=str(e)
2892                )
2893                raise e
2894
2895        # Run evaluators
2896        evaluations = []
2897
2898        for evaluator in evaluators:
2899            try:
2900                eval_metadata: Optional[Dict[str, Any]] = None
2901
2902                if isinstance(item, dict):
2903                    eval_metadata = item.get("metadata")
2904                elif hasattr(item, "metadata"):
2905                    eval_metadata = item.metadata
2906
2907                eval_results = await _run_evaluator(
2908                    evaluator,
2909                    input=input_data,
2910                    output=output,
2911                    expected_output=expected_output,
2912                    metadata=eval_metadata,
2913                )
2914                evaluations.extend(eval_results)
2915
2916                # Store evaluations as scores
2917                for evaluation in eval_results:
2918                    self.create_score(
2919                        trace_id=trace_id,
2920                        observation_id=span.id,
2921                        name=evaluation.name,
2922                        value=evaluation.value,  # type: ignore
2923                        comment=evaluation.comment,
2924                        metadata=evaluation.metadata,
2925                        config_id=evaluation.config_id,
2926                        data_type=evaluation.data_type,  # type: ignore
2927                    )
2928
2929            except Exception as e:
2930                langfuse_logger.error(f"Evaluator failed: {e}")
2931
2932        # Run composite evaluator if provided and we have evaluations
2933        if composite_evaluator and evaluations:
2934            try:
2935                composite_eval_metadata: Optional[Dict[str, Any]] = None
2936                if isinstance(item, dict):
2937                    composite_eval_metadata = item.get("metadata")
2938                elif hasattr(item, "metadata"):
2939                    composite_eval_metadata = item.metadata
2940
2941                result = composite_evaluator(
2942                    input=input_data,
2943                    output=output,
2944                    expected_output=expected_output,
2945                    metadata=composite_eval_metadata,
2946                    evaluations=evaluations,
2947                )
2948
2949                # Handle async composite evaluators
2950                if asyncio.iscoroutine(result):
2951                    result = await result
2952
2953                # Normalize to list
2954                composite_evals: List[Evaluation] = []
2955                if isinstance(result, (dict, Evaluation)):
2956                    composite_evals = [result]  # type: ignore
2957                elif isinstance(result, list):
2958                    composite_evals = result  # type: ignore
2959
2960                # Store composite evaluations as scores and add to evaluations list
2961                for composite_evaluation in composite_evals:
2962                    self.create_score(
2963                        trace_id=trace_id,
2964                        observation_id=span.id,
2965                        name=composite_evaluation.name,
2966                        value=composite_evaluation.value,  # type: ignore
2967                        comment=composite_evaluation.comment,
2968                        metadata=composite_evaluation.metadata,
2969                        config_id=composite_evaluation.config_id,
2970                        data_type=composite_evaluation.data_type,  # type: ignore
2971                    )
2972                    evaluations.append(composite_evaluation)
2973
2974            except Exception as e:
2975                langfuse_logger.error(f"Composite evaluator failed: {e}")
2976
2977        return ExperimentItemResult(
2978            item=item,
2979            output=output,
2980            evaluations=evaluations,
2981            trace_id=trace_id,
2982            dataset_run_id=dataset_run_id,
2983        )
2984
2985    def _create_experiment_run_name(
2986        self, *, name: Optional[str] = None, run_name: Optional[str] = None
2987    ) -> str:
2988        if run_name:
2989            return run_name
2990
2991        iso_timestamp = _get_timestamp().isoformat().replace("+00:00", "Z")
2992
2993        return f"{name} - {iso_timestamp}"
2994
2995    def run_batched_evaluation(
2996        self,
2997        *,
2998        scope: Literal["traces", "observations"],
2999        mapper: MapperFunction,
3000        filter: Optional[str] = None,
3001        fetch_batch_size: int = 50,
3002        max_items: Optional[int] = None,
3003        max_retries: int = 3,
3004        evaluators: List[EvaluatorFunction],
3005        composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
3006        max_concurrency: int = 50,
3007        metadata: Optional[Dict[str, Any]] = None,
3008        resume_from: Optional[BatchEvaluationResumeToken] = None,
3009        verbose: bool = False,
3010    ) -> BatchEvaluationResult:
3011        """Fetch traces or observations and run evaluations on each item.
3012
3013        This method provides a powerful way to evaluate existing data in Langfuse at scale.
3014        It fetches items based on filters, transforms them using a mapper function, runs
3015        evaluators on each item, and creates scores that are linked back to the original
3016        entities. This is ideal for:
3017
3018        - Running evaluations on production traces after deployment
3019        - Backtesting new evaluation metrics on historical data
3020        - Batch scoring of observations for quality monitoring
3021        - Periodic evaluation runs on recent data
3022
3023        The method uses a streaming/pipeline approach to process items in batches, making
3024        it memory-efficient for large datasets. It includes comprehensive error handling,
3025        retry logic, and resume capability for long-running evaluations.
3026
3027        Args:
3028            scope: The type of items to evaluate. Must be one of:
3029                - "traces": Evaluate complete traces with all their observations
3030                - "observations": Evaluate individual observations (spans, generations, events)
3031            mapper: Function that transforms API response objects into evaluator inputs.
3032                Receives a trace/observation object and returns an EvaluatorInputs
3033                instance with input, output, expected_output, and metadata fields.
3034                Can be sync or async.
3035            evaluators: List of evaluation functions to run on each item. Each evaluator
3036                receives the mapped inputs and returns Evaluation object(s). Evaluator
3037                failures are logged but don't stop the batch evaluation.
3038            filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples:
3039                - '{"tags": ["production"]}'
3040                - '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}'
3041                Default: None (fetches all items).
3042            fetch_batch_size: Number of items to fetch per API call and hold in memory.
3043                Larger values may be faster but use more memory. Default: 50.
3044            max_items: Maximum total number of items to process. If None, processes all
3045                items matching the filter. Useful for testing or limiting evaluation runs.
3046                Default: None (process all).
3047            max_concurrency: Maximum number of items to evaluate concurrently. Controls
3048                parallelism and resource usage. Default: 50.
3049            composite_evaluator: Optional function that creates a composite score from
3050                item-level evaluations. Receives the original item and its evaluations,
3051                returns a single Evaluation. Useful for weighted averages or combined metrics.
3052                Default: None.
3053            metadata: Optional metadata dict to add to all created scores. Useful for
3054                tracking evaluation runs, versions, or other context. Default: None.
3055            max_retries: Maximum number of retry attempts for failed batch fetches.
3056                Uses exponential backoff (1s, 2s, 4s). Default: 3.
3057            verbose: If True, logs progress information to console. Useful for monitoring
3058                long-running evaluations. Default: False.
3059            resume_from: Optional resume token from a previous incomplete run. Allows
3060                continuing evaluation after interruption or failure. Default: None.
3061
3062
3063        Returns:
3064            BatchEvaluationResult containing:
3065                - total_items_fetched: Number of items fetched from API
3066                - total_items_processed: Number of items successfully evaluated
3067                - total_items_failed: Number of items that failed evaluation
3068                - total_scores_created: Scores created by item-level evaluators
3069                - total_composite_scores_created: Scores created by composite evaluator
3070                - total_evaluations_failed: Individual evaluator failures
3071                - evaluator_stats: Per-evaluator statistics (success rate, scores created)
3072                - resume_token: Token for resuming if incomplete (None if completed)
3073                - completed: True if all items processed
3074                - duration_seconds: Total execution time
3075                - failed_item_ids: IDs of items that failed
3076                - error_summary: Error types and counts
3077                - has_more_items: True if max_items reached but more exist
3078
3079        Raises:
3080            ValueError: If invalid scope is provided.
3081
3082        Examples:
3083            Basic trace evaluation:
3084            ```python
3085            from langfuse import Langfuse, EvaluatorInputs, Evaluation
3086
3087            client = Langfuse()
3088
3089            # Define mapper to extract fields from traces
3090            def trace_mapper(trace):
3091                return EvaluatorInputs(
3092                    input=trace.input,
3093                    output=trace.output,
3094                    expected_output=None,
3095                    metadata={"trace_id": trace.id}
3096                )
3097
3098            # Define evaluator
3099            def length_evaluator(*, input, output, expected_output, metadata):
3100                return Evaluation(
3101                    name="output_length",
3102                    value=len(output) if output else 0
3103                )
3104
3105            # Run batch evaluation
3106            result = client.run_batched_evaluation(
3107                scope="traces",
3108                mapper=trace_mapper,
3109                evaluators=[length_evaluator],
3110                filter='{"tags": ["production"]}',
3111                max_items=1000,
3112                verbose=True
3113            )
3114
3115            print(f"Processed {result.total_items_processed} traces")
3116            print(f"Created {result.total_scores_created} scores")
3117            ```
3118
3119            Evaluation with composite scorer:
3120            ```python
3121            def accuracy_evaluator(*, input, output, expected_output, metadata):
3122                # ... evaluation logic
3123                return Evaluation(name="accuracy", value=0.85)
3124
3125            def relevance_evaluator(*, input, output, expected_output, metadata):
3126                # ... evaluation logic
3127                return Evaluation(name="relevance", value=0.92)
3128
3129            def composite_evaluator(*, item, evaluations):
3130                # Weighted average of evaluations
3131                weights = {"accuracy": 0.6, "relevance": 0.4}
3132                total = sum(
3133                    e.value * weights.get(e.name, 0)
3134                    for e in evaluations
3135                    if isinstance(e.value, (int, float))
3136                )
3137                return Evaluation(
3138                    name="composite_score",
3139                    value=total,
3140                    comment=f"Weighted average of {len(evaluations)} metrics"
3141                )
3142
3143            result = client.run_batched_evaluation(
3144                scope="traces",
3145                mapper=trace_mapper,
3146                evaluators=[accuracy_evaluator, relevance_evaluator],
3147                composite_evaluator=composite_evaluator,
3148                filter='{"user_id": "important_user"}',
3149                verbose=True
3150            )
3151            ```
3152
3153            Handling incomplete runs with resume:
3154            ```python
3155            # Initial run that may fail or timeout
3156            result = client.run_batched_evaluation(
3157                scope="observations",
3158                mapper=obs_mapper,
3159                evaluators=[my_evaluator],
3160                max_items=10000,
3161                verbose=True
3162            )
3163
3164            # Check if incomplete
3165            if not result.completed and result.resume_token:
3166                print(f"Processed {result.resume_token.items_processed} items before interruption")
3167
3168                # Resume from where it left off
3169                result = client.run_batched_evaluation(
3170                    scope="observations",
3171                    mapper=obs_mapper,
3172                    evaluators=[my_evaluator],
3173                    resume_from=result.resume_token,
3174                    verbose=True
3175                )
3176
3177            print(f"Total items processed: {result.total_items_processed}")
3178            ```
3179
3180            Monitoring evaluator performance:
3181            ```python
3182            result = client.run_batched_evaluation(...)
3183
3184            for stats in result.evaluator_stats:
3185                success_rate = stats.successful_runs / stats.total_runs
3186                print(f"{stats.name}:")
3187                print(f"  Success rate: {success_rate:.1%}")
3188                print(f"  Scores created: {stats.total_scores_created}")
3189
3190                if stats.failed_runs > 0:
3191                    print(f"  âš ī¸  Failed {stats.failed_runs} times")
3192            ```
3193
3194        Note:
3195            - Evaluator failures are logged but don't stop the batch evaluation
3196            - Individual item failures are tracked but don't stop processing
3197            - Fetch failures are retried with exponential backoff
3198            - All scores are automatically flushed to Langfuse at the end
3199            - The resume mechanism uses timestamp-based filtering to avoid duplicates
3200        """
3201        runner = BatchEvaluationRunner(self)
3202
3203        return cast(
3204            BatchEvaluationResult,
3205            run_async_safely(
3206                runner.run_async(
3207                    scope=scope,
3208                    mapper=mapper,
3209                    evaluators=evaluators,
3210                    filter=filter,
3211                    fetch_batch_size=fetch_batch_size,
3212                    max_items=max_items,
3213                    max_concurrency=max_concurrency,
3214                    composite_evaluator=composite_evaluator,
3215                    metadata=metadata,
3216                    max_retries=max_retries,
3217                    verbose=verbose,
3218                    resume_from=resume_from,
3219                )
3220            ),
3221        )
3222
3223    def auth_check(self) -> bool:
3224        """Check if the provided credentials (public and secret key) are valid.
3225
3226        Raises:
3227            Exception: If no projects were found for the provided credentials.
3228
3229        Note:
3230            This method is blocking. It is discouraged to use it in production code.
3231        """
3232        try:
3233            projects = self.api.projects.get()
3234            langfuse_logger.debug(
3235                f"Auth check successful, found {len(projects.data)} projects"
3236            )
3237            if len(projects.data) == 0:
3238                raise Exception(
3239                    "Auth check failed, no project found for the keys provided."
3240                )
3241            return True
3242
3243        except AttributeError as e:
3244            langfuse_logger.warning(
3245                f"Auth check failed: Client not properly initialized. Error: {e}"
3246            )
3247            return False
3248
3249        except Error as e:
3250            handle_fern_exception(e)
3251            raise e
3252
3253    def create_dataset(
3254        self,
3255        *,
3256        name: str,
3257        description: Optional[str] = None,
3258        metadata: Optional[Any] = None,
3259        input_schema: Optional[Any] = None,
3260        expected_output_schema: Optional[Any] = None,
3261    ) -> Dataset:
3262        """Create a dataset with the given name on Langfuse.
3263
3264        Args:
3265            name: Name of the dataset to create.
3266            description: Description of the dataset. Defaults to None.
3267            metadata: Additional metadata. Defaults to None.
3268            input_schema: JSON Schema for validating dataset item inputs. When set, all new items will be validated against this schema.
3269            expected_output_schema: JSON Schema for validating dataset item expected outputs. When set, all new items will be validated against this schema.
3270
3271        Returns:
3272            Dataset: The created dataset as returned by the Langfuse API.
3273        """
3274        try:
3275            body = CreateDatasetRequest(
3276                name=name,
3277                description=description,
3278                metadata=metadata,
3279                inputSchema=input_schema,
3280                expectedOutputSchema=expected_output_schema,
3281            )
3282            langfuse_logger.debug(f"Creating datasets {body}")
3283
3284            return self.api.datasets.create(request=body)
3285
3286        except Error as e:
3287            handle_fern_exception(e)
3288            raise e
3289
3290    def create_dataset_item(
3291        self,
3292        *,
3293        dataset_name: str,
3294        input: Optional[Any] = None,
3295        expected_output: Optional[Any] = None,
3296        metadata: Optional[Any] = None,
3297        source_trace_id: Optional[str] = None,
3298        source_observation_id: Optional[str] = None,
3299        status: Optional[DatasetStatus] = None,
3300        id: Optional[str] = None,
3301    ) -> DatasetItem:
3302        """Create a dataset item.
3303
3304        Upserts if an item with id already exists.
3305
3306        Args:
3307            dataset_name: Name of the dataset in which the dataset item should be created.
3308            input: Input data. Defaults to None. Can contain any dict, list or scalar.
3309            expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar.
3310            metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar.
3311            source_trace_id: Id of the source trace. Defaults to None.
3312            source_observation_id: Id of the source observation. Defaults to None.
3313            status: Status of the dataset item. Defaults to ACTIVE for newly created items.
3314            id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets.
3315
3316        Returns:
3317            DatasetItem: The created dataset item as returned by the Langfuse API.
3318
3319        Example:
3320            ```python
3321            from langfuse import Langfuse
3322
3323            langfuse = Langfuse()
3324
3325            # Uploading items to the Langfuse dataset named "capital_cities"
3326            langfuse.create_dataset_item(
3327                dataset_name="capital_cities",
3328                input={"input": {"country": "Italy"}},
3329                expected_output={"expected_output": "Rome"},
3330                metadata={"foo": "bar"}
3331            )
3332            ```
3333        """
3334        try:
3335            body = CreateDatasetItemRequest(
3336                datasetName=dataset_name,
3337                input=input,
3338                expectedOutput=expected_output,
3339                metadata=metadata,
3340                sourceTraceId=source_trace_id,
3341                sourceObservationId=source_observation_id,
3342                status=status,
3343                id=id,
3344            )
3345            langfuse_logger.debug(f"Creating dataset item {body}")
3346            return self.api.dataset_items.create(request=body)
3347        except Error as e:
3348            handle_fern_exception(e)
3349            raise e
3350
3351    def resolve_media_references(
3352        self,
3353        *,
3354        obj: Any,
3355        resolve_with: Literal["base64_data_uri"],
3356        max_depth: int = 10,
3357        content_fetch_timeout_seconds: int = 5,
3358    ) -> Any:
3359        """Replace media reference strings in an object with base64 data URIs.
3360
3361        This method recursively traverses an object (up to max_depth) looking for media reference strings
3362        in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using
3363        the provided Langfuse client and replaces the reference string with a base64 data URI.
3364
3365        If fetching media content fails for a reference string, a warning is logged and the reference
3366        string is left unchanged.
3367
3368        Args:
3369            obj: The object to process. Can be a primitive value, array, or nested object.
3370                If the object has a __dict__ attribute, a dict will be returned instead of the original object type.
3371            resolve_with: The representation of the media content to replace the media reference string with.
3372                Currently only "base64_data_uri" is supported.
3373            max_depth: int: The maximum depth to traverse the object. Default is 10.
3374            content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5.
3375
3376        Returns:
3377            A deep copy of the input object with all media references replaced with base64 data URIs where possible.
3378            If the input object has a __dict__ attribute, a dict will be returned instead of the original object type.
3379
3380        Example:
3381            obj = {
3382                "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@",
3383                "nested": {
3384                    "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@"
3385                }
3386            }
3387
3388            result = await LangfuseMedia.resolve_media_references(obj, langfuse_client)
3389
3390            # Result:
3391            # {
3392            #     "image": "...",
3393            #     "nested": {
3394            #         "pdf": "data:application/pdf;base64,JVBERi0xLjcK..."
3395            #     }
3396            # }
3397        """
3398        return LangfuseMedia.resolve_media_references(
3399            langfuse_client=self,
3400            obj=obj,
3401            resolve_with=resolve_with,
3402            max_depth=max_depth,
3403            content_fetch_timeout_seconds=content_fetch_timeout_seconds,
3404        )
3405
3406    @overload
3407    def get_prompt(
3408        self,
3409        name: str,
3410        *,
3411        version: Optional[int] = None,
3412        label: Optional[str] = None,
3413        type: Literal["chat"],
3414        cache_ttl_seconds: Optional[int] = None,
3415        fallback: Optional[List[ChatMessageDict]] = None,
3416        max_retries: Optional[int] = None,
3417        fetch_timeout_seconds: Optional[int] = None,
3418    ) -> ChatPromptClient: ...
3419
3420    @overload
3421    def get_prompt(
3422        self,
3423        name: str,
3424        *,
3425        version: Optional[int] = None,
3426        label: Optional[str] = None,
3427        type: Literal["text"] = "text",
3428        cache_ttl_seconds: Optional[int] = None,
3429        fallback: Optional[str] = None,
3430        max_retries: Optional[int] = None,
3431        fetch_timeout_seconds: Optional[int] = None,
3432    ) -> TextPromptClient: ...
3433
3434    def get_prompt(
3435        self,
3436        name: str,
3437        *,
3438        version: Optional[int] = None,
3439        label: Optional[str] = None,
3440        type: Literal["chat", "text"] = "text",
3441        cache_ttl_seconds: Optional[int] = None,
3442        fallback: Union[Optional[List[ChatMessageDict]], Optional[str]] = None,
3443        max_retries: Optional[int] = None,
3444        fetch_timeout_seconds: Optional[int] = None,
3445    ) -> PromptClient:
3446        """Get a prompt.
3447
3448        This method attempts to fetch the requested prompt from the local cache. If the prompt is not found
3449        in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again
3450        and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will
3451        return the expired prompt as a fallback.
3452
3453        Args:
3454            name (str): The name of the prompt to retrieve.
3455
3456        Keyword Args:
3457            version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both.
3458            label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both.
3459            cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a
3460            keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0.
3461            type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text".
3462            fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None.
3463            max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds.
3464            fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default.
3465
3466        Returns:
3467            The prompt object retrieved from the cache or directly fetched if not cached or expired of type
3468            - TextPromptClient, if type argument is 'text'.
3469            - ChatPromptClient, if type argument is 'chat'.
3470
3471        Raises:
3472            Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an
3473            expired prompt in the cache, in which case it logs a warning and returns the expired prompt.
3474        """
3475        if self._resources is None:
3476            raise Error(
3477                "SDK is not correctly initialized. Check the init logs for more details."
3478            )
3479        if version is not None and label is not None:
3480            raise ValueError("Cannot specify both version and label at the same time.")
3481
3482        if not name:
3483            raise ValueError("Prompt name cannot be empty.")
3484
3485        cache_key = PromptCache.generate_cache_key(name, version=version, label=label)
3486        bounded_max_retries = self._get_bounded_max_retries(
3487            max_retries, default_max_retries=2, max_retries_upper_bound=4
3488        )
3489
3490        langfuse_logger.debug(f"Getting prompt '{cache_key}'")
3491        cached_prompt = self._resources.prompt_cache.get(cache_key)
3492
3493        if cached_prompt is None or cache_ttl_seconds == 0:
3494            langfuse_logger.debug(
3495                f"Prompt '{cache_key}' not found in cache or caching disabled."
3496            )
3497            try:
3498                return self._fetch_prompt_and_update_cache(
3499                    name,
3500                    version=version,
3501                    label=label,
3502                    ttl_seconds=cache_ttl_seconds,
3503                    max_retries=bounded_max_retries,
3504                    fetch_timeout_seconds=fetch_timeout_seconds,
3505                )
3506            except Exception as e:
3507                if fallback:
3508                    langfuse_logger.warning(
3509                        f"Returning fallback prompt for '{cache_key}' due to fetch error: {e}"
3510                    )
3511
3512                    fallback_client_args: Dict[str, Any] = {
3513                        "name": name,
3514                        "prompt": fallback,
3515                        "type": type,
3516                        "version": version or 0,
3517                        "config": {},
3518                        "labels": [label] if label else [],
3519                        "tags": [],
3520                    }
3521
3522                    if type == "text":
3523                        return TextPromptClient(
3524                            prompt=Prompt_Text(**fallback_client_args),
3525                            is_fallback=True,
3526                        )
3527
3528                    if type == "chat":
3529                        return ChatPromptClient(
3530                            prompt=Prompt_Chat(**fallback_client_args),
3531                            is_fallback=True,
3532                        )
3533
3534                raise e
3535
3536        if cached_prompt.is_expired():
3537            langfuse_logger.debug(f"Stale prompt '{cache_key}' found in cache.")
3538            try:
3539                # refresh prompt in background thread, refresh_prompt deduplicates tasks
3540                langfuse_logger.debug(f"Refreshing prompt '{cache_key}' in background.")
3541
3542                def refresh_task() -> None:
3543                    self._fetch_prompt_and_update_cache(
3544                        name,
3545                        version=version,
3546                        label=label,
3547                        ttl_seconds=cache_ttl_seconds,
3548                        max_retries=bounded_max_retries,
3549                        fetch_timeout_seconds=fetch_timeout_seconds,
3550                    )
3551
3552                self._resources.prompt_cache.add_refresh_prompt_task(
3553                    cache_key,
3554                    refresh_task,
3555                )
3556                langfuse_logger.debug(
3557                    f"Returning stale prompt '{cache_key}' from cache."
3558                )
3559                # return stale prompt
3560                return cached_prompt.value
3561
3562            except Exception as e:
3563                langfuse_logger.warning(
3564                    f"Error when refreshing cached prompt '{cache_key}', returning cached version. Error: {e}"
3565                )
3566                # creation of refresh prompt task failed, return stale prompt
3567                return cached_prompt.value
3568
3569        return cached_prompt.value
3570
3571    def _fetch_prompt_and_update_cache(
3572        self,
3573        name: str,
3574        *,
3575        version: Optional[int] = None,
3576        label: Optional[str] = None,
3577        ttl_seconds: Optional[int] = None,
3578        max_retries: int,
3579        fetch_timeout_seconds: Optional[int],
3580    ) -> PromptClient:
3581        cache_key = PromptCache.generate_cache_key(name, version=version, label=label)
3582        langfuse_logger.debug(f"Fetching prompt '{cache_key}' from server...")
3583
3584        try:
3585
3586            @backoff.on_exception(
3587                backoff.constant, Exception, max_tries=max_retries + 1, logger=None
3588            )
3589            def fetch_prompts() -> Any:
3590                return self.api.prompts.get(
3591                    self._url_encode(name),
3592                    version=version,
3593                    label=label,
3594                    request_options={
3595                        "timeout_in_seconds": fetch_timeout_seconds,
3596                    }
3597                    if fetch_timeout_seconds is not None
3598                    else None,
3599                )
3600
3601            prompt_response = fetch_prompts()
3602
3603            prompt: PromptClient
3604            if prompt_response.type == "chat":
3605                prompt = ChatPromptClient(prompt_response)
3606            else:
3607                prompt = TextPromptClient(prompt_response)
3608
3609            if self._resources is not None:
3610                self._resources.prompt_cache.set(cache_key, prompt, ttl_seconds)
3611
3612            return prompt
3613
3614        except NotFoundError as not_found_error:
3615            langfuse_logger.warning(
3616                f"Prompt '{cache_key}' not found during refresh, evicting from cache."
3617            )
3618            if self._resources is not None:
3619                self._resources.prompt_cache.delete(cache_key)
3620            raise not_found_error
3621
3622        except Exception as e:
3623            langfuse_logger.error(
3624                f"Error while fetching prompt '{cache_key}': {str(e)}"
3625            )
3626            raise e
3627
3628    def _get_bounded_max_retries(
3629        self,
3630        max_retries: Optional[int],
3631        *,
3632        default_max_retries: int = 2,
3633        max_retries_upper_bound: int = 4,
3634    ) -> int:
3635        if max_retries is None:
3636            return default_max_retries
3637
3638        bounded_max_retries = min(
3639            max(max_retries, 0),
3640            max_retries_upper_bound,
3641        )
3642
3643        return bounded_max_retries
3644
3645    @overload
3646    def create_prompt(
3647        self,
3648        *,
3649        name: str,
3650        prompt: List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]],
3651        labels: List[str] = [],
3652        tags: Optional[List[str]] = None,
3653        type: Optional[Literal["chat"]],
3654        config: Optional[Any] = None,
3655        commit_message: Optional[str] = None,
3656    ) -> ChatPromptClient: ...
3657
3658    @overload
3659    def create_prompt(
3660        self,
3661        *,
3662        name: str,
3663        prompt: str,
3664        labels: List[str] = [],
3665        tags: Optional[List[str]] = None,
3666        type: Optional[Literal["text"]] = "text",
3667        config: Optional[Any] = None,
3668        commit_message: Optional[str] = None,
3669    ) -> TextPromptClient: ...
3670
3671    def create_prompt(
3672        self,
3673        *,
3674        name: str,
3675        prompt: Union[
3676            str, List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]]
3677        ],
3678        labels: List[str] = [],
3679        tags: Optional[List[str]] = None,
3680        type: Optional[Literal["chat", "text"]] = "text",
3681        config: Optional[Any] = None,
3682        commit_message: Optional[str] = None,
3683    ) -> PromptClient:
3684        """Create a new prompt in Langfuse.
3685
3686        Keyword Args:
3687            name : The name of the prompt to be created.
3688            prompt : The content of the prompt to be created.
3689            is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead.
3690            labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label.
3691            tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt.
3692            config: Additional structured data to be saved with the prompt. Defaults to None.
3693            type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text".
3694            commit_message: Optional string describing the change.
3695
3696        Returns:
3697            TextPromptClient: The prompt if type argument is 'text'.
3698            ChatPromptClient: The prompt if type argument is 'chat'.
3699        """
3700        try:
3701            langfuse_logger.debug(f"Creating prompt {name=}, {labels=}")
3702
3703            if type == "chat":
3704                if not isinstance(prompt, list):
3705                    raise ValueError(
3706                        "For 'chat' type, 'prompt' must be a list of chat messages with role and content attributes."
3707                    )
3708                request: Union[CreatePromptRequest_Chat, CreatePromptRequest_Text] = (
3709                    CreatePromptRequest_Chat(
3710                        name=name,
3711                        prompt=cast(Any, prompt),
3712                        labels=labels,
3713                        tags=tags,
3714                        config=config or {},
3715                        commitMessage=commit_message,
3716                        type="chat",
3717                    )
3718                )
3719                server_prompt = self.api.prompts.create(request=request)
3720
3721                if self._resources is not None:
3722                    self._resources.prompt_cache.invalidate(name)
3723
3724                return ChatPromptClient(prompt=cast(Prompt_Chat, server_prompt))
3725
3726            if not isinstance(prompt, str):
3727                raise ValueError("For 'text' type, 'prompt' must be a string.")
3728
3729            request = CreatePromptRequest_Text(
3730                name=name,
3731                prompt=prompt,
3732                labels=labels,
3733                tags=tags,
3734                config=config or {},
3735                commitMessage=commit_message,
3736                type="text",
3737            )
3738
3739            server_prompt = self.api.prompts.create(request=request)
3740
3741            if self._resources is not None:
3742                self._resources.prompt_cache.invalidate(name)
3743
3744            return TextPromptClient(prompt=cast(Prompt_Text, server_prompt))
3745
3746        except Error as e:
3747            handle_fern_exception(e)
3748            raise e
3749
3750    def update_prompt(
3751        self,
3752        *,
3753        name: str,
3754        version: int,
3755        new_labels: List[str] = [],
3756    ) -> Any:
3757        """Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name.
3758
3759        Args:
3760            name (str): The name of the prompt to update.
3761            version (int): The version number of the prompt to update.
3762            new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to [].
3763
3764        Returns:
3765            Prompt: The updated prompt from the Langfuse API.
3766
3767        """
3768        updated_prompt = self.api.prompt_version.update(
3769            name=self._url_encode(name),
3770            version=version,
3771            new_labels=new_labels,
3772        )
3773
3774        if self._resources is not None:
3775            self._resources.prompt_cache.invalidate(name)
3776
3777        return updated_prompt
3778
3779    def _url_encode(self, url: str, *, is_url_param: Optional[bool] = False) -> str:
3780        # httpx â‰Ĩ 0.28 does its own WHATWG-compliant quoting (eg. encodes bare
3781        # “%”, “?”, “#”, “|”, â€Ļ in query/path parts).  Re-quoting here would
3782        # double-encode, so we skip when the value is about to be sent straight
3783        # to httpx (`is_url_param=True`) and the installed version is â‰Ĩ 0.28.
3784        if is_url_param and Version(httpx.__version__) >= Version("0.28.0"):
3785            return url
3786
3787        # urllib.parse.quote does not escape slashes "/" by default; we need to add safe="" to force escaping
3788        # we need add safe="" to force escaping of slashes
3789        # This is necessary for prompts in prompt folders
3790        return urllib.parse.quote(url, safe="")
3791
3792    def clear_prompt_cache(self) -> None:
3793        """Clear the entire prompt cache, removing all cached prompts.
3794
3795        This method is useful when you want to force a complete refresh of all
3796        cached prompts, for example after major updates or when you need to
3797        ensure the latest versions are fetched from the server.
3798        """
3799        if self._resources is not None:
3800            self._resources.prompt_cache.clear()

Main client for Langfuse tracing and platform features.

This class provides an interface for creating and managing traces, spans, and generations in Langfuse as well as interacting with the Langfuse API.

The client features a thread-safe singleton pattern for each unique public API key, ensuring consistent trace context propagation across your application. It implements efficient batching of spans with configurable flush settings and includes background thread management for media uploads and score ingestion.

Configuration is flexible through either direct parameters or environment variables, with graceful fallbacks and runtime configuration updates.

Attributes:
  • api: Synchronous API client for Langfuse backend communication
  • async_api: Asynchronous API client for Langfuse backend communication
  • _otel_tracer: Internal LangfuseTracer instance managing OpenTelemetry components
Arguments:
  • public_key (Optional[str]): Your Langfuse public API key. Can also be set via LANGFUSE_PUBLIC_KEY environment variable.
  • secret_key (Optional[str]): Your Langfuse secret API key. Can also be set via LANGFUSE_SECRET_KEY environment variable.
  • base_url (Optional[str]): The Langfuse API base URL. Defaults to "https://cloud.langfuse.com". Can also be set via LANGFUSE_BASE_URL environment variable.
  • host (Optional[str]): Deprecated. Use base_url instead. The Langfuse API host URL. Defaults to "https://cloud.langfuse.com".
  • timeout (Optional[int]): Timeout in seconds for API requests. Defaults to 5 seconds.
  • httpx_client (Optional[httpx.Client]): Custom httpx client for making non-tracing HTTP requests. If not provided, a default client will be created.
  • debug (bool): Enable debug logging. Defaults to False. Can also be set via LANGFUSE_DEBUG environment variable.
  • tracing_enabled (Optional[bool]): Enable or disable tracing. Defaults to True. Can also be set via LANGFUSE_TRACING_ENABLED environment variable.
  • flush_at (Optional[int]): Number of spans to batch before sending to the API. Defaults to 512. Can also be set via LANGFUSE_FLUSH_AT environment variable.
  • flush_interval (Optional[float]): Time in seconds between batch flushes. Defaults to 5 seconds. Can also be set via LANGFUSE_FLUSH_INTERVAL environment variable.
  • environment (Optional[str]): Environment name for tracing. Default is 'default'. Can also be set via LANGFUSE_TRACING_ENVIRONMENT environment variable. Can be any lowercase alphanumeric string with hyphens and underscores that does not start with 'langfuse'.
  • release (Optional[str]): Release version/hash of your application. Used for grouping analytics by release.
  • media_upload_thread_count (Optional[int]): Number of background threads for handling media uploads. Defaults to 1. Can also be set via LANGFUSE_MEDIA_UPLOAD_THREAD_COUNT environment variable.
  • sample_rate (Optional[float]): Sampling rate for traces (0.0 to 1.0). Defaults to 1.0 (100% of traces are sampled). Can also be set via LANGFUSE_SAMPLE_RATE environment variable.
  • mask (Optional[MaskFunction]): Function to mask sensitive data in traces before sending to the API.
  • blocked_instrumentation_scopes (Optional[List[str]]): List of instrumentation scope names to block from being exported to Langfuse. Spans from these scopes will be filtered out before being sent to the API. Useful for filtering out spans from specific libraries or frameworks. For exported spans, you can see the instrumentation scope name in the span metadata in Langfuse (metadata.scope.name)
  • additional_headers (Optional[Dict[str, str]]): Additional headers to include in all API requests and OTLPSpanExporter requests. These headers will be merged with default headers. Note: If httpx_client is provided, additional_headers must be set directly on your custom httpx_client as well.
  • tracer_provider(Optional[TracerProvider]): OpenTelemetry TracerProvider to use for Langfuse. This can be useful to set to have disconnected tracing between Langfuse and other OpenTelemetry-span emitting libraries. Note: To track active spans, the context is still shared between TracerProviders. This may lead to broken trace trees.
Example:
from langfuse.otel import Langfuse

# Initialize the client (reads from env vars if not provided)
langfuse = Langfuse(
    public_key="your-public-key",
    secret_key="your-secret-key",
    host="https://cloud.langfuse.com",  # Optional, default shown
)

# Create a trace span
with langfuse.start_as_current_span(name="process-query") as span:
    # Your application code here

    # Create a nested generation span for an LLM call
    with span.start_as_current_generation(
        name="generate-response",
        model="gpt-4",
        input={"query": "Tell me about AI"},
        model_parameters={"temperature": 0.7, "max_tokens": 500}
    ) as generation:
        # Generate response here
        response = "AI is a field of computer science..."

        generation.update(
            output=response,
            usage_details={"prompt_tokens": 10, "completion_tokens": 50},
            cost_details={"total_cost": 0.0023}
        )

        # Score the generation (supports NUMERIC, BOOLEAN, CATEGORICAL)
        generation.score(name="relevance", value=0.95, data_type="NUMERIC")
Langfuse( *, public_key: Optional[str] = None, secret_key: Optional[str] = None, base_url: Optional[str] = None, host: Optional[str] = None, timeout: Optional[int] = None, httpx_client: Optional[httpx.Client] = None, debug: bool = False, tracing_enabled: Optional[bool] = True, flush_at: Optional[int] = None, flush_interval: Optional[float] = None, environment: Optional[str] = None, release: Optional[str] = None, media_upload_thread_count: Optional[int] = None, sample_rate: Optional[float] = None, mask: Optional[langfuse.types.MaskFunction] = None, blocked_instrumentation_scopes: Optional[List[str]] = None, additional_headers: Optional[Dict[str, str]] = None, tracer_provider: Optional[opentelemetry.sdk.trace.TracerProvider] = None)
208    def __init__(
209        self,
210        *,
211        public_key: Optional[str] = None,
212        secret_key: Optional[str] = None,
213        base_url: Optional[str] = None,
214        host: Optional[str] = None,
215        timeout: Optional[int] = None,
216        httpx_client: Optional[httpx.Client] = None,
217        debug: bool = False,
218        tracing_enabled: Optional[bool] = True,
219        flush_at: Optional[int] = None,
220        flush_interval: Optional[float] = None,
221        environment: Optional[str] = None,
222        release: Optional[str] = None,
223        media_upload_thread_count: Optional[int] = None,
224        sample_rate: Optional[float] = None,
225        mask: Optional[MaskFunction] = None,
226        blocked_instrumentation_scopes: Optional[List[str]] = None,
227        additional_headers: Optional[Dict[str, str]] = None,
228        tracer_provider: Optional[TracerProvider] = None,
229    ):
230        self._base_url = (
231            base_url
232            or os.environ.get(LANGFUSE_BASE_URL)
233            or host
234            or os.environ.get(LANGFUSE_HOST, "https://cloud.langfuse.com")
235        )
236        self._environment = environment or cast(
237            str, os.environ.get(LANGFUSE_TRACING_ENVIRONMENT)
238        )
239        self._project_id: Optional[str] = None
240        sample_rate = sample_rate or float(os.environ.get(LANGFUSE_SAMPLE_RATE, 1.0))
241        if not 0.0 <= sample_rate <= 1.0:
242            raise ValueError(
243                f"Sample rate must be between 0.0 and 1.0, got {sample_rate}"
244            )
245
246        timeout = timeout or int(os.environ.get(LANGFUSE_TIMEOUT, 5))
247
248        self._tracing_enabled = (
249            tracing_enabled
250            and os.environ.get(LANGFUSE_TRACING_ENABLED, "true").lower() != "false"
251        )
252        if not self._tracing_enabled:
253            langfuse_logger.info(
254                "Configuration: Langfuse tracing is explicitly disabled. No data will be sent to the Langfuse API."
255            )
256
257        debug = (
258            debug if debug else (os.getenv(LANGFUSE_DEBUG, "false").lower() == "true")
259        )
260        if debug:
261            logging.basicConfig(
262                format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
263            )
264            langfuse_logger.setLevel(logging.DEBUG)
265
266        public_key = public_key or os.environ.get(LANGFUSE_PUBLIC_KEY)
267        if public_key is None:
268            langfuse_logger.warning(
269                "Authentication error: Langfuse client initialized without public_key. Client will be disabled. "
270                "Provide a public_key parameter or set LANGFUSE_PUBLIC_KEY environment variable. "
271            )
272            self._otel_tracer = otel_trace_api.NoOpTracer()
273            return
274
275        secret_key = secret_key or os.environ.get(LANGFUSE_SECRET_KEY)
276        if secret_key is None:
277            langfuse_logger.warning(
278                "Authentication error: Langfuse client initialized without secret_key. Client will be disabled. "
279                "Provide a secret_key parameter or set LANGFUSE_SECRET_KEY environment variable. "
280            )
281            self._otel_tracer = otel_trace_api.NoOpTracer()
282            return
283
284        if os.environ.get("OTEL_SDK_DISABLED", "false").lower() == "true":
285            langfuse_logger.warning(
286                "OTEL_SDK_DISABLED is set. Langfuse tracing will be disabled and no traces will appear in the UI."
287            )
288
289        # Initialize api and tracer if requirements are met
290        self._resources = LangfuseResourceManager(
291            public_key=public_key,
292            secret_key=secret_key,
293            base_url=self._base_url,
294            timeout=timeout,
295            environment=self._environment,
296            release=release,
297            flush_at=flush_at,
298            flush_interval=flush_interval,
299            httpx_client=httpx_client,
300            media_upload_thread_count=media_upload_thread_count,
301            sample_rate=sample_rate,
302            mask=mask,
303            tracing_enabled=self._tracing_enabled,
304            blocked_instrumentation_scopes=blocked_instrumentation_scopes,
305            additional_headers=additional_headers,
306            tracer_provider=tracer_provider,
307        )
308        self._mask = self._resources.mask
309
310        self._otel_tracer = (
311            self._resources.tracer
312            if self._tracing_enabled and self._resources.tracer is not None
313            else otel_trace_api.NoOpTracer()
314        )
315        self.api = self._resources.api
316        self.async_api = self._resources.async_api
api
async_api
def start_span( self, *, trace_context: Optional[langfuse.types.TraceContext] = None, name: str, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None) -> LangfuseSpan:
318    def start_span(
319        self,
320        *,
321        trace_context: Optional[TraceContext] = None,
322        name: str,
323        input: Optional[Any] = None,
324        output: Optional[Any] = None,
325        metadata: Optional[Any] = None,
326        version: Optional[str] = None,
327        level: Optional[SpanLevel] = None,
328        status_message: Optional[str] = None,
329    ) -> LangfuseSpan:
330        """Create a new span for tracing a unit of work.
331
332        This method creates a new span but does not set it as the current span in the
333        context. To create and use a span within a context, use start_as_current_span().
334
335        The created span will be the child of the current span in the context.
336
337        Args:
338            trace_context: Optional context for connecting to an existing trace
339            name: Name of the span (e.g., function or operation name)
340            input: Input data for the operation (can be any JSON-serializable object)
341            output: Output data from the operation (can be any JSON-serializable object)
342            metadata: Additional metadata to associate with the span
343            version: Version identifier for the code or component
344            level: Importance level of the span (info, warning, error)
345            status_message: Optional status message for the span
346
347        Returns:
348            A LangfuseSpan object that must be ended with .end() when the operation completes
349
350        Example:
351            ```python
352            span = langfuse.start_span(name="process-data")
353            try:
354                # Do work
355                span.update(output="result")
356            finally:
357                span.end()
358            ```
359        """
360        return self.start_observation(
361            trace_context=trace_context,
362            name=name,
363            as_type="span",
364            input=input,
365            output=output,
366            metadata=metadata,
367            version=version,
368            level=level,
369            status_message=status_message,
370        )

Create a new span for tracing a unit of work.

This method creates a new span but does not set it as the current span in the context. To create and use a span within a context, use start_as_current_span().

The created span will be the child of the current span in the context.

Arguments:
  • trace_context: Optional context for connecting to an existing trace
  • name: Name of the span (e.g., function or operation name)
  • input: Input data for the operation (can be any JSON-serializable object)
  • output: Output data from the operation (can be any JSON-serializable object)
  • metadata: Additional metadata to associate with the span
  • version: Version identifier for the code or component
  • level: Importance level of the span (info, warning, error)
  • status_message: Optional status message for the span
Returns:

A LangfuseSpan object that must be ended with .end() when the operation completes

Example:
span = langfuse.start_span(name="process-data")
try:
    # Do work
    span.update(output="result")
finally:
    span.end()
def start_as_current_span( self, *, trace_context: Optional[langfuse.types.TraceContext] = None, name: str, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, end_on_exit: Optional[bool] = None) -> opentelemetry.util._decorator._AgnosticContextManager[LangfuseSpan]:
372    def start_as_current_span(
373        self,
374        *,
375        trace_context: Optional[TraceContext] = None,
376        name: str,
377        input: Optional[Any] = None,
378        output: Optional[Any] = None,
379        metadata: Optional[Any] = None,
380        version: Optional[str] = None,
381        level: Optional[SpanLevel] = None,
382        status_message: Optional[str] = None,
383        end_on_exit: Optional[bool] = None,
384    ) -> _AgnosticContextManager[LangfuseSpan]:
385        """Create a new span and set it as the current span in a context manager.
386
387        This method creates a new span and sets it as the current span within a context
388        manager. Use this method with a 'with' statement to automatically handle span
389        lifecycle within a code block.
390
391        The created span will be the child of the current span in the context.
392
393        Args:
394            trace_context: Optional context for connecting to an existing trace
395            name: Name of the span (e.g., function or operation name)
396            input: Input data for the operation (can be any JSON-serializable object)
397            output: Output data from the operation (can be any JSON-serializable object)
398            metadata: Additional metadata to associate with the span
399            version: Version identifier for the code or component
400            level: Importance level of the span (info, warning, error)
401            status_message: Optional status message for the span
402            end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
403
404        Returns:
405            A context manager that yields a LangfuseSpan
406
407        Example:
408            ```python
409            with langfuse.start_as_current_span(name="process-query") as span:
410                # Do work
411                result = process_data()
412                span.update(output=result)
413
414                # Create a child span automatically
415                with span.start_as_current_span(name="sub-operation") as child_span:
416                    # Do sub-operation work
417                    child_span.update(output="sub-result")
418            ```
419        """
420        return self.start_as_current_observation(
421            trace_context=trace_context,
422            name=name,
423            as_type="span",
424            input=input,
425            output=output,
426            metadata=metadata,
427            version=version,
428            level=level,
429            status_message=status_message,
430            end_on_exit=end_on_exit,
431        )

Create a new span and set it as the current span in a context manager.

This method creates a new span and sets it as the current span within a context manager. Use this method with a 'with' statement to automatically handle span lifecycle within a code block.

The created span will be the child of the current span in the context.

Arguments:
  • trace_context: Optional context for connecting to an existing trace
  • name: Name of the span (e.g., function or operation name)
  • input: Input data for the operation (can be any JSON-serializable object)
  • output: Output data from the operation (can be any JSON-serializable object)
  • metadata: Additional metadata to associate with the span
  • version: Version identifier for the code or component
  • level: Importance level of the span (info, warning, error)
  • status_message: Optional status message for the span
  • end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
Returns:

A context manager that yields a LangfuseSpan

Example:
with langfuse.start_as_current_span(name="process-query") as span:
    # Do work
    result = process_data()
    span.update(output=result)

    # Create a child span automatically
    with span.start_as_current_span(name="sub-operation") as child_span:
        # Do sub-operation work
        child_span.update(output="sub-result")
def start_observation( self, *, trace_context: Optional[langfuse.types.TraceContext] = None, name: str, as_type: Union[Literal['generation', 'embedding'], Literal['span', 'agent', 'tool', 'chain', 'retriever', 'evaluator', 'guardrail']] = 'span', input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None) -> Union[LangfuseSpan, LangfuseGeneration, LangfuseAgent, LangfuseTool, LangfuseChain, LangfuseRetriever, LangfuseEvaluator, LangfuseEmbedding, LangfuseGuardrail]:
580    def start_observation(
581        self,
582        *,
583        trace_context: Optional[TraceContext] = None,
584        name: str,
585        as_type: ObservationTypeLiteralNoEvent = "span",
586        input: Optional[Any] = None,
587        output: Optional[Any] = None,
588        metadata: Optional[Any] = None,
589        version: Optional[str] = None,
590        level: Optional[SpanLevel] = None,
591        status_message: Optional[str] = None,
592        completion_start_time: Optional[datetime] = None,
593        model: Optional[str] = None,
594        model_parameters: Optional[Dict[str, MapValue]] = None,
595        usage_details: Optional[Dict[str, int]] = None,
596        cost_details: Optional[Dict[str, float]] = None,
597        prompt: Optional[PromptClient] = None,
598    ) -> Union[
599        LangfuseSpan,
600        LangfuseGeneration,
601        LangfuseAgent,
602        LangfuseTool,
603        LangfuseChain,
604        LangfuseRetriever,
605        LangfuseEvaluator,
606        LangfuseEmbedding,
607        LangfuseGuardrail,
608    ]:
609        """Create a new observation of the specified type.
610
611        This method creates a new observation but does not set it as the current span in the
612        context. To create and use an observation within a context, use start_as_current_observation().
613
614        Args:
615            trace_context: Optional context for connecting to an existing trace
616            name: Name of the observation
617            as_type: Type of observation to create (defaults to "span")
618            input: Input data for the operation
619            output: Output data from the operation
620            metadata: Additional metadata to associate with the observation
621            version: Version identifier for the code or component
622            level: Importance level of the observation
623            status_message: Optional status message for the observation
624            completion_start_time: When the model started generating (for generation types)
625            model: Name/identifier of the AI model used (for generation types)
626            model_parameters: Parameters used for the model (for generation types)
627            usage_details: Token usage information (for generation types)
628            cost_details: Cost information (for generation types)
629            prompt: Associated prompt template (for generation types)
630
631        Returns:
632            An observation object of the appropriate type that must be ended with .end()
633        """
634        if trace_context:
635            trace_id = trace_context.get("trace_id", None)
636            parent_span_id = trace_context.get("parent_span_id", None)
637
638            if trace_id:
639                remote_parent_span = self._create_remote_parent_span(
640                    trace_id=trace_id, parent_span_id=parent_span_id
641                )
642
643                with otel_trace_api.use_span(
644                    cast(otel_trace_api.Span, remote_parent_span)
645                ):
646                    otel_span = self._otel_tracer.start_span(name=name)
647                    otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True)
648
649                    return self._create_observation_from_otel_span(
650                        otel_span=otel_span,
651                        as_type=as_type,
652                        input=input,
653                        output=output,
654                        metadata=metadata,
655                        version=version,
656                        level=level,
657                        status_message=status_message,
658                        completion_start_time=completion_start_time,
659                        model=model,
660                        model_parameters=model_parameters,
661                        usage_details=usage_details,
662                        cost_details=cost_details,
663                        prompt=prompt,
664                    )
665
666        otel_span = self._otel_tracer.start_span(name=name)
667
668        return self._create_observation_from_otel_span(
669            otel_span=otel_span,
670            as_type=as_type,
671            input=input,
672            output=output,
673            metadata=metadata,
674            version=version,
675            level=level,
676            status_message=status_message,
677            completion_start_time=completion_start_time,
678            model=model,
679            model_parameters=model_parameters,
680            usage_details=usage_details,
681            cost_details=cost_details,
682            prompt=prompt,
683        )

Create a new observation of the specified type.

This method creates a new observation but does not set it as the current span in the context. To create and use an observation within a context, use start_as_current_observation().

Arguments:
  • trace_context: Optional context for connecting to an existing trace
  • name: Name of the observation
  • as_type: Type of observation to create (defaults to "span")
  • input: Input data for the operation
  • output: Output data from the operation
  • metadata: Additional metadata to associate with the observation
  • version: Version identifier for the code or component
  • level: Importance level of the observation
  • status_message: Optional status message for the observation
  • completion_start_time: When the model started generating (for generation types)
  • model: Name/identifier of the AI model used (for generation types)
  • model_parameters: Parameters used for the model (for generation types)
  • usage_details: Token usage information (for generation types)
  • cost_details: Cost information (for generation types)
  • prompt: Associated prompt template (for generation types)
Returns:

An observation object of the appropriate type that must be ended with .end()

def start_generation( self, *, trace_context: Optional[langfuse.types.TraceContext] = None, name: str, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None) -> LangfuseGeneration:
755    def start_generation(
756        self,
757        *,
758        trace_context: Optional[TraceContext] = None,
759        name: str,
760        input: Optional[Any] = None,
761        output: Optional[Any] = None,
762        metadata: Optional[Any] = None,
763        version: Optional[str] = None,
764        level: Optional[SpanLevel] = None,
765        status_message: Optional[str] = None,
766        completion_start_time: Optional[datetime] = None,
767        model: Optional[str] = None,
768        model_parameters: Optional[Dict[str, MapValue]] = None,
769        usage_details: Optional[Dict[str, int]] = None,
770        cost_details: Optional[Dict[str, float]] = None,
771        prompt: Optional[PromptClient] = None,
772    ) -> LangfuseGeneration:
773        """Create a new generation span for model generations.
774
775        DEPRECATED: This method is deprecated and will be removed in a future version.
776        Use start_observation(as_type='generation') instead.
777
778        This method creates a specialized span for tracking model generations.
779        It includes additional fields specific to model generations such as model name,
780        token usage, and cost details.
781
782        The created generation span will be the child of the current span in the context.
783
784        Args:
785            trace_context: Optional context for connecting to an existing trace
786            name: Name of the generation operation
787            input: Input data for the model (e.g., prompts)
788            output: Output from the model (e.g., completions)
789            metadata: Additional metadata to associate with the generation
790            version: Version identifier for the model or component
791            level: Importance level of the generation (info, warning, error)
792            status_message: Optional status message for the generation
793            completion_start_time: When the model started generating the response
794            model: Name/identifier of the AI model used (e.g., "gpt-4")
795            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
796            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
797            cost_details: Cost information for the model call
798            prompt: Associated prompt template from Langfuse prompt management
799
800        Returns:
801            A LangfuseGeneration object that must be ended with .end() when complete
802
803        Example:
804            ```python
805            generation = langfuse.start_generation(
806                name="answer-generation",
807                model="gpt-4",
808                input={"prompt": "Explain quantum computing"},
809                model_parameters={"temperature": 0.7}
810            )
811            try:
812                # Call model API
813                response = llm.generate(...)
814
815                generation.update(
816                    output=response.text,
817                    usage_details={
818                        "prompt_tokens": response.usage.prompt_tokens,
819                        "completion_tokens": response.usage.completion_tokens
820                    }
821                )
822            finally:
823                generation.end()
824            ```
825        """
826        warnings.warn(
827            "start_generation is deprecated and will be removed in a future version. "
828            "Use start_observation(as_type='generation') instead.",
829            DeprecationWarning,
830            stacklevel=2,
831        )
832        return self.start_observation(
833            trace_context=trace_context,
834            name=name,
835            as_type="generation",
836            input=input,
837            output=output,
838            metadata=metadata,
839            version=version,
840            level=level,
841            status_message=status_message,
842            completion_start_time=completion_start_time,
843            model=model,
844            model_parameters=model_parameters,
845            usage_details=usage_details,
846            cost_details=cost_details,
847            prompt=prompt,
848        )

Create a new generation span for model generations.

DEPRECATED: This method is deprecated and will be removed in a future version. Use start_observation(as_type='generation') instead.

This method creates a specialized span for tracking model generations. It includes additional fields specific to model generations such as model name, token usage, and cost details.

The created generation span will be the child of the current span in the context.

Arguments:
  • trace_context: Optional context for connecting to an existing trace
  • name: Name of the generation operation
  • input: Input data for the model (e.g., prompts)
  • output: Output from the model (e.g., completions)
  • metadata: Additional metadata to associate with the generation
  • version: Version identifier for the model or component
  • level: Importance level of the generation (info, warning, error)
  • status_message: Optional status message for the generation
  • completion_start_time: When the model started generating the response
  • model: Name/identifier of the AI model used (e.g., "gpt-4")
  • model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
  • usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
  • cost_details: Cost information for the model call
  • prompt: Associated prompt template from Langfuse prompt management
Returns:

A LangfuseGeneration object that must be ended with .end() when complete

Example:
generation = langfuse.start_generation(
    name="answer-generation",
    model="gpt-4",
    input={"prompt": "Explain quantum computing"},
    model_parameters={"temperature": 0.7}
)
try:
    # Call model API
    response = llm.generate(...)

    generation.update(
        output=response.text,
        usage_details={
            "prompt_tokens": response.usage.prompt_tokens,
            "completion_tokens": response.usage.completion_tokens
        }
    )
finally:
    generation.end()
def start_as_current_generation( self, *, trace_context: Optional[langfuse.types.TraceContext] = None, name: str, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None, end_on_exit: Optional[bool] = None) -> opentelemetry.util._decorator._AgnosticContextManager[LangfuseGeneration]:
850    def start_as_current_generation(
851        self,
852        *,
853        trace_context: Optional[TraceContext] = None,
854        name: str,
855        input: Optional[Any] = None,
856        output: Optional[Any] = None,
857        metadata: Optional[Any] = None,
858        version: Optional[str] = None,
859        level: Optional[SpanLevel] = None,
860        status_message: Optional[str] = None,
861        completion_start_time: Optional[datetime] = None,
862        model: Optional[str] = None,
863        model_parameters: Optional[Dict[str, MapValue]] = None,
864        usage_details: Optional[Dict[str, int]] = None,
865        cost_details: Optional[Dict[str, float]] = None,
866        prompt: Optional[PromptClient] = None,
867        end_on_exit: Optional[bool] = None,
868    ) -> _AgnosticContextManager[LangfuseGeneration]:
869        """Create a new generation span and set it as the current span in a context manager.
870
871        DEPRECATED: This method is deprecated and will be removed in a future version.
872        Use start_as_current_observation(as_type='generation') instead.
873
874        This method creates a specialized span for model generations and sets it as the
875        current span within a context manager. Use this method with a 'with' statement to
876        automatically handle the generation span lifecycle within a code block.
877
878        The created generation span will be the child of the current span in the context.
879
880        Args:
881            trace_context: Optional context for connecting to an existing trace
882            name: Name of the generation operation
883            input: Input data for the model (e.g., prompts)
884            output: Output from the model (e.g., completions)
885            metadata: Additional metadata to associate with the generation
886            version: Version identifier for the model or component
887            level: Importance level of the generation (info, warning, error)
888            status_message: Optional status message for the generation
889            completion_start_time: When the model started generating the response
890            model: Name/identifier of the AI model used (e.g., "gpt-4")
891            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
892            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
893            cost_details: Cost information for the model call
894            prompt: Associated prompt template from Langfuse prompt management
895            end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
896
897        Returns:
898            A context manager that yields a LangfuseGeneration
899
900        Example:
901            ```python
902            with langfuse.start_as_current_generation(
903                name="answer-generation",
904                model="gpt-4",
905                input={"prompt": "Explain quantum computing"}
906            ) as generation:
907                # Call model API
908                response = llm.generate(...)
909
910                # Update with results
911                generation.update(
912                    output=response.text,
913                    usage_details={
914                        "prompt_tokens": response.usage.prompt_tokens,
915                        "completion_tokens": response.usage.completion_tokens
916                    }
917                )
918            ```
919        """
920        warnings.warn(
921            "start_as_current_generation is deprecated and will be removed in a future version. "
922            "Use start_as_current_observation(as_type='generation') instead.",
923            DeprecationWarning,
924            stacklevel=2,
925        )
926        return self.start_as_current_observation(
927            trace_context=trace_context,
928            name=name,
929            as_type="generation",
930            input=input,
931            output=output,
932            metadata=metadata,
933            version=version,
934            level=level,
935            status_message=status_message,
936            completion_start_time=completion_start_time,
937            model=model,
938            model_parameters=model_parameters,
939            usage_details=usage_details,
940            cost_details=cost_details,
941            prompt=prompt,
942            end_on_exit=end_on_exit,
943        )

Create a new generation span and set it as the current span in a context manager.

DEPRECATED: This method is deprecated and will be removed in a future version. Use start_as_current_observation(as_type='generation') instead.

This method creates a specialized span for model generations and sets it as the current span within a context manager. Use this method with a 'with' statement to automatically handle the generation span lifecycle within a code block.

The created generation span will be the child of the current span in the context.

Arguments:
  • trace_context: Optional context for connecting to an existing trace
  • name: Name of the generation operation
  • input: Input data for the model (e.g., prompts)
  • output: Output from the model (e.g., completions)
  • metadata: Additional metadata to associate with the generation
  • version: Version identifier for the model or component
  • level: Importance level of the generation (info, warning, error)
  • status_message: Optional status message for the generation
  • completion_start_time: When the model started generating the response
  • model: Name/identifier of the AI model used (e.g., "gpt-4")
  • model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
  • usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
  • cost_details: Cost information for the model call
  • prompt: Associated prompt template from Langfuse prompt management
  • end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
Returns:

A context manager that yields a LangfuseGeneration

Example:
with langfuse.start_as_current_generation(
    name="answer-generation",
    model="gpt-4",
    input={"prompt": "Explain quantum computing"}
) as generation:
    # Call model API
    response = llm.generate(...)

    # Update with results
    generation.update(
        output=response.text,
        usage_details={
            "prompt_tokens": response.usage.prompt_tokens,
            "completion_tokens": response.usage.completion_tokens
        }
    )
def start_as_current_observation( self, *, trace_context: Optional[langfuse.types.TraceContext] = None, name: str, as_type: Union[Literal['generation', 'embedding'], Literal['span', 'agent', 'tool', 'chain', 'retriever', 'evaluator', 'guardrail']] = 'span', input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None, end_on_exit: Optional[bool] = None) -> Union[opentelemetry.util._decorator._AgnosticContextManager[LangfuseGeneration], opentelemetry.util._decorator._AgnosticContextManager[LangfuseSpan], opentelemetry.util._decorator._AgnosticContextManager[LangfuseAgent], opentelemetry.util._decorator._AgnosticContextManager[LangfuseTool], opentelemetry.util._decorator._AgnosticContextManager[LangfuseChain], opentelemetry.util._decorator._AgnosticContextManager[LangfuseRetriever], opentelemetry.util._decorator._AgnosticContextManager[LangfuseEvaluator], opentelemetry.util._decorator._AgnosticContextManager[LangfuseEmbedding], opentelemetry.util._decorator._AgnosticContextManager[LangfuseGuardrail]]:
1101    def start_as_current_observation(
1102        self,
1103        *,
1104        trace_context: Optional[TraceContext] = None,
1105        name: str,
1106        as_type: ObservationTypeLiteralNoEvent = "span",
1107        input: Optional[Any] = None,
1108        output: Optional[Any] = None,
1109        metadata: Optional[Any] = None,
1110        version: Optional[str] = None,
1111        level: Optional[SpanLevel] = None,
1112        status_message: Optional[str] = None,
1113        completion_start_time: Optional[datetime] = None,
1114        model: Optional[str] = None,
1115        model_parameters: Optional[Dict[str, MapValue]] = None,
1116        usage_details: Optional[Dict[str, int]] = None,
1117        cost_details: Optional[Dict[str, float]] = None,
1118        prompt: Optional[PromptClient] = None,
1119        end_on_exit: Optional[bool] = None,
1120    ) -> Union[
1121        _AgnosticContextManager[LangfuseGeneration],
1122        _AgnosticContextManager[LangfuseSpan],
1123        _AgnosticContextManager[LangfuseAgent],
1124        _AgnosticContextManager[LangfuseTool],
1125        _AgnosticContextManager[LangfuseChain],
1126        _AgnosticContextManager[LangfuseRetriever],
1127        _AgnosticContextManager[LangfuseEvaluator],
1128        _AgnosticContextManager[LangfuseEmbedding],
1129        _AgnosticContextManager[LangfuseGuardrail],
1130    ]:
1131        """Create a new observation and set it as the current span in a context manager.
1132
1133        This method creates a new observation of the specified type and sets it as the
1134        current span within a context manager. Use this method with a 'with' statement to
1135        automatically handle the observation lifecycle within a code block.
1136
1137        The created observation will be the child of the current span in the context.
1138
1139        Args:
1140            trace_context: Optional context for connecting to an existing trace
1141            name: Name of the observation (e.g., function or operation name)
1142            as_type: Type of observation to create (defaults to "span")
1143            input: Input data for the operation (can be any JSON-serializable object)
1144            output: Output data from the operation (can be any JSON-serializable object)
1145            metadata: Additional metadata to associate with the observation
1146            version: Version identifier for the code or component
1147            level: Importance level of the observation (info, warning, error)
1148            status_message: Optional status message for the observation
1149            end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
1150
1151            The following parameters are available when as_type is: "generation" or "embedding".
1152            completion_start_time: When the model started generating the response
1153            model: Name/identifier of the AI model used (e.g., "gpt-4")
1154            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1155            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1156            cost_details: Cost information for the model call
1157            prompt: Associated prompt template from Langfuse prompt management
1158
1159        Returns:
1160            A context manager that yields the appropriate observation type based on as_type
1161
1162        Example:
1163            ```python
1164            # Create a span
1165            with langfuse.start_as_current_observation(name="process-query", as_type="span") as span:
1166                # Do work
1167                result = process_data()
1168                span.update(output=result)
1169
1170                # Create a child span automatically
1171                with span.start_as_current_span(name="sub-operation") as child_span:
1172                    # Do sub-operation work
1173                    child_span.update(output="sub-result")
1174
1175            # Create a tool observation
1176            with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool:
1177                # Do tool work
1178                results = search_web(query)
1179                tool.update(output=results)
1180
1181            # Create a generation observation
1182            with langfuse.start_as_current_observation(
1183                name="answer-generation",
1184                as_type="generation",
1185                model="gpt-4"
1186            ) as generation:
1187                # Generate answer
1188                response = llm.generate(...)
1189                generation.update(output=response)
1190            ```
1191        """
1192        if as_type in get_observation_types_list(ObservationTypeGenerationLike):
1193            if trace_context:
1194                trace_id = trace_context.get("trace_id", None)
1195                parent_span_id = trace_context.get("parent_span_id", None)
1196
1197                if trace_id:
1198                    remote_parent_span = self._create_remote_parent_span(
1199                        trace_id=trace_id, parent_span_id=parent_span_id
1200                    )
1201
1202                    return cast(
1203                        Union[
1204                            _AgnosticContextManager[LangfuseGeneration],
1205                            _AgnosticContextManager[LangfuseEmbedding],
1206                        ],
1207                        self._create_span_with_parent_context(
1208                            as_type=as_type,
1209                            name=name,
1210                            remote_parent_span=remote_parent_span,
1211                            parent=None,
1212                            end_on_exit=end_on_exit,
1213                            input=input,
1214                            output=output,
1215                            metadata=metadata,
1216                            version=version,
1217                            level=level,
1218                            status_message=status_message,
1219                            completion_start_time=completion_start_time,
1220                            model=model,
1221                            model_parameters=model_parameters,
1222                            usage_details=usage_details,
1223                            cost_details=cost_details,
1224                            prompt=prompt,
1225                        ),
1226                    )
1227
1228            return cast(
1229                Union[
1230                    _AgnosticContextManager[LangfuseGeneration],
1231                    _AgnosticContextManager[LangfuseEmbedding],
1232                ],
1233                self._start_as_current_otel_span_with_processed_media(
1234                    as_type=as_type,
1235                    name=name,
1236                    end_on_exit=end_on_exit,
1237                    input=input,
1238                    output=output,
1239                    metadata=metadata,
1240                    version=version,
1241                    level=level,
1242                    status_message=status_message,
1243                    completion_start_time=completion_start_time,
1244                    model=model,
1245                    model_parameters=model_parameters,
1246                    usage_details=usage_details,
1247                    cost_details=cost_details,
1248                    prompt=prompt,
1249                ),
1250            )
1251
1252        if as_type in get_observation_types_list(ObservationTypeSpanLike):
1253            if trace_context:
1254                trace_id = trace_context.get("trace_id", None)
1255                parent_span_id = trace_context.get("parent_span_id", None)
1256
1257                if trace_id:
1258                    remote_parent_span = self._create_remote_parent_span(
1259                        trace_id=trace_id, parent_span_id=parent_span_id
1260                    )
1261
1262                    return cast(
1263                        Union[
1264                            _AgnosticContextManager[LangfuseSpan],
1265                            _AgnosticContextManager[LangfuseAgent],
1266                            _AgnosticContextManager[LangfuseTool],
1267                            _AgnosticContextManager[LangfuseChain],
1268                            _AgnosticContextManager[LangfuseRetriever],
1269                            _AgnosticContextManager[LangfuseEvaluator],
1270                            _AgnosticContextManager[LangfuseGuardrail],
1271                        ],
1272                        self._create_span_with_parent_context(
1273                            as_type=as_type,
1274                            name=name,
1275                            remote_parent_span=remote_parent_span,
1276                            parent=None,
1277                            end_on_exit=end_on_exit,
1278                            input=input,
1279                            output=output,
1280                            metadata=metadata,
1281                            version=version,
1282                            level=level,
1283                            status_message=status_message,
1284                        ),
1285                    )
1286
1287            return cast(
1288                Union[
1289                    _AgnosticContextManager[LangfuseSpan],
1290                    _AgnosticContextManager[LangfuseAgent],
1291                    _AgnosticContextManager[LangfuseTool],
1292                    _AgnosticContextManager[LangfuseChain],
1293                    _AgnosticContextManager[LangfuseRetriever],
1294                    _AgnosticContextManager[LangfuseEvaluator],
1295                    _AgnosticContextManager[LangfuseGuardrail],
1296                ],
1297                self._start_as_current_otel_span_with_processed_media(
1298                    as_type=as_type,
1299                    name=name,
1300                    end_on_exit=end_on_exit,
1301                    input=input,
1302                    output=output,
1303                    metadata=metadata,
1304                    version=version,
1305                    level=level,
1306                    status_message=status_message,
1307                ),
1308            )
1309
1310        # This should never be reached since all valid types are handled above
1311        langfuse_logger.warning(
1312            f"Unknown observation type: {as_type}, falling back to span"
1313        )
1314        return self._start_as_current_otel_span_with_processed_media(
1315            as_type="span",
1316            name=name,
1317            end_on_exit=end_on_exit,
1318            input=input,
1319            output=output,
1320            metadata=metadata,
1321            version=version,
1322            level=level,
1323            status_message=status_message,
1324        )

Create a new observation and set it as the current span in a context manager.

This method creates a new observation of the specified type and sets it as the current span within a context manager. Use this method with a 'with' statement to automatically handle the observation lifecycle within a code block.

The created observation will be the child of the current span in the context.

Arguments:
  • trace_context: Optional context for connecting to an existing trace
  • name: Name of the observation (e.g., function or operation name)
  • as_type: Type of observation to create (defaults to "span")
  • input: Input data for the operation (can be any JSON-serializable object)
  • output: Output data from the operation (can be any JSON-serializable object)
  • metadata: Additional metadata to associate with the observation
  • version: Version identifier for the code or component
  • level: Importance level of the observation (info, warning, error)
  • status_message: Optional status message for the observation
  • end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
  • The following parameters are available when as_type is: "generation" or "embedding".
  • completion_start_time: When the model started generating the response
  • model: Name/identifier of the AI model used (e.g., "gpt-4")
  • model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
  • usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
  • cost_details: Cost information for the model call
  • prompt: Associated prompt template from Langfuse prompt management
Returns:

A context manager that yields the appropriate observation type based on as_type

Example:
# Create a span
with langfuse.start_as_current_observation(name="process-query", as_type="span") as span:
    # Do work
    result = process_data()
    span.update(output=result)

    # Create a child span automatically
    with span.start_as_current_span(name="sub-operation") as child_span:
        # Do sub-operation work
        child_span.update(output="sub-result")

# Create a tool observation
with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool:
    # Do tool work
    results = search_web(query)
    tool.update(output=results)

# Create a generation observation
with langfuse.start_as_current_observation(
    name="answer-generation",
    as_type="generation",
    model="gpt-4"
) as generation:
    # Generate answer
    response = llm.generate(...)
    generation.update(output=response)
def update_current_generation( self, *, name: Optional[str] = None, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None) -> None:
1485    def update_current_generation(
1486        self,
1487        *,
1488        name: Optional[str] = None,
1489        input: Optional[Any] = None,
1490        output: Optional[Any] = None,
1491        metadata: Optional[Any] = None,
1492        version: Optional[str] = None,
1493        level: Optional[SpanLevel] = None,
1494        status_message: Optional[str] = None,
1495        completion_start_time: Optional[datetime] = None,
1496        model: Optional[str] = None,
1497        model_parameters: Optional[Dict[str, MapValue]] = None,
1498        usage_details: Optional[Dict[str, int]] = None,
1499        cost_details: Optional[Dict[str, float]] = None,
1500        prompt: Optional[PromptClient] = None,
1501    ) -> None:
1502        """Update the current active generation span with new information.
1503
1504        This method updates the current generation span in the active context with
1505        additional information. It's useful for adding output, usage stats, or other
1506        details that become available during or after model generation.
1507
1508        Args:
1509            name: The generation name
1510            input: Updated input data for the model
1511            output: Output from the model (e.g., completions)
1512            metadata: Additional metadata to associate with the generation
1513            version: Version identifier for the model or component
1514            level: Importance level of the generation (info, warning, error)
1515            status_message: Optional status message for the generation
1516            completion_start_time: When the model started generating the response
1517            model: Name/identifier of the AI model used (e.g., "gpt-4")
1518            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1519            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1520            cost_details: Cost information for the model call
1521            prompt: Associated prompt template from Langfuse prompt management
1522
1523        Example:
1524            ```python
1525            with langfuse.start_as_current_generation(name="answer-query") as generation:
1526                # Initial setup and API call
1527                response = llm.generate(...)
1528
1529                # Update with results that weren't available at creation time
1530                langfuse.update_current_generation(
1531                    output=response.text,
1532                    usage_details={
1533                        "prompt_tokens": response.usage.prompt_tokens,
1534                        "completion_tokens": response.usage.completion_tokens
1535                    }
1536                )
1537            ```
1538        """
1539        if not self._tracing_enabled:
1540            langfuse_logger.debug(
1541                "Operation skipped: update_current_generation - Tracing is disabled or client is in no-op mode."
1542            )
1543            return
1544
1545        current_otel_span = self._get_current_otel_span()
1546
1547        if current_otel_span is not None:
1548            generation = LangfuseGeneration(
1549                otel_span=current_otel_span, langfuse_client=self
1550            )
1551
1552            if name:
1553                current_otel_span.update_name(name)
1554
1555            generation.update(
1556                input=input,
1557                output=output,
1558                metadata=metadata,
1559                version=version,
1560                level=level,
1561                status_message=status_message,
1562                completion_start_time=completion_start_time,
1563                model=model,
1564                model_parameters=model_parameters,
1565                usage_details=usage_details,
1566                cost_details=cost_details,
1567                prompt=prompt,
1568            )

Update the current active generation span with new information.

This method updates the current generation span in the active context with additional information. It's useful for adding output, usage stats, or other details that become available during or after model generation.

Arguments:
  • name: The generation name
  • input: Updated input data for the model
  • output: Output from the model (e.g., completions)
  • metadata: Additional metadata to associate with the generation
  • version: Version identifier for the model or component
  • level: Importance level of the generation (info, warning, error)
  • status_message: Optional status message for the generation
  • completion_start_time: When the model started generating the response
  • model: Name/identifier of the AI model used (e.g., "gpt-4")
  • model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
  • usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
  • cost_details: Cost information for the model call
  • prompt: Associated prompt template from Langfuse prompt management
Example:
with langfuse.start_as_current_generation(name="answer-query") as generation:
    # Initial setup and API call
    response = llm.generate(...)

    # Update with results that weren't available at creation time
    langfuse.update_current_generation(
        output=response.text,
        usage_details={
            "prompt_tokens": response.usage.prompt_tokens,
            "completion_tokens": response.usage.completion_tokens
        }
    )
def update_current_span( self, *, name: Optional[str] = None, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None) -> None:
1570    def update_current_span(
1571        self,
1572        *,
1573        name: Optional[str] = None,
1574        input: Optional[Any] = None,
1575        output: Optional[Any] = None,
1576        metadata: Optional[Any] = None,
1577        version: Optional[str] = None,
1578        level: Optional[SpanLevel] = None,
1579        status_message: Optional[str] = None,
1580    ) -> None:
1581        """Update the current active span with new information.
1582
1583        This method updates the current span in the active context with
1584        additional information. It's useful for adding outputs or metadata
1585        that become available during execution.
1586
1587        Args:
1588            name: The span name
1589            input: Updated input data for the operation
1590            output: Output data from the operation
1591            metadata: Additional metadata to associate with the span
1592            version: Version identifier for the code or component
1593            level: Importance level of the span (info, warning, error)
1594            status_message: Optional status message for the span
1595
1596        Example:
1597            ```python
1598            with langfuse.start_as_current_span(name="process-data") as span:
1599                # Initial processing
1600                result = process_first_part()
1601
1602                # Update with intermediate results
1603                langfuse.update_current_span(metadata={"intermediate_result": result})
1604
1605                # Continue processing
1606                final_result = process_second_part(result)
1607
1608                # Final update
1609                langfuse.update_current_span(output=final_result)
1610            ```
1611        """
1612        if not self._tracing_enabled:
1613            langfuse_logger.debug(
1614                "Operation skipped: update_current_span - Tracing is disabled or client is in no-op mode."
1615            )
1616            return
1617
1618        current_otel_span = self._get_current_otel_span()
1619
1620        if current_otel_span is not None:
1621            span = LangfuseSpan(
1622                otel_span=current_otel_span,
1623                langfuse_client=self,
1624                environment=self._environment,
1625            )
1626
1627            if name:
1628                current_otel_span.update_name(name)
1629
1630            span.update(
1631                input=input,
1632                output=output,
1633                metadata=metadata,
1634                version=version,
1635                level=level,
1636                status_message=status_message,
1637            )

Update the current active span with new information.

This method updates the current span in the active context with additional information. It's useful for adding outputs or metadata that become available during execution.

Arguments:
  • name: The span name
  • input: Updated input data for the operation
  • output: Output data from the operation
  • metadata: Additional metadata to associate with the span
  • version: Version identifier for the code or component
  • level: Importance level of the span (info, warning, error)
  • status_message: Optional status message for the span
Example:
with langfuse.start_as_current_span(name="process-data") as span:
    # Initial processing
    result = process_first_part()

    # Update with intermediate results
    langfuse.update_current_span(metadata={"intermediate_result": result})

    # Continue processing
    final_result = process_second_part(result)

    # Final update
    langfuse.update_current_span(output=final_result)
def update_current_trace( self, *, name: Optional[str] = None, user_id: Optional[str] = None, session_id: Optional[str] = None, version: Optional[str] = None, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, tags: Optional[List[str]] = None, public: Optional[bool] = None) -> None:
1639    def update_current_trace(
1640        self,
1641        *,
1642        name: Optional[str] = None,
1643        user_id: Optional[str] = None,
1644        session_id: Optional[str] = None,
1645        version: Optional[str] = None,
1646        input: Optional[Any] = None,
1647        output: Optional[Any] = None,
1648        metadata: Optional[Any] = None,
1649        tags: Optional[List[str]] = None,
1650        public: Optional[bool] = None,
1651    ) -> None:
1652        """Update the current trace with additional information.
1653
1654        Args:
1655            name: Updated name for the Langfuse trace
1656            user_id: ID of the user who initiated the Langfuse trace
1657            session_id: Session identifier for grouping related Langfuse traces
1658            version: Version identifier for the application or service
1659            input: Input data for the overall Langfuse trace
1660            output: Output data from the overall Langfuse trace
1661            metadata: Additional metadata to associate with the Langfuse trace
1662            tags: List of tags to categorize the Langfuse trace
1663            public: Whether the Langfuse trace should be publicly accessible
1664
1665        See Also:
1666            :func:`langfuse.propagate_attributes`: Recommended replacement
1667        """
1668        if not self._tracing_enabled:
1669            langfuse_logger.debug(
1670                "Operation skipped: update_current_trace - Tracing is disabled or client is in no-op mode."
1671            )
1672            return
1673
1674        current_otel_span = self._get_current_otel_span()
1675
1676        if current_otel_span is not None and current_otel_span.is_recording():
1677            existing_observation_type = current_otel_span.attributes.get(  # type: ignore[attr-defined]
1678                LangfuseOtelSpanAttributes.OBSERVATION_TYPE, "span"
1679            )
1680            # We need to preserve the class to keep the correct observation type
1681            span_class = self._get_span_class(existing_observation_type)
1682            span = span_class(
1683                otel_span=current_otel_span,
1684                langfuse_client=self,
1685                environment=self._environment,
1686            )
1687
1688            span.update_trace(
1689                name=name,
1690                user_id=user_id,
1691                session_id=session_id,
1692                version=version,
1693                input=input,
1694                output=output,
1695                metadata=metadata,
1696                tags=tags,
1697                public=public,
1698            )

Update the current trace with additional information.

Arguments:
  • name: Updated name for the Langfuse trace
  • user_id: ID of the user who initiated the Langfuse trace
  • session_id: Session identifier for grouping related Langfuse traces
  • version: Version identifier for the application or service
  • input: Input data for the overall Langfuse trace
  • output: Output data from the overall Langfuse trace
  • metadata: Additional metadata to associate with the Langfuse trace
  • tags: List of tags to categorize the Langfuse trace
  • public: Whether the Langfuse trace should be publicly accessible
See Also:

langfuse.propagate_attributes(): Recommended replacement

def create_event( self, *, trace_context: Optional[langfuse.types.TraceContext] = None, name: str, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None) -> LangfuseEvent:
1700    def create_event(
1701        self,
1702        *,
1703        trace_context: Optional[TraceContext] = None,
1704        name: str,
1705        input: Optional[Any] = None,
1706        output: Optional[Any] = None,
1707        metadata: Optional[Any] = None,
1708        version: Optional[str] = None,
1709        level: Optional[SpanLevel] = None,
1710        status_message: Optional[str] = None,
1711    ) -> LangfuseEvent:
1712        """Create a new Langfuse observation of type 'EVENT'.
1713
1714        The created Langfuse Event observation will be the child of the current span in the context.
1715
1716        Args:
1717            trace_context: Optional context for connecting to an existing trace
1718            name: Name of the span (e.g., function or operation name)
1719            input: Input data for the operation (can be any JSON-serializable object)
1720            output: Output data from the operation (can be any JSON-serializable object)
1721            metadata: Additional metadata to associate with the span
1722            version: Version identifier for the code or component
1723            level: Importance level of the span (info, warning, error)
1724            status_message: Optional status message for the span
1725
1726        Returns:
1727            The Langfuse Event object
1728
1729        Example:
1730            ```python
1731            event = langfuse.create_event(name="process-event")
1732            ```
1733        """
1734        timestamp = time_ns()
1735
1736        if trace_context:
1737            trace_id = trace_context.get("trace_id", None)
1738            parent_span_id = trace_context.get("parent_span_id", None)
1739
1740            if trace_id:
1741                remote_parent_span = self._create_remote_parent_span(
1742                    trace_id=trace_id, parent_span_id=parent_span_id
1743                )
1744
1745                with otel_trace_api.use_span(
1746                    cast(otel_trace_api.Span, remote_parent_span)
1747                ):
1748                    otel_span = self._otel_tracer.start_span(
1749                        name=name, start_time=timestamp
1750                    )
1751                    otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True)
1752
1753                    return cast(
1754                        LangfuseEvent,
1755                        LangfuseEvent(
1756                            otel_span=otel_span,
1757                            langfuse_client=self,
1758                            environment=self._environment,
1759                            input=input,
1760                            output=output,
1761                            metadata=metadata,
1762                            version=version,
1763                            level=level,
1764                            status_message=status_message,
1765                        ).end(end_time=timestamp),
1766                    )
1767
1768        otel_span = self._otel_tracer.start_span(name=name, start_time=timestamp)
1769
1770        return cast(
1771            LangfuseEvent,
1772            LangfuseEvent(
1773                otel_span=otel_span,
1774                langfuse_client=self,
1775                environment=self._environment,
1776                input=input,
1777                output=output,
1778                metadata=metadata,
1779                version=version,
1780                level=level,
1781                status_message=status_message,
1782            ).end(end_time=timestamp),
1783        )

Create a new Langfuse observation of type 'EVENT'.

The created Langfuse Event observation will be the child of the current span in the context.

Arguments:
  • trace_context: Optional context for connecting to an existing trace
  • name: Name of the span (e.g., function or operation name)
  • input: Input data for the operation (can be any JSON-serializable object)
  • output: Output data from the operation (can be any JSON-serializable object)
  • metadata: Additional metadata to associate with the span
  • version: Version identifier for the code or component
  • level: Importance level of the span (info, warning, error)
  • status_message: Optional status message for the span
Returns:

The Langfuse Event object

Example:
event = langfuse.create_event(name="process-event")
@staticmethod
def create_trace_id(*, seed: Optional[str] = None) -> str:
1872    @staticmethod
1873    def create_trace_id(*, seed: Optional[str] = None) -> str:
1874        """Create a unique trace ID for use with Langfuse.
1875
1876        This method generates a unique trace ID for use with various Langfuse APIs.
1877        It can either generate a random ID or create a deterministic ID based on
1878        a seed string.
1879
1880        Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes.
1881        This method ensures the generated ID meets this requirement. If you need to
1882        correlate an external ID with a Langfuse trace ID, use the external ID as the
1883        seed to get a valid, deterministic Langfuse trace ID.
1884
1885        Args:
1886            seed: Optional string to use as a seed for deterministic ID generation.
1887                 If provided, the same seed will always produce the same ID.
1888                 If not provided, a random ID will be generated.
1889
1890        Returns:
1891            A 32-character lowercase hexadecimal string representing the Langfuse trace ID.
1892
1893        Example:
1894            ```python
1895            # Generate a random trace ID
1896            trace_id = langfuse.create_trace_id()
1897
1898            # Generate a deterministic ID based on a seed
1899            session_trace_id = langfuse.create_trace_id(seed="session-456")
1900
1901            # Correlate an external ID with a Langfuse trace ID
1902            external_id = "external-system-123456"
1903            correlated_trace_id = langfuse.create_trace_id(seed=external_id)
1904
1905            # Use the ID with trace context
1906            with langfuse.start_as_current_span(
1907                name="process-request",
1908                trace_context={"trace_id": trace_id}
1909            ) as span:
1910                # Operation will be part of the specific trace
1911                pass
1912            ```
1913        """
1914        if not seed:
1915            trace_id_int = RandomIdGenerator().generate_trace_id()
1916
1917            return Langfuse._format_otel_trace_id(trace_id_int)
1918
1919        return sha256(seed.encode("utf-8")).digest()[:16].hex()

Create a unique trace ID for use with Langfuse.

This method generates a unique trace ID for use with various Langfuse APIs. It can either generate a random ID or create a deterministic ID based on a seed string.

Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes. This method ensures the generated ID meets this requirement. If you need to correlate an external ID with a Langfuse trace ID, use the external ID as the seed to get a valid, deterministic Langfuse trace ID.

Arguments:
  • seed: Optional string to use as a seed for deterministic ID generation. If provided, the same seed will always produce the same ID. If not provided, a random ID will be generated.
Returns:

A 32-character lowercase hexadecimal string representing the Langfuse trace ID.

Example:
# Generate a random trace ID
trace_id = langfuse.create_trace_id()

# Generate a deterministic ID based on a seed
session_trace_id = langfuse.create_trace_id(seed="session-456")

# Correlate an external ID with a Langfuse trace ID
external_id = "external-system-123456"
correlated_trace_id = langfuse.create_trace_id(seed=external_id)

# Use the ID with trace context
with langfuse.start_as_current_span(
    name="process-request",
    trace_context={"trace_id": trace_id}
) as span:
    # Operation will be part of the specific trace
    pass
def create_score( self, *, name: str, value: Union[float, str], session_id: Optional[str] = None, dataset_run_id: Optional[str] = None, trace_id: Optional[str] = None, observation_id: Optional[str] = None, score_id: Optional[str] = None, data_type: Optional[Literal['NUMERIC', 'CATEGORICAL', 'BOOLEAN']] = None, comment: Optional[str] = None, config_id: Optional[str] = None, metadata: Optional[Any] = None, timestamp: Optional[datetime.datetime] = None) -> None:
1997    def create_score(
1998        self,
1999        *,
2000        name: str,
2001        value: Union[float, str],
2002        session_id: Optional[str] = None,
2003        dataset_run_id: Optional[str] = None,
2004        trace_id: Optional[str] = None,
2005        observation_id: Optional[str] = None,
2006        score_id: Optional[str] = None,
2007        data_type: Optional[ScoreDataType] = None,
2008        comment: Optional[str] = None,
2009        config_id: Optional[str] = None,
2010        metadata: Optional[Any] = None,
2011        timestamp: Optional[datetime] = None,
2012    ) -> None:
2013        """Create a score for a specific trace or observation.
2014
2015        This method creates a score for evaluating a Langfuse trace or observation. Scores can be
2016        used to track quality metrics, user feedback, or automated evaluations.
2017
2018        Args:
2019            name: Name of the score (e.g., "relevance", "accuracy")
2020            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
2021            session_id: ID of the Langfuse session to associate the score with
2022            dataset_run_id: ID of the Langfuse dataset run to associate the score with
2023            trace_id: ID of the Langfuse trace to associate the score with
2024            observation_id: Optional ID of the specific observation to score. Trace ID must be provided too.
2025            score_id: Optional custom ID for the score (auto-generated if not provided)
2026            data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
2027            comment: Optional comment or explanation for the score
2028            config_id: Optional ID of a score config defined in Langfuse
2029            metadata: Optional metadata to be attached to the score
2030            timestamp: Optional timestamp for the score (defaults to current UTC time)
2031
2032        Example:
2033            ```python
2034            # Create a numeric score for accuracy
2035            langfuse.create_score(
2036                name="accuracy",
2037                value=0.92,
2038                trace_id="abcdef1234567890abcdef1234567890",
2039                data_type="NUMERIC",
2040                comment="High accuracy with minor irrelevant details"
2041            )
2042
2043            # Create a categorical score for sentiment
2044            langfuse.create_score(
2045                name="sentiment",
2046                value="positive",
2047                trace_id="abcdef1234567890abcdef1234567890",
2048                observation_id="abcdef1234567890",
2049                data_type="CATEGORICAL"
2050            )
2051            ```
2052        """
2053        if not self._tracing_enabled:
2054            return
2055
2056        score_id = score_id or self._create_observation_id()
2057
2058        try:
2059            new_body = ScoreBody(
2060                id=score_id,
2061                sessionId=session_id,
2062                datasetRunId=dataset_run_id,
2063                traceId=trace_id,
2064                observationId=observation_id,
2065                name=name,
2066                value=value,
2067                dataType=data_type,  # type: ignore
2068                comment=comment,
2069                configId=config_id,
2070                environment=self._environment,
2071                metadata=metadata,
2072            )
2073
2074            event = {
2075                "id": self.create_trace_id(),
2076                "type": "score-create",
2077                "timestamp": timestamp or _get_timestamp(),
2078                "body": new_body,
2079            }
2080
2081            if self._resources is not None:
2082                # Force the score to be in sample if it was for a legacy trace ID, i.e. non-32 hexchar
2083                force_sample = (
2084                    not self._is_valid_trace_id(trace_id) if trace_id else True
2085                )
2086
2087                self._resources.add_score_task(
2088                    event,
2089                    force_sample=force_sample,
2090                )
2091
2092        except Exception as e:
2093            langfuse_logger.exception(
2094                f"Error creating score: Failed to process score event for trace_id={trace_id}, name={name}. Error: {e}"
2095            )

Create a score for a specific trace or observation.

This method creates a score for evaluating a Langfuse trace or observation. Scores can be used to track quality metrics, user feedback, or automated evaluations.

Arguments:
  • name: Name of the score (e.g., "relevance", "accuracy")
  • value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
  • session_id: ID of the Langfuse session to associate the score with
  • dataset_run_id: ID of the Langfuse dataset run to associate the score with
  • trace_id: ID of the Langfuse trace to associate the score with
  • observation_id: Optional ID of the specific observation to score. Trace ID must be provided too.
  • score_id: Optional custom ID for the score (auto-generated if not provided)
  • data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
  • comment: Optional comment or explanation for the score
  • config_id: Optional ID of a score config defined in Langfuse
  • metadata: Optional metadata to be attached to the score
  • timestamp: Optional timestamp for the score (defaults to current UTC time)
Example:
# Create a numeric score for accuracy
langfuse.create_score(
    name="accuracy",
    value=0.92,
    trace_id="abcdef1234567890abcdef1234567890",
    data_type="NUMERIC",
    comment="High accuracy with minor irrelevant details"
)

# Create a categorical score for sentiment
langfuse.create_score(
    name="sentiment",
    value="positive",
    trace_id="abcdef1234567890abcdef1234567890",
    observation_id="abcdef1234567890",
    data_type="CATEGORICAL"
)
def score_current_span( self, *, name: str, value: Union[float, str], score_id: Optional[str] = None, data_type: Optional[Literal['NUMERIC', 'CATEGORICAL', 'BOOLEAN']] = None, comment: Optional[str] = None, config_id: Optional[str] = None) -> None:
2121    def score_current_span(
2122        self,
2123        *,
2124        name: str,
2125        value: Union[float, str],
2126        score_id: Optional[str] = None,
2127        data_type: Optional[ScoreDataType] = None,
2128        comment: Optional[str] = None,
2129        config_id: Optional[str] = None,
2130    ) -> None:
2131        """Create a score for the current active span.
2132
2133        This method scores the currently active span in the context. It's a convenient
2134        way to score the current operation without needing to know its trace and span IDs.
2135
2136        Args:
2137            name: Name of the score (e.g., "relevance", "accuracy")
2138            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
2139            score_id: Optional custom ID for the score (auto-generated if not provided)
2140            data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
2141            comment: Optional comment or explanation for the score
2142            config_id: Optional ID of a score config defined in Langfuse
2143
2144        Example:
2145            ```python
2146            with langfuse.start_as_current_generation(name="answer-query") as generation:
2147                # Generate answer
2148                response = generate_answer(...)
2149                generation.update(output=response)
2150
2151                # Score the generation
2152                langfuse.score_current_span(
2153                    name="relevance",
2154                    value=0.85,
2155                    data_type="NUMERIC",
2156                    comment="Mostly relevant but contains some tangential information"
2157                )
2158            ```
2159        """
2160        current_span = self._get_current_otel_span()
2161
2162        if current_span is not None:
2163            trace_id = self._get_otel_trace_id(current_span)
2164            observation_id = self._get_otel_span_id(current_span)
2165
2166            langfuse_logger.info(
2167                f"Score: Creating score name='{name}' value={value} for current span ({observation_id}) in trace {trace_id}"
2168            )
2169
2170            self.create_score(
2171                trace_id=trace_id,
2172                observation_id=observation_id,
2173                name=name,
2174                value=cast(str, value),
2175                score_id=score_id,
2176                data_type=cast(Literal["CATEGORICAL"], data_type),
2177                comment=comment,
2178                config_id=config_id,
2179            )

Create a score for the current active span.

This method scores the currently active span in the context. It's a convenient way to score the current operation without needing to know its trace and span IDs.

Arguments:
  • name: Name of the score (e.g., "relevance", "accuracy")
  • value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
  • score_id: Optional custom ID for the score (auto-generated if not provided)
  • data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
  • comment: Optional comment or explanation for the score
  • config_id: Optional ID of a score config defined in Langfuse
Example:
with langfuse.start_as_current_generation(name="answer-query") as generation:
    # Generate answer
    response = generate_answer(...)
    generation.update(output=response)

    # Score the generation
    langfuse.score_current_span(
        name="relevance",
        value=0.85,
        data_type="NUMERIC",
        comment="Mostly relevant but contains some tangential information"
    )
def score_current_trace( self, *, name: str, value: Union[float, str], score_id: Optional[str] = None, data_type: Optional[Literal['NUMERIC', 'CATEGORICAL', 'BOOLEAN']] = None, comment: Optional[str] = None, config_id: Optional[str] = None) -> None:
2205    def score_current_trace(
2206        self,
2207        *,
2208        name: str,
2209        value: Union[float, str],
2210        score_id: Optional[str] = None,
2211        data_type: Optional[ScoreDataType] = None,
2212        comment: Optional[str] = None,
2213        config_id: Optional[str] = None,
2214    ) -> None:
2215        """Create a score for the current trace.
2216
2217        This method scores the trace of the currently active span. Unlike score_current_span,
2218        this method associates the score with the entire trace rather than a specific span.
2219        It's useful for scoring overall performance or quality of the entire operation.
2220
2221        Args:
2222            name: Name of the score (e.g., "user_satisfaction", "overall_quality")
2223            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
2224            score_id: Optional custom ID for the score (auto-generated if not provided)
2225            data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
2226            comment: Optional comment or explanation for the score
2227            config_id: Optional ID of a score config defined in Langfuse
2228
2229        Example:
2230            ```python
2231            with langfuse.start_as_current_span(name="process-user-request") as span:
2232                # Process request
2233                result = process_complete_request()
2234                span.update(output=result)
2235
2236                # Score the overall trace
2237                langfuse.score_current_trace(
2238                    name="overall_quality",
2239                    value=0.95,
2240                    data_type="NUMERIC",
2241                    comment="High quality end-to-end response"
2242                )
2243            ```
2244        """
2245        current_span = self._get_current_otel_span()
2246
2247        if current_span is not None:
2248            trace_id = self._get_otel_trace_id(current_span)
2249
2250            langfuse_logger.info(
2251                f"Score: Creating score name='{name}' value={value} for entire trace {trace_id}"
2252            )
2253
2254            self.create_score(
2255                trace_id=trace_id,
2256                name=name,
2257                value=cast(str, value),
2258                score_id=score_id,
2259                data_type=cast(Literal["CATEGORICAL"], data_type),
2260                comment=comment,
2261                config_id=config_id,
2262            )

Create a score for the current trace.

This method scores the trace of the currently active span. Unlike score_current_span, this method associates the score with the entire trace rather than a specific span. It's useful for scoring overall performance or quality of the entire operation.

Arguments:
  • name: Name of the score (e.g., "user_satisfaction", "overall_quality")
  • value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
  • score_id: Optional custom ID for the score (auto-generated if not provided)
  • data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
  • comment: Optional comment or explanation for the score
  • config_id: Optional ID of a score config defined in Langfuse
Example:
with langfuse.start_as_current_span(name="process-user-request") as span:
    # Process request
    result = process_complete_request()
    span.update(output=result)

    # Score the overall trace
    langfuse.score_current_trace(
        name="overall_quality",
        value=0.95,
        data_type="NUMERIC",
        comment="High quality end-to-end response"
    )
def flush(self) -> None:
2264    def flush(self) -> None:
2265        """Force flush all pending spans and events to the Langfuse API.
2266
2267        This method manually flushes any pending spans, scores, and other events to the
2268        Langfuse API. It's useful in scenarios where you want to ensure all data is sent
2269        before proceeding, without waiting for the automatic flush interval.
2270
2271        Example:
2272            ```python
2273            # Record some spans and scores
2274            with langfuse.start_as_current_span(name="operation") as span:
2275                # Do work...
2276                pass
2277
2278            # Ensure all data is sent to Langfuse before proceeding
2279            langfuse.flush()
2280
2281            # Continue with other work
2282            ```
2283        """
2284        if self._resources is not None:
2285            self._resources.flush()

Force flush all pending spans and events to the Langfuse API.

This method manually flushes any pending spans, scores, and other events to the Langfuse API. It's useful in scenarios where you want to ensure all data is sent before proceeding, without waiting for the automatic flush interval.

Example:
# Record some spans and scores
with langfuse.start_as_current_span(name="operation") as span:
    # Do work...
    pass

# Ensure all data is sent to Langfuse before proceeding
langfuse.flush()

# Continue with other work
def shutdown(self) -> None:
2287    def shutdown(self) -> None:
2288        """Shut down the Langfuse client and flush all pending data.
2289
2290        This method cleanly shuts down the Langfuse client, ensuring all pending data
2291        is flushed to the API and all background threads are properly terminated.
2292
2293        It's important to call this method when your application is shutting down to
2294        prevent data loss and resource leaks. For most applications, using the client
2295        as a context manager or relying on the automatic shutdown via atexit is sufficient.
2296
2297        Example:
2298            ```python
2299            # Initialize Langfuse
2300            langfuse = Langfuse(public_key="...", secret_key="...")
2301
2302            # Use Langfuse throughout your application
2303            # ...
2304
2305            # When application is shutting down
2306            langfuse.shutdown()
2307            ```
2308        """
2309        if self._resources is not None:
2310            self._resources.shutdown()

Shut down the Langfuse client and flush all pending data.

This method cleanly shuts down the Langfuse client, ensuring all pending data is flushed to the API and all background threads are properly terminated.

It's important to call this method when your application is shutting down to prevent data loss and resource leaks. For most applications, using the client as a context manager or relying on the automatic shutdown via atexit is sufficient.

Example:
# Initialize Langfuse
langfuse = Langfuse(public_key="...", secret_key="...")

# Use Langfuse throughout your application
# ...

# When application is shutting down
langfuse.shutdown()
def get_current_trace_id(self) -> Optional[str]:
2312    def get_current_trace_id(self) -> Optional[str]:
2313        """Get the trace ID of the current active span.
2314
2315        This method retrieves the trace ID from the currently active span in the context.
2316        It can be used to get the trace ID for referencing in logs, external systems,
2317        or for creating related operations.
2318
2319        Returns:
2320            The current trace ID as a 32-character lowercase hexadecimal string,
2321            or None if there is no active span.
2322
2323        Example:
2324            ```python
2325            with langfuse.start_as_current_span(name="process-request") as span:
2326                # Get the current trace ID for reference
2327                trace_id = langfuse.get_current_trace_id()
2328
2329                # Use it for external correlation
2330                log.info(f"Processing request with trace_id: {trace_id}")
2331
2332                # Or pass to another system
2333                external_system.process(data, trace_id=trace_id)
2334            ```
2335        """
2336        if not self._tracing_enabled:
2337            langfuse_logger.debug(
2338                "Operation skipped: get_current_trace_id - Tracing is disabled or client is in no-op mode."
2339            )
2340            return None
2341
2342        current_otel_span = self._get_current_otel_span()
2343
2344        return self._get_otel_trace_id(current_otel_span) if current_otel_span else None

Get the trace ID of the current active span.

This method retrieves the trace ID from the currently active span in the context. It can be used to get the trace ID for referencing in logs, external systems, or for creating related operations.

Returns:

The current trace ID as a 32-character lowercase hexadecimal string, or None if there is no active span.

Example:
with langfuse.start_as_current_span(name="process-request") as span:
    # Get the current trace ID for reference
    trace_id = langfuse.get_current_trace_id()

    # Use it for external correlation
    log.info(f"Processing request with trace_id: {trace_id}")

    # Or pass to another system
    external_system.process(data, trace_id=trace_id)
def get_current_observation_id(self) -> Optional[str]:
2346    def get_current_observation_id(self) -> Optional[str]:
2347        """Get the observation ID (span ID) of the current active span.
2348
2349        This method retrieves the observation ID from the currently active span in the context.
2350        It can be used to get the observation ID for referencing in logs, external systems,
2351        or for creating scores or other related operations.
2352
2353        Returns:
2354            The current observation ID as a 16-character lowercase hexadecimal string,
2355            or None if there is no active span.
2356
2357        Example:
2358            ```python
2359            with langfuse.start_as_current_span(name="process-user-query") as span:
2360                # Get the current observation ID
2361                observation_id = langfuse.get_current_observation_id()
2362
2363                # Store it for later reference
2364                cache.set(f"query_{query_id}_observation", observation_id)
2365
2366                # Process the query...
2367            ```
2368        """
2369        if not self._tracing_enabled:
2370            langfuse_logger.debug(
2371                "Operation skipped: get_current_observation_id - Tracing is disabled or client is in no-op mode."
2372            )
2373            return None
2374
2375        current_otel_span = self._get_current_otel_span()
2376
2377        return self._get_otel_span_id(current_otel_span) if current_otel_span else None

Get the observation ID (span ID) of the current active span.

This method retrieves the observation ID from the currently active span in the context. It can be used to get the observation ID for referencing in logs, external systems, or for creating scores or other related operations.

Returns:

The current observation ID as a 16-character lowercase hexadecimal string, or None if there is no active span.

Example:
with langfuse.start_as_current_span(name="process-user-query") as span:
    # Get the current observation ID
    observation_id = langfuse.get_current_observation_id()

    # Store it for later reference
    cache.set(f"query_{query_id}_observation", observation_id)

    # Process the query...
def get_trace_url(self, *, trace_id: Optional[str] = None) -> Optional[str]:
2390    def get_trace_url(self, *, trace_id: Optional[str] = None) -> Optional[str]:
2391        """Get the URL to view a trace in the Langfuse UI.
2392
2393        This method generates a URL that links directly to a trace in the Langfuse UI.
2394        It's useful for providing links in logs, notifications, or debugging tools.
2395
2396        Args:
2397            trace_id: Optional trace ID to generate a URL for. If not provided,
2398                     the trace ID of the current active span will be used.
2399
2400        Returns:
2401            A URL string pointing to the trace in the Langfuse UI,
2402            or None if the project ID couldn't be retrieved or no trace ID is available.
2403
2404        Example:
2405            ```python
2406            # Get URL for the current trace
2407            with langfuse.start_as_current_span(name="process-request") as span:
2408                trace_url = langfuse.get_trace_url()
2409                log.info(f"Processing trace: {trace_url}")
2410
2411            # Get URL for a specific trace
2412            specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef")
2413            send_notification(f"Review needed for trace: {specific_trace_url}")
2414            ```
2415        """
2416        project_id = self._get_project_id()
2417        final_trace_id = trace_id or self.get_current_trace_id()
2418
2419        return (
2420            f"{self._base_url}/project/{project_id}/traces/{final_trace_id}"
2421            if project_id and final_trace_id
2422            else None
2423        )

Get the URL to view a trace in the Langfuse UI.

This method generates a URL that links directly to a trace in the Langfuse UI. It's useful for providing links in logs, notifications, or debugging tools.

Arguments:
  • trace_id: Optional trace ID to generate a URL for. If not provided, the trace ID of the current active span will be used.
Returns:

A URL string pointing to the trace in the Langfuse UI, or None if the project ID couldn't be retrieved or no trace ID is available.

Example:
# Get URL for the current trace
with langfuse.start_as_current_span(name="process-request") as span:
    trace_url = langfuse.get_trace_url()
    log.info(f"Processing trace: {trace_url}")

# Get URL for a specific trace
specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef")
send_notification(f"Review needed for trace: {specific_trace_url}")
def get_dataset( self, name: str, *, fetch_items_page_size: Optional[int] = 50) -> langfuse._client.datasets.DatasetClient:
2425    def get_dataset(
2426        self, name: str, *, fetch_items_page_size: Optional[int] = 50
2427    ) -> "DatasetClient":
2428        """Fetch a dataset by its name.
2429
2430        Args:
2431            name (str): The name of the dataset to fetch.
2432            fetch_items_page_size (Optional[int]): All items of the dataset will be fetched in chunks of this size. Defaults to 50.
2433
2434        Returns:
2435            DatasetClient: The dataset with the given name.
2436        """
2437        try:
2438            langfuse_logger.debug(f"Getting datasets {name}")
2439            dataset = self.api.datasets.get(dataset_name=self._url_encode(name))
2440
2441            dataset_items = []
2442            page = 1
2443
2444            while True:
2445                new_items = self.api.dataset_items.list(
2446                    dataset_name=self._url_encode(name, is_url_param=True),
2447                    page=page,
2448                    limit=fetch_items_page_size,
2449                )
2450                dataset_items.extend(new_items.data)
2451
2452                if new_items.meta.total_pages <= page:
2453                    break
2454
2455                page += 1
2456
2457            items = [DatasetItemClient(i, langfuse=self) for i in dataset_items]
2458
2459            return DatasetClient(dataset, items=items)
2460
2461        except Error as e:
2462            handle_fern_exception(e)
2463            raise e

Fetch a dataset by its name.

Arguments:
  • name (str): The name of the dataset to fetch.
  • fetch_items_page_size (Optional[int]): All items of the dataset will be fetched in chunks of this size. Defaults to 50.
Returns:

DatasetClient: The dataset with the given name.

def run_experiment( self, *, name: str, run_name: Optional[str] = None, description: Optional[str] = None, data: Union[List[langfuse.experiment.LocalExperimentItem], List[langfuse._client.datasets.DatasetItemClient]], task: langfuse.experiment.TaskFunction, evaluators: List[langfuse.experiment.EvaluatorFunction] = [], composite_evaluator: Optional[CompositeEvaluatorFunction] = None, run_evaluators: List[langfuse.experiment.RunEvaluatorFunction] = [], max_concurrency: int = 50, metadata: Optional[Dict[str, str]] = None) -> langfuse.experiment.ExperimentResult:
2465    def run_experiment(
2466        self,
2467        *,
2468        name: str,
2469        run_name: Optional[str] = None,
2470        description: Optional[str] = None,
2471        data: ExperimentData,
2472        task: TaskFunction,
2473        evaluators: List[EvaluatorFunction] = [],
2474        composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
2475        run_evaluators: List[RunEvaluatorFunction] = [],
2476        max_concurrency: int = 50,
2477        metadata: Optional[Dict[str, str]] = None,
2478    ) -> ExperimentResult:
2479        """Run an experiment on a dataset with automatic tracing and evaluation.
2480
2481        This method executes a task function on each item in the provided dataset,
2482        automatically traces all executions with Langfuse for observability, runs
2483        item-level and run-level evaluators on the outputs, and returns comprehensive
2484        results with evaluation metrics.
2485
2486        The experiment system provides:
2487        - Automatic tracing of all task executions
2488        - Concurrent processing with configurable limits
2489        - Comprehensive error handling that isolates failures
2490        - Integration with Langfuse datasets for experiment tracking
2491        - Flexible evaluation framework supporting both sync and async evaluators
2492
2493        Args:
2494            name: Human-readable name for the experiment. Used for identification
2495                in the Langfuse UI.
2496            run_name: Optional exact name for the experiment run. If provided, this will be
2497                used as the exact dataset run name if the `data` contains Langfuse dataset items.
2498                If not provided, this will default to the experiment name appended with an ISO timestamp.
2499            description: Optional description explaining the experiment's purpose,
2500                methodology, or expected outcomes.
2501            data: Array of data items to process. Can be either:
2502                - List of dict-like items with 'input', 'expected_output', 'metadata' keys
2503                - List of Langfuse DatasetItem objects from dataset.items
2504            task: Function that processes each data item and returns output.
2505                Must accept 'item' as keyword argument and can return sync or async results.
2506                The task function signature should be: task(*, item, **kwargs) -> Any
2507            evaluators: List of functions to evaluate each item's output individually.
2508                Each evaluator receives input, output, expected_output, and metadata.
2509                Can return single Evaluation dict or list of Evaluation dicts.
2510            composite_evaluator: Optional function that creates composite scores from item-level evaluations.
2511                Receives the same inputs as item-level evaluators (input, output, expected_output, metadata)
2512                plus the list of evaluations from item-level evaluators. Useful for weighted averages,
2513                pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics.
2514            run_evaluators: List of functions to evaluate the entire experiment run.
2515                Each run evaluator receives all item_results and can compute aggregate metrics.
2516                Useful for calculating averages, distributions, or cross-item comparisons.
2517            max_concurrency: Maximum number of concurrent task executions (default: 50).
2518                Controls the number of items processed simultaneously. Adjust based on
2519                API rate limits and system resources.
2520            metadata: Optional metadata dictionary to attach to all experiment traces.
2521                This metadata will be included in every trace created during the experiment.
2522                If `data` are Langfuse dataset items, the metadata will be attached to the dataset run, too.
2523
2524        Returns:
2525            ExperimentResult containing:
2526            - run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset.
2527            - item_results: List of results for each processed item with outputs and evaluations
2528            - run_evaluations: List of aggregate evaluation results for the entire run
2529            - dataset_run_id: ID of the dataset run (if using Langfuse datasets)
2530            - dataset_run_url: Direct URL to view results in Langfuse UI (if applicable)
2531
2532        Raises:
2533            ValueError: If required parameters are missing or invalid
2534            Exception: If experiment setup fails (individual item failures are handled gracefully)
2535
2536        Examples:
2537            Basic experiment with local data:
2538            ```python
2539            def summarize_text(*, item, **kwargs):
2540                return f"Summary: {item['input'][:50]}..."
2541
2542            def length_evaluator(*, input, output, expected_output=None, **kwargs):
2543                return {
2544                    "name": "output_length",
2545                    "value": len(output),
2546                    "comment": f"Output contains {len(output)} characters"
2547                }
2548
2549            result = langfuse.run_experiment(
2550                name="Text Summarization Test",
2551                description="Evaluate summarization quality and length",
2552                data=[
2553                    {"input": "Long article text...", "expected_output": "Expected summary"},
2554                    {"input": "Another article...", "expected_output": "Another summary"}
2555                ],
2556                task=summarize_text,
2557                evaluators=[length_evaluator]
2558            )
2559
2560            print(f"Processed {len(result.item_results)} items")
2561            for item_result in result.item_results:
2562                print(f"Input: {item_result.item['input']}")
2563                print(f"Output: {item_result.output}")
2564                print(f"Evaluations: {item_result.evaluations}")
2565            ```
2566
2567            Advanced experiment with async task and multiple evaluators:
2568            ```python
2569            async def llm_task(*, item, **kwargs):
2570                # Simulate async LLM call
2571                response = await openai_client.chat.completions.create(
2572                    model="gpt-4",
2573                    messages=[{"role": "user", "content": item["input"]}]
2574                )
2575                return response.choices[0].message.content
2576
2577            def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
2578                if expected_output and expected_output.lower() in output.lower():
2579                    return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"}
2580                return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"}
2581
2582            def toxicity_evaluator(*, input, output, expected_output=None, **kwargs):
2583                # Simulate toxicity check
2584                toxicity_score = check_toxicity(output)  # Your toxicity checker
2585                return {
2586                    "name": "toxicity",
2587                    "value": toxicity_score,
2588                    "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}"
2589                }
2590
2591            def average_accuracy(*, item_results, **kwargs):
2592                accuracies = [
2593                    eval.value for result in item_results
2594                    for eval in result.evaluations
2595                    if eval.name == "accuracy"
2596                ]
2597                return {
2598                    "name": "average_accuracy",
2599                    "value": sum(accuracies) / len(accuracies) if accuracies else 0,
2600                    "comment": f"Average accuracy across {len(accuracies)} items"
2601                }
2602
2603            result = langfuse.run_experiment(
2604                name="LLM Safety and Accuracy Test",
2605                description="Evaluate model accuracy and safety across diverse prompts",
2606                data=test_dataset,  # Your dataset items
2607                task=llm_task,
2608                evaluators=[accuracy_evaluator, toxicity_evaluator],
2609                run_evaluators=[average_accuracy],
2610                max_concurrency=5,  # Limit concurrent API calls
2611                metadata={"model": "gpt-4", "temperature": 0.7}
2612            )
2613            ```
2614
2615            Using with Langfuse datasets:
2616            ```python
2617            # Get dataset from Langfuse
2618            dataset = langfuse.get_dataset("my-eval-dataset")
2619
2620            result = dataset.run_experiment(
2621                name="Production Model Evaluation",
2622                description="Monthly evaluation of production model performance",
2623                task=my_production_task,
2624                evaluators=[accuracy_evaluator, latency_evaluator]
2625            )
2626
2627            # Results automatically linked to dataset in Langfuse UI
2628            print(f"View results: {result['dataset_run_url']}")
2629            ```
2630
2631        Note:
2632            - Task and evaluator functions can be either synchronous or asynchronous
2633            - Individual item failures are logged but don't stop the experiment
2634            - All executions are automatically traced and visible in Langfuse UI
2635            - When using Langfuse datasets, results are automatically linked for easy comparison
2636            - This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.)
2637            - Async execution is handled automatically with smart event loop detection
2638        """
2639        return cast(
2640            ExperimentResult,
2641            run_async_safely(
2642                self._run_experiment_async(
2643                    name=name,
2644                    run_name=self._create_experiment_run_name(
2645                        name=name, run_name=run_name
2646                    ),
2647                    description=description,
2648                    data=data,
2649                    task=task,
2650                    evaluators=evaluators or [],
2651                    composite_evaluator=composite_evaluator,
2652                    run_evaluators=run_evaluators or [],
2653                    max_concurrency=max_concurrency,
2654                    metadata=metadata,
2655                ),
2656            ),
2657        )

Run an experiment on a dataset with automatic tracing and evaluation.

This method executes a task function on each item in the provided dataset, automatically traces all executions with Langfuse for observability, runs item-level and run-level evaluators on the outputs, and returns comprehensive results with evaluation metrics.

The experiment system provides:

  • Automatic tracing of all task executions
  • Concurrent processing with configurable limits
  • Comprehensive error handling that isolates failures
  • Integration with Langfuse datasets for experiment tracking
  • Flexible evaluation framework supporting both sync and async evaluators
Arguments:
  • name: Human-readable name for the experiment. Used for identification in the Langfuse UI.
  • run_name: Optional exact name for the experiment run. If provided, this will be used as the exact dataset run name if the data contains Langfuse dataset items. If not provided, this will default to the experiment name appended with an ISO timestamp.
  • description: Optional description explaining the experiment's purpose, methodology, or expected outcomes.
  • data: Array of data items to process. Can be either:
    • List of dict-like items with 'input', 'expected_output', 'metadata' keys
    • List of Langfuse DatasetItem objects from dataset.items
  • task: Function that processes each data item and returns output. Must accept 'item' as keyword argument and can return sync or async results. The task function signature should be: task(, item, *kwargs) -> Any
  • evaluators: List of functions to evaluate each item's output individually. Each evaluator receives input, output, expected_output, and metadata. Can return single Evaluation dict or list of Evaluation dicts.
  • composite_evaluator: Optional function that creates composite scores from item-level evaluations. Receives the same inputs as item-level evaluators (input, output, expected_output, metadata) plus the list of evaluations from item-level evaluators. Useful for weighted averages, pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics.
  • run_evaluators: List of functions to evaluate the entire experiment run. Each run evaluator receives all item_results and can compute aggregate metrics. Useful for calculating averages, distributions, or cross-item comparisons.
  • max_concurrency: Maximum number of concurrent task executions (default: 50). Controls the number of items processed simultaneously. Adjust based on API rate limits and system resources.
  • metadata: Optional metadata dictionary to attach to all experiment traces. This metadata will be included in every trace created during the experiment. If data are Langfuse dataset items, the metadata will be attached to the dataset run, too.
Returns:

ExperimentResult containing:

  • run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset.
  • item_results: List of results for each processed item with outputs and evaluations
  • run_evaluations: List of aggregate evaluation results for the entire run
  • dataset_run_id: ID of the dataset run (if using Langfuse datasets)
  • dataset_run_url: Direct URL to view results in Langfuse UI (if applicable)
Raises:
  • ValueError: If required parameters are missing or invalid
  • Exception: If experiment setup fails (individual item failures are handled gracefully)
Examples:

Basic experiment with local data:

def summarize_text(*, item, **kwargs):
    return f"Summary: {item['input'][:50]}..."

def length_evaluator(*, input, output, expected_output=None, **kwargs):
    return {
        "name": "output_length",
        "value": len(output),
        "comment": f"Output contains {len(output)} characters"
    }

result = langfuse.run_experiment(
    name="Text Summarization Test",
    description="Evaluate summarization quality and length",
    data=[
        {"input": "Long article text...", "expected_output": "Expected summary"},
        {"input": "Another article...", "expected_output": "Another summary"}
    ],
    task=summarize_text,
    evaluators=[length_evaluator]
)

print(f"Processed {len(result.item_results)} items")
for item_result in result.item_results:
    print(f"Input: {item_result.item['input']}")
    print(f"Output: {item_result.output}")
    print(f"Evaluations: {item_result.evaluations}")

Advanced experiment with async task and multiple evaluators:

async def llm_task(*, item, **kwargs):
    # Simulate async LLM call
    response = await openai_client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": item["input"]}]
    )
    return response.choices[0].message.content

def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
    if expected_output and expected_output.lower() in output.lower():
        return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"}
    return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"}

def toxicity_evaluator(*, input, output, expected_output=None, **kwargs):
    # Simulate toxicity check
    toxicity_score = check_toxicity(output)  # Your toxicity checker
    return {
        "name": "toxicity",
        "value": toxicity_score,
        "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}"
    }

def average_accuracy(*, item_results, **kwargs):
    accuracies = [
        eval.value for result in item_results
        for eval in result.evaluations
        if eval.name == "accuracy"
    ]
    return {
        "name": "average_accuracy",
        "value": sum(accuracies) / len(accuracies) if accuracies else 0,
        "comment": f"Average accuracy across {len(accuracies)} items"
    }

result = langfuse.run_experiment(
    name="LLM Safety and Accuracy Test",
    description="Evaluate model accuracy and safety across diverse prompts",
    data=test_dataset,  # Your dataset items
    task=llm_task,
    evaluators=[accuracy_evaluator, toxicity_evaluator],
    run_evaluators=[average_accuracy],
    max_concurrency=5,  # Limit concurrent API calls
    metadata={"model": "gpt-4", "temperature": 0.7}
)

Using with Langfuse datasets:

# Get dataset from Langfuse
dataset = langfuse.get_dataset("my-eval-dataset")

result = dataset.run_experiment(
    name="Production Model Evaluation",
    description="Monthly evaluation of production model performance",
    task=my_production_task,
    evaluators=[accuracy_evaluator, latency_evaluator]
)

# Results automatically linked to dataset in Langfuse UI
print(f"View results: {result['dataset_run_url']}")
Note:
  • Task and evaluator functions can be either synchronous or asynchronous
  • Individual item failures are logged but don't stop the experiment
  • All executions are automatically traced and visible in Langfuse UI
  • When using Langfuse datasets, results are automatically linked for easy comparison
  • This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.)
  • Async execution is handled automatically with smart event loop detection
def run_batched_evaluation( self, *, scope: Literal['traces', 'observations'], mapper: MapperFunction, filter: Optional[str] = None, fetch_batch_size: int = 50, max_items: Optional[int] = None, max_retries: int = 3, evaluators: List[langfuse.experiment.EvaluatorFunction], composite_evaluator: Optional[CompositeEvaluatorFunction] = None, max_concurrency: int = 50, metadata: Optional[Dict[str, Any]] = None, resume_from: Optional[BatchEvaluationResumeToken] = None, verbose: bool = False) -> BatchEvaluationResult:
2995    def run_batched_evaluation(
2996        self,
2997        *,
2998        scope: Literal["traces", "observations"],
2999        mapper: MapperFunction,
3000        filter: Optional[str] = None,
3001        fetch_batch_size: int = 50,
3002        max_items: Optional[int] = None,
3003        max_retries: int = 3,
3004        evaluators: List[EvaluatorFunction],
3005        composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
3006        max_concurrency: int = 50,
3007        metadata: Optional[Dict[str, Any]] = None,
3008        resume_from: Optional[BatchEvaluationResumeToken] = None,
3009        verbose: bool = False,
3010    ) -> BatchEvaluationResult:
3011        """Fetch traces or observations and run evaluations on each item.
3012
3013        This method provides a powerful way to evaluate existing data in Langfuse at scale.
3014        It fetches items based on filters, transforms them using a mapper function, runs
3015        evaluators on each item, and creates scores that are linked back to the original
3016        entities. This is ideal for:
3017
3018        - Running evaluations on production traces after deployment
3019        - Backtesting new evaluation metrics on historical data
3020        - Batch scoring of observations for quality monitoring
3021        - Periodic evaluation runs on recent data
3022
3023        The method uses a streaming/pipeline approach to process items in batches, making
3024        it memory-efficient for large datasets. It includes comprehensive error handling,
3025        retry logic, and resume capability for long-running evaluations.
3026
3027        Args:
3028            scope: The type of items to evaluate. Must be one of:
3029                - "traces": Evaluate complete traces with all their observations
3030                - "observations": Evaluate individual observations (spans, generations, events)
3031            mapper: Function that transforms API response objects into evaluator inputs.
3032                Receives a trace/observation object and returns an EvaluatorInputs
3033                instance with input, output, expected_output, and metadata fields.
3034                Can be sync or async.
3035            evaluators: List of evaluation functions to run on each item. Each evaluator
3036                receives the mapped inputs and returns Evaluation object(s). Evaluator
3037                failures are logged but don't stop the batch evaluation.
3038            filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples:
3039                - '{"tags": ["production"]}'
3040                - '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}'
3041                Default: None (fetches all items).
3042            fetch_batch_size: Number of items to fetch per API call and hold in memory.
3043                Larger values may be faster but use more memory. Default: 50.
3044            max_items: Maximum total number of items to process. If None, processes all
3045                items matching the filter. Useful for testing or limiting evaluation runs.
3046                Default: None (process all).
3047            max_concurrency: Maximum number of items to evaluate concurrently. Controls
3048                parallelism and resource usage. Default: 50.
3049            composite_evaluator: Optional function that creates a composite score from
3050                item-level evaluations. Receives the original item and its evaluations,
3051                returns a single Evaluation. Useful for weighted averages or combined metrics.
3052                Default: None.
3053            metadata: Optional metadata dict to add to all created scores. Useful for
3054                tracking evaluation runs, versions, or other context. Default: None.
3055            max_retries: Maximum number of retry attempts for failed batch fetches.
3056                Uses exponential backoff (1s, 2s, 4s). Default: 3.
3057            verbose: If True, logs progress information to console. Useful for monitoring
3058                long-running evaluations. Default: False.
3059            resume_from: Optional resume token from a previous incomplete run. Allows
3060                continuing evaluation after interruption or failure. Default: None.
3061
3062
3063        Returns:
3064            BatchEvaluationResult containing:
3065                - total_items_fetched: Number of items fetched from API
3066                - total_items_processed: Number of items successfully evaluated
3067                - total_items_failed: Number of items that failed evaluation
3068                - total_scores_created: Scores created by item-level evaluators
3069                - total_composite_scores_created: Scores created by composite evaluator
3070                - total_evaluations_failed: Individual evaluator failures
3071                - evaluator_stats: Per-evaluator statistics (success rate, scores created)
3072                - resume_token: Token for resuming if incomplete (None if completed)
3073                - completed: True if all items processed
3074                - duration_seconds: Total execution time
3075                - failed_item_ids: IDs of items that failed
3076                - error_summary: Error types and counts
3077                - has_more_items: True if max_items reached but more exist
3078
3079        Raises:
3080            ValueError: If invalid scope is provided.
3081
3082        Examples:
3083            Basic trace evaluation:
3084            ```python
3085            from langfuse import Langfuse, EvaluatorInputs, Evaluation
3086
3087            client = Langfuse()
3088
3089            # Define mapper to extract fields from traces
3090            def trace_mapper(trace):
3091                return EvaluatorInputs(
3092                    input=trace.input,
3093                    output=trace.output,
3094                    expected_output=None,
3095                    metadata={"trace_id": trace.id}
3096                )
3097
3098            # Define evaluator
3099            def length_evaluator(*, input, output, expected_output, metadata):
3100                return Evaluation(
3101                    name="output_length",
3102                    value=len(output) if output else 0
3103                )
3104
3105            # Run batch evaluation
3106            result = client.run_batched_evaluation(
3107                scope="traces",
3108                mapper=trace_mapper,
3109                evaluators=[length_evaluator],
3110                filter='{"tags": ["production"]}',
3111                max_items=1000,
3112                verbose=True
3113            )
3114
3115            print(f"Processed {result.total_items_processed} traces")
3116            print(f"Created {result.total_scores_created} scores")
3117            ```
3118
3119            Evaluation with composite scorer:
3120            ```python
3121            def accuracy_evaluator(*, input, output, expected_output, metadata):
3122                # ... evaluation logic
3123                return Evaluation(name="accuracy", value=0.85)
3124
3125            def relevance_evaluator(*, input, output, expected_output, metadata):
3126                # ... evaluation logic
3127                return Evaluation(name="relevance", value=0.92)
3128
3129            def composite_evaluator(*, item, evaluations):
3130                # Weighted average of evaluations
3131                weights = {"accuracy": 0.6, "relevance": 0.4}
3132                total = sum(
3133                    e.value * weights.get(e.name, 0)
3134                    for e in evaluations
3135                    if isinstance(e.value, (int, float))
3136                )
3137                return Evaluation(
3138                    name="composite_score",
3139                    value=total,
3140                    comment=f"Weighted average of {len(evaluations)} metrics"
3141                )
3142
3143            result = client.run_batched_evaluation(
3144                scope="traces",
3145                mapper=trace_mapper,
3146                evaluators=[accuracy_evaluator, relevance_evaluator],
3147                composite_evaluator=composite_evaluator,
3148                filter='{"user_id": "important_user"}',
3149                verbose=True
3150            )
3151            ```
3152
3153            Handling incomplete runs with resume:
3154            ```python
3155            # Initial run that may fail or timeout
3156            result = client.run_batched_evaluation(
3157                scope="observations",
3158                mapper=obs_mapper,
3159                evaluators=[my_evaluator],
3160                max_items=10000,
3161                verbose=True
3162            )
3163
3164            # Check if incomplete
3165            if not result.completed and result.resume_token:
3166                print(f"Processed {result.resume_token.items_processed} items before interruption")
3167
3168                # Resume from where it left off
3169                result = client.run_batched_evaluation(
3170                    scope="observations",
3171                    mapper=obs_mapper,
3172                    evaluators=[my_evaluator],
3173                    resume_from=result.resume_token,
3174                    verbose=True
3175                )
3176
3177            print(f"Total items processed: {result.total_items_processed}")
3178            ```
3179
3180            Monitoring evaluator performance:
3181            ```python
3182            result = client.run_batched_evaluation(...)
3183
3184            for stats in result.evaluator_stats:
3185                success_rate = stats.successful_runs / stats.total_runs
3186                print(f"{stats.name}:")
3187                print(f"  Success rate: {success_rate:.1%}")
3188                print(f"  Scores created: {stats.total_scores_created}")
3189
3190                if stats.failed_runs > 0:
3191                    print(f"  âš ī¸  Failed {stats.failed_runs} times")
3192            ```
3193
3194        Note:
3195            - Evaluator failures are logged but don't stop the batch evaluation
3196            - Individual item failures are tracked but don't stop processing
3197            - Fetch failures are retried with exponential backoff
3198            - All scores are automatically flushed to Langfuse at the end
3199            - The resume mechanism uses timestamp-based filtering to avoid duplicates
3200        """
3201        runner = BatchEvaluationRunner(self)
3202
3203        return cast(
3204            BatchEvaluationResult,
3205            run_async_safely(
3206                runner.run_async(
3207                    scope=scope,
3208                    mapper=mapper,
3209                    evaluators=evaluators,
3210                    filter=filter,
3211                    fetch_batch_size=fetch_batch_size,
3212                    max_items=max_items,
3213                    max_concurrency=max_concurrency,
3214                    composite_evaluator=composite_evaluator,
3215                    metadata=metadata,
3216                    max_retries=max_retries,
3217                    verbose=verbose,
3218                    resume_from=resume_from,
3219                )
3220            ),
3221        )

Fetch traces or observations and run evaluations on each item.

This method provides a powerful way to evaluate existing data in Langfuse at scale. It fetches items based on filters, transforms them using a mapper function, runs evaluators on each item, and creates scores that are linked back to the original entities. This is ideal for:

  • Running evaluations on production traces after deployment
  • Backtesting new evaluation metrics on historical data
  • Batch scoring of observations for quality monitoring
  • Periodic evaluation runs on recent data

The method uses a streaming/pipeline approach to process items in batches, making it memory-efficient for large datasets. It includes comprehensive error handling, retry logic, and resume capability for long-running evaluations.

Arguments:
  • scope: The type of items to evaluate. Must be one of:
    • "traces": Evaluate complete traces with all their observations
    • "observations": Evaluate individual observations (spans, generations, events)
  • mapper: Function that transforms API response objects into evaluator inputs. Receives a trace/observation object and returns an EvaluatorInputs instance with input, output, expected_output, and metadata fields. Can be sync or async.
  • evaluators: List of evaluation functions to run on each item. Each evaluator receives the mapped inputs and returns Evaluation object(s). Evaluator failures are logged but don't stop the batch evaluation.
  • filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples:
    • '{"tags": ["production"]}'
    • '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}' Default: None (fetches all items).
  • fetch_batch_size: Number of items to fetch per API call and hold in memory. Larger values may be faster but use more memory. Default: 50.
  • max_items: Maximum total number of items to process. If None, processes all items matching the filter. Useful for testing or limiting evaluation runs. Default: None (process all).
  • max_concurrency: Maximum number of items to evaluate concurrently. Controls parallelism and resource usage. Default: 50.
  • composite_evaluator: Optional function that creates a composite score from item-level evaluations. Receives the original item and its evaluations, returns a single Evaluation. Useful for weighted averages or combined metrics. Default: None.
  • metadata: Optional metadata dict to add to all created scores. Useful for tracking evaluation runs, versions, or other context. Default: None.
  • max_retries: Maximum number of retry attempts for failed batch fetches. Uses exponential backoff (1s, 2s, 4s). Default: 3.
  • verbose: If True, logs progress information to console. Useful for monitoring long-running evaluations. Default: False.
  • resume_from: Optional resume token from a previous incomplete run. Allows continuing evaluation after interruption or failure. Default: None.
Returns:

BatchEvaluationResult containing: - total_items_fetched: Number of items fetched from API - total_items_processed: Number of items successfully evaluated - total_items_failed: Number of items that failed evaluation - total_scores_created: Scores created by item-level evaluators - total_composite_scores_created: Scores created by composite evaluator - total_evaluations_failed: Individual evaluator failures - evaluator_stats: Per-evaluator statistics (success rate, scores created) - resume_token: Token for resuming if incomplete (None if completed) - completed: True if all items processed - duration_seconds: Total execution time - failed_item_ids: IDs of items that failed - error_summary: Error types and counts - has_more_items: True if max_items reached but more exist

Raises:
  • ValueError: If invalid scope is provided.
Examples:

Basic trace evaluation:

from langfuse import Langfuse, EvaluatorInputs, Evaluation

client = Langfuse()

# Define mapper to extract fields from traces
def trace_mapper(trace):
    return EvaluatorInputs(
        input=trace.input,
        output=trace.output,
        expected_output=None,
        metadata={"trace_id": trace.id}
    )

# Define evaluator
def length_evaluator(*, input, output, expected_output, metadata):
    return Evaluation(
        name="output_length",
        value=len(output) if output else 0
    )

# Run batch evaluation
result = client.run_batched_evaluation(
    scope="traces",
    mapper=trace_mapper,
    evaluators=[length_evaluator],
    filter='{"tags": ["production"]}',
    max_items=1000,
    verbose=True
)

print(f"Processed {result.total_items_processed} traces")
print(f"Created {result.total_scores_created} scores")

Evaluation with composite scorer:

def accuracy_evaluator(*, input, output, expected_output, metadata):
    # ... evaluation logic
    return Evaluation(name="accuracy", value=0.85)

def relevance_evaluator(*, input, output, expected_output, metadata):
    # ... evaluation logic
    return Evaluation(name="relevance", value=0.92)

def composite_evaluator(*, item, evaluations):
    # Weighted average of evaluations
    weights = {"accuracy": 0.6, "relevance": 0.4}
    total = sum(
        e.value * weights.get(e.name, 0)
        for e in evaluations
        if isinstance(e.value, (int, float))
    )
    return Evaluation(
        name="composite_score",
        value=total,
        comment=f"Weighted average of {len(evaluations)} metrics"
    )

result = client.run_batched_evaluation(
    scope="traces",
    mapper=trace_mapper,
    evaluators=[accuracy_evaluator, relevance_evaluator],
    composite_evaluator=composite_evaluator,
    filter='{"user_id": "important_user"}',
    verbose=True
)

Handling incomplete runs with resume:

# Initial run that may fail or timeout
result = client.run_batched_evaluation(
    scope="observations",
    mapper=obs_mapper,
    evaluators=[my_evaluator],
    max_items=10000,
    verbose=True
)

# Check if incomplete
if not result.completed and result.resume_token:
    print(f"Processed {result.resume_token.items_processed} items before interruption")

    # Resume from where it left off
    result = client.run_batched_evaluation(
        scope="observations",
        mapper=obs_mapper,
        evaluators=[my_evaluator],
        resume_from=result.resume_token,
        verbose=True
    )

print(f"Total items processed: {result.total_items_processed}")

Monitoring evaluator performance:

result = client.run_batched_evaluation(...)

for stats in result.evaluator_stats:
    success_rate = stats.successful_runs / stats.total_runs
    print(f"{stats.name}:")
    print(f"  Success rate: {success_rate:.1%}")
    print(f"  Scores created: {stats.total_scores_created}")

    if stats.failed_runs > 0:
        print(f"  âš ī¸  Failed {stats.failed_runs} times")
Note:
  • Evaluator failures are logged but don't stop the batch evaluation
  • Individual item failures are tracked but don't stop processing
  • Fetch failures are retried with exponential backoff
  • All scores are automatically flushed to Langfuse at the end
  • The resume mechanism uses timestamp-based filtering to avoid duplicates
def auth_check(self) -> bool:
3223    def auth_check(self) -> bool:
3224        """Check if the provided credentials (public and secret key) are valid.
3225
3226        Raises:
3227            Exception: If no projects were found for the provided credentials.
3228
3229        Note:
3230            This method is blocking. It is discouraged to use it in production code.
3231        """
3232        try:
3233            projects = self.api.projects.get()
3234            langfuse_logger.debug(
3235                f"Auth check successful, found {len(projects.data)} projects"
3236            )
3237            if len(projects.data) == 0:
3238                raise Exception(
3239                    "Auth check failed, no project found for the keys provided."
3240                )
3241            return True
3242
3243        except AttributeError as e:
3244            langfuse_logger.warning(
3245                f"Auth check failed: Client not properly initialized. Error: {e}"
3246            )
3247            return False
3248
3249        except Error as e:
3250            handle_fern_exception(e)
3251            raise e

Check if the provided credentials (public and secret key) are valid.

Raises:
  • Exception: If no projects were found for the provided credentials.
Note:

This method is blocking. It is discouraged to use it in production code.

def create_dataset( self, *, name: str, description: Optional[str] = None, metadata: Optional[Any] = None, input_schema: Optional[Any] = None, expected_output_schema: Optional[Any] = None) -> langfuse.api.Dataset:
3253    def create_dataset(
3254        self,
3255        *,
3256        name: str,
3257        description: Optional[str] = None,
3258        metadata: Optional[Any] = None,
3259        input_schema: Optional[Any] = None,
3260        expected_output_schema: Optional[Any] = None,
3261    ) -> Dataset:
3262        """Create a dataset with the given name on Langfuse.
3263
3264        Args:
3265            name: Name of the dataset to create.
3266            description: Description of the dataset. Defaults to None.
3267            metadata: Additional metadata. Defaults to None.
3268            input_schema: JSON Schema for validating dataset item inputs. When set, all new items will be validated against this schema.
3269            expected_output_schema: JSON Schema for validating dataset item expected outputs. When set, all new items will be validated against this schema.
3270
3271        Returns:
3272            Dataset: The created dataset as returned by the Langfuse API.
3273        """
3274        try:
3275            body = CreateDatasetRequest(
3276                name=name,
3277                description=description,
3278                metadata=metadata,
3279                inputSchema=input_schema,
3280                expectedOutputSchema=expected_output_schema,
3281            )
3282            langfuse_logger.debug(f"Creating datasets {body}")
3283
3284            return self.api.datasets.create(request=body)
3285
3286        except Error as e:
3287            handle_fern_exception(e)
3288            raise e

Create a dataset with the given name on Langfuse.

Arguments:
  • name: Name of the dataset to create.
  • description: Description of the dataset. Defaults to None.
  • metadata: Additional metadata. Defaults to None.
  • input_schema: JSON Schema for validating dataset item inputs. When set, all new items will be validated against this schema.
  • expected_output_schema: JSON Schema for validating dataset item expected outputs. When set, all new items will be validated against this schema.
Returns:

Dataset: The created dataset as returned by the Langfuse API.

def create_dataset_item( self, *, dataset_name: str, input: Optional[Any] = None, expected_output: Optional[Any] = None, metadata: Optional[Any] = None, source_trace_id: Optional[str] = None, source_observation_id: Optional[str] = None, status: Optional[langfuse.api.DatasetStatus] = None, id: Optional[str] = None) -> langfuse.api.DatasetItem:
3290    def create_dataset_item(
3291        self,
3292        *,
3293        dataset_name: str,
3294        input: Optional[Any] = None,
3295        expected_output: Optional[Any] = None,
3296        metadata: Optional[Any] = None,
3297        source_trace_id: Optional[str] = None,
3298        source_observation_id: Optional[str] = None,
3299        status: Optional[DatasetStatus] = None,
3300        id: Optional[str] = None,
3301    ) -> DatasetItem:
3302        """Create a dataset item.
3303
3304        Upserts if an item with id already exists.
3305
3306        Args:
3307            dataset_name: Name of the dataset in which the dataset item should be created.
3308            input: Input data. Defaults to None. Can contain any dict, list or scalar.
3309            expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar.
3310            metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar.
3311            source_trace_id: Id of the source trace. Defaults to None.
3312            source_observation_id: Id of the source observation. Defaults to None.
3313            status: Status of the dataset item. Defaults to ACTIVE for newly created items.
3314            id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets.
3315
3316        Returns:
3317            DatasetItem: The created dataset item as returned by the Langfuse API.
3318
3319        Example:
3320            ```python
3321            from langfuse import Langfuse
3322
3323            langfuse = Langfuse()
3324
3325            # Uploading items to the Langfuse dataset named "capital_cities"
3326            langfuse.create_dataset_item(
3327                dataset_name="capital_cities",
3328                input={"input": {"country": "Italy"}},
3329                expected_output={"expected_output": "Rome"},
3330                metadata={"foo": "bar"}
3331            )
3332            ```
3333        """
3334        try:
3335            body = CreateDatasetItemRequest(
3336                datasetName=dataset_name,
3337                input=input,
3338                expectedOutput=expected_output,
3339                metadata=metadata,
3340                sourceTraceId=source_trace_id,
3341                sourceObservationId=source_observation_id,
3342                status=status,
3343                id=id,
3344            )
3345            langfuse_logger.debug(f"Creating dataset item {body}")
3346            return self.api.dataset_items.create(request=body)
3347        except Error as e:
3348            handle_fern_exception(e)
3349            raise e

Create a dataset item.

Upserts if an item with id already exists.

Arguments:
  • dataset_name: Name of the dataset in which the dataset item should be created.
  • input: Input data. Defaults to None. Can contain any dict, list or scalar.
  • expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar.
  • metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar.
  • source_trace_id: Id of the source trace. Defaults to None.
  • source_observation_id: Id of the source observation. Defaults to None.
  • status: Status of the dataset item. Defaults to ACTIVE for newly created items.
  • id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets.
Returns:

DatasetItem: The created dataset item as returned by the Langfuse API.

Example:
from langfuse import Langfuse

langfuse = Langfuse()

# Uploading items to the Langfuse dataset named "capital_cities"
langfuse.create_dataset_item(
    dataset_name="capital_cities",
    input={"input": {"country": "Italy"}},
    expected_output={"expected_output": "Rome"},
    metadata={"foo": "bar"}
)
def resolve_media_references( self, *, obj: Any, resolve_with: Literal['base64_data_uri'], max_depth: int = 10, content_fetch_timeout_seconds: int = 5) -> Any:
3351    def resolve_media_references(
3352        self,
3353        *,
3354        obj: Any,
3355        resolve_with: Literal["base64_data_uri"],
3356        max_depth: int = 10,
3357        content_fetch_timeout_seconds: int = 5,
3358    ) -> Any:
3359        """Replace media reference strings in an object with base64 data URIs.
3360
3361        This method recursively traverses an object (up to max_depth) looking for media reference strings
3362        in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using
3363        the provided Langfuse client and replaces the reference string with a base64 data URI.
3364
3365        If fetching media content fails for a reference string, a warning is logged and the reference
3366        string is left unchanged.
3367
3368        Args:
3369            obj: The object to process. Can be a primitive value, array, or nested object.
3370                If the object has a __dict__ attribute, a dict will be returned instead of the original object type.
3371            resolve_with: The representation of the media content to replace the media reference string with.
3372                Currently only "base64_data_uri" is supported.
3373            max_depth: int: The maximum depth to traverse the object. Default is 10.
3374            content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5.
3375
3376        Returns:
3377            A deep copy of the input object with all media references replaced with base64 data URIs where possible.
3378            If the input object has a __dict__ attribute, a dict will be returned instead of the original object type.
3379
3380        Example:
3381            obj = {
3382                "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@",
3383                "nested": {
3384                    "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@"
3385                }
3386            }
3387
3388            result = await LangfuseMedia.resolve_media_references(obj, langfuse_client)
3389
3390            # Result:
3391            # {
3392            #     "image": "...",
3393            #     "nested": {
3394            #         "pdf": "data:application/pdf;base64,JVBERi0xLjcK..."
3395            #     }
3396            # }
3397        """
3398        return LangfuseMedia.resolve_media_references(
3399            langfuse_client=self,
3400            obj=obj,
3401            resolve_with=resolve_with,
3402            max_depth=max_depth,
3403            content_fetch_timeout_seconds=content_fetch_timeout_seconds,
3404        )

Replace media reference strings in an object with base64 data URIs.

This method recursively traverses an object (up to max_depth) looking for media reference strings in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using the provided Langfuse client and replaces the reference string with a base64 data URI.

If fetching media content fails for a reference string, a warning is logged and the reference string is left unchanged.

Arguments:
  • obj: The object to process. Can be a primitive value, array, or nested object. If the object has a __dict__ attribute, a dict will be returned instead of the original object type.
  • resolve_with: The representation of the media content to replace the media reference string with. Currently only "base64_data_uri" is supported.
  • max_depth: int: The maximum depth to traverse the object. Default is 10.
  • content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5.
Returns:

A deep copy of the input object with all media references replaced with base64 data URIs where possible. If the input object has a __dict__ attribute, a dict will be returned instead of the original object type.

Example:

obj = { "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@", "nested": { "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@" } }

result = await LangfuseMedia.resolve_media_references(obj, langfuse_client)

Result:

{

"image": "...",

"nested": {

"pdf": "data:application/pdf;base64,JVBERi0xLjcK..."

}

}

def get_prompt( self, name: str, *, version: Optional[int] = None, label: Optional[str] = None, type: Literal['chat', 'text'] = 'text', cache_ttl_seconds: Optional[int] = None, fallback: Union[List[langfuse.model.ChatMessageDict], NoneType, str] = None, max_retries: Optional[int] = None, fetch_timeout_seconds: Optional[int] = None) -> Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient]:
3434    def get_prompt(
3435        self,
3436        name: str,
3437        *,
3438        version: Optional[int] = None,
3439        label: Optional[str] = None,
3440        type: Literal["chat", "text"] = "text",
3441        cache_ttl_seconds: Optional[int] = None,
3442        fallback: Union[Optional[List[ChatMessageDict]], Optional[str]] = None,
3443        max_retries: Optional[int] = None,
3444        fetch_timeout_seconds: Optional[int] = None,
3445    ) -> PromptClient:
3446        """Get a prompt.
3447
3448        This method attempts to fetch the requested prompt from the local cache. If the prompt is not found
3449        in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again
3450        and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will
3451        return the expired prompt as a fallback.
3452
3453        Args:
3454            name (str): The name of the prompt to retrieve.
3455
3456        Keyword Args:
3457            version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both.
3458            label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both.
3459            cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a
3460            keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0.
3461            type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text".
3462            fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None.
3463            max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds.
3464            fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default.
3465
3466        Returns:
3467            The prompt object retrieved from the cache or directly fetched if not cached or expired of type
3468            - TextPromptClient, if type argument is 'text'.
3469            - ChatPromptClient, if type argument is 'chat'.
3470
3471        Raises:
3472            Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an
3473            expired prompt in the cache, in which case it logs a warning and returns the expired prompt.
3474        """
3475        if self._resources is None:
3476            raise Error(
3477                "SDK is not correctly initialized. Check the init logs for more details."
3478            )
3479        if version is not None and label is not None:
3480            raise ValueError("Cannot specify both version and label at the same time.")
3481
3482        if not name:
3483            raise ValueError("Prompt name cannot be empty.")
3484
3485        cache_key = PromptCache.generate_cache_key(name, version=version, label=label)
3486        bounded_max_retries = self._get_bounded_max_retries(
3487            max_retries, default_max_retries=2, max_retries_upper_bound=4
3488        )
3489
3490        langfuse_logger.debug(f"Getting prompt '{cache_key}'")
3491        cached_prompt = self._resources.prompt_cache.get(cache_key)
3492
3493        if cached_prompt is None or cache_ttl_seconds == 0:
3494            langfuse_logger.debug(
3495                f"Prompt '{cache_key}' not found in cache or caching disabled."
3496            )
3497            try:
3498                return self._fetch_prompt_and_update_cache(
3499                    name,
3500                    version=version,
3501                    label=label,
3502                    ttl_seconds=cache_ttl_seconds,
3503                    max_retries=bounded_max_retries,
3504                    fetch_timeout_seconds=fetch_timeout_seconds,
3505                )
3506            except Exception as e:
3507                if fallback:
3508                    langfuse_logger.warning(
3509                        f"Returning fallback prompt for '{cache_key}' due to fetch error: {e}"
3510                    )
3511
3512                    fallback_client_args: Dict[str, Any] = {
3513                        "name": name,
3514                        "prompt": fallback,
3515                        "type": type,
3516                        "version": version or 0,
3517                        "config": {},
3518                        "labels": [label] if label else [],
3519                        "tags": [],
3520                    }
3521
3522                    if type == "text":
3523                        return TextPromptClient(
3524                            prompt=Prompt_Text(**fallback_client_args),
3525                            is_fallback=True,
3526                        )
3527
3528                    if type == "chat":
3529                        return ChatPromptClient(
3530                            prompt=Prompt_Chat(**fallback_client_args),
3531                            is_fallback=True,
3532                        )
3533
3534                raise e
3535
3536        if cached_prompt.is_expired():
3537            langfuse_logger.debug(f"Stale prompt '{cache_key}' found in cache.")
3538            try:
3539                # refresh prompt in background thread, refresh_prompt deduplicates tasks
3540                langfuse_logger.debug(f"Refreshing prompt '{cache_key}' in background.")
3541
3542                def refresh_task() -> None:
3543                    self._fetch_prompt_and_update_cache(
3544                        name,
3545                        version=version,
3546                        label=label,
3547                        ttl_seconds=cache_ttl_seconds,
3548                        max_retries=bounded_max_retries,
3549                        fetch_timeout_seconds=fetch_timeout_seconds,
3550                    )
3551
3552                self._resources.prompt_cache.add_refresh_prompt_task(
3553                    cache_key,
3554                    refresh_task,
3555                )
3556                langfuse_logger.debug(
3557                    f"Returning stale prompt '{cache_key}' from cache."
3558                )
3559                # return stale prompt
3560                return cached_prompt.value
3561
3562            except Exception as e:
3563                langfuse_logger.warning(
3564                    f"Error when refreshing cached prompt '{cache_key}', returning cached version. Error: {e}"
3565                )
3566                # creation of refresh prompt task failed, return stale prompt
3567                return cached_prompt.value
3568
3569        return cached_prompt.value

Get a prompt.

This method attempts to fetch the requested prompt from the local cache. If the prompt is not found in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will return the expired prompt as a fallback.

Arguments:
  • name (str): The name of the prompt to retrieve.
Keyword Args:

version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the production label is returned. Specify either version or label, not both. label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the production label is returned. Specify either version or label, not both. cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0. type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text". fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None. max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds. fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default.

Returns:

The prompt object retrieved from the cache or directly fetched if not cached or expired of type

  • TextPromptClient, if type argument is 'text'.
  • ChatPromptClient, if type argument is 'chat'.
Raises:
  • Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an
  • expired prompt in the cache, in which case it logs a warning and returns the expired prompt.
def create_prompt( self, *, name: str, prompt: Union[str, List[Union[langfuse.model.ChatMessageDict, langfuse.model.ChatMessageWithPlaceholdersDict_Message, langfuse.model.ChatMessageWithPlaceholdersDict_Placeholder]]], labels: List[str] = [], tags: Optional[List[str]] = None, type: Optional[Literal['chat', 'text']] = 'text', config: Optional[Any] = None, commit_message: Optional[str] = None) -> Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient]:
3671    def create_prompt(
3672        self,
3673        *,
3674        name: str,
3675        prompt: Union[
3676            str, List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]]
3677        ],
3678        labels: List[str] = [],
3679        tags: Optional[List[str]] = None,
3680        type: Optional[Literal["chat", "text"]] = "text",
3681        config: Optional[Any] = None,
3682        commit_message: Optional[str] = None,
3683    ) -> PromptClient:
3684        """Create a new prompt in Langfuse.
3685
3686        Keyword Args:
3687            name : The name of the prompt to be created.
3688            prompt : The content of the prompt to be created.
3689            is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead.
3690            labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label.
3691            tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt.
3692            config: Additional structured data to be saved with the prompt. Defaults to None.
3693            type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text".
3694            commit_message: Optional string describing the change.
3695
3696        Returns:
3697            TextPromptClient: The prompt if type argument is 'text'.
3698            ChatPromptClient: The prompt if type argument is 'chat'.
3699        """
3700        try:
3701            langfuse_logger.debug(f"Creating prompt {name=}, {labels=}")
3702
3703            if type == "chat":
3704                if not isinstance(prompt, list):
3705                    raise ValueError(
3706                        "For 'chat' type, 'prompt' must be a list of chat messages with role and content attributes."
3707                    )
3708                request: Union[CreatePromptRequest_Chat, CreatePromptRequest_Text] = (
3709                    CreatePromptRequest_Chat(
3710                        name=name,
3711                        prompt=cast(Any, prompt),
3712                        labels=labels,
3713                        tags=tags,
3714                        config=config or {},
3715                        commitMessage=commit_message,
3716                        type="chat",
3717                    )
3718                )
3719                server_prompt = self.api.prompts.create(request=request)
3720
3721                if self._resources is not None:
3722                    self._resources.prompt_cache.invalidate(name)
3723
3724                return ChatPromptClient(prompt=cast(Prompt_Chat, server_prompt))
3725
3726            if not isinstance(prompt, str):
3727                raise ValueError("For 'text' type, 'prompt' must be a string.")
3728
3729            request = CreatePromptRequest_Text(
3730                name=name,
3731                prompt=prompt,
3732                labels=labels,
3733                tags=tags,
3734                config=config or {},
3735                commitMessage=commit_message,
3736                type="text",
3737            )
3738
3739            server_prompt = self.api.prompts.create(request=request)
3740
3741            if self._resources is not None:
3742                self._resources.prompt_cache.invalidate(name)
3743
3744            return TextPromptClient(prompt=cast(Prompt_Text, server_prompt))
3745
3746        except Error as e:
3747            handle_fern_exception(e)
3748            raise e

Create a new prompt in Langfuse.

Keyword Args:

name : The name of the prompt to be created. prompt : The content of the prompt to be created. is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead. labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label. tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt. config: Additional structured data to be saved with the prompt. Defaults to None. type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text". commit_message: Optional string describing the change.

Returns:

TextPromptClient: The prompt if type argument is 'text'. ChatPromptClient: The prompt if type argument is 'chat'.

def update_prompt(self, *, name: str, version: int, new_labels: List[str] = []) -> Any:
3750    def update_prompt(
3751        self,
3752        *,
3753        name: str,
3754        version: int,
3755        new_labels: List[str] = [],
3756    ) -> Any:
3757        """Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name.
3758
3759        Args:
3760            name (str): The name of the prompt to update.
3761            version (int): The version number of the prompt to update.
3762            new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to [].
3763
3764        Returns:
3765            Prompt: The updated prompt from the Langfuse API.
3766
3767        """
3768        updated_prompt = self.api.prompt_version.update(
3769            name=self._url_encode(name),
3770            version=version,
3771            new_labels=new_labels,
3772        )
3773
3774        if self._resources is not None:
3775            self._resources.prompt_cache.invalidate(name)
3776
3777        return updated_prompt

Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name.

Arguments:
  • name (str): The name of the prompt to update.
  • version (int): The version number of the prompt to update.
  • new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to [].
Returns:

Prompt: The updated prompt from the Langfuse API.

def clear_prompt_cache(self) -> None:
3792    def clear_prompt_cache(self) -> None:
3793        """Clear the entire prompt cache, removing all cached prompts.
3794
3795        This method is useful when you want to force a complete refresh of all
3796        cached prompts, for example after major updates or when you need to
3797        ensure the latest versions are fetched from the server.
3798        """
3799        if self._resources is not None:
3800            self._resources.prompt_cache.clear()

Clear the entire prompt cache, removing all cached prompts.

This method is useful when you want to force a complete refresh of all cached prompts, for example after major updates or when you need to ensure the latest versions are fetched from the server.

def get_client(*, public_key: Optional[str] = None) -> Langfuse:
 60def get_client(*, public_key: Optional[str] = None) -> Langfuse:
 61    """Get or create a Langfuse client instance.
 62
 63    Returns an existing Langfuse client or creates a new one if none exists. In multi-project setups,
 64    providing a public_key is required. Multi-project support is experimental - see Langfuse docs.
 65
 66    Behavior:
 67    - Single project: Returns existing client or creates new one
 68    - Multi-project: Requires public_key to return specific client
 69    - No public_key in multi-project: Returns disabled client to prevent data leakage
 70
 71    The function uses a singleton pattern per public_key to conserve resources and maintain state.
 72
 73    Args:
 74        public_key (Optional[str]): Project identifier
 75            - With key: Returns client for that project
 76            - Without key: Returns single client or disabled client if multiple exist
 77
 78    Returns:
 79        Langfuse: Client instance in one of three states:
 80            1. Client for specified public_key
 81            2. Default client for single-project setup
 82            3. Disabled client when multiple projects exist without key
 83
 84    Security:
 85        Disables tracing when multiple projects exist without explicit key to prevent
 86        cross-project data leakage. Multi-project setups are experimental.
 87
 88    Example:
 89        ```python
 90        # Single project
 91        client = get_client()  # Default client
 92
 93        # In multi-project usage:
 94        client_a = get_client(public_key="project_a_key")  # Returns project A's client
 95        client_b = get_client(public_key="project_b_key")  # Returns project B's client
 96
 97        # Without specific key in multi-project setup:
 98        client = get_client()  # Returns disabled client for safety
 99        ```
100    """
101    with LangfuseResourceManager._lock:
102        active_instances = LangfuseResourceManager._instances
103
104        # If no explicit public_key provided, check execution context
105        if not public_key:
106            public_key = _current_public_key.get(None)
107
108        if not public_key:
109            if len(active_instances) == 0:
110                # No clients initialized yet, create default instance
111                return Langfuse()
112
113            if len(active_instances) == 1:
114                # Only one client exists, safe to use without specifying key
115                instance = list(active_instances.values())[0]
116
117                # Initialize with the credentials bound to the instance
118                # This is important if the original instance was instantiated
119                # via constructor arguments
120                return _create_client_from_instance(instance)
121
122            else:
123                # Multiple clients exist but no key specified - disable tracing
124                # to prevent cross-project data leakage
125                langfuse_logger.warning(
126                    "No 'langfuse_public_key' passed to decorated function, but multiple langfuse clients are instantiated in current process. Skipping tracing for this function to avoid cross-project leakage."
127                )
128                return Langfuse(
129                    tracing_enabled=False, public_key="fake", secret_key="fake"
130                )
131
132        else:
133            # Specific key provided, look up existing instance
134            target_instance: Optional[LangfuseResourceManager] = active_instances.get(
135                public_key, None
136            )
137
138            if target_instance is None:
139                # No instance found with this key - client not initialized properly
140                langfuse_logger.warning(
141                    f"No Langfuse client with public key {public_key} has been initialized. Skipping tracing for decorated function."
142                )
143                return Langfuse(
144                    tracing_enabled=False, public_key="fake", secret_key="fake"
145                )
146
147            # target_instance is guaranteed to be not None at this point
148            return _create_client_from_instance(target_instance, public_key)

Get or create a Langfuse client instance.

Returns an existing Langfuse client or creates a new one if none exists. In multi-project setups, providing a public_key is required. Multi-project support is experimental - see Langfuse docs.

Behavior:

  • Single project: Returns existing client or creates new one
  • Multi-project: Requires public_key to return specific client
  • No public_key in multi-project: Returns disabled client to prevent data leakage

The function uses a singleton pattern per public_key to conserve resources and maintain state.

Arguments:
  • public_key (Optional[str]): Project identifier
    • With key: Returns client for that project
    • Without key: Returns single client or disabled client if multiple exist
Returns:

Langfuse: Client instance in one of three states: 1. Client for specified public_key 2. Default client for single-project setup 3. Disabled client when multiple projects exist without key

Security:

Disables tracing when multiple projects exist without explicit key to prevent cross-project data leakage. Multi-project setups are experimental.

Example:
# Single project
client = get_client()  # Default client

# In multi-project usage:
client_a = get_client(public_key="project_a_key")  # Returns project A's client
client_b = get_client(public_key="project_b_key")  # Returns project B's client

# Without specific key in multi-project setup:
client = get_client()  # Returns disabled client for safety
def observe( func: Optional[~F] = None, *, name: Optional[str] = None, as_type: Union[Literal['generation', 'embedding'], Literal['span', 'agent', 'tool', 'chain', 'retriever', 'evaluator', 'guardrail'], NoneType] = None, capture_input: Optional[bool] = None, capture_output: Optional[bool] = None, transform_to_string: Optional[Callable[[Iterable], str]] = None) -> Union[~F, Callable[[~F], ~F]]:
 90    def observe(
 91        self,
 92        func: Optional[F] = None,
 93        *,
 94        name: Optional[str] = None,
 95        as_type: Optional[ObservationTypeLiteralNoEvent] = None,
 96        capture_input: Optional[bool] = None,
 97        capture_output: Optional[bool] = None,
 98        transform_to_string: Optional[Callable[[Iterable], str]] = None,
 99    ) -> Union[F, Callable[[F], F]]:
100        """Wrap a function to create and manage Langfuse tracing around its execution, supporting both synchronous and asynchronous functions.
101
102        This decorator provides seamless integration of Langfuse observability into your codebase. It automatically creates
103        spans or generations around function execution, capturing timing, inputs/outputs, and error states. The decorator
104        intelligently handles both synchronous and asynchronous functions, preserving function signatures and type hints.
105
106        Using OpenTelemetry's distributed tracing system, it maintains proper trace context propagation throughout your application,
107        enabling you to see hierarchical traces of function calls with detailed performance metrics and function-specific details.
108
109        Args:
110            func (Optional[Callable]): The function to decorate. When used with parentheses @observe(), this will be None.
111            name (Optional[str]): Custom name for the created trace or span. If not provided, the function name is used.
112            as_type (Optional[Literal]): Set the observation type. Supported values:
113                    "generation", "span", "agent", "tool", "chain", "retriever", "embedding", "evaluator", "guardrail".
114                    Observation types are highlighted in the Langfuse UI for filtering and visualization.
115                    The types "generation" and "embedding" create a span on which additional attributes such as model metrics
116                    can be set.
117
118        Returns:
119            Callable: A wrapped version of the original function that automatically creates and manages Langfuse spans.
120
121        Example:
122            For general function tracing with automatic naming:
123            ```python
124            @observe()
125            def process_user_request(user_id, query):
126                # Function is automatically traced with name "process_user_request"
127                return get_response(query)
128            ```
129
130            For language model generation tracking:
131            ```python
132            @observe(name="answer-generation", as_type="generation")
133            async def generate_answer(query):
134                # Creates a generation-type span with extended LLM metrics
135                response = await openai.chat.completions.create(
136                    model="gpt-4",
137                    messages=[{"role": "user", "content": query}]
138                )
139                return response.choices[0].message.content
140            ```
141
142            For trace context propagation between functions:
143            ```python
144            @observe()
145            def main_process():
146                # Parent span is created
147                return sub_process()  # Child span automatically connected to parent
148
149            @observe()
150            def sub_process():
151                # Automatically becomes a child span of main_process
152                return "result"
153            ```
154
155        Raises:
156            Exception: Propagates any exceptions from the wrapped function after logging them in the trace.
157
158        Notes:
159            - The decorator preserves the original function's signature, docstring, and return type.
160            - Proper parent-child relationships between spans are automatically maintained.
161            - Special keyword arguments can be passed to control tracing:
162              - langfuse_trace_id: Explicitly set the trace ID for this function call
163              - langfuse_parent_observation_id: Explicitly set the parent span ID
164              - langfuse_public_key: Use a specific Langfuse project (when multiple clients exist)
165            - For async functions, the decorator returns an async function wrapper.
166            - For sync functions, the decorator returns a synchronous wrapper.
167        """
168        valid_types = set(get_observation_types_list(ObservationTypeLiteralNoEvent))
169        if as_type is not None and as_type not in valid_types:
170            self._log.warning(
171                f"Invalid as_type '{as_type}'. Valid types are: {', '.join(sorted(valid_types))}. Defaulting to 'span'."
172            )
173            as_type = "span"
174
175        function_io_capture_enabled = os.environ.get(
176            LANGFUSE_OBSERVE_DECORATOR_IO_CAPTURE_ENABLED, "True"
177        ).lower() not in ("false", "0")
178
179        should_capture_input = (
180            capture_input if capture_input is not None else function_io_capture_enabled
181        )
182
183        should_capture_output = (
184            capture_output
185            if capture_output is not None
186            else function_io_capture_enabled
187        )
188
189        def decorator(func: F) -> F:
190            return (
191                self._async_observe(
192                    func,
193                    name=name,
194                    as_type=as_type,
195                    capture_input=should_capture_input,
196                    capture_output=should_capture_output,
197                    transform_to_string=transform_to_string,
198                )
199                if asyncio.iscoroutinefunction(func)
200                else self._sync_observe(
201                    func,
202                    name=name,
203                    as_type=as_type,
204                    capture_input=should_capture_input,
205                    capture_output=should_capture_output,
206                    transform_to_string=transform_to_string,
207                )
208            )
209
210        """Handle decorator with or without parentheses.
211
212        This logic enables the decorator to work both with and without parentheses:
213        - @observe - Python passes the function directly to the decorator
214        - @observe() - Python calls the decorator first, which must return a function decorator
215
216        When called without arguments (@observe), the func parameter contains the function to decorate,
217        so we directly apply the decorator to it. When called with parentheses (@observe()),
218        func is None, so we return the decorator function itself for Python to apply in the next step.
219        """
220        if func is None:
221            return decorator
222        else:
223            return decorator(func)

Wrap a function to create and manage Langfuse tracing around its execution, supporting both synchronous and asynchronous functions.

This decorator provides seamless integration of Langfuse observability into your codebase. It automatically creates spans or generations around function execution, capturing timing, inputs/outputs, and error states. The decorator intelligently handles both synchronous and asynchronous functions, preserving function signatures and type hints.

Using OpenTelemetry's distributed tracing system, it maintains proper trace context propagation throughout your application, enabling you to see hierarchical traces of function calls with detailed performance metrics and function-specific details.

Arguments:
  • func (Optional[Callable]): The function to decorate. When used with parentheses @observe(), this will be None.
  • name (Optional[str]): Custom name for the created trace or span. If not provided, the function name is used.
  • as_type (Optional[Literal]): Set the observation type. Supported values: "generation", "span", "agent", "tool", "chain", "retriever", "embedding", "evaluator", "guardrail". Observation types are highlighted in the Langfuse UI for filtering and visualization. The types "generation" and "embedding" create a span on which additional attributes such as model metrics can be set.
Returns:

Callable: A wrapped version of the original function that automatically creates and manages Langfuse spans.

Example:

For general function tracing with automatic naming:

@observe()
def process_user_request(user_id, query):
    # Function is automatically traced with name "process_user_request"
    return get_response(query)

For language model generation tracking:

@observe(name="answer-generation", as_type="generation")
async def generate_answer(query):
    # Creates a generation-type span with extended LLM metrics
    response = await openai.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": query}]
    )
    return response.choices[0].message.content

For trace context propagation between functions:

@observe()
def main_process():
    # Parent span is created
    return sub_process()  # Child span automatically connected to parent

@observe()
def sub_process():
    # Automatically becomes a child span of main_process
    return "result"
Raises:
  • Exception: Propagates any exceptions from the wrapped function after logging them in the trace.
Notes:
  • The decorator preserves the original function's signature, docstring, and return type.
  • Proper parent-child relationships between spans are automatically maintained.
  • Special keyword arguments can be passed to control tracing:
    • langfuse_trace_id: Explicitly set the trace ID for this function call
    • langfuse_parent_observation_id: Explicitly set the parent span ID
    • langfuse_public_key: Use a specific Langfuse project (when multiple clients exist)
  • For async functions, the decorator returns an async function wrapper.
  • For sync functions, the decorator returns a synchronous wrapper.
def propagate_attributes( *, user_id: Optional[str] = None, session_id: Optional[str] = None, metadata: Optional[Dict[str, str]] = None, version: Optional[str] = None, tags: Optional[List[str]] = None, as_baggage: bool = False) -> opentelemetry.util._decorator._AgnosticContextManager[typing.Any]:
 74def propagate_attributes(
 75    *,
 76    user_id: Optional[str] = None,
 77    session_id: Optional[str] = None,
 78    metadata: Optional[Dict[str, str]] = None,
 79    version: Optional[str] = None,
 80    tags: Optional[List[str]] = None,
 81    as_baggage: bool = False,
 82) -> _AgnosticContextManager[Any]:
 83    """Propagate trace-level attributes to all spans created within this context.
 84
 85    This context manager sets attributes on the currently active span AND automatically
 86    propagates them to all new child spans created within the context. This is the
 87    recommended way to set trace-level attributes like user_id, session_id, and metadata
 88    dimensions that should be consistently applied across all observations in a trace.
 89
 90    **IMPORTANT**: Call this as early as possible within your trace/workflow. Only the
 91    currently active span and spans created after entering this context will have these
 92    attributes. Pre-existing spans will NOT be retroactively updated.
 93
 94    **Why this matters**: Langfuse aggregation queries (e.g., total cost by user_id,
 95    filtering by session_id) only include observations that have the attribute set.
 96    If you call `propagate_attributes` late in your workflow, earlier spans won't be
 97    included in aggregations for that attribute.
 98
 99    Args:
100        user_id: User identifier to associate with all spans in this context.
101            Must be US-ASCII string, ≤200 characters. Use this to track which user
102            generated each trace and enable e.g. per-user cost/performance analysis.
103        session_id: Session identifier to associate with all spans in this context.
104            Must be US-ASCII string, ≤200 characters. Use this to group related traces
105            within a user session (e.g., a conversation thread, multi-turn interaction).
106        metadata: Additional key-value metadata to propagate to all spans.
107            - Keys and values must be US-ASCII strings
108            - All values must be ≤200 characters
109            - Use for dimensions like internal correlating identifiers
110            - AVOID: large payloads, sensitive data, non-string values (will be dropped with warning)
111        version: Version identfier for parts of your application that are independently versioned, e.g. agents
112        tags: List of tags to categorize the group of observations
113        as_baggage: If True, propagates attributes using OpenTelemetry baggage for
114            cross-process/service propagation. **Security warning**: When enabled,
115            attribute values are added to HTTP headers on ALL outbound requests.
116            Only enable if values are safe to transmit via HTTP headers and you need
117            cross-service tracing. Default: False.
118
119    Returns:
120        Context manager that propagates attributes to all child spans.
121
122    Example:
123        Basic usage with user and session tracking:
124
125        ```python
126        from langfuse import Langfuse
127
128        langfuse = Langfuse()
129
130        # Set attributes early in the trace
131        with langfuse.start_as_current_span(name="user_workflow") as span:
132            with langfuse.propagate_attributes(
133                user_id="user_123",
134                session_id="session_abc",
135                metadata={"experiment": "variant_a", "environment": "production"}
136            ):
137                # All spans created here will have user_id, session_id, and metadata
138                with langfuse.start_span(name="llm_call") as llm_span:
139                    # This span inherits: user_id, session_id, experiment, environment
140                    ...
141
142                with langfuse.start_generation(name="completion") as gen:
143                    # This span also inherits all attributes
144                    ...
145        ```
146
147        Late propagation (anti-pattern):
148
149        ```python
150        with langfuse.start_as_current_span(name="workflow") as span:
151            # These spans WON'T have user_id
152            early_span = langfuse.start_span(name="early_work")
153            early_span.end()
154
155            # Set attributes in the middle
156            with langfuse.propagate_attributes(user_id="user_123"):
157                # Only spans created AFTER this point will have user_id
158                late_span = langfuse.start_span(name="late_work")
159                late_span.end()
160
161            # Result: Aggregations by user_id will miss "early_work" span
162        ```
163
164        Cross-service propagation with baggage (advanced):
165
166        ```python
167        # Service A - originating service
168        with langfuse.start_as_current_span(name="api_request"):
169            with langfuse.propagate_attributes(
170                user_id="user_123",
171                session_id="session_abc",
172                as_baggage=True  # Propagate via HTTP headers
173            ):
174                # Make HTTP request to Service B
175                response = requests.get("https://service-b.example.com/api")
176                # user_id and session_id are now in HTTP headers
177
178        # Service B - downstream service
179        # OpenTelemetry will automatically extract baggage from HTTP headers
180        # and propagate to spans in Service B
181        ```
182
183    Note:
184        - **Validation**: All attribute values (user_id, session_id, metadata values)
185          must be strings ≤200 characters. Invalid values will be dropped with a
186          warning logged. Ensure values meet constraints before calling.
187        - **OpenTelemetry**: This uses OpenTelemetry context propagation under the hood,
188          making it compatible with other OTel-instrumented libraries.
189
190    Raises:
191        No exceptions are raised. Invalid values are logged as warnings and dropped.
192    """
193    return _propagate_attributes(
194        user_id=user_id,
195        session_id=session_id,
196        metadata=metadata,
197        version=version,
198        tags=tags,
199        as_baggage=as_baggage,
200    )

Propagate trace-level attributes to all spans created within this context.

This context manager sets attributes on the currently active span AND automatically propagates them to all new child spans created within the context. This is the recommended way to set trace-level attributes like user_id, session_id, and metadata dimensions that should be consistently applied across all observations in a trace.

IMPORTANT: Call this as early as possible within your trace/workflow. Only the currently active span and spans created after entering this context will have these attributes. Pre-existing spans will NOT be retroactively updated.

Why this matters: Langfuse aggregation queries (e.g., total cost by user_id, filtering by session_id) only include observations that have the attribute set. If you call propagate_attributes late in your workflow, earlier spans won't be included in aggregations for that attribute.

Arguments:
  • user_id: User identifier to associate with all spans in this context. Must be US-ASCII string, ≤200 characters. Use this to track which user generated each trace and enable e.g. per-user cost/performance analysis.
  • session_id: Session identifier to associate with all spans in this context. Must be US-ASCII string, ≤200 characters. Use this to group related traces within a user session (e.g., a conversation thread, multi-turn interaction).
  • metadata: Additional key-value metadata to propagate to all spans.
    • Keys and values must be US-ASCII strings
    • All values must be ≤200 characters
    • Use for dimensions like internal correlating identifiers
    • AVOID: large payloads, sensitive data, non-string values (will be dropped with warning)
  • version: Version identfier for parts of your application that are independently versioned, e.g. agents
  • tags: List of tags to categorize the group of observations
  • as_baggage: If True, propagates attributes using OpenTelemetry baggage for cross-process/service propagation. Security warning: When enabled, attribute values are added to HTTP headers on ALL outbound requests. Only enable if values are safe to transmit via HTTP headers and you need cross-service tracing. Default: False.
Returns:

Context manager that propagates attributes to all child spans.

Example:

Basic usage with user and session tracking:

from langfuse import Langfuse

langfuse = Langfuse()

# Set attributes early in the trace
with langfuse.start_as_current_span(name="user_workflow") as span:
    with langfuse.propagate_attributes(
        user_id="user_123",
        session_id="session_abc",
        metadata={"experiment": "variant_a", "environment": "production"}
    ):
        # All spans created here will have user_id, session_id, and metadata
        with langfuse.start_span(name="llm_call") as llm_span:
            # This span inherits: user_id, session_id, experiment, environment
            ...

        with langfuse.start_generation(name="completion") as gen:
            # This span also inherits all attributes
            ...

Late propagation (anti-pattern):

with langfuse.start_as_current_span(name="workflow") as span:
    # These spans WON'T have user_id
    early_span = langfuse.start_span(name="early_work")
    early_span.end()

    # Set attributes in the middle
    with langfuse.propagate_attributes(user_id="user_123"):
        # Only spans created AFTER this point will have user_id
        late_span = langfuse.start_span(name="late_work")
        late_span.end()

    # Result: Aggregations by user_id will miss "early_work" span

Cross-service propagation with baggage (advanced):

# Service A - originating service
with langfuse.start_as_current_span(name="api_request"):
    with langfuse.propagate_attributes(
        user_id="user_123",
        session_id="session_abc",
        as_baggage=True  # Propagate via HTTP headers
    ):
        # Make HTTP request to Service B
        response = requests.get("https://service-b.example.com/api")
        # user_id and session_id are now in HTTP headers

# Service B - downstream service
# OpenTelemetry will automatically extract baggage from HTTP headers
# and propagate to spans in Service B
Note:
  • Validation: All attribute values (user_id, session_id, metadata values) must be strings ≤200 characters. Invalid values will be dropped with a warning logged. Ensure values meet constraints before calling.
  • OpenTelemetry: This uses OpenTelemetry context propagation under the hood, making it compatible with other OTel-instrumented libraries.
Raises:
  • No exceptions are raised. Invalid values are logged as warnings and dropped.
ObservationTypeLiteral = typing.Union[typing.Literal['generation', 'embedding'], typing.Literal['span', 'agent', 'tool', 'chain', 'retriever', 'evaluator', 'guardrail'], typing.Literal['event']]
class LangfuseSpan(langfuse._client.span.LangfuseObservationWrapper):
1156class LangfuseSpan(LangfuseObservationWrapper):
1157    """Standard span implementation for general operations in Langfuse.
1158
1159    This class represents a general-purpose span that can be used to trace
1160    any operation in your application. It extends the base LangfuseObservationWrapper
1161    with specific methods for creating child spans, generations, and updating
1162    span-specific attributes. If possible, use a more specific type for
1163    better observability and insights.
1164    """
1165
1166    def __init__(
1167        self,
1168        *,
1169        otel_span: otel_trace_api.Span,
1170        langfuse_client: "Langfuse",
1171        input: Optional[Any] = None,
1172        output: Optional[Any] = None,
1173        metadata: Optional[Any] = None,
1174        environment: Optional[str] = None,
1175        version: Optional[str] = None,
1176        level: Optional[SpanLevel] = None,
1177        status_message: Optional[str] = None,
1178    ):
1179        """Initialize a new LangfuseSpan.
1180
1181        Args:
1182            otel_span: The OpenTelemetry span to wrap
1183            langfuse_client: Reference to the parent Langfuse client
1184            input: Input data for the span (any JSON-serializable object)
1185            output: Output data from the span (any JSON-serializable object)
1186            metadata: Additional metadata to associate with the span
1187            environment: The tracing environment
1188            version: Version identifier for the code or component
1189            level: Importance level of the span (info, warning, error)
1190            status_message: Optional status message for the span
1191        """
1192        super().__init__(
1193            otel_span=otel_span,
1194            as_type="span",
1195            langfuse_client=langfuse_client,
1196            input=input,
1197            output=output,
1198            metadata=metadata,
1199            environment=environment,
1200            version=version,
1201            level=level,
1202            status_message=status_message,
1203        )
1204
1205    def start_span(
1206        self,
1207        name: str,
1208        input: Optional[Any] = None,
1209        output: Optional[Any] = None,
1210        metadata: Optional[Any] = None,
1211        version: Optional[str] = None,
1212        level: Optional[SpanLevel] = None,
1213        status_message: Optional[str] = None,
1214    ) -> "LangfuseSpan":
1215        """Create a new child span.
1216
1217        This method creates a new child span with this span as the parent.
1218        Unlike start_as_current_span(), this method does not set the new span
1219        as the current span in the context.
1220
1221        Args:
1222            name: Name of the span (e.g., function or operation name)
1223            input: Input data for the operation
1224            output: Output data from the operation
1225            metadata: Additional metadata to associate with the span
1226            version: Version identifier for the code or component
1227            level: Importance level of the span (info, warning, error)
1228            status_message: Optional status message for the span
1229
1230        Returns:
1231            A new LangfuseSpan that must be ended with .end() when complete
1232
1233        Example:
1234            ```python
1235            parent_span = langfuse.start_span(name="process-request")
1236            try:
1237                # Create a child span
1238                child_span = parent_span.start_span(name="validate-input")
1239                try:
1240                    # Do validation work
1241                    validation_result = validate(request_data)
1242                    child_span.update(output=validation_result)
1243                finally:
1244                    child_span.end()
1245
1246                # Continue with parent span
1247                result = process_validated_data(validation_result)
1248                parent_span.update(output=result)
1249            finally:
1250                parent_span.end()
1251            ```
1252        """
1253        return self.start_observation(
1254            name=name,
1255            as_type="span",
1256            input=input,
1257            output=output,
1258            metadata=metadata,
1259            version=version,
1260            level=level,
1261            status_message=status_message,
1262        )
1263
1264    def start_as_current_span(
1265        self,
1266        *,
1267        name: str,
1268        input: Optional[Any] = None,
1269        output: Optional[Any] = None,
1270        metadata: Optional[Any] = None,
1271        version: Optional[str] = None,
1272        level: Optional[SpanLevel] = None,
1273        status_message: Optional[str] = None,
1274    ) -> _AgnosticContextManager["LangfuseSpan"]:
1275        """[DEPRECATED] Create a new child span and set it as the current span in a context manager.
1276
1277        DEPRECATED: This method is deprecated and will be removed in a future version.
1278        Use start_as_current_observation(as_type='span') instead.
1279
1280        This method creates a new child span and sets it as the current span within
1281        a context manager. It should be used with a 'with' statement to automatically
1282        manage the span's lifecycle.
1283
1284        Args:
1285            name: Name of the span (e.g., function or operation name)
1286            input: Input data for the operation
1287            output: Output data from the operation
1288            metadata: Additional metadata to associate with the span
1289            version: Version identifier for the code or component
1290            level: Importance level of the span (info, warning, error)
1291            status_message: Optional status message for the span
1292
1293        Returns:
1294            A context manager that yields a new LangfuseSpan
1295
1296        Example:
1297            ```python
1298            with langfuse.start_as_current_span(name="process-request") as parent_span:
1299                # Parent span is active here
1300
1301                # Create a child span with context management
1302                with parent_span.start_as_current_span(name="validate-input") as child_span:
1303                    # Child span is active here
1304                    validation_result = validate(request_data)
1305                    child_span.update(output=validation_result)
1306
1307                # Back to parent span context
1308                result = process_validated_data(validation_result)
1309                parent_span.update(output=result)
1310            ```
1311        """
1312        warnings.warn(
1313            "start_as_current_span is deprecated and will be removed in a future version. "
1314            "Use start_as_current_observation(as_type='span') instead.",
1315            DeprecationWarning,
1316            stacklevel=2,
1317        )
1318        return self.start_as_current_observation(
1319            name=name,
1320            as_type="span",
1321            input=input,
1322            output=output,
1323            metadata=metadata,
1324            version=version,
1325            level=level,
1326            status_message=status_message,
1327        )
1328
1329    def start_generation(
1330        self,
1331        *,
1332        name: str,
1333        input: Optional[Any] = None,
1334        output: Optional[Any] = None,
1335        metadata: Optional[Any] = None,
1336        version: Optional[str] = None,
1337        level: Optional[SpanLevel] = None,
1338        status_message: Optional[str] = None,
1339        completion_start_time: Optional[datetime] = None,
1340        model: Optional[str] = None,
1341        model_parameters: Optional[Dict[str, MapValue]] = None,
1342        usage_details: Optional[Dict[str, int]] = None,
1343        cost_details: Optional[Dict[str, float]] = None,
1344        prompt: Optional[PromptClient] = None,
1345    ) -> "LangfuseGeneration":
1346        """[DEPRECATED] Create a new child generation span.
1347
1348        DEPRECATED: This method is deprecated and will be removed in a future version.
1349        Use start_observation(as_type='generation') instead.
1350
1351        This method creates a new child generation span with this span as the parent.
1352        Generation spans are specialized for AI/LLM operations and include additional
1353        fields for model information, usage stats, and costs.
1354
1355        Unlike start_as_current_generation(), this method does not set the new span
1356        as the current span in the context.
1357
1358        Args:
1359            name: Name of the generation operation
1360            input: Input data for the model (e.g., prompts)
1361            output: Output from the model (e.g., completions)
1362            metadata: Additional metadata to associate with the generation
1363            version: Version identifier for the model or component
1364            level: Importance level of the generation (info, warning, error)
1365            status_message: Optional status message for the generation
1366            completion_start_time: When the model started generating the response
1367            model: Name/identifier of the AI model used (e.g., "gpt-4")
1368            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1369            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1370            cost_details: Cost information for the model call
1371            prompt: Associated prompt template from Langfuse prompt management
1372
1373        Returns:
1374            A new LangfuseGeneration that must be ended with .end() when complete
1375
1376        Example:
1377            ```python
1378            span = langfuse.start_span(name="process-query")
1379            try:
1380                # Create a generation child span
1381                generation = span.start_generation(
1382                    name="generate-answer",
1383                    model="gpt-4",
1384                    input={"prompt": "Explain quantum computing"}
1385                )
1386                try:
1387                    # Call model API
1388                    response = llm.generate(...)
1389
1390                    generation.update(
1391                        output=response.text,
1392                        usage_details={
1393                            "prompt_tokens": response.usage.prompt_tokens,
1394                            "completion_tokens": response.usage.completion_tokens
1395                        }
1396                    )
1397                finally:
1398                    generation.end()
1399
1400                # Continue with parent span
1401                span.update(output={"answer": response.text, "source": "gpt-4"})
1402            finally:
1403                span.end()
1404            ```
1405        """
1406        warnings.warn(
1407            "start_generation is deprecated and will be removed in a future version. "
1408            "Use start_observation(as_type='generation') instead.",
1409            DeprecationWarning,
1410            stacklevel=2,
1411        )
1412        return self.start_observation(
1413            name=name,
1414            as_type="generation",
1415            input=input,
1416            output=output,
1417            metadata=metadata,
1418            version=version,
1419            level=level,
1420            status_message=status_message,
1421            completion_start_time=completion_start_time,
1422            model=model,
1423            model_parameters=model_parameters,
1424            usage_details=usage_details,
1425            cost_details=cost_details,
1426            prompt=prompt,
1427        )
1428
1429    def start_as_current_generation(
1430        self,
1431        *,
1432        name: str,
1433        input: Optional[Any] = None,
1434        output: Optional[Any] = None,
1435        metadata: Optional[Any] = None,
1436        version: Optional[str] = None,
1437        level: Optional[SpanLevel] = None,
1438        status_message: Optional[str] = None,
1439        completion_start_time: Optional[datetime] = None,
1440        model: Optional[str] = None,
1441        model_parameters: Optional[Dict[str, MapValue]] = None,
1442        usage_details: Optional[Dict[str, int]] = None,
1443        cost_details: Optional[Dict[str, float]] = None,
1444        prompt: Optional[PromptClient] = None,
1445    ) -> _AgnosticContextManager["LangfuseGeneration"]:
1446        """[DEPRECATED] Create a new child generation span and set it as the current span in a context manager.
1447
1448        DEPRECATED: This method is deprecated and will be removed in a future version.
1449        Use start_as_current_observation(as_type='generation') instead.
1450
1451        This method creates a new child generation span and sets it as the current span
1452        within a context manager. Generation spans are specialized for AI/LLM operations
1453        and include additional fields for model information, usage stats, and costs.
1454
1455        Args:
1456            name: Name of the generation operation
1457            input: Input data for the model (e.g., prompts)
1458            output: Output from the model (e.g., completions)
1459            metadata: Additional metadata to associate with the generation
1460            version: Version identifier for the model or component
1461            level: Importance level of the generation (info, warning, error)
1462            status_message: Optional status message for the generation
1463            completion_start_time: When the model started generating the response
1464            model: Name/identifier of the AI model used (e.g., "gpt-4")
1465            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1466            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1467            cost_details: Cost information for the model call
1468            prompt: Associated prompt template from Langfuse prompt management
1469
1470        Returns:
1471            A context manager that yields a new LangfuseGeneration
1472
1473        Example:
1474            ```python
1475            with langfuse.start_as_current_span(name="process-request") as span:
1476                # Prepare data
1477                query = preprocess_user_query(user_input)
1478
1479                # Create a generation span with context management
1480                with span.start_as_current_generation(
1481                    name="generate-answer",
1482                    model="gpt-4",
1483                    input={"query": query}
1484                ) as generation:
1485                    # Generation span is active here
1486                    response = llm.generate(query)
1487
1488                    # Update with results
1489                    generation.update(
1490                        output=response.text,
1491                        usage_details={
1492                            "prompt_tokens": response.usage.prompt_tokens,
1493                            "completion_tokens": response.usage.completion_tokens
1494                        }
1495                    )
1496
1497                # Back to parent span context
1498                span.update(output={"answer": response.text, "source": "gpt-4"})
1499            ```
1500        """
1501        warnings.warn(
1502            "start_as_current_generation is deprecated and will be removed in a future version. "
1503            "Use start_as_current_observation(as_type='generation') instead.",
1504            DeprecationWarning,
1505            stacklevel=2,
1506        )
1507        return self.start_as_current_observation(
1508            name=name,
1509            as_type="generation",
1510            input=input,
1511            output=output,
1512            metadata=metadata,
1513            version=version,
1514            level=level,
1515            status_message=status_message,
1516            completion_start_time=completion_start_time,
1517            model=model,
1518            model_parameters=model_parameters,
1519            usage_details=usage_details,
1520            cost_details=cost_details,
1521            prompt=prompt,
1522        )
1523
1524    def create_event(
1525        self,
1526        *,
1527        name: str,
1528        input: Optional[Any] = None,
1529        output: Optional[Any] = None,
1530        metadata: Optional[Any] = None,
1531        version: Optional[str] = None,
1532        level: Optional[SpanLevel] = None,
1533        status_message: Optional[str] = None,
1534    ) -> "LangfuseEvent":
1535        """Create a new Langfuse observation of type 'EVENT'.
1536
1537        Args:
1538            name: Name of the span (e.g., function or operation name)
1539            input: Input data for the operation (can be any JSON-serializable object)
1540            output: Output data from the operation (can be any JSON-serializable object)
1541            metadata: Additional metadata to associate with the span
1542            version: Version identifier for the code or component
1543            level: Importance level of the span (info, warning, error)
1544            status_message: Optional status message for the span
1545
1546        Returns:
1547            The LangfuseEvent object
1548
1549        Example:
1550            ```python
1551            event = langfuse.create_event(name="process-event")
1552            ```
1553        """
1554        timestamp = time_ns()
1555
1556        with otel_trace_api.use_span(self._otel_span):
1557            new_otel_span = self._langfuse_client._otel_tracer.start_span(
1558                name=name, start_time=timestamp
1559            )
1560
1561        return cast(
1562            "LangfuseEvent",
1563            LangfuseEvent(
1564                otel_span=new_otel_span,
1565                langfuse_client=self._langfuse_client,
1566                input=input,
1567                output=output,
1568                metadata=metadata,
1569                environment=self._environment,
1570                version=version,
1571                level=level,
1572                status_message=status_message,
1573            ).end(end_time=timestamp),
1574        )

Standard span implementation for general operations in Langfuse.

This class represents a general-purpose span that can be used to trace any operation in your application. It extends the base LangfuseObservationWrapper with specific methods for creating child spans, generations, and updating span-specific attributes. If possible, use a more specific type for better observability and insights.

LangfuseSpan( *, otel_span: opentelemetry.trace.span.Span, langfuse_client: Langfuse, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, environment: Optional[str] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None)
1166    def __init__(
1167        self,
1168        *,
1169        otel_span: otel_trace_api.Span,
1170        langfuse_client: "Langfuse",
1171        input: Optional[Any] = None,
1172        output: Optional[Any] = None,
1173        metadata: Optional[Any] = None,
1174        environment: Optional[str] = None,
1175        version: Optional[str] = None,
1176        level: Optional[SpanLevel] = None,
1177        status_message: Optional[str] = None,
1178    ):
1179        """Initialize a new LangfuseSpan.
1180
1181        Args:
1182            otel_span: The OpenTelemetry span to wrap
1183            langfuse_client: Reference to the parent Langfuse client
1184            input: Input data for the span (any JSON-serializable object)
1185            output: Output data from the span (any JSON-serializable object)
1186            metadata: Additional metadata to associate with the span
1187            environment: The tracing environment
1188            version: Version identifier for the code or component
1189            level: Importance level of the span (info, warning, error)
1190            status_message: Optional status message for the span
1191        """
1192        super().__init__(
1193            otel_span=otel_span,
1194            as_type="span",
1195            langfuse_client=langfuse_client,
1196            input=input,
1197            output=output,
1198            metadata=metadata,
1199            environment=environment,
1200            version=version,
1201            level=level,
1202            status_message=status_message,
1203        )

Initialize a new LangfuseSpan.

Arguments:
  • otel_span: The OpenTelemetry span to wrap
  • langfuse_client: Reference to the parent Langfuse client
  • input: Input data for the span (any JSON-serializable object)
  • output: Output data from the span (any JSON-serializable object)
  • metadata: Additional metadata to associate with the span
  • environment: The tracing environment
  • version: Version identifier for the code or component
  • level: Importance level of the span (info, warning, error)
  • status_message: Optional status message for the span
def start_span( self, name: str, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None) -> LangfuseSpan:
1205    def start_span(
1206        self,
1207        name: str,
1208        input: Optional[Any] = None,
1209        output: Optional[Any] = None,
1210        metadata: Optional[Any] = None,
1211        version: Optional[str] = None,
1212        level: Optional[SpanLevel] = None,
1213        status_message: Optional[str] = None,
1214    ) -> "LangfuseSpan":
1215        """Create a new child span.
1216
1217        This method creates a new child span with this span as the parent.
1218        Unlike start_as_current_span(), this method does not set the new span
1219        as the current span in the context.
1220
1221        Args:
1222            name: Name of the span (e.g., function or operation name)
1223            input: Input data for the operation
1224            output: Output data from the operation
1225            metadata: Additional metadata to associate with the span
1226            version: Version identifier for the code or component
1227            level: Importance level of the span (info, warning, error)
1228            status_message: Optional status message for the span
1229
1230        Returns:
1231            A new LangfuseSpan that must be ended with .end() when complete
1232
1233        Example:
1234            ```python
1235            parent_span = langfuse.start_span(name="process-request")
1236            try:
1237                # Create a child span
1238                child_span = parent_span.start_span(name="validate-input")
1239                try:
1240                    # Do validation work
1241                    validation_result = validate(request_data)
1242                    child_span.update(output=validation_result)
1243                finally:
1244                    child_span.end()
1245
1246                # Continue with parent span
1247                result = process_validated_data(validation_result)
1248                parent_span.update(output=result)
1249            finally:
1250                parent_span.end()
1251            ```
1252        """
1253        return self.start_observation(
1254            name=name,
1255            as_type="span",
1256            input=input,
1257            output=output,
1258            metadata=metadata,
1259            version=version,
1260            level=level,
1261            status_message=status_message,
1262        )

Create a new child span.

This method creates a new child span with this span as the parent. Unlike start_as_current_span(), this method does not set the new span as the current span in the context.

Arguments:
  • name: Name of the span (e.g., function or operation name)
  • input: Input data for the operation
  • output: Output data from the operation
  • metadata: Additional metadata to associate with the span
  • version: Version identifier for the code or component
  • level: Importance level of the span (info, warning, error)
  • status_message: Optional status message for the span
Returns:

A new LangfuseSpan that must be ended with .end() when complete

Example:
parent_span = langfuse.start_span(name="process-request")
try:
    # Create a child span
    child_span = parent_span.start_span(name="validate-input")
    try:
        # Do validation work
        validation_result = validate(request_data)
        child_span.update(output=validation_result)
    finally:
        child_span.end()

    # Continue with parent span
    result = process_validated_data(validation_result)
    parent_span.update(output=result)
finally:
    parent_span.end()
def start_as_current_span( self, *, name: str, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None) -> opentelemetry.util._decorator._AgnosticContextManager[LangfuseSpan]:
1264    def start_as_current_span(
1265        self,
1266        *,
1267        name: str,
1268        input: Optional[Any] = None,
1269        output: Optional[Any] = None,
1270        metadata: Optional[Any] = None,
1271        version: Optional[str] = None,
1272        level: Optional[SpanLevel] = None,
1273        status_message: Optional[str] = None,
1274    ) -> _AgnosticContextManager["LangfuseSpan"]:
1275        """[DEPRECATED] Create a new child span and set it as the current span in a context manager.
1276
1277        DEPRECATED: This method is deprecated and will be removed in a future version.
1278        Use start_as_current_observation(as_type='span') instead.
1279
1280        This method creates a new child span and sets it as the current span within
1281        a context manager. It should be used with a 'with' statement to automatically
1282        manage the span's lifecycle.
1283
1284        Args:
1285            name: Name of the span (e.g., function or operation name)
1286            input: Input data for the operation
1287            output: Output data from the operation
1288            metadata: Additional metadata to associate with the span
1289            version: Version identifier for the code or component
1290            level: Importance level of the span (info, warning, error)
1291            status_message: Optional status message for the span
1292
1293        Returns:
1294            A context manager that yields a new LangfuseSpan
1295
1296        Example:
1297            ```python
1298            with langfuse.start_as_current_span(name="process-request") as parent_span:
1299                # Parent span is active here
1300
1301                # Create a child span with context management
1302                with parent_span.start_as_current_span(name="validate-input") as child_span:
1303                    # Child span is active here
1304                    validation_result = validate(request_data)
1305                    child_span.update(output=validation_result)
1306
1307                # Back to parent span context
1308                result = process_validated_data(validation_result)
1309                parent_span.update(output=result)
1310            ```
1311        """
1312        warnings.warn(
1313            "start_as_current_span is deprecated and will be removed in a future version. "
1314            "Use start_as_current_observation(as_type='span') instead.",
1315            DeprecationWarning,
1316            stacklevel=2,
1317        )
1318        return self.start_as_current_observation(
1319            name=name,
1320            as_type="span",
1321            input=input,
1322            output=output,
1323            metadata=metadata,
1324            version=version,
1325            level=level,
1326            status_message=status_message,
1327        )

[DEPRECATED] Create a new child span and set it as the current span in a context manager.

DEPRECATED: This method is deprecated and will be removed in a future version. Use start_as_current_observation(as_type='span') instead.

This method creates a new child span and sets it as the current span within a context manager. It should be used with a 'with' statement to automatically manage the span's lifecycle.

Arguments:
  • name: Name of the span (e.g., function or operation name)
  • input: Input data for the operation
  • output: Output data from the operation
  • metadata: Additional metadata to associate with the span
  • version: Version identifier for the code or component
  • level: Importance level of the span (info, warning, error)
  • status_message: Optional status message for the span
Returns:

A context manager that yields a new LangfuseSpan

Example:
with langfuse.start_as_current_span(name="process-request") as parent_span:
    # Parent span is active here

    # Create a child span with context management
    with parent_span.start_as_current_span(name="validate-input") as child_span:
        # Child span is active here
        validation_result = validate(request_data)
        child_span.update(output=validation_result)

    # Back to parent span context
    result = process_validated_data(validation_result)
    parent_span.update(output=result)
def start_generation( self, *, name: str, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None) -> LangfuseGeneration:
1329    def start_generation(
1330        self,
1331        *,
1332        name: str,
1333        input: Optional[Any] = None,
1334        output: Optional[Any] = None,
1335        metadata: Optional[Any] = None,
1336        version: Optional[str] = None,
1337        level: Optional[SpanLevel] = None,
1338        status_message: Optional[str] = None,
1339        completion_start_time: Optional[datetime] = None,
1340        model: Optional[str] = None,
1341        model_parameters: Optional[Dict[str, MapValue]] = None,
1342        usage_details: Optional[Dict[str, int]] = None,
1343        cost_details: Optional[Dict[str, float]] = None,
1344        prompt: Optional[PromptClient] = None,
1345    ) -> "LangfuseGeneration":
1346        """[DEPRECATED] Create a new child generation span.
1347
1348        DEPRECATED: This method is deprecated and will be removed in a future version.
1349        Use start_observation(as_type='generation') instead.
1350
1351        This method creates a new child generation span with this span as the parent.
1352        Generation spans are specialized for AI/LLM operations and include additional
1353        fields for model information, usage stats, and costs.
1354
1355        Unlike start_as_current_generation(), this method does not set the new span
1356        as the current span in the context.
1357
1358        Args:
1359            name: Name of the generation operation
1360            input: Input data for the model (e.g., prompts)
1361            output: Output from the model (e.g., completions)
1362            metadata: Additional metadata to associate with the generation
1363            version: Version identifier for the model or component
1364            level: Importance level of the generation (info, warning, error)
1365            status_message: Optional status message for the generation
1366            completion_start_time: When the model started generating the response
1367            model: Name/identifier of the AI model used (e.g., "gpt-4")
1368            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1369            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1370            cost_details: Cost information for the model call
1371            prompt: Associated prompt template from Langfuse prompt management
1372
1373        Returns:
1374            A new LangfuseGeneration that must be ended with .end() when complete
1375
1376        Example:
1377            ```python
1378            span = langfuse.start_span(name="process-query")
1379            try:
1380                # Create a generation child span
1381                generation = span.start_generation(
1382                    name="generate-answer",
1383                    model="gpt-4",
1384                    input={"prompt": "Explain quantum computing"}
1385                )
1386                try:
1387                    # Call model API
1388                    response = llm.generate(...)
1389
1390                    generation.update(
1391                        output=response.text,
1392                        usage_details={
1393                            "prompt_tokens": response.usage.prompt_tokens,
1394                            "completion_tokens": response.usage.completion_tokens
1395                        }
1396                    )
1397                finally:
1398                    generation.end()
1399
1400                # Continue with parent span
1401                span.update(output={"answer": response.text, "source": "gpt-4"})
1402            finally:
1403                span.end()
1404            ```
1405        """
1406        warnings.warn(
1407            "start_generation is deprecated and will be removed in a future version. "
1408            "Use start_observation(as_type='generation') instead.",
1409            DeprecationWarning,
1410            stacklevel=2,
1411        )
1412        return self.start_observation(
1413            name=name,
1414            as_type="generation",
1415            input=input,
1416            output=output,
1417            metadata=metadata,
1418            version=version,
1419            level=level,
1420            status_message=status_message,
1421            completion_start_time=completion_start_time,
1422            model=model,
1423            model_parameters=model_parameters,
1424            usage_details=usage_details,
1425            cost_details=cost_details,
1426            prompt=prompt,
1427        )

[DEPRECATED] Create a new child generation span.

DEPRECATED: This method is deprecated and will be removed in a future version. Use start_observation(as_type='generation') instead.

This method creates a new child generation span with this span as the parent. Generation spans are specialized for AI/LLM operations and include additional fields for model information, usage stats, and costs.

Unlike start_as_current_generation(), this method does not set the new span as the current span in the context.

Arguments:
  • name: Name of the generation operation
  • input: Input data for the model (e.g., prompts)
  • output: Output from the model (e.g., completions)
  • metadata: Additional metadata to associate with the generation
  • version: Version identifier for the model or component
  • level: Importance level of the generation (info, warning, error)
  • status_message: Optional status message for the generation
  • completion_start_time: When the model started generating the response
  • model: Name/identifier of the AI model used (e.g., "gpt-4")
  • model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
  • usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
  • cost_details: Cost information for the model call
  • prompt: Associated prompt template from Langfuse prompt management
Returns:

A new LangfuseGeneration that must be ended with .end() when complete

Example:
span = langfuse.start_span(name="process-query")
try:
    # Create a generation child span
    generation = span.start_generation(
        name="generate-answer",
        model="gpt-4",
        input={"prompt": "Explain quantum computing"}
    )
    try:
        # Call model API
        response = llm.generate(...)

        generation.update(
            output=response.text,
            usage_details={
                "prompt_tokens": response.usage.prompt_tokens,
                "completion_tokens": response.usage.completion_tokens
            }
        )
    finally:
        generation.end()

    # Continue with parent span
    span.update(output={"answer": response.text, "source": "gpt-4"})
finally:
    span.end()
def start_as_current_generation( self, *, name: str, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None) -> opentelemetry.util._decorator._AgnosticContextManager[LangfuseGeneration]:
1429    def start_as_current_generation(
1430        self,
1431        *,
1432        name: str,
1433        input: Optional[Any] = None,
1434        output: Optional[Any] = None,
1435        metadata: Optional[Any] = None,
1436        version: Optional[str] = None,
1437        level: Optional[SpanLevel] = None,
1438        status_message: Optional[str] = None,
1439        completion_start_time: Optional[datetime] = None,
1440        model: Optional[str] = None,
1441        model_parameters: Optional[Dict[str, MapValue]] = None,
1442        usage_details: Optional[Dict[str, int]] = None,
1443        cost_details: Optional[Dict[str, float]] = None,
1444        prompt: Optional[PromptClient] = None,
1445    ) -> _AgnosticContextManager["LangfuseGeneration"]:
1446        """[DEPRECATED] Create a new child generation span and set it as the current span in a context manager.
1447
1448        DEPRECATED: This method is deprecated and will be removed in a future version.
1449        Use start_as_current_observation(as_type='generation') instead.
1450
1451        This method creates a new child generation span and sets it as the current span
1452        within a context manager. Generation spans are specialized for AI/LLM operations
1453        and include additional fields for model information, usage stats, and costs.
1454
1455        Args:
1456            name: Name of the generation operation
1457            input: Input data for the model (e.g., prompts)
1458            output: Output from the model (e.g., completions)
1459            metadata: Additional metadata to associate with the generation
1460            version: Version identifier for the model or component
1461            level: Importance level of the generation (info, warning, error)
1462            status_message: Optional status message for the generation
1463            completion_start_time: When the model started generating the response
1464            model: Name/identifier of the AI model used (e.g., "gpt-4")
1465            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1466            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1467            cost_details: Cost information for the model call
1468            prompt: Associated prompt template from Langfuse prompt management
1469
1470        Returns:
1471            A context manager that yields a new LangfuseGeneration
1472
1473        Example:
1474            ```python
1475            with langfuse.start_as_current_span(name="process-request") as span:
1476                # Prepare data
1477                query = preprocess_user_query(user_input)
1478
1479                # Create a generation span with context management
1480                with span.start_as_current_generation(
1481                    name="generate-answer",
1482                    model="gpt-4",
1483                    input={"query": query}
1484                ) as generation:
1485                    # Generation span is active here
1486                    response = llm.generate(query)
1487
1488                    # Update with results
1489                    generation.update(
1490                        output=response.text,
1491                        usage_details={
1492                            "prompt_tokens": response.usage.prompt_tokens,
1493                            "completion_tokens": response.usage.completion_tokens
1494                        }
1495                    )
1496
1497                # Back to parent span context
1498                span.update(output={"answer": response.text, "source": "gpt-4"})
1499            ```
1500        """
1501        warnings.warn(
1502            "start_as_current_generation is deprecated and will be removed in a future version. "
1503            "Use start_as_current_observation(as_type='generation') instead.",
1504            DeprecationWarning,
1505            stacklevel=2,
1506        )
1507        return self.start_as_current_observation(
1508            name=name,
1509            as_type="generation",
1510            input=input,
1511            output=output,
1512            metadata=metadata,
1513            version=version,
1514            level=level,
1515            status_message=status_message,
1516            completion_start_time=completion_start_time,
1517            model=model,
1518            model_parameters=model_parameters,
1519            usage_details=usage_details,
1520            cost_details=cost_details,
1521            prompt=prompt,
1522        )

[DEPRECATED] Create a new child generation span and set it as the current span in a context manager.

DEPRECATED: This method is deprecated and will be removed in a future version. Use start_as_current_observation(as_type='generation') instead.

This method creates a new child generation span and sets it as the current span within a context manager. Generation spans are specialized for AI/LLM operations and include additional fields for model information, usage stats, and costs.

Arguments:
  • name: Name of the generation operation
  • input: Input data for the model (e.g., prompts)
  • output: Output from the model (e.g., completions)
  • metadata: Additional metadata to associate with the generation
  • version: Version identifier for the model or component
  • level: Importance level of the generation (info, warning, error)
  • status_message: Optional status message for the generation
  • completion_start_time: When the model started generating the response
  • model: Name/identifier of the AI model used (e.g., "gpt-4")
  • model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
  • usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
  • cost_details: Cost information for the model call
  • prompt: Associated prompt template from Langfuse prompt management
Returns:

A context manager that yields a new LangfuseGeneration

Example:
with langfuse.start_as_current_span(name="process-request") as span:
    # Prepare data
    query = preprocess_user_query(user_input)

    # Create a generation span with context management
    with span.start_as_current_generation(
        name="generate-answer",
        model="gpt-4",
        input={"query": query}
    ) as generation:
        # Generation span is active here
        response = llm.generate(query)

        # Update with results
        generation.update(
            output=response.text,
            usage_details={
                "prompt_tokens": response.usage.prompt_tokens,
                "completion_tokens": response.usage.completion_tokens
            }
        )

    # Back to parent span context
    span.update(output={"answer": response.text, "source": "gpt-4"})
def create_event( self, *, name: str, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None) -> LangfuseEvent:
1524    def create_event(
1525        self,
1526        *,
1527        name: str,
1528        input: Optional[Any] = None,
1529        output: Optional[Any] = None,
1530        metadata: Optional[Any] = None,
1531        version: Optional[str] = None,
1532        level: Optional[SpanLevel] = None,
1533        status_message: Optional[str] = None,
1534    ) -> "LangfuseEvent":
1535        """Create a new Langfuse observation of type 'EVENT'.
1536
1537        Args:
1538            name: Name of the span (e.g., function or operation name)
1539            input: Input data for the operation (can be any JSON-serializable object)
1540            output: Output data from the operation (can be any JSON-serializable object)
1541            metadata: Additional metadata to associate with the span
1542            version: Version identifier for the code or component
1543            level: Importance level of the span (info, warning, error)
1544            status_message: Optional status message for the span
1545
1546        Returns:
1547            The LangfuseEvent object
1548
1549        Example:
1550            ```python
1551            event = langfuse.create_event(name="process-event")
1552            ```
1553        """
1554        timestamp = time_ns()
1555
1556        with otel_trace_api.use_span(self._otel_span):
1557            new_otel_span = self._langfuse_client._otel_tracer.start_span(
1558                name=name, start_time=timestamp
1559            )
1560
1561        return cast(
1562            "LangfuseEvent",
1563            LangfuseEvent(
1564                otel_span=new_otel_span,
1565                langfuse_client=self._langfuse_client,
1566                input=input,
1567                output=output,
1568                metadata=metadata,
1569                environment=self._environment,
1570                version=version,
1571                level=level,
1572                status_message=status_message,
1573            ).end(end_time=timestamp),
1574        )

Create a new Langfuse observation of type 'EVENT'.

Arguments:
  • name: Name of the span (e.g., function or operation name)
  • input: Input data for the operation (can be any JSON-serializable object)
  • output: Output data from the operation (can be any JSON-serializable object)
  • metadata: Additional metadata to associate with the span
  • version: Version identifier for the code or component
  • level: Importance level of the span (info, warning, error)
  • status_message: Optional status message for the span
Returns:

The LangfuseEvent object

Example:
event = langfuse.create_event(name="process-event")
class LangfuseGeneration(langfuse._client.span.LangfuseObservationWrapper):
1577class LangfuseGeneration(LangfuseObservationWrapper):
1578    """Specialized span implementation for AI model generations in Langfuse.
1579
1580    This class represents a generation span specifically designed for tracking
1581    AI/LLM operations. It extends the base LangfuseObservationWrapper with specialized
1582    attributes for model details, token usage, and costs.
1583    """
1584
1585    def __init__(
1586        self,
1587        *,
1588        otel_span: otel_trace_api.Span,
1589        langfuse_client: "Langfuse",
1590        input: Optional[Any] = None,
1591        output: Optional[Any] = None,
1592        metadata: Optional[Any] = None,
1593        environment: Optional[str] = None,
1594        version: Optional[str] = None,
1595        level: Optional[SpanLevel] = None,
1596        status_message: Optional[str] = None,
1597        completion_start_time: Optional[datetime] = None,
1598        model: Optional[str] = None,
1599        model_parameters: Optional[Dict[str, MapValue]] = None,
1600        usage_details: Optional[Dict[str, int]] = None,
1601        cost_details: Optional[Dict[str, float]] = None,
1602        prompt: Optional[PromptClient] = None,
1603    ):
1604        """Initialize a new LangfuseGeneration span.
1605
1606        Args:
1607            otel_span: The OpenTelemetry span to wrap
1608            langfuse_client: Reference to the parent Langfuse client
1609            input: Input data for the generation (e.g., prompts)
1610            output: Output from the generation (e.g., completions)
1611            metadata: Additional metadata to associate with the generation
1612            environment: The tracing environment
1613            version: Version identifier for the model or component
1614            level: Importance level of the generation (info, warning, error)
1615            status_message: Optional status message for the generation
1616            completion_start_time: When the model started generating the response
1617            model: Name/identifier of the AI model used (e.g., "gpt-4")
1618            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1619            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1620            cost_details: Cost information for the model call
1621            prompt: Associated prompt template from Langfuse prompt management
1622        """
1623        super().__init__(
1624            as_type="generation",
1625            otel_span=otel_span,
1626            langfuse_client=langfuse_client,
1627            input=input,
1628            output=output,
1629            metadata=metadata,
1630            environment=environment,
1631            version=version,
1632            level=level,
1633            status_message=status_message,
1634            completion_start_time=completion_start_time,
1635            model=model,
1636            model_parameters=model_parameters,
1637            usage_details=usage_details,
1638            cost_details=cost_details,
1639            prompt=prompt,
1640        )

Specialized span implementation for AI model generations in Langfuse.

This class represents a generation span specifically designed for tracking AI/LLM operations. It extends the base LangfuseObservationWrapper with specialized attributes for model details, token usage, and costs.

LangfuseGeneration( *, otel_span: opentelemetry.trace.span.Span, langfuse_client: Langfuse, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, environment: Optional[str] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None)
1585    def __init__(
1586        self,
1587        *,
1588        otel_span: otel_trace_api.Span,
1589        langfuse_client: "Langfuse",
1590        input: Optional[Any] = None,
1591        output: Optional[Any] = None,
1592        metadata: Optional[Any] = None,
1593        environment: Optional[str] = None,
1594        version: Optional[str] = None,
1595        level: Optional[SpanLevel] = None,
1596        status_message: Optional[str] = None,
1597        completion_start_time: Optional[datetime] = None,
1598        model: Optional[str] = None,
1599        model_parameters: Optional[Dict[str, MapValue]] = None,
1600        usage_details: Optional[Dict[str, int]] = None,
1601        cost_details: Optional[Dict[str, float]] = None,
1602        prompt: Optional[PromptClient] = None,
1603    ):
1604        """Initialize a new LangfuseGeneration span.
1605
1606        Args:
1607            otel_span: The OpenTelemetry span to wrap
1608            langfuse_client: Reference to the parent Langfuse client
1609            input: Input data for the generation (e.g., prompts)
1610            output: Output from the generation (e.g., completions)
1611            metadata: Additional metadata to associate with the generation
1612            environment: The tracing environment
1613            version: Version identifier for the model or component
1614            level: Importance level of the generation (info, warning, error)
1615            status_message: Optional status message for the generation
1616            completion_start_time: When the model started generating the response
1617            model: Name/identifier of the AI model used (e.g., "gpt-4")
1618            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1619            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1620            cost_details: Cost information for the model call
1621            prompt: Associated prompt template from Langfuse prompt management
1622        """
1623        super().__init__(
1624            as_type="generation",
1625            otel_span=otel_span,
1626            langfuse_client=langfuse_client,
1627            input=input,
1628            output=output,
1629            metadata=metadata,
1630            environment=environment,
1631            version=version,
1632            level=level,
1633            status_message=status_message,
1634            completion_start_time=completion_start_time,
1635            model=model,
1636            model_parameters=model_parameters,
1637            usage_details=usage_details,
1638            cost_details=cost_details,
1639            prompt=prompt,
1640        )

Initialize a new LangfuseGeneration span.

Arguments:
  • otel_span: The OpenTelemetry span to wrap
  • langfuse_client: Reference to the parent Langfuse client
  • input: Input data for the generation (e.g., prompts)
  • output: Output from the generation (e.g., completions)
  • metadata: Additional metadata to associate with the generation
  • environment: The tracing environment
  • version: Version identifier for the model or component
  • level: Importance level of the generation (info, warning, error)
  • status_message: Optional status message for the generation
  • completion_start_time: When the model started generating the response
  • model: Name/identifier of the AI model used (e.g., "gpt-4")
  • model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
  • usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
  • cost_details: Cost information for the model call
  • prompt: Associated prompt template from Langfuse prompt management
class LangfuseEvent(langfuse._client.span.LangfuseObservationWrapper):
1643class LangfuseEvent(LangfuseObservationWrapper):
1644    """Specialized span implementation for Langfuse Events."""
1645
1646    def __init__(
1647        self,
1648        *,
1649        otel_span: otel_trace_api.Span,
1650        langfuse_client: "Langfuse",
1651        input: Optional[Any] = None,
1652        output: Optional[Any] = None,
1653        metadata: Optional[Any] = None,
1654        environment: Optional[str] = None,
1655        version: Optional[str] = None,
1656        level: Optional[SpanLevel] = None,
1657        status_message: Optional[str] = None,
1658    ):
1659        """Initialize a new LangfuseEvent span.
1660
1661        Args:
1662            otel_span: The OpenTelemetry span to wrap
1663            langfuse_client: Reference to the parent Langfuse client
1664            input: Input data for the event
1665            output: Output from the event
1666            metadata: Additional metadata to associate with the generation
1667            environment: The tracing environment
1668            version: Version identifier for the model or component
1669            level: Importance level of the generation (info, warning, error)
1670            status_message: Optional status message for the generation
1671        """
1672        super().__init__(
1673            otel_span=otel_span,
1674            as_type="event",
1675            langfuse_client=langfuse_client,
1676            input=input,
1677            output=output,
1678            metadata=metadata,
1679            environment=environment,
1680            version=version,
1681            level=level,
1682            status_message=status_message,
1683        )
1684
1685    def update(
1686        self,
1687        *,
1688        name: Optional[str] = None,
1689        input: Optional[Any] = None,
1690        output: Optional[Any] = None,
1691        metadata: Optional[Any] = None,
1692        version: Optional[str] = None,
1693        level: Optional[SpanLevel] = None,
1694        status_message: Optional[str] = None,
1695        completion_start_time: Optional[datetime] = None,
1696        model: Optional[str] = None,
1697        model_parameters: Optional[Dict[str, MapValue]] = None,
1698        usage_details: Optional[Dict[str, int]] = None,
1699        cost_details: Optional[Dict[str, float]] = None,
1700        prompt: Optional[PromptClient] = None,
1701        **kwargs: Any,
1702    ) -> "LangfuseEvent":
1703        """Update is not allowed for LangfuseEvent because events cannot be updated.
1704
1705        This method logs a warning and returns self without making changes.
1706
1707        Returns:
1708            self: Returns the unchanged LangfuseEvent instance
1709        """
1710        langfuse_logger.warning(
1711            "Attempted to update LangfuseEvent observation. Events cannot be updated after creation."
1712        )
1713        return self

Specialized span implementation for Langfuse Events.

LangfuseEvent( *, otel_span: opentelemetry.trace.span.Span, langfuse_client: Langfuse, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, environment: Optional[str] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None)
1646    def __init__(
1647        self,
1648        *,
1649        otel_span: otel_trace_api.Span,
1650        langfuse_client: "Langfuse",
1651        input: Optional[Any] = None,
1652        output: Optional[Any] = None,
1653        metadata: Optional[Any] = None,
1654        environment: Optional[str] = None,
1655        version: Optional[str] = None,
1656        level: Optional[SpanLevel] = None,
1657        status_message: Optional[str] = None,
1658    ):
1659        """Initialize a new LangfuseEvent span.
1660
1661        Args:
1662            otel_span: The OpenTelemetry span to wrap
1663            langfuse_client: Reference to the parent Langfuse client
1664            input: Input data for the event
1665            output: Output from the event
1666            metadata: Additional metadata to associate with the generation
1667            environment: The tracing environment
1668            version: Version identifier for the model or component
1669            level: Importance level of the generation (info, warning, error)
1670            status_message: Optional status message for the generation
1671        """
1672        super().__init__(
1673            otel_span=otel_span,
1674            as_type="event",
1675            langfuse_client=langfuse_client,
1676            input=input,
1677            output=output,
1678            metadata=metadata,
1679            environment=environment,
1680            version=version,
1681            level=level,
1682            status_message=status_message,
1683        )

Initialize a new LangfuseEvent span.

Arguments:
  • otel_span: The OpenTelemetry span to wrap
  • langfuse_client: Reference to the parent Langfuse client
  • input: Input data for the event
  • output: Output from the event
  • metadata: Additional metadata to associate with the generation
  • environment: The tracing environment
  • version: Version identifier for the model or component
  • level: Importance level of the generation (info, warning, error)
  • status_message: Optional status message for the generation
def update( self, *, name: Optional[str] = None, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None, **kwargs: Any) -> LangfuseEvent:
1685    def update(
1686        self,
1687        *,
1688        name: Optional[str] = None,
1689        input: Optional[Any] = None,
1690        output: Optional[Any] = None,
1691        metadata: Optional[Any] = None,
1692        version: Optional[str] = None,
1693        level: Optional[SpanLevel] = None,
1694        status_message: Optional[str] = None,
1695        completion_start_time: Optional[datetime] = None,
1696        model: Optional[str] = None,
1697        model_parameters: Optional[Dict[str, MapValue]] = None,
1698        usage_details: Optional[Dict[str, int]] = None,
1699        cost_details: Optional[Dict[str, float]] = None,
1700        prompt: Optional[PromptClient] = None,
1701        **kwargs: Any,
1702    ) -> "LangfuseEvent":
1703        """Update is not allowed for LangfuseEvent because events cannot be updated.
1704
1705        This method logs a warning and returns self without making changes.
1706
1707        Returns:
1708            self: Returns the unchanged LangfuseEvent instance
1709        """
1710        langfuse_logger.warning(
1711            "Attempted to update LangfuseEvent observation. Events cannot be updated after creation."
1712        )
1713        return self

Update is not allowed for LangfuseEvent because events cannot be updated.

This method logs a warning and returns self without making changes.

Returns:

self: Returns the unchanged LangfuseEvent instance

class LangfuseOtelSpanAttributes:
27class LangfuseOtelSpanAttributes:
28    # Langfuse-Trace attributes
29    TRACE_NAME = "langfuse.trace.name"
30    TRACE_USER_ID = "user.id"
31    TRACE_SESSION_ID = "session.id"
32    TRACE_TAGS = "langfuse.trace.tags"
33    TRACE_PUBLIC = "langfuse.trace.public"
34    TRACE_METADATA = "langfuse.trace.metadata"
35    TRACE_INPUT = "langfuse.trace.input"
36    TRACE_OUTPUT = "langfuse.trace.output"
37
38    # Langfuse-observation attributes
39    OBSERVATION_TYPE = "langfuse.observation.type"
40    OBSERVATION_METADATA = "langfuse.observation.metadata"
41    OBSERVATION_LEVEL = "langfuse.observation.level"
42    OBSERVATION_STATUS_MESSAGE = "langfuse.observation.status_message"
43    OBSERVATION_INPUT = "langfuse.observation.input"
44    OBSERVATION_OUTPUT = "langfuse.observation.output"
45
46    # Langfuse-observation of type Generation attributes
47    OBSERVATION_COMPLETION_START_TIME = "langfuse.observation.completion_start_time"
48    OBSERVATION_MODEL = "langfuse.observation.model.name"
49    OBSERVATION_MODEL_PARAMETERS = "langfuse.observation.model.parameters"
50    OBSERVATION_USAGE_DETAILS = "langfuse.observation.usage_details"
51    OBSERVATION_COST_DETAILS = "langfuse.observation.cost_details"
52    OBSERVATION_PROMPT_NAME = "langfuse.observation.prompt.name"
53    OBSERVATION_PROMPT_VERSION = "langfuse.observation.prompt.version"
54
55    # General
56    ENVIRONMENT = "langfuse.environment"
57    RELEASE = "langfuse.release"
58    VERSION = "langfuse.version"
59
60    # Internal
61    AS_ROOT = "langfuse.internal.as_root"
62
63    # Experiments
64    EXPERIMENT_ID = "langfuse.experiment.id"
65    EXPERIMENT_NAME = "langfuse.experiment.name"
66    EXPERIMENT_DESCRIPTION = "langfuse.experiment.description"
67    EXPERIMENT_METADATA = "langfuse.experiment.metadata"
68    EXPERIMENT_DATASET_ID = "langfuse.experiment.dataset.id"
69    EXPERIMENT_ITEM_ID = "langfuse.experiment.item.id"
70    EXPERIMENT_ITEM_EXPECTED_OUTPUT = "langfuse.experiment.item.expected_output"
71    EXPERIMENT_ITEM_METADATA = "langfuse.experiment.item.metadata"
72    EXPERIMENT_ITEM_ROOT_OBSERVATION_ID = "langfuse.experiment.item.root_observation_id"
TRACE_NAME = 'langfuse.trace.name'
TRACE_USER_ID = 'user.id'
TRACE_SESSION_ID = 'session.id'
TRACE_TAGS = 'langfuse.trace.tags'
TRACE_PUBLIC = 'langfuse.trace.public'
TRACE_METADATA = 'langfuse.trace.metadata'
TRACE_INPUT = 'langfuse.trace.input'
TRACE_OUTPUT = 'langfuse.trace.output'
OBSERVATION_TYPE = 'langfuse.observation.type'
OBSERVATION_METADATA = 'langfuse.observation.metadata'
OBSERVATION_LEVEL = 'langfuse.observation.level'
OBSERVATION_STATUS_MESSAGE = 'langfuse.observation.status_message'
OBSERVATION_INPUT = 'langfuse.observation.input'
OBSERVATION_OUTPUT = 'langfuse.observation.output'
OBSERVATION_COMPLETION_START_TIME = 'langfuse.observation.completion_start_time'
OBSERVATION_MODEL = 'langfuse.observation.model.name'
OBSERVATION_MODEL_PARAMETERS = 'langfuse.observation.model.parameters'
OBSERVATION_USAGE_DETAILS = 'langfuse.observation.usage_details'
OBSERVATION_COST_DETAILS = 'langfuse.observation.cost_details'
OBSERVATION_PROMPT_NAME = 'langfuse.observation.prompt.name'
OBSERVATION_PROMPT_VERSION = 'langfuse.observation.prompt.version'
ENVIRONMENT = 'langfuse.environment'
RELEASE = 'langfuse.release'
VERSION = 'langfuse.version'
AS_ROOT = 'langfuse.internal.as_root'
EXPERIMENT_ID = 'langfuse.experiment.id'
EXPERIMENT_NAME = 'langfuse.experiment.name'
EXPERIMENT_DESCRIPTION = 'langfuse.experiment.description'
EXPERIMENT_METADATA = 'langfuse.experiment.metadata'
EXPERIMENT_DATASET_ID = 'langfuse.experiment.dataset.id'
EXPERIMENT_ITEM_ID = 'langfuse.experiment.item.id'
EXPERIMENT_ITEM_EXPECTED_OUTPUT = 'langfuse.experiment.item.expected_output'
EXPERIMENT_ITEM_METADATA = 'langfuse.experiment.item.metadata'
EXPERIMENT_ITEM_ROOT_OBSERVATION_ID = 'langfuse.experiment.item.root_observation_id'
class LangfuseAgent(langfuse._client.span.LangfuseObservationWrapper):
1716class LangfuseAgent(LangfuseObservationWrapper):
1717    """Agent observation for reasoning blocks that act on tools using LLM guidance."""
1718
1719    def __init__(self, **kwargs: Any) -> None:
1720        """Initialize a new LangfuseAgent span."""
1721        kwargs["as_type"] = "agent"
1722        super().__init__(**kwargs)

Agent observation for reasoning blocks that act on tools using LLM guidance.

LangfuseAgent(**kwargs: Any)
1719    def __init__(self, **kwargs: Any) -> None:
1720        """Initialize a new LangfuseAgent span."""
1721        kwargs["as_type"] = "agent"
1722        super().__init__(**kwargs)

Initialize a new LangfuseAgent span.

class LangfuseTool(langfuse._client.span.LangfuseObservationWrapper):
1725class LangfuseTool(LangfuseObservationWrapper):
1726    """Tool observation representing external tool calls, e.g., calling a weather API."""
1727
1728    def __init__(self, **kwargs: Any) -> None:
1729        """Initialize a new LangfuseTool span."""
1730        kwargs["as_type"] = "tool"
1731        super().__init__(**kwargs)

Tool observation representing external tool calls, e.g., calling a weather API.

LangfuseTool(**kwargs: Any)
1728    def __init__(self, **kwargs: Any) -> None:
1729        """Initialize a new LangfuseTool span."""
1730        kwargs["as_type"] = "tool"
1731        super().__init__(**kwargs)

Initialize a new LangfuseTool span.

class LangfuseChain(langfuse._client.span.LangfuseObservationWrapper):
1734class LangfuseChain(LangfuseObservationWrapper):
1735    """Chain observation for connecting LLM application steps, e.g. passing context from retriever to LLM."""
1736
1737    def __init__(self, **kwargs: Any) -> None:
1738        """Initialize a new LangfuseChain span."""
1739        kwargs["as_type"] = "chain"
1740        super().__init__(**kwargs)

Chain observation for connecting LLM application steps, e.g. passing context from retriever to LLM.

LangfuseChain(**kwargs: Any)
1737    def __init__(self, **kwargs: Any) -> None:
1738        """Initialize a new LangfuseChain span."""
1739        kwargs["as_type"] = "chain"
1740        super().__init__(**kwargs)

Initialize a new LangfuseChain span.

class LangfuseEmbedding(langfuse._client.span.LangfuseObservationWrapper):
1752class LangfuseEmbedding(LangfuseObservationWrapper):
1753    """Embedding observation for LLM embedding calls, typically used before retrieval."""
1754
1755    def __init__(self, **kwargs: Any) -> None:
1756        """Initialize a new LangfuseEmbedding span."""
1757        kwargs["as_type"] = "embedding"
1758        super().__init__(**kwargs)

Embedding observation for LLM embedding calls, typically used before retrieval.

LangfuseEmbedding(**kwargs: Any)
1755    def __init__(self, **kwargs: Any) -> None:
1756        """Initialize a new LangfuseEmbedding span."""
1757        kwargs["as_type"] = "embedding"
1758        super().__init__(**kwargs)

Initialize a new LangfuseEmbedding span.

class LangfuseEvaluator(langfuse._client.span.LangfuseObservationWrapper):
1761class LangfuseEvaluator(LangfuseObservationWrapper):
1762    """Evaluator observation for assessing relevance, correctness, or helpfulness of LLM outputs."""
1763
1764    def __init__(self, **kwargs: Any) -> None:
1765        """Initialize a new LangfuseEvaluator span."""
1766        kwargs["as_type"] = "evaluator"
1767        super().__init__(**kwargs)

Evaluator observation for assessing relevance, correctness, or helpfulness of LLM outputs.

LangfuseEvaluator(**kwargs: Any)
1764    def __init__(self, **kwargs: Any) -> None:
1765        """Initialize a new LangfuseEvaluator span."""
1766        kwargs["as_type"] = "evaluator"
1767        super().__init__(**kwargs)

Initialize a new LangfuseEvaluator span.

class LangfuseRetriever(langfuse._client.span.LangfuseObservationWrapper):
1743class LangfuseRetriever(LangfuseObservationWrapper):
1744    """Retriever observation for data retrieval steps, e.g. vector store or database queries."""
1745
1746    def __init__(self, **kwargs: Any) -> None:
1747        """Initialize a new LangfuseRetriever span."""
1748        kwargs["as_type"] = "retriever"
1749        super().__init__(**kwargs)

Retriever observation for data retrieval steps, e.g. vector store or database queries.

LangfuseRetriever(**kwargs: Any)
1746    def __init__(self, **kwargs: Any) -> None:
1747        """Initialize a new LangfuseRetriever span."""
1748        kwargs["as_type"] = "retriever"
1749        super().__init__(**kwargs)

Initialize a new LangfuseRetriever span.

class LangfuseGuardrail(langfuse._client.span.LangfuseObservationWrapper):
1770class LangfuseGuardrail(LangfuseObservationWrapper):
1771    """Guardrail observation for protection e.g. against jailbreaks or offensive content."""
1772
1773    def __init__(self, **kwargs: Any) -> None:
1774        """Initialize a new LangfuseGuardrail span."""
1775        kwargs["as_type"] = "guardrail"
1776        super().__init__(**kwargs)

Guardrail observation for protection e.g. against jailbreaks or offensive content.

LangfuseGuardrail(**kwargs: Any)
1773    def __init__(self, **kwargs: Any) -> None:
1774        """Initialize a new LangfuseGuardrail span."""
1775        kwargs["as_type"] = "guardrail"
1776        super().__init__(**kwargs)

Initialize a new LangfuseGuardrail span.

class Evaluation:
 97class Evaluation:
 98    """Represents an evaluation result for an experiment item or an entire experiment run.
 99
100    This class provides a strongly-typed way to create evaluation results in evaluator functions.
101    Users must use keyword arguments when instantiating this class.
102
103    Attributes:
104        name: Unique identifier for the evaluation metric. Should be descriptive
105            and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity").
106            Used for aggregation and comparison across experiment runs.
107        value: The evaluation score or result. Can be:
108            - Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42)
109            - String: For categorical results like "positive", "negative", "neutral"
110            - Boolean: For binary assessments like "passes_safety_check"
111        comment: Optional human-readable explanation of the evaluation result.
112            Useful for providing context, explaining scoring rationale, or noting
113            special conditions. Displayed in Langfuse UI for interpretability.
114        metadata: Optional structured metadata about the evaluation process.
115            Can include confidence scores, intermediate calculations, model versions,
116            or any other relevant technical details.
117        data_type: Optional score data type. Required if value is not NUMERIC.
118            One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC.
119        config_id: Optional Langfuse score config ID.
120
121    Examples:
122        Basic accuracy evaluation:
123        ```python
124        from langfuse import Evaluation
125
126        def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
127            if not expected_output:
128                return Evaluation(name="accuracy", value=None, comment="No expected output")
129
130            is_correct = output.strip().lower() == expected_output.strip().lower()
131            return Evaluation(
132                name="accuracy",
133                value=1.0 if is_correct else 0.0,
134                comment="Correct answer" if is_correct else "Incorrect answer"
135            )
136        ```
137
138        Multi-metric evaluator:
139        ```python
140        def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs):
141            return [
142                Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"),
143                Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"),
144                Evaluation(
145                    name="quality",
146                    value=0.85,
147                    comment="High quality response",
148                    metadata={"confidence": 0.92, "model": "gpt-4"}
149                )
150            ]
151        ```
152
153        Categorical evaluation:
154        ```python
155        def sentiment_evaluator(*, input, output, **kwargs):
156            sentiment = analyze_sentiment(output)  # Returns "positive", "negative", or "neutral"
157            return Evaluation(
158                name="sentiment",
159                value=sentiment,
160                comment=f"Response expresses {sentiment} sentiment",
161                data_type="CATEGORICAL"
162            )
163        ```
164
165        Failed evaluation with error handling:
166        ```python
167        def external_api_evaluator(*, input, output, **kwargs):
168            try:
169                score = external_api.evaluate(output)
170                return Evaluation(name="external_score", value=score)
171            except Exception as e:
172                return Evaluation(
173                    name="external_score",
174                    value=None,
175                    comment=f"API unavailable: {e}",
176                    metadata={"error": str(e), "retry_count": 3}
177                )
178        ```
179
180    Note:
181        All arguments must be passed as keywords. Positional arguments are not allowed
182        to ensure code clarity and prevent errors from argument reordering.
183    """
184
185    def __init__(
186        self,
187        *,
188        name: str,
189        value: Union[int, float, str, bool],
190        comment: Optional[str] = None,
191        metadata: Optional[Dict[str, Any]] = None,
192        data_type: Optional[ScoreDataType] = None,
193        config_id: Optional[str] = None,
194    ):
195        """Initialize an Evaluation with the provided data.
196
197        Args:
198            name: Unique identifier for the evaluation metric.
199            value: The evaluation score or result.
200            comment: Optional human-readable explanation of the result.
201            metadata: Optional structured metadata about the evaluation process.
202            data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
203            config_id: Optional Langfuse score config ID.
204
205        Note:
206            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
207        """
208        self.name = name
209        self.value = value
210        self.comment = comment
211        self.metadata = metadata
212        self.data_type = data_type
213        self.config_id = config_id

Represents an evaluation result for an experiment item or an entire experiment run.

This class provides a strongly-typed way to create evaluation results in evaluator functions. Users must use keyword arguments when instantiating this class.

Attributes:
  • name: Unique identifier for the evaluation metric. Should be descriptive and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity"). Used for aggregation and comparison across experiment runs.
  • value: The evaluation score or result. Can be:
    • Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42)
    • String: For categorical results like "positive", "negative", "neutral"
    • Boolean: For binary assessments like "passes_safety_check"
  • comment: Optional human-readable explanation of the evaluation result. Useful for providing context, explaining scoring rationale, or noting special conditions. Displayed in Langfuse UI for interpretability.
  • metadata: Optional structured metadata about the evaluation process. Can include confidence scores, intermediate calculations, model versions, or any other relevant technical details.
  • data_type: Optional score data type. Required if value is not NUMERIC. One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC.
  • config_id: Optional Langfuse score config ID.
Examples:

Basic accuracy evaluation:

from langfuse import Evaluation

def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
    if not expected_output:
        return Evaluation(name="accuracy", value=None, comment="No expected output")

    is_correct = output.strip().lower() == expected_output.strip().lower()
    return Evaluation(
        name="accuracy",
        value=1.0 if is_correct else 0.0,
        comment="Correct answer" if is_correct else "Incorrect answer"
    )

Multi-metric evaluator:

def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs):
    return [
        Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"),
        Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"),
        Evaluation(
            name="quality",
            value=0.85,
            comment="High quality response",
            metadata={"confidence": 0.92, "model": "gpt-4"}
        )
    ]

Categorical evaluation:

def sentiment_evaluator(*, input, output, **kwargs):
    sentiment = analyze_sentiment(output)  # Returns "positive", "negative", or "neutral"
    return Evaluation(
        name="sentiment",
        value=sentiment,
        comment=f"Response expresses {sentiment} sentiment",
        data_type="CATEGORICAL"
    )

Failed evaluation with error handling:

def external_api_evaluator(*, input, output, **kwargs):
    try:
        score = external_api.evaluate(output)
        return Evaluation(name="external_score", value=score)
    except Exception as e:
        return Evaluation(
            name="external_score",
            value=None,
            comment=f"API unavailable: {e}",
            metadata={"error": str(e), "retry_count": 3}
        )
Note:

All arguments must be passed as keywords. Positional arguments are not allowed to ensure code clarity and prevent errors from argument reordering.

Evaluation( *, name: str, value: Union[int, float, str, bool], comment: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, data_type: Optional[langfuse.api.ScoreDataType] = None, config_id: Optional[str] = None)
185    def __init__(
186        self,
187        *,
188        name: str,
189        value: Union[int, float, str, bool],
190        comment: Optional[str] = None,
191        metadata: Optional[Dict[str, Any]] = None,
192        data_type: Optional[ScoreDataType] = None,
193        config_id: Optional[str] = None,
194    ):
195        """Initialize an Evaluation with the provided data.
196
197        Args:
198            name: Unique identifier for the evaluation metric.
199            value: The evaluation score or result.
200            comment: Optional human-readable explanation of the result.
201            metadata: Optional structured metadata about the evaluation process.
202            data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
203            config_id: Optional Langfuse score config ID.
204
205        Note:
206            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
207        """
208        self.name = name
209        self.value = value
210        self.comment = comment
211        self.metadata = metadata
212        self.data_type = data_type
213        self.config_id = config_id

Initialize an Evaluation with the provided data.

Arguments:
  • name: Unique identifier for the evaluation metric.
  • value: The evaluation score or result.
  • comment: Optional human-readable explanation of the result.
  • metadata: Optional structured metadata about the evaluation process.
  • data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
  • config_id: Optional Langfuse score config ID.
Note:

All arguments must be provided as keywords. Positional arguments will raise a TypeError.

name
value
comment
metadata
data_type
config_id
class EvaluatorInputs:
 38class EvaluatorInputs:
 39    """Input data structure for evaluators, returned by mapper functions.
 40
 41    This class provides a strongly-typed container for transforming API response
 42    objects (traces, observations) into the standardized format expected
 43    by evaluator functions. It ensures consistent access to input, output, expected
 44    output, and metadata regardless of the source entity type.
 45
 46    Attributes:
 47        input: The input data that was provided to generate the output being evaluated.
 48            For traces, this might be the initial prompt or request. For observations,
 49            this could be the span's input. The exact meaning depends on your use case.
 50        output: The actual output that was produced and needs to be evaluated.
 51            For traces, this is typically the final response. For observations,
 52            this might be the generation output or span result.
 53        expected_output: Optional ground truth or expected result for comparison.
 54            Used by evaluators to assess correctness. May be None if no ground truth
 55            is available for the entity being evaluated.
 56        metadata: Optional structured metadata providing additional context for evaluation.
 57            Can include information about the entity, execution context, user attributes,
 58            or any other relevant data that evaluators might use.
 59
 60    Examples:
 61        Simple mapper for traces:
 62        ```python
 63        from langfuse import EvaluatorInputs
 64
 65        def trace_mapper(trace):
 66            return EvaluatorInputs(
 67                input=trace.input,
 68                output=trace.output,
 69                expected_output=None,  # No ground truth available
 70                metadata={"user_id": trace.user_id, "tags": trace.tags}
 71            )
 72        ```
 73
 74        Mapper for observations extracting specific fields:
 75        ```python
 76        def observation_mapper(observation):
 77            # Extract input/output from observation's data
 78            input_data = observation.input if hasattr(observation, 'input') else None
 79            output_data = observation.output if hasattr(observation, 'output') else None
 80
 81            return EvaluatorInputs(
 82                input=input_data,
 83                output=output_data,
 84                expected_output=None,
 85                metadata={
 86                    "observation_type": observation.type,
 87                    "model": observation.model,
 88                    "latency_ms": observation.end_time - observation.start_time
 89                }
 90            )
 91        ```
 92        ```
 93
 94    Note:
 95        All arguments must be passed as keywords when instantiating this class.
 96    """
 97
 98    def __init__(
 99        self,
100        *,
101        input: Any,
102        output: Any,
103        expected_output: Any = None,
104        metadata: Optional[Dict[str, Any]] = None,
105    ):
106        """Initialize EvaluatorInputs with the provided data.
107
108        Args:
109            input: The input data for evaluation.
110            output: The output data to be evaluated.
111            expected_output: Optional ground truth for comparison.
112            metadata: Optional additional context for evaluation.
113
114        Note:
115            All arguments must be provided as keywords.
116        """
117        self.input = input
118        self.output = output
119        self.expected_output = expected_output
120        self.metadata = metadata

Input data structure for evaluators, returned by mapper functions.

This class provides a strongly-typed container for transforming API response objects (traces, observations) into the standardized format expected by evaluator functions. It ensures consistent access to input, output, expected output, and metadata regardless of the source entity type.

Attributes:
  • input: The input data that was provided to generate the output being evaluated. For traces, this might be the initial prompt or request. For observations, this could be the span's input. The exact meaning depends on your use case.
  • output: The actual output that was produced and needs to be evaluated. For traces, this is typically the final response. For observations, this might be the generation output or span result.
  • expected_output: Optional ground truth or expected result for comparison. Used by evaluators to assess correctness. May be None if no ground truth is available for the entity being evaluated.
  • metadata: Optional structured metadata providing additional context for evaluation. Can include information about the entity, execution context, user attributes, or any other relevant data that evaluators might use.
Examples:

Simple mapper for traces:

from langfuse import EvaluatorInputs

def trace_mapper(trace):
    return EvaluatorInputs(
        input=trace.input,
        output=trace.output,
        expected_output=None,  # No ground truth available
        metadata={"user_id": trace.user_id, "tags": trace.tags}
    )

Mapper for observations extracting specific fields:

def observation_mapper(observation):
    # Extract input/output from observation's data
    input_data = observation.input if hasattr(observation, 'input') else None
    output_data = observation.output if hasattr(observation, 'output') else None

    return EvaluatorInputs(
        input=input_data,
        output=output_data,
        expected_output=None,
        metadata={
            "observation_type": observation.type,
            "model": observation.model,
            "latency_ms": observation.end_time - observation.start_time
        }
    )

```

Note:

All arguments must be passed as keywords when instantiating this class.

EvaluatorInputs( *, input: Any, output: Any, expected_output: Any = None, metadata: Optional[Dict[str, Any]] = None)
 98    def __init__(
 99        self,
100        *,
101        input: Any,
102        output: Any,
103        expected_output: Any = None,
104        metadata: Optional[Dict[str, Any]] = None,
105    ):
106        """Initialize EvaluatorInputs with the provided data.
107
108        Args:
109            input: The input data for evaluation.
110            output: The output data to be evaluated.
111            expected_output: Optional ground truth for comparison.
112            metadata: Optional additional context for evaluation.
113
114        Note:
115            All arguments must be provided as keywords.
116        """
117        self.input = input
118        self.output = output
119        self.expected_output = expected_output
120        self.metadata = metadata

Initialize EvaluatorInputs with the provided data.

Arguments:
  • input: The input data for evaluation.
  • output: The output data to be evaluated.
  • expected_output: Optional ground truth for comparison.
  • metadata: Optional additional context for evaluation.
Note:

All arguments must be provided as keywords.

input
output
expected_output
metadata
class MapperFunction(typing.Protocol):
123class MapperFunction(Protocol):
124    """Protocol defining the interface for mapper functions in batch evaluation.
125
126    Mapper functions transform API response objects (traces or observations)
127    into the standardized EvaluatorInputs format that evaluators expect. This abstraction
128    allows you to define how to extract and structure evaluation data from different
129    entity types.
130
131    Mapper functions must:
132    - Accept a single item parameter (trace, observation)
133    - Return an EvaluatorInputs instance with input, output, expected_output, metadata
134    - Can be either synchronous or asynchronous
135    - Should handle missing or malformed data gracefully
136    """
137
138    def __call__(
139        self,
140        *,
141        item: Union["TraceWithFullDetails", "ObservationsView"],
142        **kwargs: Dict[str, Any],
143    ) -> Union[EvaluatorInputs, Awaitable[EvaluatorInputs]]:
144        """Transform an API response object into evaluator inputs.
145
146        This method defines how to extract evaluation-relevant data from the raw
147        API response object. The implementation should map entity-specific fields
148        to the standardized input/output/expected_output/metadata structure.
149
150        Args:
151            item: The API response object to transform. The type depends on the scope:
152                - TraceWithFullDetails: When evaluating traces
153                - ObservationsView: When evaluating observations
154
155        Returns:
156            EvaluatorInputs: A structured container with:
157                - input: The input data that generated the output
158                - output: The output to be evaluated
159                - expected_output: Optional ground truth for comparison
160                - metadata: Optional additional context
161
162            Can return either a direct EvaluatorInputs instance or an awaitable
163            (for async mappers that need to fetch additional data).
164
165        Examples:
166            Basic trace mapper:
167            ```python
168            def map_trace(trace):
169                return EvaluatorInputs(
170                    input=trace.input,
171                    output=trace.output,
172                    expected_output=None,
173                    metadata={"trace_id": trace.id, "user": trace.user_id}
174                )
175            ```
176
177            Observation mapper with conditional logic:
178            ```python
179            def map_observation(observation):
180                # Extract fields based on observation type
181                if observation.type == "GENERATION":
182                    input_data = observation.input
183                    output_data = observation.output
184                else:
185                    # For other types, use different fields
186                    input_data = observation.metadata.get("input")
187                    output_data = observation.metadata.get("output")
188
189                return EvaluatorInputs(
190                    input=input_data,
191                    output=output_data,
192                    expected_output=None,
193                    metadata={"obs_id": observation.id, "type": observation.type}
194                )
195            ```
196
197            Async mapper (if additional processing needed):
198            ```python
199            async def map_trace_async(trace):
200                # Could do async processing here if needed
201                processed_output = await some_async_transformation(trace.output)
202
203                return EvaluatorInputs(
204                    input=trace.input,
205                    output=processed_output,
206                    expected_output=None,
207                    metadata={"trace_id": trace.id}
208                )
209            ```
210        """
211        ...

Protocol defining the interface for mapper functions in batch evaluation.

Mapper functions transform API response objects (traces or observations) into the standardized EvaluatorInputs format that evaluators expect. This abstraction allows you to define how to extract and structure evaluation data from different entity types.

Mapper functions must:

  • Accept a single item parameter (trace, observation)
  • Return an EvaluatorInputs instance with input, output, expected_output, metadata
  • Can be either synchronous or asynchronous
  • Should handle missing or malformed data gracefully
MapperFunction(*args, **kwargs)
1927def _no_init_or_replace_init(self, *args, **kwargs):
1928    cls = type(self)
1929
1930    if cls._is_protocol:
1931        raise TypeError('Protocols cannot be instantiated')
1932
1933    # Already using a custom `__init__`. No need to calculate correct
1934    # `__init__` to call. This can lead to RecursionError. See bpo-45121.
1935    if cls.__init__ is not _no_init_or_replace_init:
1936        return
1937
1938    # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`.
1939    # The first instantiation of the subclass will call `_no_init_or_replace_init` which
1940    # searches for a proper new `__init__` in the MRO. The new `__init__`
1941    # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent
1942    # instantiation of the protocol subclass will thus use the new
1943    # `__init__` and no longer call `_no_init_or_replace_init`.
1944    for base in cls.__mro__:
1945        init = base.__dict__.get('__init__', _no_init_or_replace_init)
1946        if init is not _no_init_or_replace_init:
1947            cls.__init__ = init
1948            break
1949    else:
1950        # should not happen
1951        cls.__init__ = object.__init__
1952
1953    cls.__init__(self, *args, **kwargs)
class CompositeEvaluatorFunction(typing.Protocol):
214class CompositeEvaluatorFunction(Protocol):
215    """Protocol defining the interface for composite evaluator functions.
216
217    Composite evaluators create aggregate scores from multiple item-level evaluations.
218    This is commonly used to compute weighted averages, combined metrics, or other
219    composite assessments based on individual evaluation results.
220
221    Composite evaluators:
222    - Accept the same inputs as item-level evaluators (input, output, expected_output, metadata)
223      plus the list of evaluations
224    - Return either a single Evaluation, a list of Evaluations, or a dict
225    - Can be either synchronous or asynchronous
226    - Have access to both raw item data and evaluation results
227    """
228
229    def __call__(
230        self,
231        *,
232        input: Optional[Any] = None,
233        output: Optional[Any] = None,
234        expected_output: Optional[Any] = None,
235        metadata: Optional[Dict[str, Any]] = None,
236        evaluations: List[Evaluation],
237        **kwargs: Dict[str, Any],
238    ) -> Union[
239        Evaluation,
240        List[Evaluation],
241        Dict[str, Any],
242        Awaitable[Evaluation],
243        Awaitable[List[Evaluation]],
244        Awaitable[Dict[str, Any]],
245    ]:
246        r"""Create a composite evaluation from item-level evaluation results.
247
248        This method combines multiple evaluation scores into a single composite metric.
249        Common use cases include weighted averages, pass/fail decisions based on multiple
250        criteria, or custom scoring logic that considers multiple dimensions.
251
252        Args:
253            input: The input data that was provided to the system being evaluated.
254            output: The output generated by the system being evaluated.
255            expected_output: The expected/reference output for comparison (if available).
256            metadata: Additional metadata about the evaluation context.
257            evaluations: List of evaluation results from item-level evaluators.
258                Each evaluation contains name, value, comment, and metadata.
259
260        Returns:
261            Can return any of:
262            - Evaluation: A single composite evaluation result
263            - List[Evaluation]: Multiple composite evaluations
264            - Dict: A dict that will be converted to an Evaluation
265                - name: Identifier for the composite metric (e.g., "composite_score")
266                - value: The computed composite value
267                - comment: Optional explanation of how the score was computed
268                - metadata: Optional details about the composition logic
269
270            Can return either a direct Evaluation instance or an awaitable
271            (for async composite evaluators).
272
273        Examples:
274            Simple weighted average:
275            ```python
276            def weighted_composite(*, input, output, expected_output, metadata, evaluations):
277                weights = {
278                    "accuracy": 0.5,
279                    "relevance": 0.3,
280                    "safety": 0.2
281                }
282
283                total_score = 0.0
284                total_weight = 0.0
285
286                for eval in evaluations:
287                    if eval.name in weights and isinstance(eval.value, (int, float)):
288                        total_score += eval.value * weights[eval.name]
289                        total_weight += weights[eval.name]
290
291                final_score = total_score / total_weight if total_weight > 0 else 0.0
292
293                return Evaluation(
294                    name="composite_score",
295                    value=final_score,
296                    comment=f"Weighted average of {len(evaluations)} metrics"
297                )
298            ```
299
300            Pass/fail composite based on thresholds:
301            ```python
302            def pass_fail_composite(*, input, output, expected_output, metadata, evaluations):
303                # Must pass all criteria
304                thresholds = {
305                    "accuracy": 0.7,
306                    "safety": 0.9,
307                    "relevance": 0.6
308                }
309
310                passes = True
311                failing_metrics = []
312
313                for metric, threshold in thresholds.items():
314                    eval_result = next((e for e in evaluations if e.name == metric), None)
315                    if eval_result and isinstance(eval_result.value, (int, float)):
316                        if eval_result.value < threshold:
317                            passes = False
318                            failing_metrics.append(metric)
319
320                return Evaluation(
321                    name="passes_all_checks",
322                    value=passes,
323                    comment=f"Failed: {', '.join(failing_metrics)}" if failing_metrics else "All checks passed",
324                    data_type="BOOLEAN"
325                )
326            ```
327
328            Async composite with external scoring:
329            ```python
330            async def llm_composite(*, input, output, expected_output, metadata, evaluations):
331                # Use LLM to synthesize multiple evaluation results
332                eval_summary = "\n".join(
333                    f"- {e.name}: {e.value}" for e in evaluations
334                )
335
336                prompt = f"Given these evaluation scores:\n{eval_summary}\n"
337                prompt += f"For the output: {output}\n"
338                prompt += "Provide an overall quality score from 0-1."
339
340                response = await openai.chat.completions.create(
341                    model="gpt-4",
342                    messages=[{"role": "user", "content": prompt}]
343                )
344
345                score = float(response.choices[0].message.content.strip())
346
347                return Evaluation(
348                    name="llm_composite_score",
349                    value=score,
350                    comment="LLM-synthesized composite score"
351                )
352            ```
353
354            Context-aware composite:
355            ```python
356            def context_composite(*, input, output, expected_output, metadata, evaluations):
357                # Adjust weighting based on metadata
358                base_weights = {"accuracy": 0.5, "speed": 0.3, "cost": 0.2}
359
360                # If metadata indicates high importance, prioritize accuracy
361                if metadata and metadata.get('importance') == 'high':
362                    weights = {"accuracy": 0.7, "speed": 0.2, "cost": 0.1}
363                else:
364                    weights = base_weights
365
366                total = sum(
367                    e.value * weights.get(e.name, 0)
368                    for e in evaluations
369                    if isinstance(e.value, (int, float))
370                )
371
372                return Evaluation(
373                    name="weighted_composite",
374                    value=total,
375                    comment="Context-aware weighted composite"
376                )
377            ```
378        """
379        ...

Protocol defining the interface for composite evaluator functions.

Composite evaluators create aggregate scores from multiple item-level evaluations. This is commonly used to compute weighted averages, combined metrics, or other composite assessments based on individual evaluation results.

Composite evaluators:

  • Accept the same inputs as item-level evaluators (input, output, expected_output, metadata) plus the list of evaluations
  • Return either a single Evaluation, a list of Evaluations, or a dict
  • Can be either synchronous or asynchronous
  • Have access to both raw item data and evaluation results
CompositeEvaluatorFunction(*args, **kwargs)
1927def _no_init_or_replace_init(self, *args, **kwargs):
1928    cls = type(self)
1929
1930    if cls._is_protocol:
1931        raise TypeError('Protocols cannot be instantiated')
1932
1933    # Already using a custom `__init__`. No need to calculate correct
1934    # `__init__` to call. This can lead to RecursionError. See bpo-45121.
1935    if cls.__init__ is not _no_init_or_replace_init:
1936        return
1937
1938    # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`.
1939    # The first instantiation of the subclass will call `_no_init_or_replace_init` which
1940    # searches for a proper new `__init__` in the MRO. The new `__init__`
1941    # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent
1942    # instantiation of the protocol subclass will thus use the new
1943    # `__init__` and no longer call `_no_init_or_replace_init`.
1944    for base in cls.__mro__:
1945        init = base.__dict__.get('__init__', _no_init_or_replace_init)
1946        if init is not _no_init_or_replace_init:
1947            cls.__init__ = init
1948            break
1949    else:
1950        # should not happen
1951        cls.__init__ = object.__init__
1952
1953    cls.__init__(self, *args, **kwargs)
class EvaluatorStats:
382class EvaluatorStats:
383    """Statistics for a single evaluator's performance during batch evaluation.
384
385    This class tracks detailed metrics about how a specific evaluator performed
386    across all items in a batch evaluation run. It helps identify evaluator issues,
387    understand reliability, and optimize evaluation pipelines.
388
389    Attributes:
390        name: The name of the evaluator function (extracted from __name__).
391        total_runs: Total number of times the evaluator was invoked.
392        successful_runs: Number of times the evaluator completed successfully.
393        failed_runs: Number of times the evaluator raised an exception or failed.
394        total_scores_created: Total number of evaluation scores created by this evaluator.
395            Can be higher than successful_runs if the evaluator returns multiple scores.
396
397    Examples:
398        Accessing evaluator stats from batch evaluation result:
399        ```python
400        result = client.run_batched_evaluation(...)
401
402        for stats in result.evaluator_stats:
403            print(f"Evaluator: {stats.name}")
404            print(f"  Success rate: {stats.successful_runs / stats.total_runs:.1%}")
405            print(f"  Scores created: {stats.total_scores_created}")
406
407            if stats.failed_runs > 0:
408                print(f"  âš ī¸  Failed {stats.failed_runs} times")
409        ```
410
411        Identifying problematic evaluators:
412        ```python
413        result = client.run_batched_evaluation(...)
414
415        # Find evaluators with high failure rates
416        for stats in result.evaluator_stats:
417            failure_rate = stats.failed_runs / stats.total_runs
418            if failure_rate > 0.1:  # More than 10% failures
419                print(f"âš ī¸  {stats.name} has {failure_rate:.1%} failure rate")
420                print(f"    Consider debugging or removing this evaluator")
421        ```
422
423    Note:
424        All arguments must be passed as keywords when instantiating this class.
425    """
426
427    def __init__(
428        self,
429        *,
430        name: str,
431        total_runs: int = 0,
432        successful_runs: int = 0,
433        failed_runs: int = 0,
434        total_scores_created: int = 0,
435    ):
436        """Initialize EvaluatorStats with the provided metrics.
437
438        Args:
439            name: The evaluator function name.
440            total_runs: Total number of evaluator invocations.
441            successful_runs: Number of successful completions.
442            failed_runs: Number of failures.
443            total_scores_created: Total scores created by this evaluator.
444
445        Note:
446            All arguments must be provided as keywords.
447        """
448        self.name = name
449        self.total_runs = total_runs
450        self.successful_runs = successful_runs
451        self.failed_runs = failed_runs
452        self.total_scores_created = total_scores_created

Statistics for a single evaluator's performance during batch evaluation.

This class tracks detailed metrics about how a specific evaluator performed across all items in a batch evaluation run. It helps identify evaluator issues, understand reliability, and optimize evaluation pipelines.

Attributes:
  • name: The name of the evaluator function (extracted from __name__).
  • total_runs: Total number of times the evaluator was invoked.
  • successful_runs: Number of times the evaluator completed successfully.
  • failed_runs: Number of times the evaluator raised an exception or failed.
  • total_scores_created: Total number of evaluation scores created by this evaluator. Can be higher than successful_runs if the evaluator returns multiple scores.
Examples:

Accessing evaluator stats from batch evaluation result:

result = client.run_batched_evaluation(...)

for stats in result.evaluator_stats:
    print(f"Evaluator: {stats.name}")
    print(f"  Success rate: {stats.successful_runs / stats.total_runs:.1%}")
    print(f"  Scores created: {stats.total_scores_created}")

    if stats.failed_runs > 0:
        print(f"  âš ī¸  Failed {stats.failed_runs} times")

Identifying problematic evaluators:

result = client.run_batched_evaluation(...)

# Find evaluators with high failure rates
for stats in result.evaluator_stats:
    failure_rate = stats.failed_runs / stats.total_runs
    if failure_rate > 0.1:  # More than 10% failures
        print(f"âš ī¸  {stats.name} has {failure_rate:.1%} failure rate")
        print(f"    Consider debugging or removing this evaluator")
Note:

All arguments must be passed as keywords when instantiating this class.

EvaluatorStats( *, name: str, total_runs: int = 0, successful_runs: int = 0, failed_runs: int = 0, total_scores_created: int = 0)
427    def __init__(
428        self,
429        *,
430        name: str,
431        total_runs: int = 0,
432        successful_runs: int = 0,
433        failed_runs: int = 0,
434        total_scores_created: int = 0,
435    ):
436        """Initialize EvaluatorStats with the provided metrics.
437
438        Args:
439            name: The evaluator function name.
440            total_runs: Total number of evaluator invocations.
441            successful_runs: Number of successful completions.
442            failed_runs: Number of failures.
443            total_scores_created: Total scores created by this evaluator.
444
445        Note:
446            All arguments must be provided as keywords.
447        """
448        self.name = name
449        self.total_runs = total_runs
450        self.successful_runs = successful_runs
451        self.failed_runs = failed_runs
452        self.total_scores_created = total_scores_created

Initialize EvaluatorStats with the provided metrics.

Arguments:
  • name: The evaluator function name.
  • total_runs: Total number of evaluator invocations.
  • successful_runs: Number of successful completions.
  • failed_runs: Number of failures.
  • total_scores_created: Total scores created by this evaluator.
Note:

All arguments must be provided as keywords.

name
total_runs
successful_runs
failed_runs
total_scores_created
class BatchEvaluationResumeToken:
455class BatchEvaluationResumeToken:
456    """Token for resuming a failed batch evaluation run.
457
458    This class encapsulates all the information needed to resume a batch evaluation
459    that was interrupted or failed partway through. It uses timestamp-based filtering
460    to avoid re-processing items that were already evaluated, even if the underlying
461    dataset changed between runs.
462
463    Attributes:
464        scope: The type of items being evaluated ("traces", "observations").
465        filter: The original JSON filter string used to query items.
466        last_processed_timestamp: ISO 8601 timestamp of the last successfully processed item.
467            Used to construct a filter that only fetches items after this timestamp.
468        last_processed_id: The ID of the last successfully processed item, for reference.
469        items_processed: Count of items successfully processed before interruption.
470
471    Examples:
472        Resuming a failed batch evaluation:
473        ```python
474        # Initial run that fails partway through
475        try:
476            result = client.run_batched_evaluation(
477                scope="traces",
478                mapper=my_mapper,
479                evaluators=[evaluator1, evaluator2],
480                filter='{"tags": ["production"]}',
481                max_items=10000
482            )
483        except Exception as e:
484            print(f"Evaluation failed: {e}")
485
486            # Save the resume token
487            if result.resume_token:
488                # Store resume token for later (e.g., in a file or database)
489                import json
490                with open("resume_token.json", "w") as f:
491                    json.dump({
492                        "scope": result.resume_token.scope,
493                        "filter": result.resume_token.filter,
494                        "last_timestamp": result.resume_token.last_processed_timestamp,
495                        "last_id": result.resume_token.last_processed_id,
496                        "items_done": result.resume_token.items_processed
497                    }, f)
498
499        # Later, resume from where it left off
500        with open("resume_token.json") as f:
501            token_data = json.load(f)
502
503        resume_token = BatchEvaluationResumeToken(
504            scope=token_data["scope"],
505            filter=token_data["filter"],
506            last_processed_timestamp=token_data["last_timestamp"],
507            last_processed_id=token_data["last_id"],
508            items_processed=token_data["items_done"]
509        )
510
511        # Resume the evaluation
512        result = client.run_batched_evaluation(
513            scope="traces",
514            mapper=my_mapper,
515            evaluators=[evaluator1, evaluator2],
516            resume_from=resume_token
517        )
518
519        print(f"Processed {result.total_items_processed} additional items")
520        ```
521
522        Handling partial completion:
523        ```python
524        result = client.run_batched_evaluation(...)
525
526        if not result.completed:
527            print(f"Evaluation incomplete. Processed {result.resume_token.items_processed} items")
528            print(f"Last item: {result.resume_token.last_processed_id}")
529            print(f"Resume from: {result.resume_token.last_processed_timestamp}")
530
531            # Optionally retry automatically
532            if result.resume_token:
533                print("Retrying...")
534                result = client.run_batched_evaluation(
535                    scope=result.resume_token.scope,
536                    mapper=my_mapper,
537                    evaluators=my_evaluators,
538                    resume_from=result.resume_token
539                )
540        ```
541
542    Note:
543        All arguments must be passed as keywords when instantiating this class.
544        The timestamp-based approach means that items created after the initial run
545        but before the timestamp will be skipped. This is intentional to avoid
546        duplicates and ensure consistent evaluation.
547    """
548
549    def __init__(
550        self,
551        *,
552        scope: str,
553        filter: Optional[str],
554        last_processed_timestamp: str,
555        last_processed_id: str,
556        items_processed: int,
557    ):
558        """Initialize BatchEvaluationResumeToken with the provided state.
559
560        Args:
561            scope: The scope type ("traces", "observations").
562            filter: The original JSON filter string.
563            last_processed_timestamp: ISO 8601 timestamp of last processed item.
564            last_processed_id: ID of last processed item.
565            items_processed: Count of items processed before interruption.
566
567        Note:
568            All arguments must be provided as keywords.
569        """
570        self.scope = scope
571        self.filter = filter
572        self.last_processed_timestamp = last_processed_timestamp
573        self.last_processed_id = last_processed_id
574        self.items_processed = items_processed

Token for resuming a failed batch evaluation run.

This class encapsulates all the information needed to resume a batch evaluation that was interrupted or failed partway through. It uses timestamp-based filtering to avoid re-processing items that were already evaluated, even if the underlying dataset changed between runs.

Attributes:
  • scope: The type of items being evaluated ("traces", "observations").
  • filter: The original JSON filter string used to query items.
  • last_processed_timestamp: ISO 8601 timestamp of the last successfully processed item. Used to construct a filter that only fetches items after this timestamp.
  • last_processed_id: The ID of the last successfully processed item, for reference.
  • items_processed: Count of items successfully processed before interruption.
Examples:

Resuming a failed batch evaluation:

# Initial run that fails partway through
try:
    result = client.run_batched_evaluation(
        scope="traces",
        mapper=my_mapper,
        evaluators=[evaluator1, evaluator2],
        filter='{"tags": ["production"]}',
        max_items=10000
    )
except Exception as e:
    print(f"Evaluation failed: {e}")

    # Save the resume token
    if result.resume_token:
        # Store resume token for later (e.g., in a file or database)
        import json
        with open("resume_token.json", "w") as f:
            json.dump({
                "scope": result.resume_token.scope,
                "filter": result.resume_token.filter,
                "last_timestamp": result.resume_token.last_processed_timestamp,
                "last_id": result.resume_token.last_processed_id,
                "items_done": result.resume_token.items_processed
            }, f)

# Later, resume from where it left off
with open("resume_token.json") as f:
    token_data = json.load(f)

resume_token = BatchEvaluationResumeToken(
    scope=token_data["scope"],
    filter=token_data["filter"],
    last_processed_timestamp=token_data["last_timestamp"],
    last_processed_id=token_data["last_id"],
    items_processed=token_data["items_done"]
)

# Resume the evaluation
result = client.run_batched_evaluation(
    scope="traces",
    mapper=my_mapper,
    evaluators=[evaluator1, evaluator2],
    resume_from=resume_token
)

print(f"Processed {result.total_items_processed} additional items")

Handling partial completion:

result = client.run_batched_evaluation(...)

if not result.completed:
    print(f"Evaluation incomplete. Processed {result.resume_token.items_processed} items")
    print(f"Last item: {result.resume_token.last_processed_id}")
    print(f"Resume from: {result.resume_token.last_processed_timestamp}")

    # Optionally retry automatically
    if result.resume_token:
        print("Retrying...")
        result = client.run_batched_evaluation(
            scope=result.resume_token.scope,
            mapper=my_mapper,
            evaluators=my_evaluators,
            resume_from=result.resume_token
        )
Note:

All arguments must be passed as keywords when instantiating this class. The timestamp-based approach means that items created after the initial run but before the timestamp will be skipped. This is intentional to avoid duplicates and ensure consistent evaluation.

BatchEvaluationResumeToken( *, scope: str, filter: Optional[str], last_processed_timestamp: str, last_processed_id: str, items_processed: int)
549    def __init__(
550        self,
551        *,
552        scope: str,
553        filter: Optional[str],
554        last_processed_timestamp: str,
555        last_processed_id: str,
556        items_processed: int,
557    ):
558        """Initialize BatchEvaluationResumeToken with the provided state.
559
560        Args:
561            scope: The scope type ("traces", "observations").
562            filter: The original JSON filter string.
563            last_processed_timestamp: ISO 8601 timestamp of last processed item.
564            last_processed_id: ID of last processed item.
565            items_processed: Count of items processed before interruption.
566
567        Note:
568            All arguments must be provided as keywords.
569        """
570        self.scope = scope
571        self.filter = filter
572        self.last_processed_timestamp = last_processed_timestamp
573        self.last_processed_id = last_processed_id
574        self.items_processed = items_processed

Initialize BatchEvaluationResumeToken with the provided state.

Arguments:
  • scope: The scope type ("traces", "observations").
  • filter: The original JSON filter string.
  • last_processed_timestamp: ISO 8601 timestamp of last processed item.
  • last_processed_id: ID of last processed item.
  • items_processed: Count of items processed before interruption.
Note:

All arguments must be provided as keywords.

scope
filter
last_processed_timestamp
last_processed_id
items_processed
class BatchEvaluationResult:
577class BatchEvaluationResult:
578    r"""Complete result structure for batch evaluation execution.
579
580    This class encapsulates comprehensive statistics and metadata about a batch
581    evaluation run, including counts, evaluator-specific metrics, timing information,
582    error details, and resume capability.
583
584    Attributes:
585        total_items_fetched: Total number of items fetched from the API.
586        total_items_processed: Number of items successfully evaluated.
587        total_items_failed: Number of items that failed during evaluation.
588        total_scores_created: Total scores created by all item-level evaluators.
589        total_composite_scores_created: Scores created by the composite evaluator.
590        total_evaluations_failed: Number of individual evaluator failures across all items.
591        evaluator_stats: List of per-evaluator statistics (success/failure rates, scores created).
592        resume_token: Token for resuming if evaluation was interrupted (None if completed).
593        completed: True if all items were processed, False if stopped early or failed.
594        duration_seconds: Total time taken to execute the batch evaluation.
595        failed_item_ids: List of IDs for items that failed evaluation.
596        error_summary: Dictionary mapping error types to occurrence counts.
597        has_more_items: True if max_items limit was reached but more items exist.
598        item_evaluations: Dictionary mapping item IDs to their evaluation results (both regular and composite).
599
600    Examples:
601        Basic result inspection:
602        ```python
603        result = client.run_batched_evaluation(...)
604
605        print(f"Processed: {result.total_items_processed}/{result.total_items_fetched}")
606        print(f"Scores created: {result.total_scores_created}")
607        print(f"Duration: {result.duration_seconds:.2f}s")
608        print(f"Success rate: {result.total_items_processed / result.total_items_fetched:.1%}")
609        ```
610
611        Detailed analysis with evaluator stats:
612        ```python
613        result = client.run_batched_evaluation(...)
614
615        print(f"\n📊 Batch Evaluation Results")
616        print(f"{'='*50}")
617        print(f"Items processed: {result.total_items_processed}")
618        print(f"Items failed: {result.total_items_failed}")
619        print(f"Scores created: {result.total_scores_created}")
620
621        if result.total_composite_scores_created > 0:
622            print(f"Composite scores: {result.total_composite_scores_created}")
623
624        print(f"\n📈 Evaluator Performance:")
625        for stats in result.evaluator_stats:
626            success_rate = stats.successful_runs / stats.total_runs if stats.total_runs > 0 else 0
627            print(f"\n  {stats.name}:")
628            print(f"    Success rate: {success_rate:.1%}")
629            print(f"    Scores created: {stats.total_scores_created}")
630            if stats.failed_runs > 0:
631                print(f"    âš ī¸  Failures: {stats.failed_runs}")
632
633        if result.error_summary:
634            print(f"\nâš ī¸  Errors encountered:")
635            for error_type, count in result.error_summary.items():
636                print(f"    {error_type}: {count}")
637        ```
638
639        Handling incomplete runs:
640        ```python
641        result = client.run_batched_evaluation(...)
642
643        if not result.completed:
644            print("âš ī¸  Evaluation incomplete!")
645
646            if result.resume_token:
647                print(f"Processed {result.resume_token.items_processed} items before failure")
648                print(f"Use resume_from parameter to continue from:")
649                print(f"  Timestamp: {result.resume_token.last_processed_timestamp}")
650                print(f"  Last ID: {result.resume_token.last_processed_id}")
651
652        if result.has_more_items:
653            print(f"â„šī¸  More items available beyond max_items limit")
654        ```
655
656        Performance monitoring:
657        ```python
658        result = client.run_batched_evaluation(...)
659
660        items_per_second = result.total_items_processed / result.duration_seconds
661        avg_scores_per_item = result.total_scores_created / result.total_items_processed
662
663        print(f"Performance metrics:")
664        print(f"  Throughput: {items_per_second:.2f} items/second")
665        print(f"  Avg scores/item: {avg_scores_per_item:.2f}")
666        print(f"  Total duration: {result.duration_seconds:.2f}s")
667
668        if result.total_evaluations_failed > 0:
669            failure_rate = result.total_evaluations_failed / (
670                result.total_items_processed * len(result.evaluator_stats)
671            )
672            print(f"  Evaluation failure rate: {failure_rate:.1%}")
673        ```
674
675    Note:
676        All arguments must be passed as keywords when instantiating this class.
677    """
678
679    def __init__(
680        self,
681        *,
682        total_items_fetched: int,
683        total_items_processed: int,
684        total_items_failed: int,
685        total_scores_created: int,
686        total_composite_scores_created: int,
687        total_evaluations_failed: int,
688        evaluator_stats: List[EvaluatorStats],
689        resume_token: Optional[BatchEvaluationResumeToken],
690        completed: bool,
691        duration_seconds: float,
692        failed_item_ids: List[str],
693        error_summary: Dict[str, int],
694        has_more_items: bool,
695        item_evaluations: Dict[str, List["Evaluation"]],
696    ):
697        """Initialize BatchEvaluationResult with comprehensive statistics.
698
699        Args:
700            total_items_fetched: Total items fetched from API.
701            total_items_processed: Items successfully evaluated.
702            total_items_failed: Items that failed evaluation.
703            total_scores_created: Scores from item-level evaluators.
704            total_composite_scores_created: Scores from composite evaluator.
705            total_evaluations_failed: Individual evaluator failures.
706            evaluator_stats: Per-evaluator statistics.
707            resume_token: Token for resuming (None if completed).
708            completed: Whether all items were processed.
709            duration_seconds: Total execution time.
710            failed_item_ids: IDs of failed items.
711            error_summary: Error types and counts.
712            has_more_items: Whether more items exist beyond max_items.
713            item_evaluations: Dictionary mapping item IDs to their evaluation results.
714
715        Note:
716            All arguments must be provided as keywords.
717        """
718        self.total_items_fetched = total_items_fetched
719        self.total_items_processed = total_items_processed
720        self.total_items_failed = total_items_failed
721        self.total_scores_created = total_scores_created
722        self.total_composite_scores_created = total_composite_scores_created
723        self.total_evaluations_failed = total_evaluations_failed
724        self.evaluator_stats = evaluator_stats
725        self.resume_token = resume_token
726        self.completed = completed
727        self.duration_seconds = duration_seconds
728        self.failed_item_ids = failed_item_ids
729        self.error_summary = error_summary
730        self.has_more_items = has_more_items
731        self.item_evaluations = item_evaluations
732
733    def __str__(self) -> str:
734        """Return a formatted string representation of the batch evaluation results.
735
736        Returns:
737            A multi-line string with a summary of the evaluation results.
738        """
739        lines = []
740        lines.append("=" * 60)
741        lines.append("Batch Evaluation Results")
742        lines.append("=" * 60)
743
744        # Summary statistics
745        lines.append(f"\nStatus: {'Completed' if self.completed else 'Incomplete'}")
746        lines.append(f"Duration: {self.duration_seconds:.2f}s")
747        lines.append(f"\nItems fetched: {self.total_items_fetched}")
748        lines.append(f"Items processed: {self.total_items_processed}")
749
750        if self.total_items_failed > 0:
751            lines.append(f"Items failed: {self.total_items_failed}")
752
753        # Success rate
754        if self.total_items_fetched > 0:
755            success_rate = self.total_items_processed / self.total_items_fetched * 100
756            lines.append(f"Success rate: {success_rate:.1f}%")
757
758        # Scores created
759        lines.append(f"\nScores created: {self.total_scores_created}")
760        if self.total_composite_scores_created > 0:
761            lines.append(f"Composite scores: {self.total_composite_scores_created}")
762
763        total_scores = self.total_scores_created + self.total_composite_scores_created
764        lines.append(f"Total scores: {total_scores}")
765
766        # Evaluator statistics
767        if self.evaluator_stats:
768            lines.append("\nEvaluator Performance:")
769            for stats in self.evaluator_stats:
770                lines.append(f"  {stats.name}:")
771                if stats.total_runs > 0:
772                    success_rate = (
773                        stats.successful_runs / stats.total_runs * 100
774                        if stats.total_runs > 0
775                        else 0
776                    )
777                    lines.append(
778                        f"    Runs: {stats.successful_runs}/{stats.total_runs} "
779                        f"({success_rate:.1f}% success)"
780                    )
781                    lines.append(f"    Scores created: {stats.total_scores_created}")
782                    if stats.failed_runs > 0:
783                        lines.append(f"    Failed runs: {stats.failed_runs}")
784
785        # Performance metrics
786        if self.total_items_processed > 0 and self.duration_seconds > 0:
787            items_per_sec = self.total_items_processed / self.duration_seconds
788            lines.append("\nPerformance:")
789            lines.append(f"  Throughput: {items_per_sec:.2f} items/second")
790            if self.total_scores_created > 0:
791                avg_scores = self.total_scores_created / self.total_items_processed
792                lines.append(f"  Avg scores per item: {avg_scores:.2f}")
793
794        # Errors and warnings
795        if self.error_summary:
796            lines.append("\nErrors encountered:")
797            for error_type, count in self.error_summary.items():
798                lines.append(f"  {error_type}: {count}")
799
800        # Incomplete run information
801        if not self.completed:
802            lines.append("\nWarning: Evaluation incomplete")
803            if self.resume_token:
804                lines.append(
805                    f"  Last processed: {self.resume_token.last_processed_timestamp}"
806                )
807                lines.append(f"  Items processed: {self.resume_token.items_processed}")
808                lines.append("  Use resume_from parameter to continue")
809
810        if self.has_more_items:
811            lines.append("\nNote: More items available beyond max_items limit")
812
813        lines.append("=" * 60)
814        return "\n".join(lines)

Complete result structure for batch evaluation execution.

This class encapsulates comprehensive statistics and metadata about a batch evaluation run, including counts, evaluator-specific metrics, timing information, error details, and resume capability.

Attributes:
  • total_items_fetched: Total number of items fetched from the API.
  • total_items_processed: Number of items successfully evaluated.
  • total_items_failed: Number of items that failed during evaluation.
  • total_scores_created: Total scores created by all item-level evaluators.
  • total_composite_scores_created: Scores created by the composite evaluator.
  • total_evaluations_failed: Number of individual evaluator failures across all items.
  • evaluator_stats: List of per-evaluator statistics (success/failure rates, scores created).
  • resume_token: Token for resuming if evaluation was interrupted (None if completed).
  • completed: True if all items were processed, False if stopped early or failed.
  • duration_seconds: Total time taken to execute the batch evaluation.
  • failed_item_ids: List of IDs for items that failed evaluation.
  • error_summary: Dictionary mapping error types to occurrence counts.
  • has_more_items: True if max_items limit was reached but more items exist.
  • item_evaluations: Dictionary mapping item IDs to their evaluation results (both regular and composite).
Examples:

Basic result inspection:

result = client.run_batched_evaluation(...)

print(f"Processed: {result.total_items_processed}/{result.total_items_fetched}")
print(f"Scores created: {result.total_scores_created}")
print(f"Duration: {result.duration_seconds:.2f}s")
print(f"Success rate: {result.total_items_processed / result.total_items_fetched:.1%}")

Detailed analysis with evaluator stats:

result = client.run_batched_evaluation(...)

print(f"\n📊 Batch Evaluation Results")
print(f"{'='*50}")
print(f"Items processed: {result.total_items_processed}")
print(f"Items failed: {result.total_items_failed}")
print(f"Scores created: {result.total_scores_created}")

if result.total_composite_scores_created > 0:
    print(f"Composite scores: {result.total_composite_scores_created}")

print(f"\n📈 Evaluator Performance:")
for stats in result.evaluator_stats:
    success_rate = stats.successful_runs / stats.total_runs if stats.total_runs > 0 else 0
    print(f"\n  {stats.name}:")
    print(f"    Success rate: {success_rate:.1%}")
    print(f"    Scores created: {stats.total_scores_created}")
    if stats.failed_runs > 0:
        print(f"    âš ī¸  Failures: {stats.failed_runs}")

if result.error_summary:
    print(f"\nâš ī¸  Errors encountered:")
    for error_type, count in result.error_summary.items():
        print(f"    {error_type}: {count}")

Handling incomplete runs:

result = client.run_batched_evaluation(...)

if not result.completed:
    print("âš ī¸  Evaluation incomplete!")

    if result.resume_token:
        print(f"Processed {result.resume_token.items_processed} items before failure")
        print(f"Use resume_from parameter to continue from:")
        print(f"  Timestamp: {result.resume_token.last_processed_timestamp}")
        print(f"  Last ID: {result.resume_token.last_processed_id}")

if result.has_more_items:
    print(f"â„šī¸  More items available beyond max_items limit")

Performance monitoring:

result = client.run_batched_evaluation(...)

items_per_second = result.total_items_processed / result.duration_seconds
avg_scores_per_item = result.total_scores_created / result.total_items_processed

print(f"Performance metrics:")
print(f"  Throughput: {items_per_second:.2f} items/second")
print(f"  Avg scores/item: {avg_scores_per_item:.2f}")
print(f"  Total duration: {result.duration_seconds:.2f}s")

if result.total_evaluations_failed > 0:
    failure_rate = result.total_evaluations_failed / (
        result.total_items_processed * len(result.evaluator_stats)
    )
    print(f"  Evaluation failure rate: {failure_rate:.1%}")
Note:

All arguments must be passed as keywords when instantiating this class.

BatchEvaluationResult( *, total_items_fetched: int, total_items_processed: int, total_items_failed: int, total_scores_created: int, total_composite_scores_created: int, total_evaluations_failed: int, evaluator_stats: List[EvaluatorStats], resume_token: Optional[BatchEvaluationResumeToken], completed: bool, duration_seconds: float, failed_item_ids: List[str], error_summary: Dict[str, int], has_more_items: bool, item_evaluations: Dict[str, List[Evaluation]])
679    def __init__(
680        self,
681        *,
682        total_items_fetched: int,
683        total_items_processed: int,
684        total_items_failed: int,
685        total_scores_created: int,
686        total_composite_scores_created: int,
687        total_evaluations_failed: int,
688        evaluator_stats: List[EvaluatorStats],
689        resume_token: Optional[BatchEvaluationResumeToken],
690        completed: bool,
691        duration_seconds: float,
692        failed_item_ids: List[str],
693        error_summary: Dict[str, int],
694        has_more_items: bool,
695        item_evaluations: Dict[str, List["Evaluation"]],
696    ):
697        """Initialize BatchEvaluationResult with comprehensive statistics.
698
699        Args:
700            total_items_fetched: Total items fetched from API.
701            total_items_processed: Items successfully evaluated.
702            total_items_failed: Items that failed evaluation.
703            total_scores_created: Scores from item-level evaluators.
704            total_composite_scores_created: Scores from composite evaluator.
705            total_evaluations_failed: Individual evaluator failures.
706            evaluator_stats: Per-evaluator statistics.
707            resume_token: Token for resuming (None if completed).
708            completed: Whether all items were processed.
709            duration_seconds: Total execution time.
710            failed_item_ids: IDs of failed items.
711            error_summary: Error types and counts.
712            has_more_items: Whether more items exist beyond max_items.
713            item_evaluations: Dictionary mapping item IDs to their evaluation results.
714
715        Note:
716            All arguments must be provided as keywords.
717        """
718        self.total_items_fetched = total_items_fetched
719        self.total_items_processed = total_items_processed
720        self.total_items_failed = total_items_failed
721        self.total_scores_created = total_scores_created
722        self.total_composite_scores_created = total_composite_scores_created
723        self.total_evaluations_failed = total_evaluations_failed
724        self.evaluator_stats = evaluator_stats
725        self.resume_token = resume_token
726        self.completed = completed
727        self.duration_seconds = duration_seconds
728        self.failed_item_ids = failed_item_ids
729        self.error_summary = error_summary
730        self.has_more_items = has_more_items
731        self.item_evaluations = item_evaluations

Initialize BatchEvaluationResult with comprehensive statistics.

Arguments:
  • total_items_fetched: Total items fetched from API.
  • total_items_processed: Items successfully evaluated.
  • total_items_failed: Items that failed evaluation.
  • total_scores_created: Scores from item-level evaluators.
  • total_composite_scores_created: Scores from composite evaluator.
  • total_evaluations_failed: Individual evaluator failures.
  • evaluator_stats: Per-evaluator statistics.
  • resume_token: Token for resuming (None if completed).
  • completed: Whether all items were processed.
  • duration_seconds: Total execution time.
  • failed_item_ids: IDs of failed items.
  • error_summary: Error types and counts.
  • has_more_items: Whether more items exist beyond max_items.
  • item_evaluations: Dictionary mapping item IDs to their evaluation results.
Note:

All arguments must be provided as keywords.

total_items_fetched
total_items_processed
total_items_failed
total_scores_created
total_composite_scores_created
total_evaluations_failed
evaluator_stats
resume_token
completed
duration_seconds
failed_item_ids
error_summary
has_more_items
item_evaluations