langfuse

 1""".. include:: ../README.md"""
 2
 3from langfuse.batch_evaluation import (
 4    BatchEvaluationResult,
 5    BatchEvaluationResumeToken,
 6    CompositeEvaluatorFunction,
 7    EvaluatorInputs,
 8    EvaluatorStats,
 9    MapperFunction,
10)
11from langfuse.experiment import Evaluation, RegressionError, RunnerContext
12
13from ._client import client as _client_module
14from ._client.attributes import LangfuseOtelSpanAttributes
15from ._client.constants import ObservationTypeLiteral
16from ._client.get_client import get_client
17from ._client.observe import observe
18from ._client.propagation import propagate_attributes
19from ._client.span import (
20    LangfuseAgent,
21    LangfuseChain,
22    LangfuseEmbedding,
23    LangfuseEvaluator,
24    LangfuseEvent,
25    LangfuseGeneration,
26    LangfuseGuardrail,
27    LangfuseRetriever,
28    LangfuseSpan,
29    LangfuseTool,
30)
31from ._version import __version__
32from .media import LangfuseMedia, LangfuseMediaReference
33from .span_filter import (
34    KNOWN_LLM_INSTRUMENTATION_SCOPE_PREFIXES,
35    is_default_export_span,
36    is_genai_span,
37    is_known_llm_instrumentor,
38    is_langfuse_span,
39)
40from .types import (
41    MaskOtelSpansFunction,
42    MaskOtelSpansParams,
43    MaskOtelSpansResult,
44    OtelSpanData,
45    OtelSpanIdentifier,
46    OtelSpanPatch,
47)
48
49Langfuse = _client_module.Langfuse
50
51__all__ = [
52    "Langfuse",
53    "LangfuseMedia",
54    "LangfuseMediaReference",
55    "get_client",
56    "observe",
57    "propagate_attributes",
58    "ObservationTypeLiteral",
59    "LangfuseSpan",
60    "LangfuseGeneration",
61    "LangfuseEvent",
62    "LangfuseOtelSpanAttributes",
63    "LangfuseAgent",
64    "LangfuseTool",
65    "LangfuseChain",
66    "LangfuseEmbedding",
67    "LangfuseEvaluator",
68    "LangfuseRetriever",
69    "LangfuseGuardrail",
70    "Evaluation",
71    "EvaluatorInputs",
72    "MapperFunction",
73    "CompositeEvaluatorFunction",
74    "EvaluatorStats",
75    "BatchEvaluationResumeToken",
76    "BatchEvaluationResult",
77    "RunnerContext",
78    "RegressionError",
79    "__version__",
80    "is_default_export_span",
81    "is_langfuse_span",
82    "is_genai_span",
83    "is_known_llm_instrumentor",
84    "KNOWN_LLM_INSTRUMENTATION_SCOPE_PREFIXES",
85    "MaskOtelSpansFunction",
86    "MaskOtelSpansParams",
87    "MaskOtelSpansResult",
88    "OtelSpanData",
89    "OtelSpanIdentifier",
90    "OtelSpanPatch",
91    "experiment",
92    "api",
93]
class Langfuse:
 154class Langfuse:
 155    """Main client for Langfuse tracing and platform features.
 156
 157    This class provides an interface for creating and managing traces, spans,
 158    and generations in Langfuse as well as interacting with the Langfuse API.
 159
 160    The client features a thread-safe singleton pattern for each unique public API key,
 161    ensuring consistent trace context propagation across your application. It implements
 162    efficient batching of spans with configurable flush settings and includes background
 163    thread management for media uploads and score ingestion.
 164
 165    Configuration is flexible through either direct parameters or environment variables,
 166    with graceful fallbacks and runtime configuration updates.
 167
 168    Attributes:
 169        api: Synchronous API client for Langfuse backend communication
 170        async_api: Asynchronous API client for Langfuse backend communication
 171        _otel_tracer: Internal LangfuseTracer instance managing OpenTelemetry components
 172
 173    Parameters:
 174        public_key (Optional[str]): Your Langfuse public API key. Can also be set via LANGFUSE_PUBLIC_KEY environment variable.
 175        secret_key (Optional[str]): Your Langfuse secret API key. Can also be set via LANGFUSE_SECRET_KEY environment variable.
 176        base_url (Optional[str]): The Langfuse API base URL. Defaults to "https://cloud.langfuse.com". Can also be set via LANGFUSE_BASE_URL environment variable.
 177        host (Optional[str]): Deprecated. Use base_url instead. The Langfuse API host URL. Defaults to "https://cloud.langfuse.com".
 178        timeout (Optional[int]): Timeout in seconds for API requests. Defaults to 5 seconds.
 179        httpx_client (Optional[httpx.Client]): Custom httpx client for making non-tracing HTTP requests. If not provided, a default client will be created.
 180        debug (bool): Enable debug logging. Defaults to False. Can also be set via LANGFUSE_DEBUG environment variable.
 181        tracing_enabled (Optional[bool]): Enable or disable tracing. Defaults to True. Can also be set via LANGFUSE_TRACING_ENABLED environment variable.
 182        flush_at (Optional[int]): Number of spans to batch before sending to the API. Defaults to 512. Can also be set via LANGFUSE_FLUSH_AT environment variable.
 183        flush_interval (Optional[float]): Time in seconds between batch flushes. Defaults to 5 seconds. Can also be set via LANGFUSE_FLUSH_INTERVAL environment variable.
 184        environment (Optional[str]): Environment name for tracing. Default is 'default'. Can also be set via LANGFUSE_TRACING_ENVIRONMENT environment variable. Can be any lowercase alphanumeric string with hyphens and underscores that does not start with 'langfuse'.
 185        release (Optional[str]): Release version/hash of your application. Used for grouping analytics by release.
 186        media_upload_thread_count (Optional[int]): Number of background threads for handling media uploads. Defaults to 1. Can also be set via LANGFUSE_MEDIA_UPLOAD_THREAD_COUNT environment variable.
 187        sample_rate (Optional[float]): Sampling rate for traces (0.0 to 1.0). Defaults to 1.0 (100% of traces are sampled). Can also be set via LANGFUSE_SAMPLE_RATE environment variable.
 188        mask (Optional[MaskFunction]): Function to mask sensitive data synchronously when Langfuse SDK attributes are created. This applies only to data set through Langfuse SDK APIs such as `start_observation()`, `update()`, and `set_trace_io()`.
 189        mask_otel_spans (Optional[MaskOtelSpansFunction]): Synchronous export-stage hook for masking raw OpenTelemetry span attributes before this Langfuse client sends them to Langfuse. Use this for spans created by third-party OpenTelemetry instrumentations, or when you need to inspect final span attributes after export filtering and Langfuse media handling. It does not modify spans already exported through other OpenTelemetry exporters.
 190
 191            The hook receives one OpenTelemetry export batch. A batch is not guaranteed to contain a complete trace, request, or Langfuse observation tree. The hook usually runs on the OpenTelemetry batch span processor worker thread; during `flush()` and shutdown it may run on the caller thread. Keep it synchronous, deterministic, and fast.
 192
 193            Return `None` to leave the batch unchanged. Return `MaskOtelSpansResult` with `OtelSpanPatch` values to delete or replace attributes on selected spans. If the hook raises or returns an invalid batch result, Langfuse drops the whole export batch. If one returned span patch is invalid, Langfuse drops only that span from the Langfuse export.
 194
 195            Example:
 196                ```python
 197                from typing import Optional
 198
 199                from langfuse import Langfuse
 200                from langfuse.types import (
 201                    MaskOtelSpansParams,
 202                    MaskOtelSpansResult,
 203                    OtelSpanPatch,
 204                )
 205
 206                def mask_otel_spans(
 207                    *, params: MaskOtelSpansParams
 208                ) -> Optional[MaskOtelSpansResult]:
 209                    patches = {}
 210
 211                    for identifier, span in params.spans.items():
 212                        if "gen_ai.prompt.0.content" in span.attributes:
 213                            patches[identifier] = OtelSpanPatch(
 214                                delete_attributes=("gen_ai.prompt.0.content",),
 215                                set_attributes={"masking.applied": True},
 216                            )
 217
 218                    return MaskOtelSpansResult(span_patches=patches)
 219
 220                langfuse = Langfuse(mask_otel_spans=mask_otel_spans)
 221                ```
 222        blocked_instrumentation_scopes (Optional[List[str]]): Deprecated. Use `should_export_span` instead. Equivalent behavior:
 223            ```python
 224            from langfuse.span_filter import is_default_export_span
 225            blocked = {"sqlite", "requests"}
 226
 227            should_export_span = lambda span: (
 228                is_default_export_span(span)
 229                and (
 230                    span.instrumentation_scope is None
 231                    or span.instrumentation_scope.name not in blocked
 232                )
 233            )
 234            ```
 235        should_export_span (Optional[Callable[[ReadableSpan], bool]]): Callback to decide whether to export a span. If omitted, Langfuse uses the default filter (Langfuse SDK spans, spans with `gen_ai.*` attributes, and known LLM instrumentation scopes).
 236        additional_headers (Optional[Dict[str, str]]): Additional headers to include in all API requests and in the default OTLPSpanExporter requests. These headers will be merged with default headers. Note: If httpx_client is provided, additional_headers must be set directly on your custom httpx_client as well. If `span_exporter` is provided, these headers are not wired into that exporter and must be configured on the exporter instance directly.
 237        tracer_provider(Optional[TracerProvider]): OpenTelemetry TracerProvider to use for Langfuse. This can be useful to set to have disconnected tracing between Langfuse and other OpenTelemetry-span emitting libraries. Note: To track active spans, the context is still shared between TracerProviders. This may lead to broken trace trees.
 238        id_generator (Optional[IdGenerator]): OpenTelemetry ID generator to use when Langfuse creates its own TracerProvider. If omitted, the OpenTelemetry SDK default is used. If `tracer_provider` is provided, or an OpenTelemetry TracerProvider is already registered globally, configure the ID generator on that provider instead.
 239        span_exporter (Optional[SpanExporter]): Custom OpenTelemetry span exporter for the Langfuse span processor. If omitted, Langfuse creates an OTLPSpanExporter pointed at the Langfuse OTLP endpoint. If provided, Langfuse does not wire `base_url`, exporter headers, exporter auth, or exporter timeout into it. Configure endpoint, headers, and timeout on the exporter instance directly. If you are sending spans to Langfuse v4 or using Langfuse Cloud Fast Preview, include `x-langfuse-ingestion-version=4` on the exporter to enable real time processing of exported spans.
 240
 241    Example:
 242        ```python
 243        from langfuse.otel import Langfuse
 244
 245        # Initialize the client (reads from env vars if not provided)
 246        langfuse = Langfuse(
 247            public_key="your-public-key",
 248            secret_key="your-secret-key",
 249            host="https://cloud.langfuse.com",  # Optional, default shown
 250        )
 251
 252        # Create a trace span
 253        with langfuse.start_as_current_observation(name="process-query") as span:
 254            # Your application code here
 255
 256            # Create a nested generation span for an LLM call
 257            with span.start_as_current_generation(
 258                name="generate-response",
 259                model="gpt-4",
 260                input={"query": "Tell me about AI"},
 261                model_parameters={"temperature": 0.7, "max_tokens": 500}
 262            ) as generation:
 263                # Generate response here
 264                response = "AI is a field of computer science..."
 265
 266                generation.update(
 267                    output=response,
 268                    usage_details={"prompt_tokens": 10, "completion_tokens": 50},
 269                    cost_details={"total_cost": 0.0023}
 270                )
 271
 272                # Score the generation (supports NUMERIC, BOOLEAN, CATEGORICAL)
 273                generation.score(name="relevance", value=0.95, data_type="NUMERIC")
 274        ```
 275    """
 276
 277    _resources: Optional[LangfuseResourceManager] = None
 278    _mask: Optional[MaskFunction] = None
 279    _otel_tracer: otel_trace_api.Tracer
 280
 281    def __init__(
 282        self,
 283        *,
 284        public_key: Optional[str] = None,
 285        secret_key: Optional[str] = None,
 286        base_url: Optional[str] = None,
 287        host: Optional[str] = None,
 288        timeout: Optional[int] = None,
 289        httpx_client: Optional[httpx.Client] = None,
 290        debug: bool = False,
 291        tracing_enabled: Optional[bool] = True,
 292        flush_at: Optional[int] = None,
 293        flush_interval: Optional[float] = None,
 294        environment: Optional[str] = None,
 295        release: Optional[str] = None,
 296        media_upload_thread_count: Optional[int] = None,
 297        sample_rate: Optional[float] = None,
 298        mask: Optional[MaskFunction] = None,
 299        mask_otel_spans: Optional[MaskOtelSpansFunction] = None,
 300        blocked_instrumentation_scopes: Optional[List[str]] = None,
 301        should_export_span: Optional[Callable[[ReadableSpan], bool]] = None,
 302        additional_headers: Optional[Dict[str, str]] = None,
 303        tracer_provider: Optional[TracerProvider] = None,
 304        id_generator: Optional[IdGenerator] = None,
 305        span_exporter: Optional[SpanExporter] = None,
 306    ):
 307        self._base_url = (
 308            base_url
 309            or os.environ.get(LANGFUSE_BASE_URL)
 310            or host
 311            or os.environ.get(LANGFUSE_HOST, "https://cloud.langfuse.com")
 312        )
 313        self._environment = environment or cast(
 314            str, os.environ.get(LANGFUSE_TRACING_ENVIRONMENT)
 315        )
 316        self._release = (
 317            release
 318            or os.environ.get(LANGFUSE_RELEASE, None)
 319            or get_common_release_envs()
 320        )
 321        self._project_id: Optional[str] = None
 322        sample_rate = sample_rate or float(os.environ.get(LANGFUSE_SAMPLE_RATE, 1.0))
 323        if not 0.0 <= sample_rate <= 1.0:
 324            raise ValueError(
 325                f"Sample rate must be between 0.0 and 1.0, got {sample_rate}"
 326            )
 327
 328        timeout = timeout or int(os.environ.get(LANGFUSE_TIMEOUT, 5))
 329
 330        self._tracing_enabled = (
 331            tracing_enabled
 332            and os.environ.get(LANGFUSE_TRACING_ENABLED, "true").lower() != "false"
 333        )
 334        if not self._tracing_enabled:
 335            langfuse_logger.info(
 336                "Configuration: Langfuse tracing is explicitly disabled. No data will be sent to the Langfuse API."
 337            )
 338
 339        debug = (
 340            debug if debug else (os.getenv(LANGFUSE_DEBUG, "false").lower() == "true")
 341        )
 342        if debug:
 343            logging.basicConfig(
 344                format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 345            )
 346            langfuse_logger.setLevel(logging.DEBUG)
 347
 348        public_key = public_key or os.environ.get(LANGFUSE_PUBLIC_KEY)
 349        if public_key is None:
 350            langfuse_logger.warning(
 351                "Authentication error: Langfuse client initialized without public_key. Client will be disabled. "
 352                "Provide a public_key parameter or set LANGFUSE_PUBLIC_KEY environment variable. "
 353            )
 354            self._otel_tracer = otel_trace_api.NoOpTracer()
 355            return
 356
 357        secret_key = secret_key or os.environ.get(LANGFUSE_SECRET_KEY)
 358        if secret_key is None:
 359            langfuse_logger.warning(
 360                "Authentication error: Langfuse client initialized without secret_key. Client will be disabled. "
 361                "Provide a secret_key parameter or set LANGFUSE_SECRET_KEY environment variable. "
 362            )
 363            self._otel_tracer = otel_trace_api.NoOpTracer()
 364            return
 365
 366        if os.environ.get("OTEL_SDK_DISABLED", "false").lower() == "true":
 367            langfuse_logger.warning(
 368                "OTEL_SDK_DISABLED is set. Langfuse tracing will be disabled and no traces will appear in the UI."
 369            )
 370
 371        if blocked_instrumentation_scopes is not None:
 372            warnings.warn(
 373                "`blocked_instrumentation_scopes` is deprecated and will be removed in a future release. "
 374                "Use `should_export_span` instead. Example: "
 375                "from langfuse.span_filter import is_default_export_span; "
 376                'blocked={"scope"}; should_export_span=lambda span: '
 377                "is_default_export_span(span) and (span.instrumentation_scope is None or "
 378                "span.instrumentation_scope.name not in blocked).",
 379                DeprecationWarning,
 380                stacklevel=2,
 381            )
 382
 383        # Initialize api and tracer if requirements are met
 384        self._resources = LangfuseResourceManager(
 385            public_key=public_key,
 386            secret_key=secret_key,
 387            base_url=self._base_url,
 388            timeout=timeout,
 389            environment=self._environment,
 390            release=release,
 391            flush_at=flush_at,
 392            flush_interval=flush_interval,
 393            httpx_client=httpx_client,
 394            media_upload_thread_count=media_upload_thread_count,
 395            sample_rate=sample_rate,
 396            mask=mask,
 397            mask_otel_spans=mask_otel_spans,
 398            tracing_enabled=self._tracing_enabled,
 399            blocked_instrumentation_scopes=blocked_instrumentation_scopes,
 400            should_export_span=should_export_span,
 401            additional_headers=additional_headers,
 402            tracer_provider=tracer_provider,
 403            id_generator=id_generator,
 404            span_exporter=span_exporter,
 405        )
 406        self._mask = self._resources.mask
 407
 408        self._otel_tracer = (
 409            self._resources.tracer
 410            if self._tracing_enabled and self._resources.tracer is not None
 411            else otel_trace_api.NoOpTracer()
 412        )
 413        self.api = self._resources.api
 414        self.async_api = self._resources.async_api
 415
 416    @overload
 417    def start_observation(
 418        self,
 419        *,
 420        trace_context: Optional[TraceContext] = None,
 421        name: str,
 422        as_type: Literal["generation"],
 423        input: Optional[Any] = None,
 424        output: Optional[Any] = None,
 425        metadata: Optional[Any] = None,
 426        version: Optional[str] = None,
 427        level: Optional[SpanLevel] = None,
 428        status_message: Optional[str] = None,
 429        completion_start_time: Optional[datetime] = None,
 430        model: Optional[str] = None,
 431        model_parameters: Optional[Dict[str, MapValue]] = None,
 432        usage_details: Optional[Dict[str, int]] = None,
 433        cost_details: Optional[Dict[str, float]] = None,
 434        prompt: Optional[PromptClient] = None,
 435    ) -> LangfuseGeneration: ...
 436
 437    @overload
 438    def start_observation(
 439        self,
 440        *,
 441        trace_context: Optional[TraceContext] = None,
 442        name: str,
 443        as_type: Literal["span"] = "span",
 444        input: Optional[Any] = None,
 445        output: Optional[Any] = None,
 446        metadata: Optional[Any] = None,
 447        version: Optional[str] = None,
 448        level: Optional[SpanLevel] = None,
 449        status_message: Optional[str] = None,
 450    ) -> LangfuseSpan: ...
 451
 452    @overload
 453    def start_observation(
 454        self,
 455        *,
 456        trace_context: Optional[TraceContext] = None,
 457        name: str,
 458        as_type: Literal["agent"],
 459        input: Optional[Any] = None,
 460        output: Optional[Any] = None,
 461        metadata: Optional[Any] = None,
 462        version: Optional[str] = None,
 463        level: Optional[SpanLevel] = None,
 464        status_message: Optional[str] = None,
 465    ) -> LangfuseAgent: ...
 466
 467    @overload
 468    def start_observation(
 469        self,
 470        *,
 471        trace_context: Optional[TraceContext] = None,
 472        name: str,
 473        as_type: Literal["tool"],
 474        input: Optional[Any] = None,
 475        output: Optional[Any] = None,
 476        metadata: Optional[Any] = None,
 477        version: Optional[str] = None,
 478        level: Optional[SpanLevel] = None,
 479        status_message: Optional[str] = None,
 480    ) -> LangfuseTool: ...
 481
 482    @overload
 483    def start_observation(
 484        self,
 485        *,
 486        trace_context: Optional[TraceContext] = None,
 487        name: str,
 488        as_type: Literal["chain"],
 489        input: Optional[Any] = None,
 490        output: Optional[Any] = None,
 491        metadata: Optional[Any] = None,
 492        version: Optional[str] = None,
 493        level: Optional[SpanLevel] = None,
 494        status_message: Optional[str] = None,
 495    ) -> LangfuseChain: ...
 496
 497    @overload
 498    def start_observation(
 499        self,
 500        *,
 501        trace_context: Optional[TraceContext] = None,
 502        name: str,
 503        as_type: Literal["retriever"],
 504        input: Optional[Any] = None,
 505        output: Optional[Any] = None,
 506        metadata: Optional[Any] = None,
 507        version: Optional[str] = None,
 508        level: Optional[SpanLevel] = None,
 509        status_message: Optional[str] = None,
 510    ) -> LangfuseRetriever: ...
 511
 512    @overload
 513    def start_observation(
 514        self,
 515        *,
 516        trace_context: Optional[TraceContext] = None,
 517        name: str,
 518        as_type: Literal["evaluator"],
 519        input: Optional[Any] = None,
 520        output: Optional[Any] = None,
 521        metadata: Optional[Any] = None,
 522        version: Optional[str] = None,
 523        level: Optional[SpanLevel] = None,
 524        status_message: Optional[str] = None,
 525    ) -> LangfuseEvaluator: ...
 526
 527    @overload
 528    def start_observation(
 529        self,
 530        *,
 531        trace_context: Optional[TraceContext] = None,
 532        name: str,
 533        as_type: Literal["embedding"],
 534        input: Optional[Any] = None,
 535        output: Optional[Any] = None,
 536        metadata: Optional[Any] = None,
 537        version: Optional[str] = None,
 538        level: Optional[SpanLevel] = None,
 539        status_message: Optional[str] = None,
 540        completion_start_time: Optional[datetime] = None,
 541        model: Optional[str] = None,
 542        model_parameters: Optional[Dict[str, MapValue]] = None,
 543        usage_details: Optional[Dict[str, int]] = None,
 544        cost_details: Optional[Dict[str, float]] = None,
 545        prompt: Optional[PromptClient] = None,
 546    ) -> LangfuseEmbedding: ...
 547
 548    @overload
 549    def start_observation(
 550        self,
 551        *,
 552        trace_context: Optional[TraceContext] = None,
 553        name: str,
 554        as_type: Literal["guardrail"],
 555        input: Optional[Any] = None,
 556        output: Optional[Any] = None,
 557        metadata: Optional[Any] = None,
 558        version: Optional[str] = None,
 559        level: Optional[SpanLevel] = None,
 560        status_message: Optional[str] = None,
 561    ) -> LangfuseGuardrail: ...
 562
 563    def start_observation(
 564        self,
 565        *,
 566        trace_context: Optional[TraceContext] = None,
 567        name: str,
 568        as_type: ObservationTypeLiteralNoEvent = "span",
 569        input: Optional[Any] = None,
 570        output: Optional[Any] = None,
 571        metadata: Optional[Any] = None,
 572        version: Optional[str] = None,
 573        level: Optional[SpanLevel] = None,
 574        status_message: Optional[str] = None,
 575        completion_start_time: Optional[datetime] = None,
 576        model: Optional[str] = None,
 577        model_parameters: Optional[Dict[str, MapValue]] = None,
 578        usage_details: Optional[Dict[str, int]] = None,
 579        cost_details: Optional[Dict[str, float]] = None,
 580        prompt: Optional[PromptClient] = None,
 581    ) -> Union[
 582        LangfuseSpan,
 583        LangfuseGeneration,
 584        LangfuseAgent,
 585        LangfuseTool,
 586        LangfuseChain,
 587        LangfuseRetriever,
 588        LangfuseEvaluator,
 589        LangfuseEmbedding,
 590        LangfuseGuardrail,
 591    ]:
 592        """Create a new observation of the specified type.
 593
 594        This method creates a new observation but does not set it as the current span in the
 595        context. To create and use an observation within a context, use start_as_current_observation().
 596
 597        Args:
 598            trace_context: Optional context for connecting to an existing trace
 599            name: Name of the observation
 600            as_type: Type of observation to create (defaults to "span")
 601            input: Input data for the operation
 602            output: Output data from the operation
 603            metadata: Additional metadata to associate with the observation
 604            version: Version identifier for the code or component
 605            level: Importance level of the observation
 606            status_message: Optional status message for the observation
 607            completion_start_time: When the model started generating (for generation types)
 608            model: Name/identifier of the AI model used (for generation types)
 609            model_parameters: Parameters used for the model (for generation types)
 610            usage_details: Token usage information (for generation types)
 611            cost_details: Cost information (for generation types)
 612            prompt: Associated prompt template (for generation types)
 613
 614        Returns:
 615            An observation object of the appropriate type that must be ended with .end()
 616        """
 617        if trace_context:
 618            trace_id = trace_context.get("trace_id", None)
 619            parent_span_id = trace_context.get("parent_span_id", None)
 620
 621            if trace_id:
 622                remote_parent_span = self._create_remote_parent_span(
 623                    trace_id=trace_id, parent_span_id=parent_span_id
 624                )
 625
 626                with otel_trace_api.use_span(
 627                    cast(otel_trace_api.Span, remote_parent_span)
 628                ):
 629                    otel_span = self._otel_tracer.start_span(name=name)
 630                    otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True)
 631
 632                    return self._create_observation_from_otel_span(
 633                        otel_span=otel_span,
 634                        as_type=as_type,
 635                        input=input,
 636                        output=output,
 637                        metadata=metadata,
 638                        version=version,
 639                        level=level,
 640                        status_message=status_message,
 641                        completion_start_time=completion_start_time,
 642                        model=model,
 643                        model_parameters=model_parameters,
 644                        usage_details=usage_details,
 645                        cost_details=cost_details,
 646                        prompt=prompt,
 647                    )
 648
 649        otel_span = self._otel_tracer.start_span(name=name)
 650
 651        return self._create_observation_from_otel_span(
 652            otel_span=otel_span,
 653            as_type=as_type,
 654            input=input,
 655            output=output,
 656            metadata=metadata,
 657            version=version,
 658            level=level,
 659            status_message=status_message,
 660            completion_start_time=completion_start_time,
 661            model=model,
 662            model_parameters=model_parameters,
 663            usage_details=usage_details,
 664            cost_details=cost_details,
 665            prompt=prompt,
 666        )
 667
 668    def _create_observation_from_otel_span(
 669        self,
 670        *,
 671        otel_span: otel_trace_api.Span,
 672        as_type: ObservationTypeLiteralNoEvent,
 673        input: Optional[Any] = None,
 674        output: Optional[Any] = None,
 675        metadata: Optional[Any] = None,
 676        version: Optional[str] = None,
 677        level: Optional[SpanLevel] = None,
 678        status_message: Optional[str] = None,
 679        completion_start_time: Optional[datetime] = None,
 680        model: Optional[str] = None,
 681        model_parameters: Optional[Dict[str, MapValue]] = None,
 682        usage_details: Optional[Dict[str, int]] = None,
 683        cost_details: Optional[Dict[str, float]] = None,
 684        prompt: Optional[PromptClient] = None,
 685    ) -> Union[
 686        LangfuseSpan,
 687        LangfuseGeneration,
 688        LangfuseAgent,
 689        LangfuseTool,
 690        LangfuseChain,
 691        LangfuseRetriever,
 692        LangfuseEvaluator,
 693        LangfuseEmbedding,
 694        LangfuseGuardrail,
 695    ]:
 696        """Create the appropriate observation type from an OTEL span."""
 697        if as_type in get_observation_types_list(ObservationTypeGenerationLike):
 698            observation_class = self._get_span_class(as_type)
 699            # Type ignore to prevent overloads of internal _get_span_class function,
 700            # issue is that LangfuseEvent could be returned and that classes have diff. args
 701            return observation_class(  # type: ignore[return-value,call-arg]
 702                otel_span=otel_span,
 703                langfuse_client=self,
 704                environment=self._environment,
 705                release=self._release,
 706                input=input,
 707                output=output,
 708                metadata=metadata,
 709                version=version,
 710                level=level,
 711                status_message=status_message,
 712                completion_start_time=completion_start_time,
 713                model=model,
 714                model_parameters=model_parameters,
 715                usage_details=usage_details,
 716                cost_details=cost_details,
 717                prompt=prompt,
 718            )
 719        else:
 720            # For other types (e.g. span, guardrail), create appropriate class without generation properties
 721            observation_class = self._get_span_class(as_type)
 722            # Type ignore to prevent overloads of internal _get_span_class function,
 723            # issue is that LangfuseEvent could be returned and that classes have diff. args
 724            return observation_class(  # type: ignore[return-value,call-arg]
 725                otel_span=otel_span,
 726                langfuse_client=self,
 727                environment=self._environment,
 728                release=self._release,
 729                input=input,
 730                output=output,
 731                metadata=metadata,
 732                version=version,
 733                level=level,
 734                status_message=status_message,
 735            )
 736            # span._observation_type = as_type
 737            # span._otel_span.set_attribute("langfuse.observation.type", as_type)
 738            # return span
 739
 740    @overload
 741    def start_as_current_observation(
 742        self,
 743        *,
 744        trace_context: Optional[TraceContext] = None,
 745        name: str,
 746        as_type: Literal["generation"],
 747        input: Optional[Any] = None,
 748        output: Optional[Any] = None,
 749        metadata: Optional[Any] = None,
 750        version: Optional[str] = None,
 751        level: Optional[SpanLevel] = None,
 752        status_message: Optional[str] = None,
 753        completion_start_time: Optional[datetime] = None,
 754        model: Optional[str] = None,
 755        model_parameters: Optional[Dict[str, MapValue]] = None,
 756        usage_details: Optional[Dict[str, int]] = None,
 757        cost_details: Optional[Dict[str, float]] = None,
 758        prompt: Optional[PromptClient] = None,
 759        end_on_exit: Optional[bool] = None,
 760    ) -> _AgnosticContextManager[LangfuseGeneration]: ...
 761
 762    @overload
 763    def start_as_current_observation(
 764        self,
 765        *,
 766        trace_context: Optional[TraceContext] = None,
 767        name: str,
 768        as_type: Literal["span"] = "span",
 769        input: Optional[Any] = None,
 770        output: Optional[Any] = None,
 771        metadata: Optional[Any] = None,
 772        version: Optional[str] = None,
 773        level: Optional[SpanLevel] = None,
 774        status_message: Optional[str] = None,
 775        end_on_exit: Optional[bool] = None,
 776    ) -> _AgnosticContextManager[LangfuseSpan]: ...
 777
 778    @overload
 779    def start_as_current_observation(
 780        self,
 781        *,
 782        trace_context: Optional[TraceContext] = None,
 783        name: str,
 784        as_type: Literal["agent"],
 785        input: Optional[Any] = None,
 786        output: Optional[Any] = None,
 787        metadata: Optional[Any] = None,
 788        version: Optional[str] = None,
 789        level: Optional[SpanLevel] = None,
 790        status_message: Optional[str] = None,
 791        end_on_exit: Optional[bool] = None,
 792    ) -> _AgnosticContextManager[LangfuseAgent]: ...
 793
 794    @overload
 795    def start_as_current_observation(
 796        self,
 797        *,
 798        trace_context: Optional[TraceContext] = None,
 799        name: str,
 800        as_type: Literal["tool"],
 801        input: Optional[Any] = None,
 802        output: Optional[Any] = None,
 803        metadata: Optional[Any] = None,
 804        version: Optional[str] = None,
 805        level: Optional[SpanLevel] = None,
 806        status_message: Optional[str] = None,
 807        end_on_exit: Optional[bool] = None,
 808    ) -> _AgnosticContextManager[LangfuseTool]: ...
 809
 810    @overload
 811    def start_as_current_observation(
 812        self,
 813        *,
 814        trace_context: Optional[TraceContext] = None,
 815        name: str,
 816        as_type: Literal["chain"],
 817        input: Optional[Any] = None,
 818        output: Optional[Any] = None,
 819        metadata: Optional[Any] = None,
 820        version: Optional[str] = None,
 821        level: Optional[SpanLevel] = None,
 822        status_message: Optional[str] = None,
 823        end_on_exit: Optional[bool] = None,
 824    ) -> _AgnosticContextManager[LangfuseChain]: ...
 825
 826    @overload
 827    def start_as_current_observation(
 828        self,
 829        *,
 830        trace_context: Optional[TraceContext] = None,
 831        name: str,
 832        as_type: Literal["retriever"],
 833        input: Optional[Any] = None,
 834        output: Optional[Any] = None,
 835        metadata: Optional[Any] = None,
 836        version: Optional[str] = None,
 837        level: Optional[SpanLevel] = None,
 838        status_message: Optional[str] = None,
 839        end_on_exit: Optional[bool] = None,
 840    ) -> _AgnosticContextManager[LangfuseRetriever]: ...
 841
 842    @overload
 843    def start_as_current_observation(
 844        self,
 845        *,
 846        trace_context: Optional[TraceContext] = None,
 847        name: str,
 848        as_type: Literal["evaluator"],
 849        input: Optional[Any] = None,
 850        output: Optional[Any] = None,
 851        metadata: Optional[Any] = None,
 852        version: Optional[str] = None,
 853        level: Optional[SpanLevel] = None,
 854        status_message: Optional[str] = None,
 855        end_on_exit: Optional[bool] = None,
 856    ) -> _AgnosticContextManager[LangfuseEvaluator]: ...
 857
 858    @overload
 859    def start_as_current_observation(
 860        self,
 861        *,
 862        trace_context: Optional[TraceContext] = None,
 863        name: str,
 864        as_type: Literal["embedding"],
 865        input: Optional[Any] = None,
 866        output: Optional[Any] = None,
 867        metadata: Optional[Any] = None,
 868        version: Optional[str] = None,
 869        level: Optional[SpanLevel] = None,
 870        status_message: Optional[str] = None,
 871        completion_start_time: Optional[datetime] = None,
 872        model: Optional[str] = None,
 873        model_parameters: Optional[Dict[str, MapValue]] = None,
 874        usage_details: Optional[Dict[str, int]] = None,
 875        cost_details: Optional[Dict[str, float]] = None,
 876        prompt: Optional[PromptClient] = None,
 877        end_on_exit: Optional[bool] = None,
 878    ) -> _AgnosticContextManager[LangfuseEmbedding]: ...
 879
 880    @overload
 881    def start_as_current_observation(
 882        self,
 883        *,
 884        trace_context: Optional[TraceContext] = None,
 885        name: str,
 886        as_type: Literal["guardrail"],
 887        input: Optional[Any] = None,
 888        output: Optional[Any] = None,
 889        metadata: Optional[Any] = None,
 890        version: Optional[str] = None,
 891        level: Optional[SpanLevel] = None,
 892        status_message: Optional[str] = None,
 893        end_on_exit: Optional[bool] = None,
 894    ) -> _AgnosticContextManager[LangfuseGuardrail]: ...
 895
 896    def start_as_current_observation(
 897        self,
 898        *,
 899        trace_context: Optional[TraceContext] = None,
 900        name: str,
 901        as_type: ObservationTypeLiteralNoEvent = "span",
 902        input: Optional[Any] = None,
 903        output: Optional[Any] = None,
 904        metadata: Optional[Any] = None,
 905        version: Optional[str] = None,
 906        level: Optional[SpanLevel] = None,
 907        status_message: Optional[str] = None,
 908        completion_start_time: Optional[datetime] = None,
 909        model: Optional[str] = None,
 910        model_parameters: Optional[Dict[str, MapValue]] = None,
 911        usage_details: Optional[Dict[str, int]] = None,
 912        cost_details: Optional[Dict[str, float]] = None,
 913        prompt: Optional[PromptClient] = None,
 914        end_on_exit: Optional[bool] = None,
 915    ) -> Union[
 916        _AgnosticContextManager[LangfuseGeneration],
 917        _AgnosticContextManager[LangfuseSpan],
 918        _AgnosticContextManager[LangfuseAgent],
 919        _AgnosticContextManager[LangfuseTool],
 920        _AgnosticContextManager[LangfuseChain],
 921        _AgnosticContextManager[LangfuseRetriever],
 922        _AgnosticContextManager[LangfuseEvaluator],
 923        _AgnosticContextManager[LangfuseEmbedding],
 924        _AgnosticContextManager[LangfuseGuardrail],
 925    ]:
 926        """Create a new observation and set it as the current span in a context manager.
 927
 928        This method creates a new observation of the specified type and sets it as the
 929        current span within a context manager. Use this method with a 'with' statement to
 930        automatically handle the observation lifecycle within a code block.
 931
 932        The created observation will be the child of the current span in the context.
 933
 934        Args:
 935            trace_context: Optional context for connecting to an existing trace
 936            name: Name of the observation (e.g., function or operation name)
 937            as_type: Type of observation to create (defaults to "span")
 938            input: Input data for the operation (can be any JSON-serializable object)
 939            output: Output data from the operation (can be any JSON-serializable object)
 940            metadata: Additional metadata to associate with the observation
 941            version: Version identifier for the code or component
 942            level: Importance level of the observation (info, warning, error)
 943            status_message: Optional status message for the observation
 944            end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
 945
 946            The following parameters are available when as_type is: "generation" or "embedding".
 947            completion_start_time: When the model started generating the response
 948            model: Name/identifier of the AI model used (e.g., "gpt-4")
 949            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
 950            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
 951            cost_details: Cost information for the model call
 952            prompt: Associated prompt template from Langfuse prompt management
 953
 954        Returns:
 955            A context manager that yields the appropriate observation type based on as_type
 956
 957        Example:
 958            ```python
 959            # Create a span
 960            with langfuse.start_as_current_observation(name="process-query", as_type="span") as span:
 961                # Do work
 962                result = process_data()
 963                span.update(output=result)
 964
 965                # Create a child span automatically
 966                with span.start_as_current_observation(name="sub-operation") as child_span:
 967                    # Do sub-operation work
 968                    child_span.update(output="sub-result")
 969
 970            # Create a tool observation
 971            with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool:
 972                # Do tool work
 973                results = search_web(query)
 974                tool.update(output=results)
 975
 976            # Create a generation observation
 977            with langfuse.start_as_current_observation(
 978                name="answer-generation",
 979                as_type="generation",
 980                model="gpt-4"
 981            ) as generation:
 982                # Generate answer
 983                response = llm.generate(...)
 984                generation.update(output=response)
 985            ```
 986        """
 987        if as_type in get_observation_types_list(ObservationTypeGenerationLike):
 988            if trace_context:
 989                trace_id = trace_context.get("trace_id", None)
 990                parent_span_id = trace_context.get("parent_span_id", None)
 991
 992                if trace_id:
 993                    remote_parent_span = self._create_remote_parent_span(
 994                        trace_id=trace_id, parent_span_id=parent_span_id
 995                    )
 996
 997                    return cast(
 998                        Union[
 999                            _AgnosticContextManager[LangfuseGeneration],
1000                            _AgnosticContextManager[LangfuseEmbedding],
1001                        ],
1002                        self._create_span_with_parent_context(
1003                            as_type=as_type,
1004                            name=name,
1005                            remote_parent_span=remote_parent_span,
1006                            parent=None,
1007                            end_on_exit=end_on_exit,
1008                            input=input,
1009                            output=output,
1010                            metadata=metadata,
1011                            version=version,
1012                            level=level,
1013                            status_message=status_message,
1014                            completion_start_time=completion_start_time,
1015                            model=model,
1016                            model_parameters=model_parameters,
1017                            usage_details=usage_details,
1018                            cost_details=cost_details,
1019                            prompt=prompt,
1020                        ),
1021                    )
1022
1023            return cast(
1024                Union[
1025                    _AgnosticContextManager[LangfuseGeneration],
1026                    _AgnosticContextManager[LangfuseEmbedding],
1027                ],
1028                self._start_as_current_otel_span_with_processed_media(
1029                    as_type=as_type,
1030                    name=name,
1031                    end_on_exit=end_on_exit,
1032                    input=input,
1033                    output=output,
1034                    metadata=metadata,
1035                    version=version,
1036                    level=level,
1037                    status_message=status_message,
1038                    completion_start_time=completion_start_time,
1039                    model=model,
1040                    model_parameters=model_parameters,
1041                    usage_details=usage_details,
1042                    cost_details=cost_details,
1043                    prompt=prompt,
1044                ),
1045            )
1046
1047        if as_type in get_observation_types_list(ObservationTypeSpanLike):
1048            if trace_context:
1049                trace_id = trace_context.get("trace_id", None)
1050                parent_span_id = trace_context.get("parent_span_id", None)
1051
1052                if trace_id:
1053                    remote_parent_span = self._create_remote_parent_span(
1054                        trace_id=trace_id, parent_span_id=parent_span_id
1055                    )
1056
1057                    return cast(
1058                        Union[
1059                            _AgnosticContextManager[LangfuseSpan],
1060                            _AgnosticContextManager[LangfuseAgent],
1061                            _AgnosticContextManager[LangfuseTool],
1062                            _AgnosticContextManager[LangfuseChain],
1063                            _AgnosticContextManager[LangfuseRetriever],
1064                            _AgnosticContextManager[LangfuseEvaluator],
1065                            _AgnosticContextManager[LangfuseGuardrail],
1066                        ],
1067                        self._create_span_with_parent_context(
1068                            as_type=as_type,
1069                            name=name,
1070                            remote_parent_span=remote_parent_span,
1071                            parent=None,
1072                            end_on_exit=end_on_exit,
1073                            input=input,
1074                            output=output,
1075                            metadata=metadata,
1076                            version=version,
1077                            level=level,
1078                            status_message=status_message,
1079                        ),
1080                    )
1081
1082            return cast(
1083                Union[
1084                    _AgnosticContextManager[LangfuseSpan],
1085                    _AgnosticContextManager[LangfuseAgent],
1086                    _AgnosticContextManager[LangfuseTool],
1087                    _AgnosticContextManager[LangfuseChain],
1088                    _AgnosticContextManager[LangfuseRetriever],
1089                    _AgnosticContextManager[LangfuseEvaluator],
1090                    _AgnosticContextManager[LangfuseGuardrail],
1091                ],
1092                self._start_as_current_otel_span_with_processed_media(
1093                    as_type=as_type,
1094                    name=name,
1095                    end_on_exit=end_on_exit,
1096                    input=input,
1097                    output=output,
1098                    metadata=metadata,
1099                    version=version,
1100                    level=level,
1101                    status_message=status_message,
1102                ),
1103            )
1104
1105        # This should never be reached since all valid types are handled above
1106        langfuse_logger.warning(
1107            f"Unknown observation type: {as_type}, falling back to span"
1108        )
1109        return self._start_as_current_otel_span_with_processed_media(
1110            as_type="span",
1111            name=name,
1112            end_on_exit=end_on_exit,
1113            input=input,
1114            output=output,
1115            metadata=metadata,
1116            version=version,
1117            level=level,
1118            status_message=status_message,
1119        )
1120
1121    def _get_span_class(
1122        self,
1123        as_type: str,
1124    ) -> Union[
1125        Type[LangfuseAgent],
1126        Type[LangfuseTool],
1127        Type[LangfuseChain],
1128        Type[LangfuseRetriever],
1129        Type[LangfuseEvaluator],
1130        Type[LangfuseEmbedding],
1131        Type[LangfuseGuardrail],
1132        Type[LangfuseGeneration],
1133        Type[LangfuseEvent],
1134        Type[LangfuseSpan],
1135    ]:
1136        """Get the appropriate span class based on as_type."""
1137        normalized_type = as_type.lower()
1138
1139        if normalized_type == "agent":
1140            return LangfuseAgent
1141        elif normalized_type == "tool":
1142            return LangfuseTool
1143        elif normalized_type == "chain":
1144            return LangfuseChain
1145        elif normalized_type == "retriever":
1146            return LangfuseRetriever
1147        elif normalized_type == "evaluator":
1148            return LangfuseEvaluator
1149        elif normalized_type == "embedding":
1150            return LangfuseEmbedding
1151        elif normalized_type == "guardrail":
1152            return LangfuseGuardrail
1153        elif normalized_type == "generation":
1154            return LangfuseGeneration
1155        elif normalized_type == "event":
1156            return LangfuseEvent
1157        elif normalized_type == "span":
1158            return LangfuseSpan
1159        else:
1160            return LangfuseSpan
1161
1162    @staticmethod
1163    def _get_observation_type_from_otel_span(otel_span: otel_trace_api.Span) -> str:
1164        if not otel_span.is_recording():
1165            return "span"
1166
1167        attributes = getattr(otel_span, "attributes", None)
1168        if attributes is None or not hasattr(attributes, "get"):
1169            return "span"
1170
1171        observation_type = attributes.get(
1172            LangfuseOtelSpanAttributes.OBSERVATION_TYPE, "span"
1173        )
1174
1175        return observation_type if isinstance(observation_type, str) else "span"
1176
1177    @_agnosticcontextmanager
1178    def _create_span_with_parent_context(
1179        self,
1180        *,
1181        name: str,
1182        parent: Optional[otel_trace_api.Span] = None,
1183        remote_parent_span: Optional[otel_trace_api.Span] = None,
1184        as_type: ObservationTypeLiteralNoEvent,
1185        end_on_exit: Optional[bool] = None,
1186        input: Optional[Any] = None,
1187        output: Optional[Any] = None,
1188        metadata: Optional[Any] = None,
1189        version: Optional[str] = None,
1190        level: Optional[SpanLevel] = None,
1191        status_message: Optional[str] = None,
1192        completion_start_time: Optional[datetime] = None,
1193        model: Optional[str] = None,
1194        model_parameters: Optional[Dict[str, MapValue]] = None,
1195        usage_details: Optional[Dict[str, int]] = None,
1196        cost_details: Optional[Dict[str, float]] = None,
1197        prompt: Optional[PromptClient] = None,
1198    ) -> Any:
1199        parent_span = parent or cast(otel_trace_api.Span, remote_parent_span)
1200
1201        with otel_trace_api.use_span(parent_span):
1202            with self._start_as_current_otel_span_with_processed_media(
1203                name=name,
1204                as_type=as_type,
1205                end_on_exit=end_on_exit,
1206                input=input,
1207                output=output,
1208                metadata=metadata,
1209                version=version,
1210                level=level,
1211                status_message=status_message,
1212                completion_start_time=completion_start_time,
1213                model=model,
1214                model_parameters=model_parameters,
1215                usage_details=usage_details,
1216                cost_details=cost_details,
1217                prompt=prompt,
1218            ) as langfuse_span:
1219                if remote_parent_span is not None:
1220                    langfuse_span._otel_span.set_attribute(
1221                        LangfuseOtelSpanAttributes.AS_ROOT, True
1222                    )
1223
1224                yield langfuse_span
1225
1226    @_agnosticcontextmanager
1227    def _start_as_current_otel_span_with_processed_media(
1228        self,
1229        *,
1230        name: str,
1231        as_type: Optional[ObservationTypeLiteralNoEvent] = None,
1232        end_on_exit: Optional[bool] = None,
1233        input: Optional[Any] = None,
1234        output: Optional[Any] = None,
1235        metadata: Optional[Any] = None,
1236        version: Optional[str] = None,
1237        level: Optional[SpanLevel] = None,
1238        status_message: Optional[str] = None,
1239        completion_start_time: Optional[datetime] = None,
1240        model: Optional[str] = None,
1241        model_parameters: Optional[Dict[str, MapValue]] = None,
1242        usage_details: Optional[Dict[str, int]] = None,
1243        cost_details: Optional[Dict[str, float]] = None,
1244        prompt: Optional[PromptClient] = None,
1245    ) -> Any:
1246        with self._otel_tracer.start_as_current_span(
1247            name=name,
1248            end_on_exit=end_on_exit if end_on_exit is not None else True,
1249        ) as otel_span:
1250            baggage_token = None
1251
1252            if otel_span.is_recording():
1253                context_with_app_root_claim = _set_langfuse_trace_id_in_baggage(
1254                    trace_id=self._get_otel_trace_id(otel_span),
1255                    context=otel_context_api.get_current(),
1256                )
1257                baggage_token = otel_context_api.attach(context_with_app_root_claim)
1258
1259            span_class = self._get_span_class(
1260                as_type or "generation"
1261            )  # default was "generation"
1262
1263            try:
1264                common_args = {
1265                    "otel_span": otel_span,
1266                    "langfuse_client": self,
1267                    "environment": self._environment,
1268                    "release": self._release,
1269                    "input": input,
1270                    "output": output,
1271                    "metadata": metadata,
1272                    "version": version,
1273                    "level": level,
1274                    "status_message": status_message,
1275                }
1276
1277                if span_class in [
1278                    LangfuseGeneration,
1279                    LangfuseEmbedding,
1280                ]:
1281                    common_args.update(
1282                        {
1283                            "completion_start_time": completion_start_time,
1284                            "model": model,
1285                            "model_parameters": model_parameters,
1286                            "usage_details": usage_details,
1287                            "cost_details": cost_details,
1288                            "prompt": prompt,
1289                        }
1290                    )
1291                # For span-like types (span, agent, tool, chain, retriever, evaluator, guardrail), no generation properties needed
1292
1293                yield span_class(**common_args)  # type: ignore[arg-type]
1294
1295            finally:
1296                if baggage_token is not None:
1297                    _detach_context_token_safely(baggage_token)
1298
1299    def _get_current_otel_span(self) -> Optional[otel_trace_api.Span]:
1300        current_span = otel_trace_api.get_current_span()
1301
1302        if current_span is otel_trace_api.INVALID_SPAN:
1303            langfuse_logger.warning(
1304                "Context error: No active span in current context. Operations that depend on an active span will be skipped. "
1305                "Ensure spans are created with start_as_current_observation() or that you're operating within an active span context."
1306            )
1307            return None
1308
1309        return current_span
1310
1311    def update_current_generation(
1312        self,
1313        *,
1314        name: Optional[str] = None,
1315        input: Optional[Any] = None,
1316        output: Optional[Any] = None,
1317        metadata: Optional[Any] = None,
1318        version: Optional[str] = None,
1319        level: Optional[SpanLevel] = None,
1320        status_message: Optional[str] = None,
1321        completion_start_time: Optional[datetime] = None,
1322        model: Optional[str] = None,
1323        model_parameters: Optional[Dict[str, MapValue]] = None,
1324        usage_details: Optional[Dict[str, int]] = None,
1325        cost_details: Optional[Dict[str, float]] = None,
1326        prompt: Optional[PromptClient] = None,
1327    ) -> None:
1328        """Update the current active generation span with new information.
1329
1330        This method updates the current generation span in the active context with
1331        additional information. It's useful for adding output, usage stats, or other
1332        details that become available during or after model generation.
1333
1334        Args:
1335            name: The generation name
1336            input: Updated input data for the model
1337            output: Output from the model (e.g., completions)
1338            metadata: Additional metadata to associate with the generation
1339            version: Version identifier for the model or component
1340            level: Importance level of the generation (info, warning, error)
1341            status_message: Optional status message for the generation
1342            completion_start_time: When the model started generating the response
1343            model: Name/identifier of the AI model used (e.g., "gpt-4")
1344            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1345            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1346            cost_details: Cost information for the model call
1347            prompt: Associated prompt template from Langfuse prompt management
1348
1349        Example:
1350            ```python
1351            with langfuse.start_as_current_generation(name="answer-query") as generation:
1352                # Initial setup and API call
1353                response = llm.generate(...)
1354
1355                # Update with results that weren't available at creation time
1356                langfuse.update_current_generation(
1357                    output=response.text,
1358                    usage_details={
1359                        "prompt_tokens": response.usage.prompt_tokens,
1360                        "completion_tokens": response.usage.completion_tokens
1361                    }
1362                )
1363            ```
1364        """
1365        if not self._tracing_enabled:
1366            langfuse_logger.debug(
1367                "Operation skipped: update_current_generation - Tracing is disabled or client is in no-op mode."
1368            )
1369            return
1370
1371        current_otel_span = self._get_current_otel_span()
1372
1373        if current_otel_span is not None:
1374            generation = LangfuseGeneration(
1375                otel_span=current_otel_span, langfuse_client=self
1376            )
1377
1378            if name:
1379                current_otel_span.update_name(name)
1380
1381            generation.update(
1382                input=input,
1383                output=output,
1384                metadata=metadata,
1385                version=version,
1386                level=level,
1387                status_message=status_message,
1388                completion_start_time=completion_start_time,
1389                model=model,
1390                model_parameters=model_parameters,
1391                usage_details=usage_details,
1392                cost_details=cost_details,
1393                prompt=prompt,
1394            )
1395
1396    def update_current_span(
1397        self,
1398        *,
1399        name: Optional[str] = None,
1400        input: Optional[Any] = None,
1401        output: Optional[Any] = None,
1402        metadata: Optional[Any] = None,
1403        version: Optional[str] = None,
1404        level: Optional[SpanLevel] = None,
1405        status_message: Optional[str] = None,
1406    ) -> None:
1407        """Update the current active span with new information.
1408
1409        This method updates the current span in the active context with
1410        additional information. It's useful for adding outputs or metadata
1411        that become available during execution.
1412
1413        Args:
1414            name: The span name
1415            input: Updated input data for the operation
1416            output: Output data from the operation
1417            metadata: Additional metadata to associate with the span
1418            version: Version identifier for the code or component
1419            level: Importance level of the span (info, warning, error)
1420            status_message: Optional status message for the span
1421
1422        Example:
1423            ```python
1424            with langfuse.start_as_current_observation(name="process-data") as span:
1425                # Initial processing
1426                result = process_first_part()
1427
1428                # Update with intermediate results
1429                langfuse.update_current_span(metadata={"intermediate_result": result})
1430
1431                # Continue processing
1432                final_result = process_second_part(result)
1433
1434                # Final update
1435                langfuse.update_current_span(output=final_result)
1436            ```
1437        """
1438        if not self._tracing_enabled:
1439            langfuse_logger.debug(
1440                "Operation skipped: update_current_span - Tracing is disabled or client is in no-op mode."
1441            )
1442            return
1443
1444        current_otel_span = self._get_current_otel_span()
1445
1446        if current_otel_span is not None:
1447            span_class = self._get_span_class(
1448                self._get_observation_type_from_otel_span(current_otel_span)
1449            )
1450            span = span_class(
1451                otel_span=current_otel_span,
1452                langfuse_client=self,
1453                environment=self._environment,
1454                release=self._release,
1455            )
1456
1457            if name:
1458                current_otel_span.update_name(name)
1459
1460            span.update(
1461                input=input,
1462                output=output,
1463                metadata=metadata,
1464                version=version,
1465                level=level,
1466                status_message=status_message,
1467            )
1468
1469    @deprecated(
1470        "Trace-level input/output is deprecated. "
1471        "For trace attributes (user_id, session_id, tags, etc.), use propagate_attributes() instead. "
1472        "This method will be removed in a future major version."
1473    )
1474    def set_current_trace_io(
1475        self,
1476        *,
1477        input: Optional[Any] = None,
1478        output: Optional[Any] = None,
1479    ) -> None:
1480        """Set trace-level input and output for the current span's trace.
1481
1482        .. deprecated::
1483            This is a legacy method for backward compatibility with Langfuse platform
1484            features that still rely on trace-level input/output (e.g., legacy LLM-as-a-judge
1485            evaluators). It will be removed in a future major version.
1486
1487            For setting other trace attributes (user_id, session_id, metadata, tags, version),
1488            use :meth:`propagate_attributes` instead.
1489
1490        Args:
1491            input: Input data to associate with the trace.
1492            output: Output data to associate with the trace.
1493        """
1494        if not self._tracing_enabled:
1495            langfuse_logger.debug(
1496                "Operation skipped: set_current_trace_io - Tracing is disabled or client is in no-op mode."
1497            )
1498            return
1499
1500        current_otel_span = self._get_current_otel_span()
1501
1502        if current_otel_span is not None and current_otel_span.is_recording():
1503            span_class = self._get_span_class(
1504                self._get_observation_type_from_otel_span(current_otel_span)
1505            )
1506            span = span_class(
1507                otel_span=current_otel_span,
1508                langfuse_client=self,
1509                environment=self._environment,
1510                release=self._release,
1511            )
1512
1513            span.set_trace_io(
1514                input=input,
1515                output=output,
1516            )
1517
1518    def set_current_trace_as_public(self) -> None:
1519        """Make the current trace publicly accessible via its URL.
1520
1521        When a trace is published, anyone with the trace link can view the full trace
1522        without needing to be logged in to Langfuse. This action cannot be undone
1523        programmatically - once published, the entire trace becomes public.
1524
1525        This is a convenience method that publishes the trace from the currently
1526        active span context. Use this when you want to make a trace public from
1527        within a traced function without needing direct access to the span object.
1528        """
1529        if not self._tracing_enabled:
1530            langfuse_logger.debug(
1531                "Operation skipped: set_current_trace_as_public - Tracing is disabled or client is in no-op mode."
1532            )
1533            return
1534
1535        current_otel_span = self._get_current_otel_span()
1536
1537        if current_otel_span is not None and current_otel_span.is_recording():
1538            span_class = self._get_span_class(
1539                self._get_observation_type_from_otel_span(current_otel_span)
1540            )
1541            span = span_class(
1542                otel_span=current_otel_span,
1543                langfuse_client=self,
1544                environment=self._environment,
1545            )
1546
1547            span.set_trace_as_public()
1548
1549    def create_event(
1550        self,
1551        *,
1552        trace_context: Optional[TraceContext] = None,
1553        name: str,
1554        input: Optional[Any] = None,
1555        output: Optional[Any] = None,
1556        metadata: Optional[Any] = None,
1557        version: Optional[str] = None,
1558        level: Optional[SpanLevel] = None,
1559        status_message: Optional[str] = None,
1560    ) -> LangfuseEvent:
1561        """Create a new Langfuse observation of type 'EVENT'.
1562
1563        The created Langfuse Event observation will be the child of the current span in the context.
1564
1565        Args:
1566            trace_context: Optional context for connecting to an existing trace
1567            name: Name of the span (e.g., function or operation name)
1568            input: Input data for the operation (can be any JSON-serializable object)
1569            output: Output data from the operation (can be any JSON-serializable object)
1570            metadata: Additional metadata to associate with the span
1571            version: Version identifier for the code or component
1572            level: Importance level of the span (info, warning, error)
1573            status_message: Optional status message for the span
1574
1575        Returns:
1576            The Langfuse Event object
1577
1578        Example:
1579            ```python
1580            event = langfuse.create_event(name="process-event")
1581            ```
1582        """
1583        timestamp = time_ns()
1584
1585        if trace_context:
1586            trace_id = trace_context.get("trace_id", None)
1587            parent_span_id = trace_context.get("parent_span_id", None)
1588
1589            if trace_id:
1590                remote_parent_span = self._create_remote_parent_span(
1591                    trace_id=trace_id, parent_span_id=parent_span_id
1592                )
1593
1594                with otel_trace_api.use_span(
1595                    cast(otel_trace_api.Span, remote_parent_span)
1596                ):
1597                    otel_span = self._otel_tracer.start_span(
1598                        name=name, start_time=timestamp
1599                    )
1600                    otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True)
1601
1602                    return cast(
1603                        LangfuseEvent,
1604                        LangfuseEvent(
1605                            otel_span=otel_span,
1606                            langfuse_client=self,
1607                            environment=self._environment,
1608                            release=self._release,
1609                            input=input,
1610                            output=output,
1611                            metadata=metadata,
1612                            version=version,
1613                            level=level,
1614                            status_message=status_message,
1615                        ).end(end_time=timestamp),
1616                    )
1617
1618        otel_span = self._otel_tracer.start_span(name=name, start_time=timestamp)
1619
1620        return cast(
1621            LangfuseEvent,
1622            LangfuseEvent(
1623                otel_span=otel_span,
1624                langfuse_client=self,
1625                environment=self._environment,
1626                release=self._release,
1627                input=input,
1628                output=output,
1629                metadata=metadata,
1630                version=version,
1631                level=level,
1632                status_message=status_message,
1633            ).end(end_time=timestamp),
1634        )
1635
1636    def _create_remote_parent_span(
1637        self, *, trace_id: str, parent_span_id: Optional[str]
1638    ) -> Any:
1639        if not self._is_valid_trace_id(trace_id):
1640            langfuse_logger.warning(
1641                f"Passed trace ID '{trace_id}' is not a valid 32 lowercase hex char Langfuse trace id. Ignoring trace ID."
1642            )
1643
1644        if parent_span_id and not self._is_valid_span_id(parent_span_id):
1645            langfuse_logger.warning(
1646                f"Passed span ID '{parent_span_id}' is not a valid 16 lowercase hex char Langfuse span id. Ignoring parent span ID."
1647            )
1648
1649        int_trace_id = int(trace_id, 16)
1650        int_parent_span_id = (
1651            int(parent_span_id, 16)
1652            if parent_span_id
1653            else RandomIdGenerator().generate_span_id()
1654        )
1655
1656        span_context = otel_trace_api.SpanContext(
1657            trace_id=int_trace_id,
1658            span_id=int_parent_span_id,
1659            trace_flags=otel_trace_api.TraceFlags(0x01),  # mark span as sampled
1660            is_remote=False,
1661        )
1662
1663        return otel_trace_api.NonRecordingSpan(span_context)
1664
1665    def _is_valid_trace_id(self, trace_id: str) -> bool:
1666        pattern = r"^[0-9a-f]{32}$"
1667
1668        return bool(re.match(pattern, trace_id))
1669
1670    def _is_valid_span_id(self, span_id: str) -> bool:
1671        pattern = r"^[0-9a-f]{16}$"
1672
1673        return bool(re.match(pattern, span_id))
1674
1675    def _create_observation_id(self, *, seed: Optional[str] = None) -> str:
1676        """Create a unique observation ID for use with Langfuse.
1677
1678        This method generates a unique observation ID (span ID in OpenTelemetry terms)
1679        for use with various Langfuse APIs. It can either generate a random ID or
1680        create a deterministic ID based on a seed string.
1681
1682        Observation IDs must be 16 lowercase hexadecimal characters, representing 8 bytes.
1683        This method ensures the generated ID meets this requirement. If you need to
1684        correlate an external ID with a Langfuse observation ID, use the external ID as
1685        the seed to get a valid, deterministic observation ID.
1686
1687        Args:
1688            seed: Optional string to use as a seed for deterministic ID generation.
1689                 If provided, the same seed will always produce the same ID.
1690                 If not provided, a random ID will be generated.
1691
1692        Returns:
1693            A 16-character lowercase hexadecimal string representing the observation ID.
1694
1695        Example:
1696            ```python
1697            # Generate a random observation ID
1698            obs_id = langfuse.create_observation_id()
1699
1700            # Generate a deterministic ID based on a seed
1701            user_obs_id = langfuse.create_observation_id(seed="user-123-feedback")
1702
1703            # Correlate an external item ID with a Langfuse observation ID
1704            item_id = "item-789012"
1705            correlated_obs_id = langfuse.create_observation_id(seed=item_id)
1706
1707            # Use the ID with Langfuse APIs
1708            langfuse.create_score(
1709                name="relevance",
1710                value=0.95,
1711                trace_id=trace_id,
1712                observation_id=obs_id
1713            )
1714            ```
1715        """
1716        if not seed:
1717            span_id_int = RandomIdGenerator().generate_span_id()
1718
1719            return self._format_otel_span_id(span_id_int)
1720
1721        return sha256(seed.encode("utf-8")).digest()[:8].hex()
1722
1723    @staticmethod
1724    def create_trace_id(*, seed: Optional[str] = None) -> str:
1725        """Create a unique trace ID for use with Langfuse.
1726
1727        This method generates a unique trace ID for use with various Langfuse APIs.
1728        It can either generate a random ID or create a deterministic ID based on
1729        a seed string.
1730
1731        Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes.
1732        This method ensures the generated ID meets this requirement. If you need to
1733        correlate an external ID with a Langfuse trace ID, use the external ID as the
1734        seed to get a valid, deterministic Langfuse trace ID.
1735
1736        Args:
1737            seed: Optional string to use as a seed for deterministic ID generation.
1738                 If provided, the same seed will always produce the same ID.
1739                 If not provided, a random ID will be generated.
1740
1741        Returns:
1742            A 32-character lowercase hexadecimal string representing the Langfuse trace ID.
1743
1744        Example:
1745            ```python
1746            # Generate a random trace ID
1747            trace_id = langfuse.create_trace_id()
1748
1749            # Generate a deterministic ID based on a seed
1750            session_trace_id = langfuse.create_trace_id(seed="session-456")
1751
1752            # Correlate an external ID with a Langfuse trace ID
1753            external_id = "external-system-123456"
1754            correlated_trace_id = langfuse.create_trace_id(seed=external_id)
1755
1756            # Use the ID with trace context
1757            with langfuse.start_as_current_observation(
1758                name="process-request",
1759                trace_context={"trace_id": trace_id}
1760            ) as span:
1761                # Operation will be part of the specific trace
1762                pass
1763            ```
1764        """
1765        if not seed:
1766            trace_id_int = RandomIdGenerator().generate_trace_id()
1767
1768            return Langfuse._format_otel_trace_id(trace_id_int)
1769
1770        return sha256(seed.encode("utf-8")).digest()[:16].hex()
1771
1772    def _get_otel_trace_id(self, otel_span: otel_trace_api.Span) -> str:
1773        span_context = otel_span.get_span_context()
1774
1775        return self._format_otel_trace_id(span_context.trace_id)
1776
1777    def _get_otel_span_id(self, otel_span: otel_trace_api.Span) -> str:
1778        span_context = otel_span.get_span_context()
1779
1780        return self._format_otel_span_id(span_context.span_id)
1781
1782    @staticmethod
1783    def _format_otel_span_id(span_id_int: int) -> str:
1784        """Format an integer span ID to a 16-character lowercase hex string.
1785
1786        Internal method to convert an OpenTelemetry integer span ID to the standard
1787        W3C Trace Context format (16-character lowercase hex string).
1788
1789        Args:
1790            span_id_int: 64-bit integer representing a span ID
1791
1792        Returns:
1793            A 16-character lowercase hexadecimal string
1794        """
1795        return format(span_id_int, "016x")
1796
1797    @staticmethod
1798    def _format_otel_trace_id(trace_id_int: int) -> str:
1799        """Format an integer trace ID to a 32-character lowercase hex string.
1800
1801        Internal method to convert an OpenTelemetry integer trace ID to the standard
1802        W3C Trace Context format (32-character lowercase hex string).
1803
1804        Args:
1805            trace_id_int: 128-bit integer representing a trace ID
1806
1807        Returns:
1808            A 32-character lowercase hexadecimal string
1809        """
1810        return format(trace_id_int, "032x")
1811
1812    @overload
1813    def create_score(
1814        self,
1815        *,
1816        name: str,
1817        value: float,
1818        session_id: Optional[str] = None,
1819        dataset_run_id: Optional[str] = None,
1820        trace_id: Optional[str] = None,
1821        observation_id: Optional[str] = None,
1822        score_id: Optional[str] = None,
1823        data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None,
1824        comment: Optional[str] = None,
1825        config_id: Optional[str] = None,
1826        metadata: Optional[Any] = None,
1827        timestamp: Optional[datetime] = None,
1828        environment: Optional[str] = None,
1829    ) -> None: ...
1830
1831    @overload
1832    def create_score(
1833        self,
1834        *,
1835        name: str,
1836        value: str,
1837        session_id: Optional[str] = None,
1838        dataset_run_id: Optional[str] = None,
1839        trace_id: Optional[str] = None,
1840        score_id: Optional[str] = None,
1841        observation_id: Optional[str] = None,
1842        data_type: Optional[
1843            Literal["CATEGORICAL", "TEXT", "CORRECTION"]
1844        ] = "CATEGORICAL",
1845        comment: Optional[str] = None,
1846        config_id: Optional[str] = None,
1847        metadata: Optional[Any] = None,
1848        timestamp: Optional[datetime] = None,
1849        environment: Optional[str] = None,
1850    ) -> None: ...
1851
1852    def create_score(
1853        self,
1854        *,
1855        name: str,
1856        value: Union[float, str],
1857        session_id: Optional[str] = None,
1858        dataset_run_id: Optional[str] = None,
1859        trace_id: Optional[str] = None,
1860        observation_id: Optional[str] = None,
1861        score_id: Optional[str] = None,
1862        data_type: Optional[ScoreDataType] = None,
1863        comment: Optional[str] = None,
1864        config_id: Optional[str] = None,
1865        metadata: Optional[Any] = None,
1866        timestamp: Optional[datetime] = None,
1867        environment: Optional[str] = None,
1868    ) -> None:
1869        """Create a score for a specific trace or observation.
1870
1871        This method creates a score for evaluating a Langfuse trace or observation. Scores can be
1872        used to track quality metrics, user feedback, or automated evaluations.
1873
1874        Args:
1875            name: Name of the score (e.g., "relevance", "accuracy")
1876            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION)
1877            session_id: ID of the Langfuse session to associate the score with
1878            dataset_run_id: ID of the Langfuse dataset run to associate the score with
1879            trace_id: ID of the Langfuse trace to associate the score with
1880            observation_id: Optional ID of the specific observation to score. Trace ID must be provided too.
1881            score_id: Optional custom ID for the score (auto-generated if not provided)
1882            data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION)
1883            comment: Optional comment or explanation for the score
1884            config_id: Optional ID of a score config defined in Langfuse
1885            metadata: Optional metadata to be attached to the score
1886            timestamp: Optional timestamp for the score (defaults to current UTC time)
1887            environment: Optional environment override for this score. If omitted,
1888                the score uses the client-level environment from
1889                `Langfuse(environment=...)` or `LANGFUSE_TRACING_ENVIRONMENT`.
1890                Langfuse observation wrapper methods pass their resolved span
1891                environment here so scores created via `span.score()` or
1892                `span.score_trace()` stay grouped with the scored observation or
1893                trace, including request-scoped environments propagated with
1894                `propagate_attributes(environment=...)`.
1895
1896        Example:
1897            ```python
1898            # Create a numeric score for accuracy
1899            langfuse.create_score(
1900                name="accuracy",
1901                value=0.92,
1902                trace_id="abcdef1234567890abcdef1234567890",
1903                data_type="NUMERIC",
1904                comment="High accuracy with minor irrelevant details"
1905            )
1906
1907            # Create a categorical score for sentiment
1908            langfuse.create_score(
1909                name="sentiment",
1910                value="positive",
1911                trace_id="abcdef1234567890abcdef1234567890",
1912                observation_id="abcdef1234567890",
1913                data_type="CATEGORICAL"
1914            )
1915            ```
1916        """
1917        if not self._tracing_enabled:
1918            return
1919
1920        score_id = score_id or self._create_observation_id()
1921
1922        try:
1923            new_body = ScoreBody(
1924                id=score_id,
1925                sessionId=session_id,
1926                datasetRunId=dataset_run_id,
1927                traceId=trace_id,
1928                observationId=observation_id,
1929                name=name,
1930                value=value,
1931                dataType=data_type,  # type: ignore
1932                comment=comment,
1933                configId=config_id,
1934                environment=environment or self._environment,
1935                metadata=metadata,
1936            )
1937
1938            event = {
1939                "id": self.create_trace_id(),
1940                "type": "score-create",
1941                "timestamp": timestamp or _get_timestamp(),
1942                "body": new_body,
1943            }
1944
1945            if self._resources is not None:
1946                # Force the score to be in sample if it was for a legacy trace ID, i.e. non-32 hexchar
1947                force_sample = (
1948                    not self._is_valid_trace_id(trace_id) if trace_id else True
1949                )
1950
1951                self._resources.add_score_task(
1952                    event,
1953                    force_sample=force_sample,
1954                )
1955
1956        except Exception as e:
1957            langfuse_logger.exception(
1958                f"Error creating score: Failed to process score event for trace_id={trace_id}, name={name}. Error: {e}"
1959            )
1960
1961    def _create_trace_tags_via_ingestion(
1962        self,
1963        *,
1964        trace_id: str,
1965        tags: List[str],
1966    ) -> None:
1967        """Private helper to enqueue trace tag updates via ingestion API events."""
1968        if not self._tracing_enabled:
1969            return
1970
1971        if len(tags) == 0:
1972            return
1973
1974        try:
1975            new_body = TraceBody(
1976                id=trace_id,
1977                tags=tags,
1978            )
1979
1980            event = {
1981                "id": self.create_trace_id(),
1982                "type": "trace-create",
1983                "timestamp": _get_timestamp(),
1984                "body": new_body,
1985            }
1986
1987            if self._resources is not None:
1988                self._resources.add_trace_task(event)
1989        except Exception as e:
1990            langfuse_logger.exception(
1991                f"Error updating trace tags: Failed to process trace update event for trace_id={trace_id}. Error: {e}"
1992            )
1993
1994    @overload
1995    def score_current_span(
1996        self,
1997        *,
1998        name: str,
1999        value: float,
2000        score_id: Optional[str] = None,
2001        data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None,
2002        comment: Optional[str] = None,
2003        config_id: Optional[str] = None,
2004        metadata: Optional[Any] = None,
2005    ) -> None: ...
2006
2007    @overload
2008    def score_current_span(
2009        self,
2010        *,
2011        name: str,
2012        value: str,
2013        score_id: Optional[str] = None,
2014        data_type: Optional[
2015            Literal["CATEGORICAL", "TEXT", "CORRECTION"]
2016        ] = "CATEGORICAL",
2017        comment: Optional[str] = None,
2018        config_id: Optional[str] = None,
2019        metadata: Optional[Any] = None,
2020    ) -> None: ...
2021
2022    def score_current_span(
2023        self,
2024        *,
2025        name: str,
2026        value: Union[float, str],
2027        score_id: Optional[str] = None,
2028        data_type: Optional[ScoreDataType] = None,
2029        comment: Optional[str] = None,
2030        config_id: Optional[str] = None,
2031        metadata: Optional[Any] = None,
2032    ) -> None:
2033        """Create a score for the current active span.
2034
2035        This method scores the currently active span in the context. It's a convenient
2036        way to score the current operation without needing to know its trace and span IDs.
2037        If the active span has a `langfuse.environment` attribute, including one
2038        set by `propagate_attributes(environment=...)`, the score uses that
2039        environment. Otherwise it uses the client-level environment.
2040
2041        Args:
2042            name: Name of the score (e.g., "relevance", "accuracy")
2043            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION)
2044            score_id: Optional custom ID for the score (auto-generated if not provided)
2045            data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION)
2046            comment: Optional comment or explanation for the score
2047            config_id: Optional ID of a score config defined in Langfuse
2048            metadata: Optional metadata to be attached to the score
2049
2050        Example:
2051            ```python
2052            with langfuse.start_as_current_generation(name="answer-query") as generation:
2053                # Generate answer
2054                response = generate_answer(...)
2055                generation.update(output=response)
2056
2057                # Score the generation
2058                langfuse.score_current_span(
2059                    name="relevance",
2060                    value=0.85,
2061                    data_type="NUMERIC",
2062                    comment="Mostly relevant but contains some tangential information",
2063                    metadata={"model": "gpt-4", "prompt_version": "v2"}
2064                )
2065            ```
2066        """
2067        current_span = self._get_current_otel_span()
2068
2069        if current_span is not None:
2070            trace_id = self._get_otel_trace_id(current_span)
2071            observation_id = self._get_otel_span_id(current_span)
2072
2073            langfuse_logger.info(
2074                f"Score: Creating score name='{name}' value={value} for current span ({observation_id}) in trace {trace_id}"
2075            )
2076
2077            self.create_score(
2078                trace_id=trace_id,
2079                observation_id=observation_id,
2080                name=name,
2081                value=cast(str, value),
2082                score_id=score_id,
2083                data_type=cast(Literal["CATEGORICAL", "TEXT", "CORRECTION"], data_type),
2084                comment=comment,
2085                config_id=config_id,
2086                metadata=metadata,
2087                environment=get_string_span_attribute(
2088                    current_span, LangfuseOtelSpanAttributes.ENVIRONMENT
2089                ),
2090            )
2091
2092    @overload
2093    def score_current_trace(
2094        self,
2095        *,
2096        name: str,
2097        value: float,
2098        score_id: Optional[str] = None,
2099        data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None,
2100        comment: Optional[str] = None,
2101        config_id: Optional[str] = None,
2102        metadata: Optional[Any] = None,
2103    ) -> None: ...
2104
2105    @overload
2106    def score_current_trace(
2107        self,
2108        *,
2109        name: str,
2110        value: str,
2111        score_id: Optional[str] = None,
2112        data_type: Optional[
2113            Literal["CATEGORICAL", "TEXT", "CORRECTION"]
2114        ] = "CATEGORICAL",
2115        comment: Optional[str] = None,
2116        config_id: Optional[str] = None,
2117        metadata: Optional[Any] = None,
2118    ) -> None: ...
2119
2120    def score_current_trace(
2121        self,
2122        *,
2123        name: str,
2124        value: Union[float, str],
2125        score_id: Optional[str] = None,
2126        data_type: Optional[ScoreDataType] = None,
2127        comment: Optional[str] = None,
2128        config_id: Optional[str] = None,
2129        metadata: Optional[Any] = None,
2130    ) -> None:
2131        """Create a score for the current trace.
2132
2133        This method scores the trace of the currently active span. Unlike score_current_span,
2134        this method associates the score with the entire trace rather than a specific span.
2135        It's useful for scoring overall performance or quality of the entire operation.
2136        If the active span has a `langfuse.environment` attribute, including one
2137        set by `propagate_attributes(environment=...)`, the score uses that
2138        environment. Otherwise it uses the client-level environment.
2139
2140        Args:
2141            name: Name of the score (e.g., "user_satisfaction", "overall_quality")
2142            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION)
2143            score_id: Optional custom ID for the score (auto-generated if not provided)
2144            data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION)
2145            comment: Optional comment or explanation for the score
2146            config_id: Optional ID of a score config defined in Langfuse
2147            metadata: Optional metadata to be attached to the score
2148
2149        Example:
2150            ```python
2151            with langfuse.start_as_current_observation(name="process-user-request") as span:
2152                # Process request
2153                result = process_complete_request()
2154                span.update(output=result)
2155
2156                # Score the overall trace
2157                langfuse.score_current_trace(
2158                    name="overall_quality",
2159                    value=0.95,
2160                    data_type="NUMERIC",
2161                    comment="High quality end-to-end response",
2162                    metadata={"evaluator": "gpt-4", "criteria": "comprehensive"}
2163                )
2164            ```
2165        """
2166        current_span = self._get_current_otel_span()
2167
2168        if current_span is not None:
2169            trace_id = self._get_otel_trace_id(current_span)
2170
2171            langfuse_logger.info(
2172                f"Score: Creating score name='{name}' value={value} for entire trace {trace_id}"
2173            )
2174
2175            self.create_score(
2176                trace_id=trace_id,
2177                name=name,
2178                value=cast(str, value),
2179                score_id=score_id,
2180                data_type=cast(Literal["CATEGORICAL", "TEXT", "CORRECTION"], data_type),
2181                comment=comment,
2182                config_id=config_id,
2183                metadata=metadata,
2184                environment=get_string_span_attribute(
2185                    current_span, LangfuseOtelSpanAttributes.ENVIRONMENT
2186                ),
2187            )
2188
2189    def flush(self) -> None:
2190        """Force flush all pending spans and events to the Langfuse API.
2191
2192        This method manually flushes any pending spans, scores, and other events to the
2193        Langfuse API. It's useful in scenarios where you want to ensure all data is sent
2194        before proceeding, without waiting for the automatic flush interval.
2195
2196        Example:
2197            ```python
2198            # Record some spans and scores
2199            with langfuse.start_as_current_observation(name="operation") as span:
2200                # Do work...
2201                pass
2202
2203            # Ensure all data is sent to Langfuse before proceeding
2204            langfuse.flush()
2205
2206            # Continue with other work
2207            ```
2208        """
2209        if self._resources is not None:
2210            self._resources.flush()
2211
2212    def shutdown(self) -> None:
2213        """Shut down the Langfuse client and flush all pending data.
2214
2215        This method cleanly shuts down the Langfuse client, ensuring all pending data
2216        is flushed to the API and all background threads are properly terminated.
2217
2218        It's important to call this method when your application is shutting down to
2219        prevent data loss and resource leaks. For most applications, using the client
2220        as a context manager or relying on the automatic shutdown via atexit is sufficient.
2221
2222        Example:
2223            ```python
2224            # Initialize Langfuse
2225            langfuse = Langfuse(public_key="...", secret_key="...")
2226
2227            # Use Langfuse throughout your application
2228            # ...
2229
2230            # When application is shutting down
2231            langfuse.shutdown()
2232            ```
2233        """
2234        if self._resources is not None:
2235            self._resources.shutdown()
2236
2237    def get_current_trace_id(self) -> Optional[str]:
2238        """Get the trace ID of the current active span.
2239
2240        This method retrieves the trace ID from the currently active span in the context.
2241        It can be used to get the trace ID for referencing in logs, external systems,
2242        or for creating related operations.
2243
2244        Returns:
2245            The current trace ID as a 32-character lowercase hexadecimal string,
2246            or None if there is no active span.
2247
2248        Example:
2249            ```python
2250            with langfuse.start_as_current_observation(name="process-request") as span:
2251                # Get the current trace ID for reference
2252                trace_id = langfuse.get_current_trace_id()
2253
2254                # Use it for external correlation
2255                log.info(f"Processing request with trace_id: {trace_id}")
2256
2257                # Or pass to another system
2258                external_system.process(data, trace_id=trace_id)
2259            ```
2260        """
2261        if not self._tracing_enabled:
2262            langfuse_logger.debug(
2263                "Operation skipped: get_current_trace_id - Tracing is disabled or client is in no-op mode."
2264            )
2265            return None
2266
2267        current_otel_span = self._get_current_otel_span()
2268
2269        return self._get_otel_trace_id(current_otel_span) if current_otel_span else None
2270
2271    def get_current_observation_id(self) -> Optional[str]:
2272        """Get the observation ID (span ID) of the current active span.
2273
2274        This method retrieves the observation ID from the currently active span in the context.
2275        It can be used to get the observation ID for referencing in logs, external systems,
2276        or for creating scores or other related operations.
2277
2278        Returns:
2279            The current observation ID as a 16-character lowercase hexadecimal string,
2280            or None if there is no active span.
2281
2282        Example:
2283            ```python
2284            with langfuse.start_as_current_observation(name="process-user-query") as span:
2285                # Get the current observation ID
2286                observation_id = langfuse.get_current_observation_id()
2287
2288                # Store it for later reference
2289                cache.set(f"query_{query_id}_observation", observation_id)
2290
2291                # Process the query...
2292            ```
2293        """
2294        if not self._tracing_enabled:
2295            langfuse_logger.debug(
2296                "Operation skipped: get_current_observation_id - Tracing is disabled or client is in no-op mode."
2297            )
2298            return None
2299
2300        current_otel_span = self._get_current_otel_span()
2301
2302        return self._get_otel_span_id(current_otel_span) if current_otel_span else None
2303
2304    def _get_project_id(self) -> Optional[str]:
2305        """Fetch and return the current project id. Persisted across requests. Returns None if no project id is found for api keys."""
2306        if not self._project_id:
2307            proj = self.api.projects.get()
2308            if not proj.data or not proj.data[0].id:
2309                return None
2310
2311            self._project_id = proj.data[0].id
2312
2313        return self._project_id
2314
2315    def get_trace_url(self, *, trace_id: Optional[str] = None) -> Optional[str]:
2316        """Get the URL to view a trace in the Langfuse UI.
2317
2318        This method generates a URL that links directly to a trace in the Langfuse UI.
2319        It's useful for providing links in logs, notifications, or debugging tools.
2320
2321        Args:
2322            trace_id: Optional trace ID to generate a URL for. If not provided,
2323                     the trace ID of the current active span will be used.
2324
2325        Returns:
2326            A URL string pointing to the trace in the Langfuse UI,
2327            or None if the project ID couldn't be retrieved or no trace ID is available.
2328
2329        Example:
2330            ```python
2331            # Get URL for the current trace
2332            with langfuse.start_as_current_observation(name="process-request") as span:
2333                trace_url = langfuse.get_trace_url()
2334                log.info(f"Processing trace: {trace_url}")
2335
2336            # Get URL for a specific trace
2337            specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef")
2338            send_notification(f"Review needed for trace: {specific_trace_url}")
2339            ```
2340        """
2341        final_trace_id = trace_id or self.get_current_trace_id()
2342        if not final_trace_id:
2343            return None
2344
2345        project_id = self._get_project_id()
2346
2347        return (
2348            f"{self._base_url}/project/{project_id}/traces/{final_trace_id}"
2349            if project_id and final_trace_id
2350            else None
2351        )
2352
2353    def get_dataset(
2354        self,
2355        name: str,
2356        *,
2357        fetch_items_page_size: Optional[int] = 50,
2358        version: Optional[datetime] = None,
2359    ) -> "DatasetClient":
2360        """Fetch a dataset by its name.
2361
2362        Args:
2363            name: The name of the dataset to fetch.
2364            fetch_items_page_size: All items of the dataset will be fetched in chunks of this size. Defaults to 50.
2365            version: Retrieve dataset items as they existed at this specific point in time (UTC).
2366                If provided, returns the state of items at the specified UTC timestamp.
2367                If not provided, returns the latest version. Must be a timezone-aware datetime object in UTC.
2368
2369        Returns:
2370            DatasetClient: The dataset with the given name.
2371        """
2372        try:
2373            langfuse_logger.debug(f"Getting datasets {name}")
2374            dataset = self.api.datasets.get(dataset_name=self._url_encode(name))
2375
2376            dataset_items: List[DatasetItem] = []
2377            page = 1
2378
2379            while True:
2380                new_items = self.api.dataset_items.list(
2381                    dataset_name=self._url_encode(name, is_url_param=True),
2382                    page=page,
2383                    limit=fetch_items_page_size,
2384                    version=version,
2385                )
2386                dataset_items.extend(
2387                    self._hydrate_dataset_item_media_references(item)
2388                    for item in new_items.data
2389                )
2390
2391                if new_items.meta.total_pages <= page:
2392                    break
2393
2394                page += 1
2395
2396            return DatasetClient(
2397                dataset=dataset,
2398                items=dataset_items,
2399                version=version,
2400                langfuse_client=self,
2401            )
2402
2403        except Error as e:
2404            handle_fern_exception(e)
2405            raise e
2406
2407    def get_dataset_run(
2408        self, *, dataset_name: str, run_name: str
2409    ) -> DatasetRunWithItems:
2410        """Fetch a dataset run by dataset name and run name.
2411
2412        Args:
2413            dataset_name (str): The name of the dataset.
2414            run_name (str): The name of the run.
2415
2416        Returns:
2417            DatasetRunWithItems: The dataset run with its items.
2418        """
2419        try:
2420            return cast(
2421                DatasetRunWithItems,
2422                self.api.datasets.get_run(
2423                    dataset_name=self._url_encode(dataset_name),
2424                    run_name=self._url_encode(run_name),
2425                    request_options=None,
2426                ),
2427            )
2428        except Error as e:
2429            handle_fern_exception(e)
2430            raise e
2431
2432    def get_dataset_runs(
2433        self,
2434        *,
2435        dataset_name: str,
2436        page: Optional[int] = None,
2437        limit: Optional[int] = None,
2438    ) -> PaginatedDatasetRuns:
2439        """Fetch all runs for a dataset.
2440
2441        Args:
2442            dataset_name (str): The name of the dataset.
2443            page (Optional[int]): Page number, starts at 1.
2444            limit (Optional[int]): Limit of items per page.
2445
2446        Returns:
2447            PaginatedDatasetRuns: Paginated list of dataset runs.
2448        """
2449        try:
2450            return cast(
2451                PaginatedDatasetRuns,
2452                self.api.datasets.get_runs(
2453                    dataset_name=self._url_encode(dataset_name),
2454                    page=page,
2455                    limit=limit,
2456                    request_options=None,
2457                ),
2458            )
2459        except Error as e:
2460            handle_fern_exception(e)
2461            raise e
2462
2463    def delete_dataset_run(
2464        self, *, dataset_name: str, run_name: str
2465    ) -> DeleteDatasetRunResponse:
2466        """Delete a dataset run and all its run items. This action is irreversible.
2467
2468        Args:
2469            dataset_name (str): The name of the dataset.
2470            run_name (str): The name of the run.
2471
2472        Returns:
2473            DeleteDatasetRunResponse: Confirmation of deletion.
2474        """
2475        try:
2476            return cast(
2477                DeleteDatasetRunResponse,
2478                self.api.datasets.delete_run(
2479                    dataset_name=self._url_encode(dataset_name),
2480                    run_name=self._url_encode(run_name),
2481                    request_options=None,
2482                ),
2483            )
2484        except Error as e:
2485            handle_fern_exception(e)
2486            raise e
2487
2488    def run_experiment(
2489        self,
2490        *,
2491        name: str,
2492        run_name: Optional[str] = None,
2493        description: Optional[str] = None,
2494        data: ExperimentData,
2495        task: TaskFunction,
2496        evaluators: List[EvaluatorFunction] = [],
2497        composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
2498        run_evaluators: List[RunEvaluatorFunction] = [],
2499        max_concurrency: int = 50,
2500        metadata: Optional[Dict[str, str]] = None,
2501        _dataset_version: Optional[datetime] = None,
2502    ) -> ExperimentResult:
2503        """Run an experiment on a dataset with automatic tracing and evaluation.
2504
2505        This method executes a task function on each item in the provided dataset,
2506        automatically traces all executions with Langfuse for observability, runs
2507        item-level and run-level evaluators on the outputs, and returns comprehensive
2508        results with evaluation metrics.
2509
2510        The experiment system provides:
2511        - Automatic tracing of all task executions
2512        - Concurrent processing with configurable limits
2513        - Comprehensive error handling that isolates failures
2514        - Integration with Langfuse datasets for experiment tracking
2515        - Flexible evaluation framework supporting both sync and async evaluators
2516
2517        Args:
2518            name: Human-readable name for the experiment. Used for identification
2519                in the Langfuse UI.
2520            run_name: Optional exact name for the experiment run. If provided, this will be
2521                used as the exact dataset run name if the `data` contains Langfuse dataset items.
2522                If not provided, this will default to the experiment name appended with an ISO timestamp.
2523            description: Optional description explaining the experiment's purpose,
2524                methodology, or expected outcomes.
2525            data: Array of data items to process. Can be either:
2526                - List of dict-like items with 'input', 'expected_output', 'metadata' keys
2527                - List of Langfuse DatasetItem objects from dataset.items
2528            task: Function that processes each data item and returns output.
2529                Must accept 'item' as keyword argument and can return sync or async results.
2530                The task function signature should be: task(*, item, **kwargs) -> Any
2531            evaluators: List of functions to evaluate each item's output individually.
2532                Each evaluator receives input, output, expected_output, and metadata.
2533                Can return single Evaluation dict or list of Evaluation dicts.
2534            composite_evaluator: Optional function that creates composite scores from item-level evaluations.
2535                Receives the same inputs as item-level evaluators (input, output, expected_output, metadata)
2536                plus the list of evaluations from item-level evaluators. Useful for weighted averages,
2537                pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics.
2538            run_evaluators: List of functions to evaluate the entire experiment run.
2539                Each run evaluator receives all item_results and can compute aggregate metrics.
2540                Useful for calculating averages, distributions, or cross-item comparisons.
2541            max_concurrency: Maximum number of concurrent task executions (default: 50).
2542                Controls the number of items processed simultaneously. Adjust based on
2543                API rate limits and system resources.
2544            metadata: Optional metadata dictionary to attach to all experiment traces.
2545                This metadata will be included in every trace created during the experiment.
2546                If `data` are Langfuse dataset items, the metadata will be attached to the dataset run, too.
2547
2548        Returns:
2549            ExperimentResult containing:
2550            - run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset.
2551            - item_results: List of results for each processed item with outputs and evaluations
2552            - run_evaluations: List of aggregate evaluation results for the entire run
2553            - experiment_id: Stable identifier for the experiment run across all items
2554            - dataset_run_id: ID of the dataset run (if using Langfuse datasets)
2555            - dataset_run_url: Direct URL to view results in Langfuse UI (if applicable)
2556
2557        Raises:
2558            ValueError: If required parameters are missing or invalid
2559            Exception: If experiment setup fails (individual item failures are handled gracefully)
2560
2561        Examples:
2562            Basic experiment with local data:
2563            ```python
2564            def summarize_text(*, item, **kwargs):
2565                return f"Summary: {item['input'][:50]}..."
2566
2567            def length_evaluator(*, input, output, expected_output=None, **kwargs):
2568                return {
2569                    "name": "output_length",
2570                    "value": len(output),
2571                    "comment": f"Output contains {len(output)} characters"
2572                }
2573
2574            result = langfuse.run_experiment(
2575                name="Text Summarization Test",
2576                description="Evaluate summarization quality and length",
2577                data=[
2578                    {"input": "Long article text...", "expected_output": "Expected summary"},
2579                    {"input": "Another article...", "expected_output": "Another summary"}
2580                ],
2581                task=summarize_text,
2582                evaluators=[length_evaluator]
2583            )
2584
2585            print(f"Processed {len(result.item_results)} items")
2586            for item_result in result.item_results:
2587                print(f"Input: {item_result.item['input']}")
2588                print(f"Output: {item_result.output}")
2589                print(f"Evaluations: {item_result.evaluations}")
2590            ```
2591
2592            Advanced experiment with async task and multiple evaluators:
2593            ```python
2594            async def llm_task(*, item, **kwargs):
2595                # Simulate async LLM call
2596                response = await openai_client.chat.completions.create(
2597                    model="gpt-4",
2598                    messages=[{"role": "user", "content": item["input"]}]
2599                )
2600                return response.choices[0].message.content
2601
2602            def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
2603                if expected_output and expected_output.lower() in output.lower():
2604                    return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"}
2605                return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"}
2606
2607            def toxicity_evaluator(*, input, output, expected_output=None, **kwargs):
2608                # Simulate toxicity check
2609                toxicity_score = check_toxicity(output)  # Your toxicity checker
2610                return {
2611                    "name": "toxicity",
2612                    "value": toxicity_score,
2613                    "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}"
2614                }
2615
2616            def average_accuracy(*, item_results, **kwargs):
2617                accuracies = [
2618                    eval.value for result in item_results
2619                    for eval in result.evaluations
2620                    if eval.name == "accuracy"
2621                ]
2622                return {
2623                    "name": "average_accuracy",
2624                    "value": sum(accuracies) / len(accuracies) if accuracies else 0,
2625                    "comment": f"Average accuracy across {len(accuracies)} items"
2626                }
2627
2628            result = langfuse.run_experiment(
2629                name="LLM Safety and Accuracy Test",
2630                description="Evaluate model accuracy and safety across diverse prompts",
2631                data=test_dataset,  # Your dataset items
2632                task=llm_task,
2633                evaluators=[accuracy_evaluator, toxicity_evaluator],
2634                run_evaluators=[average_accuracy],
2635                max_concurrency=5,  # Limit concurrent API calls
2636                metadata={"model": "gpt-4", "temperature": 0.7}
2637            )
2638            ```
2639
2640            Using with Langfuse datasets:
2641            ```python
2642            # Get dataset from Langfuse
2643            dataset = langfuse.get_dataset("my-eval-dataset")
2644
2645            result = dataset.run_experiment(
2646                name="Production Model Evaluation",
2647                description="Monthly evaluation of production model performance",
2648                task=my_production_task,
2649                evaluators=[accuracy_evaluator, latency_evaluator]
2650            )
2651
2652            # Results automatically linked to dataset in Langfuse UI
2653            print(f"View results: {result['dataset_run_url']}")
2654            ```
2655
2656        Note:
2657            - Task and evaluator functions can be either synchronous or asynchronous
2658            - Individual item failures are logged but don't stop the experiment
2659            - All executions are automatically traced and visible in Langfuse UI
2660            - When using Langfuse datasets, results are automatically linked for easy comparison
2661            - This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.)
2662            - Async execution is handled automatically with smart event loop detection
2663        """
2664        return cast(
2665            ExperimentResult,
2666            run_async_safely(
2667                self._run_experiment_async(
2668                    name=name,
2669                    run_name=self._create_experiment_run_name(
2670                        name=name, run_name=run_name
2671                    ),
2672                    description=description,
2673                    data=data,
2674                    task=task,
2675                    evaluators=evaluators or [],
2676                    composite_evaluator=composite_evaluator,
2677                    run_evaluators=run_evaluators or [],
2678                    max_concurrency=max_concurrency,
2679                    metadata=metadata,
2680                    dataset_version=_dataset_version,
2681                ),
2682            ),
2683        )
2684
2685    async def _run_experiment_async(
2686        self,
2687        *,
2688        name: str,
2689        run_name: str,
2690        description: Optional[str],
2691        data: ExperimentData,
2692        task: TaskFunction,
2693        evaluators: List[EvaluatorFunction],
2694        composite_evaluator: Optional[CompositeEvaluatorFunction],
2695        run_evaluators: List[RunEvaluatorFunction],
2696        max_concurrency: int,
2697        metadata: Optional[Dict[str, Any]] = None,
2698        dataset_version: Optional[datetime] = None,
2699    ) -> ExperimentResult:
2700        langfuse_logger.debug(
2701            f"Starting experiment '{name}' run '{run_name}' with {len(data)} items"
2702        )
2703
2704        shared_fallback_experiment_id = self._create_observation_id()
2705
2706        # Set up concurrency control
2707        semaphore = asyncio.Semaphore(max_concurrency)
2708
2709        # Process all items
2710        async def process_item(item: ExperimentItem) -> ExperimentItemResult:
2711            async with semaphore:
2712                return await self._process_experiment_item(
2713                    item,
2714                    task,
2715                    evaluators,
2716                    composite_evaluator,
2717                    shared_fallback_experiment_id,
2718                    name,
2719                    run_name,
2720                    description,
2721                    metadata,
2722                    dataset_version,
2723                )
2724
2725        # Run all items concurrently
2726        tasks = [process_item(item) for item in data]
2727        item_results = await asyncio.gather(*tasks, return_exceptions=True)
2728
2729        # Filter out any exceptions and log errors
2730        valid_results: List[ExperimentItemResult] = []
2731        for i, result in enumerate(item_results):
2732            if isinstance(result, Exception):
2733                langfuse_logger.error(f"Item {i} failed: {result}")
2734            elif isinstance(result, ExperimentItemResult):
2735                valid_results.append(result)  # type: ignore
2736
2737        # Run experiment-level evaluators
2738        run_evaluations: List[Evaluation] = []
2739        for run_evaluator in run_evaluators:
2740            try:
2741                evaluations = await _run_evaluator(
2742                    run_evaluator, item_results=valid_results
2743                )
2744                run_evaluations.extend(evaluations)
2745            except Exception as e:
2746                langfuse_logger.error(f"Run evaluator failed: {e}")
2747
2748        # Generate dataset run URL if applicable
2749        dataset_run_id = next(
2750            (
2751                result.dataset_run_id
2752                for result in valid_results
2753                if result.dataset_run_id
2754            ),
2755            None,
2756        )
2757        dataset_run_url = None
2758        if dataset_run_id and data:
2759            try:
2760                # Check if the first item has dataset_id (for DatasetItem objects)
2761                first_item = data[0]
2762                dataset_id = None
2763
2764                if hasattr(first_item, "dataset_id"):
2765                    dataset_id = getattr(first_item, "dataset_id", None)
2766
2767                if dataset_id:
2768                    project_id = self._get_project_id()
2769
2770                    if project_id:
2771                        dataset_run_url = f"{self._base_url}/project/{project_id}/datasets/{dataset_id}/runs/{dataset_run_id}"
2772
2773            except Exception:
2774                pass  # URL generation is optional
2775
2776        # Store run-level evaluations as scores
2777        for evaluation in run_evaluations:
2778            try:
2779                if dataset_run_id:
2780                    self.create_score(
2781                        dataset_run_id=dataset_run_id,
2782                        name=evaluation.name or "<unknown>",
2783                        value=evaluation.value,  # type: ignore
2784                        comment=evaluation.comment,
2785                        metadata=evaluation.metadata,
2786                        data_type=evaluation.data_type,  # type: ignore
2787                        config_id=evaluation.config_id,
2788                    )
2789
2790            except Exception as e:
2791                langfuse_logger.error(f"Failed to store run evaluation: {e}")
2792
2793        # Flush scores and traces
2794        self.flush()
2795
2796        return ExperimentResult(
2797            name=name,
2798            run_name=run_name,
2799            description=description,
2800            item_results=valid_results,
2801            run_evaluations=run_evaluations,
2802            experiment_id=dataset_run_id or shared_fallback_experiment_id,
2803            dataset_run_id=dataset_run_id,
2804            dataset_run_url=dataset_run_url,
2805        )
2806
2807    async def _process_experiment_item(
2808        self,
2809        item: ExperimentItem,
2810        task: Callable,
2811        evaluators: List[Callable],
2812        composite_evaluator: Optional[CompositeEvaluatorFunction],
2813        fallback_experiment_id: str,
2814        experiment_name: str,
2815        experiment_run_name: str,
2816        experiment_description: Optional[str],
2817        experiment_metadata: Optional[Dict[str, Any]] = None,
2818        dataset_version: Optional[datetime] = None,
2819    ) -> ExperimentItemResult:
2820        span_name = "experiment-item-run"
2821
2822        with self.start_as_current_observation(name=span_name) as span:
2823            try:
2824                input_data = (
2825                    item.get("input")
2826                    if isinstance(item, dict)
2827                    else getattr(item, "input", None)
2828                )
2829
2830                if input_data is None:
2831                    raise ValueError("Experiment Item is missing input. Skipping item.")
2832
2833                expected_output = (
2834                    item.get("expected_output")
2835                    if isinstance(item, dict)
2836                    else getattr(item, "expected_output", None)
2837                )
2838
2839                item_metadata = (
2840                    item.get("metadata")
2841                    if isinstance(item, dict)
2842                    else getattr(item, "metadata", None)
2843                )
2844
2845                final_observation_metadata = {
2846                    "experiment_name": experiment_name,
2847                    "experiment_run_name": experiment_run_name,
2848                    **(experiment_metadata or {}),
2849                }
2850
2851                trace_id = span.trace_id
2852                dataset_id = None
2853                dataset_item_id = None
2854                dataset_run_id = None
2855
2856                # Link to dataset run if this is a dataset item
2857                if hasattr(item, "id") and hasattr(item, "dataset_id"):
2858                    try:
2859                        # Use sync API to avoid event loop issues when run_async_safely
2860                        # creates multiple event loops across different threads
2861                        dataset_run_item = await asyncio.to_thread(
2862                            self.api.dataset_run_items.create,
2863                            run_name=experiment_run_name,
2864                            run_description=experiment_description,
2865                            metadata=experiment_metadata,
2866                            dataset_item_id=item.id,  # type: ignore
2867                            trace_id=trace_id,
2868                            observation_id=span.id,
2869                            dataset_version=dataset_version,
2870                        )
2871
2872                        dataset_run_id = dataset_run_item.dataset_run_id
2873
2874                    except Exception as e:
2875                        langfuse_logger.error(f"Failed to create dataset run item: {e}")
2876
2877                if (
2878                    not isinstance(item, dict)
2879                    and hasattr(item, "dataset_id")
2880                    and hasattr(item, "id")
2881                ):
2882                    dataset_id = item.dataset_id
2883                    dataset_item_id = item.id
2884
2885                    final_observation_metadata.update(
2886                        {"dataset_id": dataset_id, "dataset_item_id": dataset_item_id}
2887                    )
2888
2889                if isinstance(item_metadata, dict):
2890                    final_observation_metadata.update(item_metadata)
2891
2892                experiment_id = dataset_run_id or fallback_experiment_id
2893                experiment_item_id = (
2894                    dataset_item_id or get_sha256_hash_hex(_serialize(input_data))[:16]
2895                )
2896                span._otel_span.set_attributes(
2897                    {
2898                        k: v
2899                        for k, v in {
2900                            LangfuseOtelSpanAttributes.ENVIRONMENT: LANGFUSE_SDK_EXPERIMENT_ENVIRONMENT,
2901                            LangfuseOtelSpanAttributes.EXPERIMENT_DESCRIPTION: experiment_description,
2902                            LangfuseOtelSpanAttributes.EXPERIMENT_ITEM_EXPECTED_OUTPUT: _serialize(
2903                                expected_output
2904                            ),
2905                        }.items()
2906                        if v is not None
2907                    }
2908                )
2909
2910                propagated_experiment_attributes = PropagatedExperimentAttributes(
2911                    experiment_id=experiment_id,
2912                    experiment_name=experiment_run_name,
2913                    experiment_metadata=_flatten_and_serialize_metadata_values(
2914                        experiment_metadata
2915                    ),
2916                    experiment_dataset_id=dataset_id,
2917                    experiment_item_id=experiment_item_id,
2918                    experiment_item_metadata=_flatten_and_serialize_metadata_values(
2919                        item_metadata if isinstance(item_metadata, dict) else None
2920                    ),
2921                    experiment_item_root_observation_id=span.id,
2922                )
2923
2924                with _propagate_attributes(experiment=propagated_experiment_attributes):
2925                    output = await _run_task(task, item)
2926
2927                span.update(
2928                    input=input_data,
2929                    output=output,
2930                    metadata=final_observation_metadata,
2931                )
2932
2933            except Exception as e:
2934                span.update(
2935                    output=f"Error: {str(e)}", level="ERROR", status_message=str(e)
2936                )
2937                raise e
2938
2939            # Run evaluators
2940            evaluations = []
2941
2942            for evaluator in evaluators:
2943                try:
2944                    eval_metadata: Optional[Dict[str, Any]] = None
2945
2946                    if isinstance(item, dict):
2947                        eval_metadata = item.get("metadata")
2948                    elif hasattr(item, "metadata"):
2949                        eval_metadata = item.metadata
2950
2951                    with _propagate_attributes(
2952                        experiment=propagated_experiment_attributes
2953                    ):
2954                        eval_results = await _run_evaluator(
2955                            evaluator,
2956                            input=input_data,
2957                            output=output,
2958                            expected_output=expected_output,
2959                            metadata=eval_metadata,
2960                        )
2961                        evaluations.extend(eval_results)
2962
2963                        # Store evaluations as scores
2964                        for evaluation in eval_results:
2965                            self.create_score(
2966                                trace_id=trace_id,
2967                                observation_id=span.id,
2968                                name=evaluation.name,
2969                                value=evaluation.value,  # type: ignore
2970                                comment=evaluation.comment,
2971                                metadata=evaluation.metadata,
2972                                config_id=evaluation.config_id,
2973                                data_type=evaluation.data_type,  # type: ignore
2974                            )
2975
2976                except Exception as e:
2977                    langfuse_logger.error(f"Evaluator failed: {e}")
2978
2979            # Run composite evaluator if provided and we have evaluations
2980            if composite_evaluator and evaluations:
2981                try:
2982                    composite_eval_metadata: Optional[Dict[str, Any]] = None
2983                    if isinstance(item, dict):
2984                        composite_eval_metadata = item.get("metadata")
2985                    elif hasattr(item, "metadata"):
2986                        composite_eval_metadata = item.metadata
2987
2988                    with _propagate_attributes(
2989                        experiment=propagated_experiment_attributes
2990                    ):
2991                        result = composite_evaluator(
2992                            input=input_data,
2993                            output=output,
2994                            expected_output=expected_output,
2995                            metadata=composite_eval_metadata,
2996                            evaluations=evaluations,
2997                        )
2998
2999                        # Handle async composite evaluators
3000                        if asyncio.iscoroutine(result):
3001                            result = await result
3002
3003                        # Normalize to list
3004                        composite_evals: List[Evaluation] = []
3005                        if isinstance(result, (dict, Evaluation)):
3006                            composite_evals = [result]  # type: ignore
3007                        elif isinstance(result, list):
3008                            composite_evals = result  # type: ignore
3009
3010                        # Store composite evaluations as scores and add to evaluations list
3011                        for composite_evaluation in composite_evals:
3012                            self.create_score(
3013                                trace_id=trace_id,
3014                                observation_id=span.id,
3015                                name=composite_evaluation.name,
3016                                value=composite_evaluation.value,  # type: ignore
3017                                comment=composite_evaluation.comment,
3018                                metadata=composite_evaluation.metadata,
3019                                config_id=composite_evaluation.config_id,
3020                                data_type=composite_evaluation.data_type,  # type: ignore
3021                            )
3022                            evaluations.append(composite_evaluation)
3023
3024                except Exception as e:
3025                    langfuse_logger.error(f"Composite evaluator failed: {e}")
3026
3027            return ExperimentItemResult(
3028                item=item,
3029                output=output,
3030                evaluations=evaluations,
3031                trace_id=trace_id,
3032                dataset_run_id=dataset_run_id,
3033            )
3034
3035    def _create_experiment_run_name(
3036        self, *, name: Optional[str] = None, run_name: Optional[str] = None
3037    ) -> str:
3038        if run_name:
3039            return run_name
3040
3041        iso_timestamp = _get_timestamp().isoformat().replace("+00:00", "Z")
3042
3043        return f"{name} - {iso_timestamp}"
3044
3045    def run_batched_evaluation(
3046        self,
3047        *,
3048        scope: Literal["traces", "observations"],
3049        mapper: MapperFunction,
3050        filter: Optional[str] = None,
3051        fetch_batch_size: int = 50,
3052        fetch_trace_fields: Optional[str] = None,
3053        max_items: Optional[int] = None,
3054        max_retries: int = 3,
3055        evaluators: List[EvaluatorFunction],
3056        composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
3057        max_concurrency: int = 5,
3058        metadata: Optional[Dict[str, Any]] = None,
3059        _add_observation_scores_to_trace: bool = False,
3060        _additional_trace_tags: Optional[List[str]] = None,
3061        resume_from: Optional[BatchEvaluationResumeToken] = None,
3062        verbose: bool = False,
3063    ) -> BatchEvaluationResult:
3064        """Fetch traces or observations and run evaluations on each item.
3065
3066        This method provides a powerful way to evaluate existing data in Langfuse at scale.
3067        It fetches items based on filters, transforms them using a mapper function, runs
3068        evaluators on each item, and creates scores that are linked back to the original
3069        entities. This is ideal for:
3070
3071        - Running evaluations on production traces after deployment
3072        - Backtesting new evaluation metrics on historical data
3073        - Batch scoring of observations for quality monitoring
3074        - Periodic evaluation runs on recent data
3075
3076        The method uses a streaming/pipeline approach to process items in batches, making
3077        it memory-efficient for large datasets. It includes comprehensive error handling,
3078        retry logic, and resume capability for long-running evaluations.
3079
3080        Args:
3081            scope: The type of items to evaluate. Must be one of:
3082                - "traces": Evaluate complete traces with all their observations
3083                - "observations": Evaluate individual observations (spans, generations, events)
3084            mapper: Function that transforms API response objects into evaluator inputs.
3085                Receives a trace/observation object and returns an EvaluatorInputs
3086                instance with input, output, expected_output, and metadata fields.
3087                Can be sync or async.
3088            evaluators: List of evaluation functions to run on each item. Each evaluator
3089                receives the mapped inputs and returns Evaluation object(s). Evaluator
3090                failures are logged but don't stop the batch evaluation.
3091            filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples:
3092                - '{"tags": ["production"]}'
3093                - '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}'
3094                Default: None (fetches all items).
3095            fetch_batch_size: Number of items to fetch per API call and hold in memory.
3096                Larger values may be faster but use more memory. Default: 50.
3097            fetch_trace_fields: Comma-separated list of fields to include when fetching traces. Available field groups: 'core' (always included), 'io' (input, output, metadata), 'scores', 'observations', 'metrics'. If not specified, all fields are returned. Example: 'core,scores,metrics'. Note: Excluded 'observations' or 'scores' fields return empty arrays; excluded 'metrics' returns -1 for 'totalCost' and 'latency'. Only relevant if scope is 'traces'.
3098            max_items: Maximum total number of items to process. If None, processes all
3099                items matching the filter. Useful for testing or limiting evaluation runs.
3100                Default: None (process all).
3101            max_concurrency: Maximum number of items to evaluate concurrently. Controls
3102                parallelism and resource usage. Default: 5.
3103            composite_evaluator: Optional function that creates a composite score from
3104                item-level evaluations. Receives the original item and its evaluations,
3105                returns a single Evaluation. Useful for weighted averages or combined metrics.
3106                Default: None.
3107            metadata: Optional metadata dict to add to all created scores. Useful for
3108                tracking evaluation runs, versions, or other context. Default: None.
3109            max_retries: Maximum number of retry attempts for failed batch fetches.
3110                Uses exponential backoff (1s, 2s, 4s). Default: 3.
3111            verbose: If True, logs progress information to console. Useful for monitoring
3112                long-running evaluations. Default: False.
3113            resume_from: Optional resume token from a previous incomplete run. Allows
3114                continuing evaluation after interruption or failure. Default: None.
3115
3116
3117        Returns:
3118            BatchEvaluationResult containing:
3119                - total_items_fetched: Number of items fetched from API
3120                - total_items_processed: Number of items successfully evaluated
3121                - total_items_failed: Number of items that failed evaluation
3122                - total_scores_created: Scores created by item-level evaluators
3123                - total_composite_scores_created: Scores created by composite evaluator
3124                - total_evaluations_failed: Individual evaluator failures
3125                - evaluator_stats: Per-evaluator statistics (success rate, scores created)
3126                - resume_token: Token for resuming if incomplete (None if completed)
3127                - completed: True if all items processed
3128                - duration_seconds: Total execution time
3129                - failed_item_ids: IDs of items that failed
3130                - error_summary: Error types and counts
3131                - has_more_items: True if max_items reached but more exist
3132
3133        Raises:
3134            ValueError: If invalid scope is provided.
3135
3136        Examples:
3137            Basic trace evaluation:
3138            ```python
3139            from langfuse import Langfuse, EvaluatorInputs, Evaluation
3140
3141            client = Langfuse()
3142
3143            # Define mapper to extract fields from traces
3144            def trace_mapper(trace):
3145                return EvaluatorInputs(
3146                    input=trace.input,
3147                    output=trace.output,
3148                    expected_output=None,
3149                    metadata={"trace_id": trace.id}
3150                )
3151
3152            # Define evaluator
3153            def length_evaluator(*, input, output, expected_output, metadata):
3154                return Evaluation(
3155                    name="output_length",
3156                    value=len(output) if output else 0
3157                )
3158
3159            # Run batch evaluation
3160            result = client.run_batched_evaluation(
3161                scope="traces",
3162                mapper=trace_mapper,
3163                evaluators=[length_evaluator],
3164                filter='{"tags": ["production"]}',
3165                max_items=1000,
3166                verbose=True
3167            )
3168
3169            print(f"Processed {result.total_items_processed} traces")
3170            print(f"Created {result.total_scores_created} scores")
3171            ```
3172
3173            Evaluation with composite scorer:
3174            ```python
3175            def accuracy_evaluator(*, input, output, expected_output, metadata):
3176                # ... evaluation logic
3177                return Evaluation(name="accuracy", value=0.85)
3178
3179            def relevance_evaluator(*, input, output, expected_output, metadata):
3180                # ... evaluation logic
3181                return Evaluation(name="relevance", value=0.92)
3182
3183            def composite_evaluator(*, item, evaluations):
3184                # Weighted average of evaluations
3185                weights = {"accuracy": 0.6, "relevance": 0.4}
3186                total = sum(
3187                    e.value * weights.get(e.name, 0)
3188                    for e in evaluations
3189                    if isinstance(e.value, (int, float))
3190                )
3191                return Evaluation(
3192                    name="composite_score",
3193                    value=total,
3194                    comment=f"Weighted average of {len(evaluations)} metrics"
3195                )
3196
3197            result = client.run_batched_evaluation(
3198                scope="traces",
3199                mapper=trace_mapper,
3200                evaluators=[accuracy_evaluator, relevance_evaluator],
3201                composite_evaluator=composite_evaluator,
3202                filter='{"user_id": "important_user"}',
3203                verbose=True
3204            )
3205            ```
3206
3207            Handling incomplete runs with resume:
3208            ```python
3209            # Initial run that may fail or timeout
3210            result = client.run_batched_evaluation(
3211                scope="observations",
3212                mapper=obs_mapper,
3213                evaluators=[my_evaluator],
3214                max_items=10000,
3215                verbose=True
3216            )
3217
3218            # Check if incomplete
3219            if not result.completed and result.resume_token:
3220                print(f"Processed {result.resume_token.items_processed} items before interruption")
3221
3222                # Resume from where it left off
3223                result = client.run_batched_evaluation(
3224                    scope="observations",
3225                    mapper=obs_mapper,
3226                    evaluators=[my_evaluator],
3227                    resume_from=result.resume_token,
3228                    verbose=True
3229                )
3230
3231            print(f"Total items processed: {result.total_items_processed}")
3232            ```
3233
3234            Monitoring evaluator performance:
3235            ```python
3236            result = client.run_batched_evaluation(...)
3237
3238            for stats in result.evaluator_stats:
3239                success_rate = stats.successful_runs / stats.total_runs
3240                print(f"{stats.name}:")
3241                print(f"  Success rate: {success_rate:.1%}")
3242                print(f"  Scores created: {stats.total_scores_created}")
3243
3244                if stats.failed_runs > 0:
3245                    print(f"  ⚠️  Failed {stats.failed_runs} times")
3246            ```
3247
3248        Note:
3249            - Evaluator failures are logged but don't stop the batch evaluation
3250            - Individual item failures are tracked but don't stop processing
3251            - Fetch failures are retried with exponential backoff
3252            - All scores are automatically flushed to Langfuse at the end
3253            - The resume mechanism uses timestamp-based filtering to avoid duplicates
3254        """
3255        runner = BatchEvaluationRunner(self)
3256
3257        return cast(
3258            BatchEvaluationResult,
3259            run_async_safely(
3260                runner.run_async(
3261                    scope=scope,
3262                    mapper=mapper,
3263                    evaluators=evaluators,
3264                    filter=filter,
3265                    fetch_batch_size=fetch_batch_size,
3266                    fetch_trace_fields=fetch_trace_fields,
3267                    max_items=max_items,
3268                    max_concurrency=max_concurrency,
3269                    composite_evaluator=composite_evaluator,
3270                    metadata=metadata,
3271                    _add_observation_scores_to_trace=_add_observation_scores_to_trace,
3272                    _additional_trace_tags=_additional_trace_tags,
3273                    max_retries=max_retries,
3274                    verbose=verbose,
3275                    resume_from=resume_from,
3276                )
3277            ),
3278        )
3279
3280    def auth_check(self) -> bool:
3281        """Check if the provided credentials (public and secret key) are valid.
3282
3283        Raises:
3284            Exception: If no projects were found for the provided credentials.
3285
3286        Note:
3287            This method is blocking. It is discouraged to use it in production code.
3288        """
3289        try:
3290            projects = self.api.projects.get()
3291            langfuse_logger.debug(
3292                f"Auth check successful, found {len(projects.data)} projects"
3293            )
3294            if len(projects.data) == 0:
3295                raise Exception(
3296                    "Auth check failed, no project found for the keys provided."
3297                )
3298            return True
3299
3300        except AttributeError as e:
3301            langfuse_logger.warning(
3302                f"Auth check failed: Client not properly initialized. Error: {e}"
3303            )
3304            return False
3305
3306        except Error as e:
3307            handle_fern_exception(e)
3308            raise e
3309
3310    def create_dataset(
3311        self,
3312        *,
3313        name: str,
3314        description: Optional[str] = None,
3315        metadata: Optional[Any] = None,
3316        input_schema: Optional[Any] = None,
3317        expected_output_schema: Optional[Any] = None,
3318    ) -> Dataset:
3319        """Create a dataset with the given name on Langfuse.
3320
3321        Args:
3322            name: Name of the dataset to create.
3323            description: Description of the dataset. Defaults to None.
3324            metadata: Additional metadata. Defaults to None.
3325            input_schema: JSON Schema for validating dataset item inputs. When set, all new items will be validated against this schema.
3326            expected_output_schema: JSON Schema for validating dataset item expected outputs. When set, all new items will be validated against this schema.
3327
3328        Returns:
3329            Dataset: The created dataset as returned by the Langfuse API.
3330        """
3331        try:
3332            langfuse_logger.debug(f"Creating datasets {name}")
3333
3334            result = self.api.datasets.create(
3335                name=name,
3336                description=description,
3337                metadata=metadata,
3338                input_schema=input_schema,
3339                expected_output_schema=expected_output_schema,
3340            )
3341
3342            return cast(Dataset, result)
3343
3344        except Error as e:
3345            handle_fern_exception(e)
3346            raise e
3347
3348    def create_dataset_item(
3349        self,
3350        *,
3351        dataset_name: str,
3352        input: Optional[Any] = None,
3353        expected_output: Optional[Any] = None,
3354        metadata: Optional[Any] = None,
3355        source_trace_id: Optional[str] = None,
3356        source_observation_id: Optional[str] = None,
3357        status: Optional[DatasetStatus] = None,
3358        id: Optional[str] = None,
3359    ) -> DatasetItem:
3360        """Create a dataset item.
3361
3362        Upserts if an item with id already exists.
3363
3364        Args:
3365            dataset_name: Name of the dataset in which the dataset item should be created.
3366            input: Input data. Defaults to None. Can contain any dict, list or scalar.
3367            expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar.
3368            metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar.
3369            source_trace_id: Id of the source trace. Defaults to None.
3370            source_observation_id: Id of the source observation. Defaults to None.
3371            status: Status of the dataset item. Defaults to ACTIVE for newly created items.
3372            id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets.
3373
3374        Returns:
3375            DatasetItem: The created dataset item as returned by the Langfuse API.
3376
3377        Example:
3378            ```python
3379            from langfuse import Langfuse
3380
3381            langfuse = Langfuse()
3382
3383            # Uploading items to the Langfuse dataset named "capital_cities"
3384            langfuse.create_dataset_item(
3385                dataset_name="capital_cities",
3386                input={"input": {"country": "Italy"}},
3387                expected_output={"expected_output": "Rome"},
3388                metadata={"foo": "bar"}
3389            )
3390            ```
3391        """
3392        try:
3393            langfuse_logger.debug(f"Creating dataset item for dataset {dataset_name}")
3394
3395            # Media uploads must reference the (dataset, item) they belong to, and
3396            # the item need not exist yet — so settle on the item id up front and
3397            # reuse it for the create call below.
3398            item_id = id if id is not None else str(uuid.uuid4())
3399
3400            # Single pass per field: swap each LangfuseMedia for its reference
3401            # string (derived from content, not the upload) and collect the media
3402            # still to upload, deduped by media id and tagged with its field.
3403            pending_media: Dict[str, Tuple[LangfuseMedia, str]] = {}
3404            input = self._process_dataset_item_media(
3405                data=input,
3406                pending_media=pending_media,
3407                field=DatasetItemMediaReferenceField.INPUT.value,
3408            )
3409            expected_output = self._process_dataset_item_media(
3410                data=expected_output,
3411                pending_media=pending_media,
3412                field=DatasetItemMediaReferenceField.EXPECTED_OUTPUT.value,
3413            )
3414            metadata = self._process_dataset_item_media(
3415                data=metadata,
3416                pending_media=pending_media,
3417                field=DatasetItemMediaReferenceField.METADATA.value,
3418            )
3419
3420            # The upload needs the dataset id, but the create API only takes the
3421            # name. Resolve it once, and only when there is actually media to
3422            # upload — a plain item pays no extra datasets.get round-trip.
3423            if pending_media:
3424                assert self._resources is not None
3425                dataset_id = self.api.datasets.get(self._url_encode(dataset_name)).id
3426                for media, field in pending_media.values():
3427                    self._resources._media_manager._upload_media_sync(
3428                        media=media,
3429                        dataset_id=dataset_id,
3430                        dataset_item_id=item_id,
3431                        field=field,
3432                    )
3433
3434            result = self.api.dataset_items.create(
3435                dataset_name=dataset_name,
3436                input=input,
3437                expected_output=expected_output,
3438                metadata=metadata,
3439                source_trace_id=source_trace_id,
3440                source_observation_id=source_observation_id,
3441                status=status,
3442                id=item_id,
3443            )
3444
3445            return cast(DatasetItem, result)
3446        except Error as e:
3447            handle_fern_exception(e)
3448            raise e
3449
3450    def _process_dataset_item_media(
3451        self,
3452        *,
3453        data: Any,
3454        pending_media: Dict[str, Tuple[LangfuseMedia, str]],
3455        field: str,
3456    ) -> Any:
3457        """Swap each ``LangfuseMedia`` for its reference string in ``data``.
3458
3459        Each replaced media is recorded in ``pending_media`` (keyed by media id,
3460        so the same media across fields uploads once) for the caller to upload
3461        after the dataset id has been resolved.
3462        """
3463        if self._resources is None:
3464            return data
3465
3466        max_levels = 10
3467
3468        def _process_data_recursively(
3469            data: Any, level: int, ancestor_container_ids: set[int]
3470        ) -> Any:
3471            if isinstance(data, LangfuseMedia):
3472                reference_string = data._reference_string
3473                media_id = data._media_id
3474                if reference_string is None or media_id is None:
3475                    raise ValueError(
3476                        "Cannot create dataset item with invalid LangfuseMedia."
3477                    )
3478                # First field a media appears in wins; later duplicates dedupe.
3479                pending_media.setdefault(media_id, (data, field))
3480                return reference_string
3481
3482            if isinstance(data, LangfuseMediaReference):
3483                return data.reference_string if data.reference_string else data
3484
3485            # Tuples are intentionally excluded: namedtuple subclasses can't be
3486            # rebuilt from an iterable, so media inside them is left untouched.
3487            if not isinstance(data, (list, set, frozenset, dict)):
3488                return data
3489
3490            # Container ids only protect against recursive cycles.
3491            data_id = id(data)
3492            if data_id in ancestor_container_ids or level > max_levels:
3493                return data
3494
3495            next_ancestor_container_ids = ancestor_container_ids | {data_id}
3496
3497            if isinstance(data, (list, set, frozenset)):
3498                processed = (
3499                    _process_data_recursively(
3500                        item, level + 1, next_ancestor_container_ids
3501                    )
3502                    for item in data
3503                )
3504                return type(data)(processed)
3505
3506            return {
3507                key: _process_data_recursively(
3508                    value, level + 1, next_ancestor_container_ids
3509                )
3510                for key, value in data.items()
3511            }
3512
3513        return _process_data_recursively(data, 1, set())
3514
3515    def _hydrate_dataset_item_media_references(self, item: DatasetItem) -> DatasetItem:
3516        media_references = item.media_references or []
3517        if not media_references:
3518            return item
3519
3520        # Map the API enum member to the snake_case model attribute so this keeps
3521        # working regardless of the enum's wire value (e.g. "expectedOutput").
3522        attr_by_field = {
3523            DatasetItemMediaReferenceField.INPUT: "input",
3524            DatasetItemMediaReferenceField.EXPECTED_OUTPUT: "expected_output",
3525            DatasetItemMediaReferenceField.METADATA: "metadata",
3526        }
3527        hydrated_fields = {
3528            "input": item.input,
3529            "expected_output": item.expected_output,
3530            "metadata": item.metadata,
3531        }
3532
3533        for media_reference in media_references:
3534            media = media_reference.media
3535            field = attr_by_field.get(media_reference.field)
3536            if field is None:
3537                continue
3538
3539            replacement = LangfuseMediaReference(
3540                media_id=media.media_id,
3541                content_type=media.content_type,
3542                url=media.url,
3543                url_expiry=media.url_expiry,
3544                content_length=media.content_length,
3545                reference_string=media_reference.reference_string,
3546            )
3547            hydrated_fields[field] = self._replace_json_path_value(
3548                value=hydrated_fields[field],
3549                path=media_reference.json_path,
3550                replacement=replacement,
3551            )
3552
3553        return item.model_copy(
3554            update={
3555                "input": hydrated_fields["input"],
3556                "expected_output": hydrated_fields["expected_output"],
3557                "metadata": hydrated_fields["metadata"],
3558            }
3559        )
3560
3561    def _replace_json_path_value(
3562        self, *, value: Any, path: str, replacement: LangfuseMediaReference
3563    ) -> Any:
3564        try:
3565            return json_path.set_value_at_path(value, path, replacement)
3566        except Exception as e:
3567            langfuse_logger.warning(
3568                f"Failed to hydrate dataset media reference at JSONPath {path}",
3569                exc_info=e,
3570            )
3571
3572            return value
3573
3574    def resolve_media_references(
3575        self,
3576        *,
3577        obj: Any,
3578        resolve_with: Literal["base64_data_uri"],
3579        max_depth: int = 10,
3580        content_fetch_timeout_seconds: int = 5,
3581    ) -> Any:
3582        """Replace media reference strings in an object with base64 data URIs.
3583
3584        This method recursively traverses an object (up to max_depth) looking for media reference strings
3585        in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using
3586        the provided Langfuse client and replaces the reference string with a base64 data URI.
3587
3588        If fetching media content fails for a reference string, a warning is logged and the reference
3589        string is left unchanged.
3590
3591        Args:
3592            obj: The object to process. Can be a primitive value, array, or nested object.
3593                If the object has a __dict__ attribute, a dict will be returned instead of the original object type.
3594            resolve_with: The representation of the media content to replace the media reference string with.
3595                Currently only "base64_data_uri" is supported.
3596            max_depth: int: The maximum depth to traverse the object. Default is 10.
3597            content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5.
3598
3599        Returns:
3600            A deep copy of the input object with all media references replaced with base64 data URIs where possible.
3601            If the input object has a __dict__ attribute, a dict will be returned instead of the original object type.
3602
3603        Example:
3604            obj = {
3605                "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@",
3606                "nested": {
3607                    "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@"
3608                }
3609            }
3610
3611            result = await LangfuseMedia.resolve_media_references(obj, langfuse_client)
3612
3613            # Result:
3614            # {
3615            #     "image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...",
3616            #     "nested": {
3617            #         "pdf": "data:application/pdf;base64,JVBERi0xLjcK..."
3618            #     }
3619            # }
3620        """
3621        return LangfuseMedia.resolve_media_references(
3622            langfuse_client=self,
3623            obj=obj,
3624            resolve_with=resolve_with,
3625            max_depth=max_depth,
3626            content_fetch_timeout_seconds=content_fetch_timeout_seconds,
3627        )
3628
3629    @overload
3630    def get_prompt(
3631        self,
3632        name: str,
3633        *,
3634        version: Optional[int] = None,
3635        label: Optional[str] = None,
3636        type: Literal["chat"],
3637        cache_ttl_seconds: Optional[int] = None,
3638        fallback: Optional[List[ChatMessageDict]] = None,
3639        max_retries: Optional[int] = None,
3640        fetch_timeout_seconds: Optional[int] = None,
3641    ) -> ChatPromptClient: ...
3642
3643    @overload
3644    def get_prompt(
3645        self,
3646        name: str,
3647        *,
3648        version: Optional[int] = None,
3649        label: Optional[str] = None,
3650        type: Literal["text"] = "text",
3651        cache_ttl_seconds: Optional[int] = None,
3652        fallback: Optional[str] = None,
3653        max_retries: Optional[int] = None,
3654        fetch_timeout_seconds: Optional[int] = None,
3655    ) -> TextPromptClient: ...
3656
3657    def get_prompt(
3658        self,
3659        name: str,
3660        *,
3661        version: Optional[int] = None,
3662        label: Optional[str] = None,
3663        type: Literal["chat", "text"] = "text",
3664        cache_ttl_seconds: Optional[int] = None,
3665        fallback: Union[Optional[List[ChatMessageDict]], Optional[str]] = None,
3666        max_retries: Optional[int] = None,
3667        fetch_timeout_seconds: Optional[int] = None,
3668    ) -> PromptClient:
3669        """Get a prompt.
3670
3671        This method attempts to fetch the requested prompt from the local cache. If the prompt is not found
3672        in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again
3673        and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will
3674        return the expired prompt as a fallback.
3675
3676        Args:
3677            name (str): The name of the prompt to retrieve.
3678
3679        Keyword Args:
3680            version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both.
3681            label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both.
3682            cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a
3683            keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0.
3684            type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text".
3685            fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None.
3686            max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds.
3687            fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default.
3688
3689        Returns:
3690            The prompt object retrieved from the cache or directly fetched if not cached or expired of type
3691            - TextPromptClient, if type argument is 'text'.
3692            - ChatPromptClient, if type argument is 'chat'.
3693
3694        Raises:
3695            Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an
3696            expired prompt in the cache, in which case it logs a warning and returns the expired prompt.
3697        """
3698        if self._resources is None:
3699            raise Error(
3700                "SDK is not correctly initialized. Check the init logs for more details."
3701            )
3702        if version is not None and label is not None:
3703            raise ValueError("Cannot specify both version and label at the same time.")
3704
3705        if not name:
3706            raise ValueError("Prompt name cannot be empty.")
3707
3708        cache_key = PromptCache.generate_cache_key(name, version=version, label=label)
3709        bounded_max_retries = self._get_bounded_max_retries(
3710            max_retries, default_max_retries=2, max_retries_upper_bound=4
3711        )
3712
3713        langfuse_logger.debug(f"Getting prompt '{cache_key}'")
3714        cached_prompt = self._resources.prompt_cache.get(cache_key)
3715
3716        if cached_prompt is None or cache_ttl_seconds == 0:
3717            langfuse_logger.debug(
3718                f"Prompt '{cache_key}' not found in cache or caching disabled."
3719            )
3720            try:
3721                return self._fetch_prompt_and_update_cache(
3722                    name,
3723                    version=version,
3724                    label=label,
3725                    ttl_seconds=cache_ttl_seconds,
3726                    max_retries=bounded_max_retries,
3727                    fetch_timeout_seconds=fetch_timeout_seconds,
3728                )
3729            except Exception as e:
3730                if fallback:
3731                    langfuse_logger.warning(
3732                        f"Returning fallback prompt for '{cache_key}' due to fetch error: {e}"
3733                    )
3734
3735                    fallback_client_args: Dict[str, Any] = {
3736                        "name": name,
3737                        "prompt": fallback,
3738                        "type": type,
3739                        "version": version or 0,
3740                        "config": {},
3741                        "labels": [label] if label else [],
3742                        "tags": [],
3743                    }
3744
3745                    if type == "text":
3746                        return TextPromptClient(
3747                            prompt=Prompt_Text(**fallback_client_args),
3748                            is_fallback=True,
3749                        )
3750
3751                    if type == "chat":
3752                        return ChatPromptClient(
3753                            prompt=Prompt_Chat(**fallback_client_args),
3754                            is_fallback=True,
3755                        )
3756
3757                raise e
3758
3759        if cached_prompt.is_expired():
3760            langfuse_logger.debug(f"Stale prompt '{cache_key}' found in cache.")
3761            try:
3762                # refresh prompt in background thread, refresh_prompt deduplicates tasks
3763                langfuse_logger.debug(f"Refreshing prompt '{cache_key}' in background.")
3764
3765                def refresh_task() -> None:
3766                    self._fetch_prompt_and_update_cache(
3767                        name,
3768                        version=version,
3769                        label=label,
3770                        ttl_seconds=cache_ttl_seconds,
3771                        max_retries=bounded_max_retries,
3772                        fetch_timeout_seconds=fetch_timeout_seconds,
3773                    )
3774
3775                self._resources.prompt_cache.add_refresh_prompt_task_if_current(
3776                    cache_key,
3777                    cached_prompt,
3778                    refresh_task,
3779                )
3780                langfuse_logger.debug(
3781                    f"Returning stale prompt '{cache_key}' from cache."
3782                )
3783                # return stale prompt
3784                return cached_prompt.value
3785
3786            except Exception as e:
3787                langfuse_logger.warning(
3788                    f"Error when refreshing cached prompt '{cache_key}', returning cached version. Error: {e}"
3789                )
3790                # creation of refresh prompt task failed, return stale prompt
3791                return cached_prompt.value
3792
3793        return cached_prompt.value
3794
3795    def _fetch_prompt_and_update_cache(
3796        self,
3797        name: str,
3798        *,
3799        version: Optional[int] = None,
3800        label: Optional[str] = None,
3801        ttl_seconds: Optional[int] = None,
3802        max_retries: int,
3803        fetch_timeout_seconds: Optional[int],
3804    ) -> PromptClient:
3805        cache_key = PromptCache.generate_cache_key(name, version=version, label=label)
3806        langfuse_logger.debug(f"Fetching prompt '{cache_key}' from server...")
3807
3808        try:
3809
3810            @backoff.on_exception(
3811                backoff.constant, Exception, max_tries=max_retries + 1, logger=None
3812            )
3813            def fetch_prompts() -> Any:
3814                return self.api.prompts.get(
3815                    self._url_encode(name),
3816                    version=version,
3817                    label=label,
3818                    request_options={
3819                        "timeout_in_seconds": fetch_timeout_seconds,
3820                    }
3821                    if fetch_timeout_seconds is not None
3822                    else None,
3823                )
3824
3825            prompt_response = fetch_prompts()
3826
3827            prompt: PromptClient
3828            if prompt_response.type == "chat":
3829                prompt = ChatPromptClient(prompt_response)
3830            else:
3831                prompt = TextPromptClient(prompt_response)
3832
3833            if self._resources is not None:
3834                self._resources.prompt_cache.set(cache_key, prompt, ttl_seconds)
3835
3836            return prompt
3837
3838        except NotFoundError as not_found_error:
3839            langfuse_logger.warning(
3840                f"Prompt '{cache_key}' not found during refresh, evicting from cache."
3841            )
3842            if self._resources is not None:
3843                self._resources.prompt_cache.delete(cache_key)
3844            raise not_found_error
3845
3846        except Exception as e:
3847            langfuse_logger.error(
3848                f"Error while fetching prompt '{cache_key}': {str(e)}"
3849            )
3850            raise e
3851
3852    def _get_bounded_max_retries(
3853        self,
3854        max_retries: Optional[int],
3855        *,
3856        default_max_retries: int = 2,
3857        max_retries_upper_bound: int = 4,
3858    ) -> int:
3859        if max_retries is None:
3860            return default_max_retries
3861
3862        bounded_max_retries = min(
3863            max(max_retries, 0),
3864            max_retries_upper_bound,
3865        )
3866
3867        return bounded_max_retries
3868
3869    @overload
3870    def create_prompt(
3871        self,
3872        *,
3873        name: str,
3874        prompt: List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]],
3875        labels: List[str] = [],
3876        tags: Optional[List[str]] = None,
3877        type: Optional[Literal["chat"]],
3878        config: Optional[Any] = None,
3879        commit_message: Optional[str] = None,
3880    ) -> ChatPromptClient: ...
3881
3882    @overload
3883    def create_prompt(
3884        self,
3885        *,
3886        name: str,
3887        prompt: str,
3888        labels: List[str] = [],
3889        tags: Optional[List[str]] = None,
3890        type: Optional[Literal["text"]] = "text",
3891        config: Optional[Any] = None,
3892        commit_message: Optional[str] = None,
3893    ) -> TextPromptClient: ...
3894
3895    def create_prompt(
3896        self,
3897        *,
3898        name: str,
3899        prompt: Union[
3900            str, List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]]
3901        ],
3902        labels: List[str] = [],
3903        tags: Optional[List[str]] = None,
3904        type: Optional[Literal["chat", "text"]] = "text",
3905        config: Optional[Any] = None,
3906        commit_message: Optional[str] = None,
3907    ) -> PromptClient:
3908        """Create a new prompt in Langfuse.
3909
3910        Keyword Args:
3911            name : The name of the prompt to be created.
3912            prompt : The content of the prompt to be created.
3913            is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead.
3914            labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label.
3915            tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt.
3916            config: Additional structured data to be saved with the prompt. Defaults to None.
3917            type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text".
3918            commit_message: Optional string describing the change.
3919
3920        Returns:
3921            TextPromptClient: The prompt if type argument is 'text'.
3922            ChatPromptClient: The prompt if type argument is 'chat'.
3923        """
3924        try:
3925            langfuse_logger.debug(f"Creating prompt {name=}, {labels=}")
3926
3927            if type == "chat":
3928                if not isinstance(prompt, list):
3929                    raise ValueError(
3930                        "For 'chat' type, 'prompt' must be a list of chat messages with role and content attributes."
3931                    )
3932                request: Union[CreateChatPromptRequest, CreateTextPromptRequest] = (
3933                    CreateChatPromptRequest(
3934                        name=name,
3935                        prompt=cast(Any, prompt),
3936                        labels=labels,
3937                        tags=tags,
3938                        config=config or {},
3939                        commit_message=commit_message,
3940                        type=CreateChatPromptType.CHAT,
3941                    )
3942                )
3943                server_prompt = self.api.prompts.create(request=request)
3944
3945                if self._resources is not None:
3946                    self._resources.prompt_cache.invalidate(name)
3947
3948                return ChatPromptClient(prompt=cast(Prompt_Chat, server_prompt))
3949
3950            if not isinstance(prompt, str):
3951                raise ValueError("For 'text' type, 'prompt' must be a string.")
3952
3953            request = CreateTextPromptRequest(
3954                name=name,
3955                prompt=prompt,
3956                labels=labels,
3957                tags=tags,
3958                config=config or {},
3959                commit_message=commit_message,
3960            )
3961
3962            server_prompt = self.api.prompts.create(request=request)
3963
3964            if self._resources is not None:
3965                self._resources.prompt_cache.invalidate(name)
3966
3967            return TextPromptClient(prompt=cast(Prompt_Text, server_prompt))
3968
3969        except Error as e:
3970            handle_fern_exception(e)
3971            raise e
3972
3973    def update_prompt(
3974        self,
3975        *,
3976        name: str,
3977        version: int,
3978        new_labels: List[str] = [],
3979    ) -> Any:
3980        """Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name.
3981
3982        Args:
3983            name (str): The name of the prompt to update.
3984            version (int): The version number of the prompt to update.
3985            new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to [].
3986
3987        Returns:
3988            Prompt: The updated prompt from the Langfuse API.
3989
3990        """
3991        updated_prompt = self.api.prompt_version.update(
3992            name=self._url_encode(name),
3993            version=version,
3994            new_labels=new_labels,
3995        )
3996
3997        if self._resources is not None:
3998            self._resources.prompt_cache.invalidate(name)
3999
4000        return updated_prompt
4001
4002    def _url_encode(self, url: str, *, is_url_param: Optional[bool] = False) -> str:
4003        # httpx ≥ 0.28 does its own WHATWG-compliant quoting (eg. encodes bare
4004        # “%”, “?”, “#”, “|”, … in query/path parts).  Re-quoting here would
4005        # double-encode, so we skip when the value is about to be sent straight
4006        # to httpx (`is_url_param=True`) and the installed version is ≥ 0.28.
4007        if is_url_param and Version(httpx.__version__) >= Version("0.28.0"):
4008            return url
4009
4010        # urllib.parse.quote does not escape slashes "/" by default; we need to add safe="" to force escaping
4011        # we need add safe="" to force escaping of slashes
4012        # This is necessary for prompts in prompt folders
4013        return urllib.parse.quote(url, safe="")
4014
4015    def clear_prompt_cache(self) -> None:
4016        """Clear the entire prompt cache, removing all cached prompts.
4017
4018        This method is useful when you want to force a complete refresh of all
4019        cached prompts, for example after major updates or when you need to
4020        ensure the latest versions are fetched from the server.
4021        """
4022        if self._resources is not None:
4023            self._resources.prompt_cache.clear()

Main client for Langfuse tracing and platform features.

This class provides an interface for creating and managing traces, spans, and generations in Langfuse as well as interacting with the Langfuse API.

The client features a thread-safe singleton pattern for each unique public API key, ensuring consistent trace context propagation across your application. It implements efficient batching of spans with configurable flush settings and includes background thread management for media uploads and score ingestion.

Configuration is flexible through either direct parameters or environment variables, with graceful fallbacks and runtime configuration updates.

Attributes:
  • api: Synchronous API client for Langfuse backend communication
  • async_api: Asynchronous API client for Langfuse backend communication
  • _otel_tracer: Internal LangfuseTracer instance managing OpenTelemetry components
Arguments:
  • public_key (Optional[str]): Your Langfuse public API key. Can also be set via LANGFUSE_PUBLIC_KEY environment variable.
  • secret_key (Optional[str]): Your Langfuse secret API key. Can also be set via LANGFUSE_SECRET_KEY environment variable.
  • base_url (Optional[str]): The Langfuse API base URL. Defaults to "https://cloud.langfuse.com". Can also be set via LANGFUSE_BASE_URL environment variable.
  • host (Optional[str]): Deprecated. Use base_url instead. The Langfuse API host URL. Defaults to "https://cloud.langfuse.com".
  • timeout (Optional[int]): Timeout in seconds for API requests. Defaults to 5 seconds.
  • httpx_client (Optional[httpx.Client]): Custom httpx client for making non-tracing HTTP requests. If not provided, a default client will be created.
  • debug (bool): Enable debug logging. Defaults to False. Can also be set via LANGFUSE_DEBUG environment variable.
  • tracing_enabled (Optional[bool]): Enable or disable tracing. Defaults to True. Can also be set via LANGFUSE_TRACING_ENABLED environment variable.
  • flush_at (Optional[int]): Number of spans to batch before sending to the API. Defaults to 512. Can also be set via LANGFUSE_FLUSH_AT environment variable.
  • flush_interval (Optional[float]): Time in seconds between batch flushes. Defaults to 5 seconds. Can also be set via LANGFUSE_FLUSH_INTERVAL environment variable.
  • environment (Optional[str]): Environment name for tracing. Default is 'default'. Can also be set via LANGFUSE_TRACING_ENVIRONMENT environment variable. Can be any lowercase alphanumeric string with hyphens and underscores that does not start with 'langfuse'.
  • release (Optional[str]): Release version/hash of your application. Used for grouping analytics by release.
  • media_upload_thread_count (Optional[int]): Number of background threads for handling media uploads. Defaults to 1. Can also be set via LANGFUSE_MEDIA_UPLOAD_THREAD_COUNT environment variable.
  • sample_rate (Optional[float]): Sampling rate for traces (0.0 to 1.0). Defaults to 1.0 (100% of traces are sampled). Can also be set via LANGFUSE_SAMPLE_RATE environment variable.
  • mask (Optional[MaskFunction]): Function to mask sensitive data synchronously when Langfuse SDK attributes are created. This applies only to data set through Langfuse SDK APIs such as start_observation(), update(), and set_trace_io().
  • mask_otel_spans (Optional[MaskOtelSpansFunction]): Synchronous export-stage hook for masking raw OpenTelemetry span attributes before this Langfuse client sends them to Langfuse. Use this for spans created by third-party OpenTelemetry instrumentations, or when you need to inspect final span attributes after export filtering and Langfuse media handling. It does not modify spans already exported through other OpenTelemetry exporters.

    The hook receives one OpenTelemetry export batch. A batch is not guaranteed to contain a complete trace, request, or Langfuse observation tree. The hook usually runs on the OpenTelemetry batch span processor worker thread; during flush() and shutdown it may run on the caller thread. Keep it synchronous, deterministic, and fast.

    Return None to leave the batch unchanged. Return MaskOtelSpansResult with OtelSpanPatch values to delete or replace attributes on selected spans. If the hook raises or returns an invalid batch result, Langfuse drops the whole export batch. If one returned span patch is invalid, Langfuse drops only that span from the Langfuse export.

    Example:

    from typing import Optional
    
    from langfuse import Langfuse
    from langfuse.types import (
        MaskOtelSpansParams,
        MaskOtelSpansResult,
        OtelSpanPatch,
    )
    
    def mask_otel_spans(
        *, params: MaskOtelSpansParams
    ) -> Optional[MaskOtelSpansResult]:
        patches = {}
    
        for identifier, span in params.spans.items():
            if "gen_ai.prompt.0.content" in span.attributes:
                patches[identifier] = OtelSpanPatch(
                    delete_attributes=("gen_ai.prompt.0.content",),
                    set_attributes={"masking.applied": True},
                )
    
        return MaskOtelSpansResult(span_patches=patches)
    
    langfuse = Langfuse(mask_otel_spans=mask_otel_spans)
    
  • blocked_instrumentation_scopes (Optional[List[str]]): Deprecated. Use should_export_span instead. Equivalent behavior:

    from langfuse.span_filter import is_default_export_span
    blocked = {"sqlite", "requests"}
    
    should_export_span = lambda span: (
        is_default_export_span(span)
        and (
            span.instrumentation_scope is None
            or span.instrumentation_scope.name not in blocked
        )
    )
    
  • should_export_span (Optional[Callable[[ReadableSpan], bool]]): Callback to decide whether to export a span. If omitted, Langfuse uses the default filter (Langfuse SDK spans, spans with gen_ai.* attributes, and known LLM instrumentation scopes).

  • additional_headers (Optional[Dict[str, str]]): Additional headers to include in all API requests and in the default OTLPSpanExporter requests. These headers will be merged with default headers. Note: If httpx_client is provided, additional_headers must be set directly on your custom httpx_client as well. If span_exporter is provided, these headers are not wired into that exporter and must be configured on the exporter instance directly.
  • tracer_provider(Optional[TracerProvider]): OpenTelemetry TracerProvider to use for Langfuse. This can be useful to set to have disconnected tracing between Langfuse and other OpenTelemetry-span emitting libraries. Note: To track active spans, the context is still shared between TracerProviders. This may lead to broken trace trees.
  • id_generator (Optional[IdGenerator]): OpenTelemetry ID generator to use when Langfuse creates its own TracerProvider. If omitted, the OpenTelemetry SDK default is used. If tracer_provider is provided, or an OpenTelemetry TracerProvider is already registered globally, configure the ID generator on that provider instead.
  • span_exporter (Optional[SpanExporter]): Custom OpenTelemetry span exporter for the Langfuse span processor. If omitted, Langfuse creates an OTLPSpanExporter pointed at the Langfuse OTLP endpoint. If provided, Langfuse does not wire base_url, exporter headers, exporter auth, or exporter timeout into it. Configure endpoint, headers, and timeout on the exporter instance directly. If you are sending spans to Langfuse v4 or using Langfuse Cloud Fast Preview, include x-langfuse-ingestion-version=4 on the exporter to enable real time processing of exported spans.
Example:
from langfuse.otel import Langfuse

# Initialize the client (reads from env vars if not provided)
langfuse = Langfuse(
    public_key="your-public-key",
    secret_key="your-secret-key",
    host="https://cloud.langfuse.com",  # Optional, default shown
)

# Create a trace span
with langfuse.start_as_current_observation(name="process-query") as span:
    # Your application code here

    # Create a nested generation span for an LLM call
    with span.start_as_current_generation(
        name="generate-response",
        model="gpt-4",
        input={"query": "Tell me about AI"},
        model_parameters={"temperature": 0.7, "max_tokens": 500}
    ) as generation:
        # Generate response here
        response = "AI is a field of computer science..."

        generation.update(
            output=response,
            usage_details={"prompt_tokens": 10, "completion_tokens": 50},
            cost_details={"total_cost": 0.0023}
        )

        # Score the generation (supports NUMERIC, BOOLEAN, CATEGORICAL)
        generation.score(name="relevance", value=0.95, data_type="NUMERIC")
Langfuse( *, public_key: Optional[str] = None, secret_key: Optional[str] = None, base_url: Optional[str] = None, host: Optional[str] = None, timeout: Optional[int] = None, httpx_client: Optional[httpx.Client] = None, debug: bool = False, tracing_enabled: Optional[bool] = True, flush_at: Optional[int] = None, flush_interval: Optional[float] = None, environment: Optional[str] = None, release: Optional[str] = None, media_upload_thread_count: Optional[int] = None, sample_rate: Optional[float] = None, mask: Optional[langfuse.types.MaskFunction] = None, mask_otel_spans: Optional[MaskOtelSpansFunction] = None, blocked_instrumentation_scopes: Optional[List[str]] = None, should_export_span: Optional[Callable[[opentelemetry.sdk.trace.ReadableSpan], bool]] = None, additional_headers: Optional[Dict[str, str]] = None, tracer_provider: Optional[opentelemetry.sdk.trace.TracerProvider] = None, id_generator: Optional[opentelemetry.sdk.trace.id_generator.IdGenerator] = None, span_exporter: Optional[opentelemetry.sdk.trace.export.SpanExporter] = None)
281    def __init__(
282        self,
283        *,
284        public_key: Optional[str] = None,
285        secret_key: Optional[str] = None,
286        base_url: Optional[str] = None,
287        host: Optional[str] = None,
288        timeout: Optional[int] = None,
289        httpx_client: Optional[httpx.Client] = None,
290        debug: bool = False,
291        tracing_enabled: Optional[bool] = True,
292        flush_at: Optional[int] = None,
293        flush_interval: Optional[float] = None,
294        environment: Optional[str] = None,
295        release: Optional[str] = None,
296        media_upload_thread_count: Optional[int] = None,
297        sample_rate: Optional[float] = None,
298        mask: Optional[MaskFunction] = None,
299        mask_otel_spans: Optional[MaskOtelSpansFunction] = None,
300        blocked_instrumentation_scopes: Optional[List[str]] = None,
301        should_export_span: Optional[Callable[[ReadableSpan], bool]] = None,
302        additional_headers: Optional[Dict[str, str]] = None,
303        tracer_provider: Optional[TracerProvider] = None,
304        id_generator: Optional[IdGenerator] = None,
305        span_exporter: Optional[SpanExporter] = None,
306    ):
307        self._base_url = (
308            base_url
309            or os.environ.get(LANGFUSE_BASE_URL)
310            or host
311            or os.environ.get(LANGFUSE_HOST, "https://cloud.langfuse.com")
312        )
313        self._environment = environment or cast(
314            str, os.environ.get(LANGFUSE_TRACING_ENVIRONMENT)
315        )
316        self._release = (
317            release
318            or os.environ.get(LANGFUSE_RELEASE, None)
319            or get_common_release_envs()
320        )
321        self._project_id: Optional[str] = None
322        sample_rate = sample_rate or float(os.environ.get(LANGFUSE_SAMPLE_RATE, 1.0))
323        if not 0.0 <= sample_rate <= 1.0:
324            raise ValueError(
325                f"Sample rate must be between 0.0 and 1.0, got {sample_rate}"
326            )
327
328        timeout = timeout or int(os.environ.get(LANGFUSE_TIMEOUT, 5))
329
330        self._tracing_enabled = (
331            tracing_enabled
332            and os.environ.get(LANGFUSE_TRACING_ENABLED, "true").lower() != "false"
333        )
334        if not self._tracing_enabled:
335            langfuse_logger.info(
336                "Configuration: Langfuse tracing is explicitly disabled. No data will be sent to the Langfuse API."
337            )
338
339        debug = (
340            debug if debug else (os.getenv(LANGFUSE_DEBUG, "false").lower() == "true")
341        )
342        if debug:
343            logging.basicConfig(
344                format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
345            )
346            langfuse_logger.setLevel(logging.DEBUG)
347
348        public_key = public_key or os.environ.get(LANGFUSE_PUBLIC_KEY)
349        if public_key is None:
350            langfuse_logger.warning(
351                "Authentication error: Langfuse client initialized without public_key. Client will be disabled. "
352                "Provide a public_key parameter or set LANGFUSE_PUBLIC_KEY environment variable. "
353            )
354            self._otel_tracer = otel_trace_api.NoOpTracer()
355            return
356
357        secret_key = secret_key or os.environ.get(LANGFUSE_SECRET_KEY)
358        if secret_key is None:
359            langfuse_logger.warning(
360                "Authentication error: Langfuse client initialized without secret_key. Client will be disabled. "
361                "Provide a secret_key parameter or set LANGFUSE_SECRET_KEY environment variable. "
362            )
363            self._otel_tracer = otel_trace_api.NoOpTracer()
364            return
365
366        if os.environ.get("OTEL_SDK_DISABLED", "false").lower() == "true":
367            langfuse_logger.warning(
368                "OTEL_SDK_DISABLED is set. Langfuse tracing will be disabled and no traces will appear in the UI."
369            )
370
371        if blocked_instrumentation_scopes is not None:
372            warnings.warn(
373                "`blocked_instrumentation_scopes` is deprecated and will be removed in a future release. "
374                "Use `should_export_span` instead. Example: "
375                "from langfuse.span_filter import is_default_export_span; "
376                'blocked={"scope"}; should_export_span=lambda span: '
377                "is_default_export_span(span) and (span.instrumentation_scope is None or "
378                "span.instrumentation_scope.name not in blocked).",
379                DeprecationWarning,
380                stacklevel=2,
381            )
382
383        # Initialize api and tracer if requirements are met
384        self._resources = LangfuseResourceManager(
385            public_key=public_key,
386            secret_key=secret_key,
387            base_url=self._base_url,
388            timeout=timeout,
389            environment=self._environment,
390            release=release,
391            flush_at=flush_at,
392            flush_interval=flush_interval,
393            httpx_client=httpx_client,
394            media_upload_thread_count=media_upload_thread_count,
395            sample_rate=sample_rate,
396            mask=mask,
397            mask_otel_spans=mask_otel_spans,
398            tracing_enabled=self._tracing_enabled,
399            blocked_instrumentation_scopes=blocked_instrumentation_scopes,
400            should_export_span=should_export_span,
401            additional_headers=additional_headers,
402            tracer_provider=tracer_provider,
403            id_generator=id_generator,
404            span_exporter=span_exporter,
405        )
406        self._mask = self._resources.mask
407
408        self._otel_tracer = (
409            self._resources.tracer
410            if self._tracing_enabled and self._resources.tracer is not None
411            else otel_trace_api.NoOpTracer()
412        )
413        self.api = self._resources.api
414        self.async_api = self._resources.async_api
api
async_api
def start_observation( self, *, trace_context: Optional[langfuse.types.TraceContext] = None, name: str, as_type: Union[Literal['generation', 'embedding'], Literal['span', 'agent', 'tool', 'chain', 'retriever', 'evaluator', 'guardrail']] = 'span', input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, float, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None) -> Union[LangfuseSpan, LangfuseGeneration, LangfuseAgent, LangfuseTool, LangfuseChain, LangfuseRetriever, LangfuseEvaluator, LangfuseEmbedding, LangfuseGuardrail]:
563    def start_observation(
564        self,
565        *,
566        trace_context: Optional[TraceContext] = None,
567        name: str,
568        as_type: ObservationTypeLiteralNoEvent = "span",
569        input: Optional[Any] = None,
570        output: Optional[Any] = None,
571        metadata: Optional[Any] = None,
572        version: Optional[str] = None,
573        level: Optional[SpanLevel] = None,
574        status_message: Optional[str] = None,
575        completion_start_time: Optional[datetime] = None,
576        model: Optional[str] = None,
577        model_parameters: Optional[Dict[str, MapValue]] = None,
578        usage_details: Optional[Dict[str, int]] = None,
579        cost_details: Optional[Dict[str, float]] = None,
580        prompt: Optional[PromptClient] = None,
581    ) -> Union[
582        LangfuseSpan,
583        LangfuseGeneration,
584        LangfuseAgent,
585        LangfuseTool,
586        LangfuseChain,
587        LangfuseRetriever,
588        LangfuseEvaluator,
589        LangfuseEmbedding,
590        LangfuseGuardrail,
591    ]:
592        """Create a new observation of the specified type.
593
594        This method creates a new observation but does not set it as the current span in the
595        context. To create and use an observation within a context, use start_as_current_observation().
596
597        Args:
598            trace_context: Optional context for connecting to an existing trace
599            name: Name of the observation
600            as_type: Type of observation to create (defaults to "span")
601            input: Input data for the operation
602            output: Output data from the operation
603            metadata: Additional metadata to associate with the observation
604            version: Version identifier for the code or component
605            level: Importance level of the observation
606            status_message: Optional status message for the observation
607            completion_start_time: When the model started generating (for generation types)
608            model: Name/identifier of the AI model used (for generation types)
609            model_parameters: Parameters used for the model (for generation types)
610            usage_details: Token usage information (for generation types)
611            cost_details: Cost information (for generation types)
612            prompt: Associated prompt template (for generation types)
613
614        Returns:
615            An observation object of the appropriate type that must be ended with .end()
616        """
617        if trace_context:
618            trace_id = trace_context.get("trace_id", None)
619            parent_span_id = trace_context.get("parent_span_id", None)
620
621            if trace_id:
622                remote_parent_span = self._create_remote_parent_span(
623                    trace_id=trace_id, parent_span_id=parent_span_id
624                )
625
626                with otel_trace_api.use_span(
627                    cast(otel_trace_api.Span, remote_parent_span)
628                ):
629                    otel_span = self._otel_tracer.start_span(name=name)
630                    otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True)
631
632                    return self._create_observation_from_otel_span(
633                        otel_span=otel_span,
634                        as_type=as_type,
635                        input=input,
636                        output=output,
637                        metadata=metadata,
638                        version=version,
639                        level=level,
640                        status_message=status_message,
641                        completion_start_time=completion_start_time,
642                        model=model,
643                        model_parameters=model_parameters,
644                        usage_details=usage_details,
645                        cost_details=cost_details,
646                        prompt=prompt,
647                    )
648
649        otel_span = self._otel_tracer.start_span(name=name)
650
651        return self._create_observation_from_otel_span(
652            otel_span=otel_span,
653            as_type=as_type,
654            input=input,
655            output=output,
656            metadata=metadata,
657            version=version,
658            level=level,
659            status_message=status_message,
660            completion_start_time=completion_start_time,
661            model=model,
662            model_parameters=model_parameters,
663            usage_details=usage_details,
664            cost_details=cost_details,
665            prompt=prompt,
666        )

Create a new observation of the specified type.

This method creates a new observation but does not set it as the current span in the context. To create and use an observation within a context, use start_as_current_observation().

Arguments:
  • trace_context: Optional context for connecting to an existing trace
  • name: Name of the observation
  • as_type: Type of observation to create (defaults to "span")
  • input: Input data for the operation
  • output: Output data from the operation
  • metadata: Additional metadata to associate with the observation
  • version: Version identifier for the code or component
  • level: Importance level of the observation
  • status_message: Optional status message for the observation
  • completion_start_time: When the model started generating (for generation types)
  • model: Name/identifier of the AI model used (for generation types)
  • model_parameters: Parameters used for the model (for generation types)
  • usage_details: Token usage information (for generation types)
  • cost_details: Cost information (for generation types)
  • prompt: Associated prompt template (for generation types)
Returns:

An observation object of the appropriate type that must be ended with .end()

def start_as_current_observation( self, *, trace_context: Optional[langfuse.types.TraceContext] = None, name: str, as_type: Union[Literal['generation', 'embedding'], Literal['span', 'agent', 'tool', 'chain', 'retriever', 'evaluator', 'guardrail']] = 'span', input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, float, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None, end_on_exit: Optional[bool] = None) -> Union[opentelemetry.util._decorator._AgnosticContextManager[LangfuseGeneration], opentelemetry.util._decorator._AgnosticContextManager[LangfuseSpan], opentelemetry.util._decorator._AgnosticContextManager[LangfuseAgent], opentelemetry.util._decorator._AgnosticContextManager[LangfuseTool], opentelemetry.util._decorator._AgnosticContextManager[LangfuseChain], opentelemetry.util._decorator._AgnosticContextManager[LangfuseRetriever], opentelemetry.util._decorator._AgnosticContextManager[LangfuseEvaluator], opentelemetry.util._decorator._AgnosticContextManager[LangfuseEmbedding], opentelemetry.util._decorator._AgnosticContextManager[LangfuseGuardrail]]:
 896    def start_as_current_observation(
 897        self,
 898        *,
 899        trace_context: Optional[TraceContext] = None,
 900        name: str,
 901        as_type: ObservationTypeLiteralNoEvent = "span",
 902        input: Optional[Any] = None,
 903        output: Optional[Any] = None,
 904        metadata: Optional[Any] = None,
 905        version: Optional[str] = None,
 906        level: Optional[SpanLevel] = None,
 907        status_message: Optional[str] = None,
 908        completion_start_time: Optional[datetime] = None,
 909        model: Optional[str] = None,
 910        model_parameters: Optional[Dict[str, MapValue]] = None,
 911        usage_details: Optional[Dict[str, int]] = None,
 912        cost_details: Optional[Dict[str, float]] = None,
 913        prompt: Optional[PromptClient] = None,
 914        end_on_exit: Optional[bool] = None,
 915    ) -> Union[
 916        _AgnosticContextManager[LangfuseGeneration],
 917        _AgnosticContextManager[LangfuseSpan],
 918        _AgnosticContextManager[LangfuseAgent],
 919        _AgnosticContextManager[LangfuseTool],
 920        _AgnosticContextManager[LangfuseChain],
 921        _AgnosticContextManager[LangfuseRetriever],
 922        _AgnosticContextManager[LangfuseEvaluator],
 923        _AgnosticContextManager[LangfuseEmbedding],
 924        _AgnosticContextManager[LangfuseGuardrail],
 925    ]:
 926        """Create a new observation and set it as the current span in a context manager.
 927
 928        This method creates a new observation of the specified type and sets it as the
 929        current span within a context manager. Use this method with a 'with' statement to
 930        automatically handle the observation lifecycle within a code block.
 931
 932        The created observation will be the child of the current span in the context.
 933
 934        Args:
 935            trace_context: Optional context for connecting to an existing trace
 936            name: Name of the observation (e.g., function or operation name)
 937            as_type: Type of observation to create (defaults to "span")
 938            input: Input data for the operation (can be any JSON-serializable object)
 939            output: Output data from the operation (can be any JSON-serializable object)
 940            metadata: Additional metadata to associate with the observation
 941            version: Version identifier for the code or component
 942            level: Importance level of the observation (info, warning, error)
 943            status_message: Optional status message for the observation
 944            end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
 945
 946            The following parameters are available when as_type is: "generation" or "embedding".
 947            completion_start_time: When the model started generating the response
 948            model: Name/identifier of the AI model used (e.g., "gpt-4")
 949            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
 950            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
 951            cost_details: Cost information for the model call
 952            prompt: Associated prompt template from Langfuse prompt management
 953
 954        Returns:
 955            A context manager that yields the appropriate observation type based on as_type
 956
 957        Example:
 958            ```python
 959            # Create a span
 960            with langfuse.start_as_current_observation(name="process-query", as_type="span") as span:
 961                # Do work
 962                result = process_data()
 963                span.update(output=result)
 964
 965                # Create a child span automatically
 966                with span.start_as_current_observation(name="sub-operation") as child_span:
 967                    # Do sub-operation work
 968                    child_span.update(output="sub-result")
 969
 970            # Create a tool observation
 971            with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool:
 972                # Do tool work
 973                results = search_web(query)
 974                tool.update(output=results)
 975
 976            # Create a generation observation
 977            with langfuse.start_as_current_observation(
 978                name="answer-generation",
 979                as_type="generation",
 980                model="gpt-4"
 981            ) as generation:
 982                # Generate answer
 983                response = llm.generate(...)
 984                generation.update(output=response)
 985            ```
 986        """
 987        if as_type in get_observation_types_list(ObservationTypeGenerationLike):
 988            if trace_context:
 989                trace_id = trace_context.get("trace_id", None)
 990                parent_span_id = trace_context.get("parent_span_id", None)
 991
 992                if trace_id:
 993                    remote_parent_span = self._create_remote_parent_span(
 994                        trace_id=trace_id, parent_span_id=parent_span_id
 995                    )
 996
 997                    return cast(
 998                        Union[
 999                            _AgnosticContextManager[LangfuseGeneration],
1000                            _AgnosticContextManager[LangfuseEmbedding],
1001                        ],
1002                        self._create_span_with_parent_context(
1003                            as_type=as_type,
1004                            name=name,
1005                            remote_parent_span=remote_parent_span,
1006                            parent=None,
1007                            end_on_exit=end_on_exit,
1008                            input=input,
1009                            output=output,
1010                            metadata=metadata,
1011                            version=version,
1012                            level=level,
1013                            status_message=status_message,
1014                            completion_start_time=completion_start_time,
1015                            model=model,
1016                            model_parameters=model_parameters,
1017                            usage_details=usage_details,
1018                            cost_details=cost_details,
1019                            prompt=prompt,
1020                        ),
1021                    )
1022
1023            return cast(
1024                Union[
1025                    _AgnosticContextManager[LangfuseGeneration],
1026                    _AgnosticContextManager[LangfuseEmbedding],
1027                ],
1028                self._start_as_current_otel_span_with_processed_media(
1029                    as_type=as_type,
1030                    name=name,
1031                    end_on_exit=end_on_exit,
1032                    input=input,
1033                    output=output,
1034                    metadata=metadata,
1035                    version=version,
1036                    level=level,
1037                    status_message=status_message,
1038                    completion_start_time=completion_start_time,
1039                    model=model,
1040                    model_parameters=model_parameters,
1041                    usage_details=usage_details,
1042                    cost_details=cost_details,
1043                    prompt=prompt,
1044                ),
1045            )
1046
1047        if as_type in get_observation_types_list(ObservationTypeSpanLike):
1048            if trace_context:
1049                trace_id = trace_context.get("trace_id", None)
1050                parent_span_id = trace_context.get("parent_span_id", None)
1051
1052                if trace_id:
1053                    remote_parent_span = self._create_remote_parent_span(
1054                        trace_id=trace_id, parent_span_id=parent_span_id
1055                    )
1056
1057                    return cast(
1058                        Union[
1059                            _AgnosticContextManager[LangfuseSpan],
1060                            _AgnosticContextManager[LangfuseAgent],
1061                            _AgnosticContextManager[LangfuseTool],
1062                            _AgnosticContextManager[LangfuseChain],
1063                            _AgnosticContextManager[LangfuseRetriever],
1064                            _AgnosticContextManager[LangfuseEvaluator],
1065                            _AgnosticContextManager[LangfuseGuardrail],
1066                        ],
1067                        self._create_span_with_parent_context(
1068                            as_type=as_type,
1069                            name=name,
1070                            remote_parent_span=remote_parent_span,
1071                            parent=None,
1072                            end_on_exit=end_on_exit,
1073                            input=input,
1074                            output=output,
1075                            metadata=metadata,
1076                            version=version,
1077                            level=level,
1078                            status_message=status_message,
1079                        ),
1080                    )
1081
1082            return cast(
1083                Union[
1084                    _AgnosticContextManager[LangfuseSpan],
1085                    _AgnosticContextManager[LangfuseAgent],
1086                    _AgnosticContextManager[LangfuseTool],
1087                    _AgnosticContextManager[LangfuseChain],
1088                    _AgnosticContextManager[LangfuseRetriever],
1089                    _AgnosticContextManager[LangfuseEvaluator],
1090                    _AgnosticContextManager[LangfuseGuardrail],
1091                ],
1092                self._start_as_current_otel_span_with_processed_media(
1093                    as_type=as_type,
1094                    name=name,
1095                    end_on_exit=end_on_exit,
1096                    input=input,
1097                    output=output,
1098                    metadata=metadata,
1099                    version=version,
1100                    level=level,
1101                    status_message=status_message,
1102                ),
1103            )
1104
1105        # This should never be reached since all valid types are handled above
1106        langfuse_logger.warning(
1107            f"Unknown observation type: {as_type}, falling back to span"
1108        )
1109        return self._start_as_current_otel_span_with_processed_media(
1110            as_type="span",
1111            name=name,
1112            end_on_exit=end_on_exit,
1113            input=input,
1114            output=output,
1115            metadata=metadata,
1116            version=version,
1117            level=level,
1118            status_message=status_message,
1119        )

Create a new observation and set it as the current span in a context manager.

This method creates a new observation of the specified type and sets it as the current span within a context manager. Use this method with a 'with' statement to automatically handle the observation lifecycle within a code block.

The created observation will be the child of the current span in the context.

Arguments:
  • trace_context: Optional context for connecting to an existing trace
  • name: Name of the observation (e.g., function or operation name)
  • as_type: Type of observation to create (defaults to "span")
  • input: Input data for the operation (can be any JSON-serializable object)
  • output: Output data from the operation (can be any JSON-serializable object)
  • metadata: Additional metadata to associate with the observation
  • version: Version identifier for the code or component
  • level: Importance level of the observation (info, warning, error)
  • status_message: Optional status message for the observation
  • end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
  • The following parameters are available when as_type is: "generation" or "embedding".
  • completion_start_time: When the model started generating the response
  • model: Name/identifier of the AI model used (e.g., "gpt-4")
  • model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
  • usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
  • cost_details: Cost information for the model call
  • prompt: Associated prompt template from Langfuse prompt management
Returns:

A context manager that yields the appropriate observation type based on as_type

Example:
# Create a span
with langfuse.start_as_current_observation(name="process-query", as_type="span") as span:
    # Do work
    result = process_data()
    span.update(output=result)

    # Create a child span automatically
    with span.start_as_current_observation(name="sub-operation") as child_span:
        # Do sub-operation work
        child_span.update(output="sub-result")

# Create a tool observation
with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool:
    # Do tool work
    results = search_web(query)
    tool.update(output=results)

# Create a generation observation
with langfuse.start_as_current_observation(
    name="answer-generation",
    as_type="generation",
    model="gpt-4"
) as generation:
    # Generate answer
    response = llm.generate(...)
    generation.update(output=response)
def update_current_generation( self, *, name: Optional[str] = None, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, float, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None) -> None:
1311    def update_current_generation(
1312        self,
1313        *,
1314        name: Optional[str] = None,
1315        input: Optional[Any] = None,
1316        output: Optional[Any] = None,
1317        metadata: Optional[Any] = None,
1318        version: Optional[str] = None,
1319        level: Optional[SpanLevel] = None,
1320        status_message: Optional[str] = None,
1321        completion_start_time: Optional[datetime] = None,
1322        model: Optional[str] = None,
1323        model_parameters: Optional[Dict[str, MapValue]] = None,
1324        usage_details: Optional[Dict[str, int]] = None,
1325        cost_details: Optional[Dict[str, float]] = None,
1326        prompt: Optional[PromptClient] = None,
1327    ) -> None:
1328        """Update the current active generation span with new information.
1329
1330        This method updates the current generation span in the active context with
1331        additional information. It's useful for adding output, usage stats, or other
1332        details that become available during or after model generation.
1333
1334        Args:
1335            name: The generation name
1336            input: Updated input data for the model
1337            output: Output from the model (e.g., completions)
1338            metadata: Additional metadata to associate with the generation
1339            version: Version identifier for the model or component
1340            level: Importance level of the generation (info, warning, error)
1341            status_message: Optional status message for the generation
1342            completion_start_time: When the model started generating the response
1343            model: Name/identifier of the AI model used (e.g., "gpt-4")
1344            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1345            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1346            cost_details: Cost information for the model call
1347            prompt: Associated prompt template from Langfuse prompt management
1348
1349        Example:
1350            ```python
1351            with langfuse.start_as_current_generation(name="answer-query") as generation:
1352                # Initial setup and API call
1353                response = llm.generate(...)
1354
1355                # Update with results that weren't available at creation time
1356                langfuse.update_current_generation(
1357                    output=response.text,
1358                    usage_details={
1359                        "prompt_tokens": response.usage.prompt_tokens,
1360                        "completion_tokens": response.usage.completion_tokens
1361                    }
1362                )
1363            ```
1364        """
1365        if not self._tracing_enabled:
1366            langfuse_logger.debug(
1367                "Operation skipped: update_current_generation - Tracing is disabled or client is in no-op mode."
1368            )
1369            return
1370
1371        current_otel_span = self._get_current_otel_span()
1372
1373        if current_otel_span is not None:
1374            generation = LangfuseGeneration(
1375                otel_span=current_otel_span, langfuse_client=self
1376            )
1377
1378            if name:
1379                current_otel_span.update_name(name)
1380
1381            generation.update(
1382                input=input,
1383                output=output,
1384                metadata=metadata,
1385                version=version,
1386                level=level,
1387                status_message=status_message,
1388                completion_start_time=completion_start_time,
1389                model=model,
1390                model_parameters=model_parameters,
1391                usage_details=usage_details,
1392                cost_details=cost_details,
1393                prompt=prompt,
1394            )

Update the current active generation span with new information.

This method updates the current generation span in the active context with additional information. It's useful for adding output, usage stats, or other details that become available during or after model generation.

Arguments:
  • name: The generation name
  • input: Updated input data for the model
  • output: Output from the model (e.g., completions)
  • metadata: Additional metadata to associate with the generation
  • version: Version identifier for the model or component
  • level: Importance level of the generation (info, warning, error)
  • status_message: Optional status message for the generation
  • completion_start_time: When the model started generating the response
  • model: Name/identifier of the AI model used (e.g., "gpt-4")
  • model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
  • usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
  • cost_details: Cost information for the model call
  • prompt: Associated prompt template from Langfuse prompt management
Example:
with langfuse.start_as_current_generation(name="answer-query") as generation:
    # Initial setup and API call
    response = llm.generate(...)

    # Update with results that weren't available at creation time
    langfuse.update_current_generation(
        output=response.text,
        usage_details={
            "prompt_tokens": response.usage.prompt_tokens,
            "completion_tokens": response.usage.completion_tokens
        }
    )
def update_current_span( self, *, name: Optional[str] = None, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None) -> None:
1396    def update_current_span(
1397        self,
1398        *,
1399        name: Optional[str] = None,
1400        input: Optional[Any] = None,
1401        output: Optional[Any] = None,
1402        metadata: Optional[Any] = None,
1403        version: Optional[str] = None,
1404        level: Optional[SpanLevel] = None,
1405        status_message: Optional[str] = None,
1406    ) -> None:
1407        """Update the current active span with new information.
1408
1409        This method updates the current span in the active context with
1410        additional information. It's useful for adding outputs or metadata
1411        that become available during execution.
1412
1413        Args:
1414            name: The span name
1415            input: Updated input data for the operation
1416            output: Output data from the operation
1417            metadata: Additional metadata to associate with the span
1418            version: Version identifier for the code or component
1419            level: Importance level of the span (info, warning, error)
1420            status_message: Optional status message for the span
1421
1422        Example:
1423            ```python
1424            with langfuse.start_as_current_observation(name="process-data") as span:
1425                # Initial processing
1426                result = process_first_part()
1427
1428                # Update with intermediate results
1429                langfuse.update_current_span(metadata={"intermediate_result": result})
1430
1431                # Continue processing
1432                final_result = process_second_part(result)
1433
1434                # Final update
1435                langfuse.update_current_span(output=final_result)
1436            ```
1437        """
1438        if not self._tracing_enabled:
1439            langfuse_logger.debug(
1440                "Operation skipped: update_current_span - Tracing is disabled or client is in no-op mode."
1441            )
1442            return
1443
1444        current_otel_span = self._get_current_otel_span()
1445
1446        if current_otel_span is not None:
1447            span_class = self._get_span_class(
1448                self._get_observation_type_from_otel_span(current_otel_span)
1449            )
1450            span = span_class(
1451                otel_span=current_otel_span,
1452                langfuse_client=self,
1453                environment=self._environment,
1454                release=self._release,
1455            )
1456
1457            if name:
1458                current_otel_span.update_name(name)
1459
1460            span.update(
1461                input=input,
1462                output=output,
1463                metadata=metadata,
1464                version=version,
1465                level=level,
1466                status_message=status_message,
1467            )

Update the current active span with new information.

This method updates the current span in the active context with additional information. It's useful for adding outputs or metadata that become available during execution.

Arguments:
  • name: The span name
  • input: Updated input data for the operation
  • output: Output data from the operation
  • metadata: Additional metadata to associate with the span
  • version: Version identifier for the code or component
  • level: Importance level of the span (info, warning, error)
  • status_message: Optional status message for the span
Example:
with langfuse.start_as_current_observation(name="process-data") as span:
    # Initial processing
    result = process_first_part()

    # Update with intermediate results
    langfuse.update_current_span(metadata={"intermediate_result": result})

    # Continue processing
    final_result = process_second_part(result)

    # Final update
    langfuse.update_current_span(output=final_result)
@deprecated('Trace-level input/output is deprecated. For trace attributes (user_id, session_id, tags, etc.), use propagate_attributes() instead. This method will be removed in a future major version.')
def set_current_trace_io( self, *, input: Optional[Any] = None, output: Optional[Any] = None) -> None:
1469    @deprecated(
1470        "Trace-level input/output is deprecated. "
1471        "For trace attributes (user_id, session_id, tags, etc.), use propagate_attributes() instead. "
1472        "This method will be removed in a future major version."
1473    )
1474    def set_current_trace_io(
1475        self,
1476        *,
1477        input: Optional[Any] = None,
1478        output: Optional[Any] = None,
1479    ) -> None:
1480        """Set trace-level input and output for the current span's trace.
1481
1482        .. deprecated::
1483            This is a legacy method for backward compatibility with Langfuse platform
1484            features that still rely on trace-level input/output (e.g., legacy LLM-as-a-judge
1485            evaluators). It will be removed in a future major version.
1486
1487            For setting other trace attributes (user_id, session_id, metadata, tags, version),
1488            use :meth:`propagate_attributes` instead.
1489
1490        Args:
1491            input: Input data to associate with the trace.
1492            output: Output data to associate with the trace.
1493        """
1494        if not self._tracing_enabled:
1495            langfuse_logger.debug(
1496                "Operation skipped: set_current_trace_io - Tracing is disabled or client is in no-op mode."
1497            )
1498            return
1499
1500        current_otel_span = self._get_current_otel_span()
1501
1502        if current_otel_span is not None and current_otel_span.is_recording():
1503            span_class = self._get_span_class(
1504                self._get_observation_type_from_otel_span(current_otel_span)
1505            )
1506            span = span_class(
1507                otel_span=current_otel_span,
1508                langfuse_client=self,
1509                environment=self._environment,
1510                release=self._release,
1511            )
1512
1513            span.set_trace_io(
1514                input=input,
1515                output=output,
1516            )

Set trace-level input and output for the current span's trace.

Deprecated since version : This is a legacy method for backward compatibility with Langfuse platform features that still rely on trace-level input/output (e.g., legacy LLM-as-a-judge evaluators). It will be removed in a future major version.

For setting other trace attributes (user_id, session_id, metadata, tags, version), use propagate_attributes() instead.

Arguments:
  • input: Input data to associate with the trace.
  • output: Output data to associate with the trace.
def set_current_trace_as_public(self) -> None:
1518    def set_current_trace_as_public(self) -> None:
1519        """Make the current trace publicly accessible via its URL.
1520
1521        When a trace is published, anyone with the trace link can view the full trace
1522        without needing to be logged in to Langfuse. This action cannot be undone
1523        programmatically - once published, the entire trace becomes public.
1524
1525        This is a convenience method that publishes the trace from the currently
1526        active span context. Use this when you want to make a trace public from
1527        within a traced function without needing direct access to the span object.
1528        """
1529        if not self._tracing_enabled:
1530            langfuse_logger.debug(
1531                "Operation skipped: set_current_trace_as_public - Tracing is disabled or client is in no-op mode."
1532            )
1533            return
1534
1535        current_otel_span = self._get_current_otel_span()
1536
1537        if current_otel_span is not None and current_otel_span.is_recording():
1538            span_class = self._get_span_class(
1539                self._get_observation_type_from_otel_span(current_otel_span)
1540            )
1541            span = span_class(
1542                otel_span=current_otel_span,
1543                langfuse_client=self,
1544                environment=self._environment,
1545            )
1546
1547            span.set_trace_as_public()

Make the current trace publicly accessible via its URL.

When a trace is published, anyone with the trace link can view the full trace without needing to be logged in to Langfuse. This action cannot be undone programmatically - once published, the entire trace becomes public.

This is a convenience method that publishes the trace from the currently active span context. Use this when you want to make a trace public from within a traced function without needing direct access to the span object.

def create_event( self, *, trace_context: Optional[langfuse.types.TraceContext] = None, name: str, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None) -> LangfuseEvent:
1549    def create_event(
1550        self,
1551        *,
1552        trace_context: Optional[TraceContext] = None,
1553        name: str,
1554        input: Optional[Any] = None,
1555        output: Optional[Any] = None,
1556        metadata: Optional[Any] = None,
1557        version: Optional[str] = None,
1558        level: Optional[SpanLevel] = None,
1559        status_message: Optional[str] = None,
1560    ) -> LangfuseEvent:
1561        """Create a new Langfuse observation of type 'EVENT'.
1562
1563        The created Langfuse Event observation will be the child of the current span in the context.
1564
1565        Args:
1566            trace_context: Optional context for connecting to an existing trace
1567            name: Name of the span (e.g., function or operation name)
1568            input: Input data for the operation (can be any JSON-serializable object)
1569            output: Output data from the operation (can be any JSON-serializable object)
1570            metadata: Additional metadata to associate with the span
1571            version: Version identifier for the code or component
1572            level: Importance level of the span (info, warning, error)
1573            status_message: Optional status message for the span
1574
1575        Returns:
1576            The Langfuse Event object
1577
1578        Example:
1579            ```python
1580            event = langfuse.create_event(name="process-event")
1581            ```
1582        """
1583        timestamp = time_ns()
1584
1585        if trace_context:
1586            trace_id = trace_context.get("trace_id", None)
1587            parent_span_id = trace_context.get("parent_span_id", None)
1588
1589            if trace_id:
1590                remote_parent_span = self._create_remote_parent_span(
1591                    trace_id=trace_id, parent_span_id=parent_span_id
1592                )
1593
1594                with otel_trace_api.use_span(
1595                    cast(otel_trace_api.Span, remote_parent_span)
1596                ):
1597                    otel_span = self._otel_tracer.start_span(
1598                        name=name, start_time=timestamp
1599                    )
1600                    otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True)
1601
1602                    return cast(
1603                        LangfuseEvent,
1604                        LangfuseEvent(
1605                            otel_span=otel_span,
1606                            langfuse_client=self,
1607                            environment=self._environment,
1608                            release=self._release,
1609                            input=input,
1610                            output=output,
1611                            metadata=metadata,
1612                            version=version,
1613                            level=level,
1614                            status_message=status_message,
1615                        ).end(end_time=timestamp),
1616                    )
1617
1618        otel_span = self._otel_tracer.start_span(name=name, start_time=timestamp)
1619
1620        return cast(
1621            LangfuseEvent,
1622            LangfuseEvent(
1623                otel_span=otel_span,
1624                langfuse_client=self,
1625                environment=self._environment,
1626                release=self._release,
1627                input=input,
1628                output=output,
1629                metadata=metadata,
1630                version=version,
1631                level=level,
1632                status_message=status_message,
1633            ).end(end_time=timestamp),
1634        )

Create a new Langfuse observation of type 'EVENT'.

The created Langfuse Event observation will be the child of the current span in the context.

Arguments:
  • trace_context: Optional context for connecting to an existing trace
  • name: Name of the span (e.g., function or operation name)
  • input: Input data for the operation (can be any JSON-serializable object)
  • output: Output data from the operation (can be any JSON-serializable object)
  • metadata: Additional metadata to associate with the span
  • version: Version identifier for the code or component
  • level: Importance level of the span (info, warning, error)
  • status_message: Optional status message for the span
Returns:

The Langfuse Event object

Example:
event = langfuse.create_event(name="process-event")
@staticmethod
def create_trace_id(*, seed: Optional[str] = None) -> str:
1723    @staticmethod
1724    def create_trace_id(*, seed: Optional[str] = None) -> str:
1725        """Create a unique trace ID for use with Langfuse.
1726
1727        This method generates a unique trace ID for use with various Langfuse APIs.
1728        It can either generate a random ID or create a deterministic ID based on
1729        a seed string.
1730
1731        Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes.
1732        This method ensures the generated ID meets this requirement. If you need to
1733        correlate an external ID with a Langfuse trace ID, use the external ID as the
1734        seed to get a valid, deterministic Langfuse trace ID.
1735
1736        Args:
1737            seed: Optional string to use as a seed for deterministic ID generation.
1738                 If provided, the same seed will always produce the same ID.
1739                 If not provided, a random ID will be generated.
1740
1741        Returns:
1742            A 32-character lowercase hexadecimal string representing the Langfuse trace ID.
1743
1744        Example:
1745            ```python
1746            # Generate a random trace ID
1747            trace_id = langfuse.create_trace_id()
1748
1749            # Generate a deterministic ID based on a seed
1750            session_trace_id = langfuse.create_trace_id(seed="session-456")
1751
1752            # Correlate an external ID with a Langfuse trace ID
1753            external_id = "external-system-123456"
1754            correlated_trace_id = langfuse.create_trace_id(seed=external_id)
1755
1756            # Use the ID with trace context
1757            with langfuse.start_as_current_observation(
1758                name="process-request",
1759                trace_context={"trace_id": trace_id}
1760            ) as span:
1761                # Operation will be part of the specific trace
1762                pass
1763            ```
1764        """
1765        if not seed:
1766            trace_id_int = RandomIdGenerator().generate_trace_id()
1767
1768            return Langfuse._format_otel_trace_id(trace_id_int)
1769
1770        return sha256(seed.encode("utf-8")).digest()[:16].hex()

Create a unique trace ID for use with Langfuse.

This method generates a unique trace ID for use with various Langfuse APIs. It can either generate a random ID or create a deterministic ID based on a seed string.

Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes. This method ensures the generated ID meets this requirement. If you need to correlate an external ID with a Langfuse trace ID, use the external ID as the seed to get a valid, deterministic Langfuse trace ID.

Arguments:
  • seed: Optional string to use as a seed for deterministic ID generation. If provided, the same seed will always produce the same ID. If not provided, a random ID will be generated.
Returns:

A 32-character lowercase hexadecimal string representing the Langfuse trace ID.

Example:
# Generate a random trace ID
trace_id = langfuse.create_trace_id()

# Generate a deterministic ID based on a seed
session_trace_id = langfuse.create_trace_id(seed="session-456")

# Correlate an external ID with a Langfuse trace ID
external_id = "external-system-123456"
correlated_trace_id = langfuse.create_trace_id(seed=external_id)

# Use the ID with trace context
with langfuse.start_as_current_observation(
    name="process-request",
    trace_context={"trace_id": trace_id}
) as span:
    # Operation will be part of the specific trace
    pass
def create_score( self, *, name: str, value: Union[float, str], session_id: Optional[str] = None, dataset_run_id: Optional[str] = None, trace_id: Optional[str] = None, observation_id: Optional[str] = None, score_id: Optional[str] = None, data_type: Optional[Literal['NUMERIC', 'CATEGORICAL', 'BOOLEAN', 'TEXT', 'CORRECTION']] = None, comment: Optional[str] = None, config_id: Optional[str] = None, metadata: Optional[Any] = None, timestamp: Optional[datetime.datetime] = None, environment: Optional[str] = None) -> None:
1852    def create_score(
1853        self,
1854        *,
1855        name: str,
1856        value: Union[float, str],
1857        session_id: Optional[str] = None,
1858        dataset_run_id: Optional[str] = None,
1859        trace_id: Optional[str] = None,
1860        observation_id: Optional[str] = None,
1861        score_id: Optional[str] = None,
1862        data_type: Optional[ScoreDataType] = None,
1863        comment: Optional[str] = None,
1864        config_id: Optional[str] = None,
1865        metadata: Optional[Any] = None,
1866        timestamp: Optional[datetime] = None,
1867        environment: Optional[str] = None,
1868    ) -> None:
1869        """Create a score for a specific trace or observation.
1870
1871        This method creates a score for evaluating a Langfuse trace or observation. Scores can be
1872        used to track quality metrics, user feedback, or automated evaluations.
1873
1874        Args:
1875            name: Name of the score (e.g., "relevance", "accuracy")
1876            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION)
1877            session_id: ID of the Langfuse session to associate the score with
1878            dataset_run_id: ID of the Langfuse dataset run to associate the score with
1879            trace_id: ID of the Langfuse trace to associate the score with
1880            observation_id: Optional ID of the specific observation to score. Trace ID must be provided too.
1881            score_id: Optional custom ID for the score (auto-generated if not provided)
1882            data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION)
1883            comment: Optional comment or explanation for the score
1884            config_id: Optional ID of a score config defined in Langfuse
1885            metadata: Optional metadata to be attached to the score
1886            timestamp: Optional timestamp for the score (defaults to current UTC time)
1887            environment: Optional environment override for this score. If omitted,
1888                the score uses the client-level environment from
1889                `Langfuse(environment=...)` or `LANGFUSE_TRACING_ENVIRONMENT`.
1890                Langfuse observation wrapper methods pass their resolved span
1891                environment here so scores created via `span.score()` or
1892                `span.score_trace()` stay grouped with the scored observation or
1893                trace, including request-scoped environments propagated with
1894                `propagate_attributes(environment=...)`.
1895
1896        Example:
1897            ```python
1898            # Create a numeric score for accuracy
1899            langfuse.create_score(
1900                name="accuracy",
1901                value=0.92,
1902                trace_id="abcdef1234567890abcdef1234567890",
1903                data_type="NUMERIC",
1904                comment="High accuracy with minor irrelevant details"
1905            )
1906
1907            # Create a categorical score for sentiment
1908            langfuse.create_score(
1909                name="sentiment",
1910                value="positive",
1911                trace_id="abcdef1234567890abcdef1234567890",
1912                observation_id="abcdef1234567890",
1913                data_type="CATEGORICAL"
1914            )
1915            ```
1916        """
1917        if not self._tracing_enabled:
1918            return
1919
1920        score_id = score_id or self._create_observation_id()
1921
1922        try:
1923            new_body = ScoreBody(
1924                id=score_id,
1925                sessionId=session_id,
1926                datasetRunId=dataset_run_id,
1927                traceId=trace_id,
1928                observationId=observation_id,
1929                name=name,
1930                value=value,
1931                dataType=data_type,  # type: ignore
1932                comment=comment,
1933                configId=config_id,
1934                environment=environment or self._environment,
1935                metadata=metadata,
1936            )
1937
1938            event = {
1939                "id": self.create_trace_id(),
1940                "type": "score-create",
1941                "timestamp": timestamp or _get_timestamp(),
1942                "body": new_body,
1943            }
1944
1945            if self._resources is not None:
1946                # Force the score to be in sample if it was for a legacy trace ID, i.e. non-32 hexchar
1947                force_sample = (
1948                    not self._is_valid_trace_id(trace_id) if trace_id else True
1949                )
1950
1951                self._resources.add_score_task(
1952                    event,
1953                    force_sample=force_sample,
1954                )
1955
1956        except Exception as e:
1957            langfuse_logger.exception(
1958                f"Error creating score: Failed to process score event for trace_id={trace_id}, name={name}. Error: {e}"
1959            )

Create a score for a specific trace or observation.

This method creates a score for evaluating a Langfuse trace or observation. Scores can be used to track quality metrics, user feedback, or automated evaluations.

Arguments:
  • name: Name of the score (e.g., "relevance", "accuracy")
  • value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION)
  • session_id: ID of the Langfuse session to associate the score with
  • dataset_run_id: ID of the Langfuse dataset run to associate the score with
  • trace_id: ID of the Langfuse trace to associate the score with
  • observation_id: Optional ID of the specific observation to score. Trace ID must be provided too.
  • score_id: Optional custom ID for the score (auto-generated if not provided)
  • data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION)
  • comment: Optional comment or explanation for the score
  • config_id: Optional ID of a score config defined in Langfuse
  • metadata: Optional metadata to be attached to the score
  • timestamp: Optional timestamp for the score (defaults to current UTC time)
  • environment: Optional environment override for this score. If omitted, the score uses the client-level environment from Langfuse(environment=...) or LANGFUSE_TRACING_ENVIRONMENT. Langfuse observation wrapper methods pass their resolved span environment here so scores created via span.score() or span.score_trace() stay grouped with the scored observation or trace, including request-scoped environments propagated with propagate_attributes(environment=...).
Example:
# Create a numeric score for accuracy
langfuse.create_score(
    name="accuracy",
    value=0.92,
    trace_id="abcdef1234567890abcdef1234567890",
    data_type="NUMERIC",
    comment="High accuracy with minor irrelevant details"
)

# Create a categorical score for sentiment
langfuse.create_score(
    name="sentiment",
    value="positive",
    trace_id="abcdef1234567890abcdef1234567890",
    observation_id="abcdef1234567890",
    data_type="CATEGORICAL"
)
def score_current_span( self, *, name: str, value: Union[float, str], score_id: Optional[str] = None, data_type: Optional[Literal['NUMERIC', 'CATEGORICAL', 'BOOLEAN', 'TEXT', 'CORRECTION']] = None, comment: Optional[str] = None, config_id: Optional[str] = None, metadata: Optional[Any] = None) -> None:
2022    def score_current_span(
2023        self,
2024        *,
2025        name: str,
2026        value: Union[float, str],
2027        score_id: Optional[str] = None,
2028        data_type: Optional[ScoreDataType] = None,
2029        comment: Optional[str] = None,
2030        config_id: Optional[str] = None,
2031        metadata: Optional[Any] = None,
2032    ) -> None:
2033        """Create a score for the current active span.
2034
2035        This method scores the currently active span in the context. It's a convenient
2036        way to score the current operation without needing to know its trace and span IDs.
2037        If the active span has a `langfuse.environment` attribute, including one
2038        set by `propagate_attributes(environment=...)`, the score uses that
2039        environment. Otherwise it uses the client-level environment.
2040
2041        Args:
2042            name: Name of the score (e.g., "relevance", "accuracy")
2043            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION)
2044            score_id: Optional custom ID for the score (auto-generated if not provided)
2045            data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION)
2046            comment: Optional comment or explanation for the score
2047            config_id: Optional ID of a score config defined in Langfuse
2048            metadata: Optional metadata to be attached to the score
2049
2050        Example:
2051            ```python
2052            with langfuse.start_as_current_generation(name="answer-query") as generation:
2053                # Generate answer
2054                response = generate_answer(...)
2055                generation.update(output=response)
2056
2057                # Score the generation
2058                langfuse.score_current_span(
2059                    name="relevance",
2060                    value=0.85,
2061                    data_type="NUMERIC",
2062                    comment="Mostly relevant but contains some tangential information",
2063                    metadata={"model": "gpt-4", "prompt_version": "v2"}
2064                )
2065            ```
2066        """
2067        current_span = self._get_current_otel_span()
2068
2069        if current_span is not None:
2070            trace_id = self._get_otel_trace_id(current_span)
2071            observation_id = self._get_otel_span_id(current_span)
2072
2073            langfuse_logger.info(
2074                f"Score: Creating score name='{name}' value={value} for current span ({observation_id}) in trace {trace_id}"
2075            )
2076
2077            self.create_score(
2078                trace_id=trace_id,
2079                observation_id=observation_id,
2080                name=name,
2081                value=cast(str, value),
2082                score_id=score_id,
2083                data_type=cast(Literal["CATEGORICAL", "TEXT", "CORRECTION"], data_type),
2084                comment=comment,
2085                config_id=config_id,
2086                metadata=metadata,
2087                environment=get_string_span_attribute(
2088                    current_span, LangfuseOtelSpanAttributes.ENVIRONMENT
2089                ),
2090            )

Create a score for the current active span.

This method scores the currently active span in the context. It's a convenient way to score the current operation without needing to know its trace and span IDs. If the active span has a langfuse.environment attribute, including one set by propagate_attributes(environment=...), the score uses that environment. Otherwise it uses the client-level environment.

Arguments:
  • name: Name of the score (e.g., "relevance", "accuracy")
  • value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION)
  • score_id: Optional custom ID for the score (auto-generated if not provided)
  • data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION)
  • comment: Optional comment or explanation for the score
  • config_id: Optional ID of a score config defined in Langfuse
  • metadata: Optional metadata to be attached to the score
Example:
with langfuse.start_as_current_generation(name="answer-query") as generation:
    # Generate answer
    response = generate_answer(...)
    generation.update(output=response)

    # Score the generation
    langfuse.score_current_span(
        name="relevance",
        value=0.85,
        data_type="NUMERIC",
        comment="Mostly relevant but contains some tangential information",
        metadata={"model": "gpt-4", "prompt_version": "v2"}
    )
def score_current_trace( self, *, name: str, value: Union[float, str], score_id: Optional[str] = None, data_type: Optional[Literal['NUMERIC', 'CATEGORICAL', 'BOOLEAN', 'TEXT', 'CORRECTION']] = None, comment: Optional[str] = None, config_id: Optional[str] = None, metadata: Optional[Any] = None) -> None:
2120    def score_current_trace(
2121        self,
2122        *,
2123        name: str,
2124        value: Union[float, str],
2125        score_id: Optional[str] = None,
2126        data_type: Optional[ScoreDataType] = None,
2127        comment: Optional[str] = None,
2128        config_id: Optional[str] = None,
2129        metadata: Optional[Any] = None,
2130    ) -> None:
2131        """Create a score for the current trace.
2132
2133        This method scores the trace of the currently active span. Unlike score_current_span,
2134        this method associates the score with the entire trace rather than a specific span.
2135        It's useful for scoring overall performance or quality of the entire operation.
2136        If the active span has a `langfuse.environment` attribute, including one
2137        set by `propagate_attributes(environment=...)`, the score uses that
2138        environment. Otherwise it uses the client-level environment.
2139
2140        Args:
2141            name: Name of the score (e.g., "user_satisfaction", "overall_quality")
2142            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION)
2143            score_id: Optional custom ID for the score (auto-generated if not provided)
2144            data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION)
2145            comment: Optional comment or explanation for the score
2146            config_id: Optional ID of a score config defined in Langfuse
2147            metadata: Optional metadata to be attached to the score
2148
2149        Example:
2150            ```python
2151            with langfuse.start_as_current_observation(name="process-user-request") as span:
2152                # Process request
2153                result = process_complete_request()
2154                span.update(output=result)
2155
2156                # Score the overall trace
2157                langfuse.score_current_trace(
2158                    name="overall_quality",
2159                    value=0.95,
2160                    data_type="NUMERIC",
2161                    comment="High quality end-to-end response",
2162                    metadata={"evaluator": "gpt-4", "criteria": "comprehensive"}
2163                )
2164            ```
2165        """
2166        current_span = self._get_current_otel_span()
2167
2168        if current_span is not None:
2169            trace_id = self._get_otel_trace_id(current_span)
2170
2171            langfuse_logger.info(
2172                f"Score: Creating score name='{name}' value={value} for entire trace {trace_id}"
2173            )
2174
2175            self.create_score(
2176                trace_id=trace_id,
2177                name=name,
2178                value=cast(str, value),
2179                score_id=score_id,
2180                data_type=cast(Literal["CATEGORICAL", "TEXT", "CORRECTION"], data_type),
2181                comment=comment,
2182                config_id=config_id,
2183                metadata=metadata,
2184                environment=get_string_span_attribute(
2185                    current_span, LangfuseOtelSpanAttributes.ENVIRONMENT
2186                ),
2187            )

Create a score for the current trace.

This method scores the trace of the currently active span. Unlike score_current_span, this method associates the score with the entire trace rather than a specific span. It's useful for scoring overall performance or quality of the entire operation. If the active span has a langfuse.environment attribute, including one set by propagate_attributes(environment=...), the score uses that environment. Otherwise it uses the client-level environment.

Arguments:
  • name: Name of the score (e.g., "user_satisfaction", "overall_quality")
  • value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION)
  • score_id: Optional custom ID for the score (auto-generated if not provided)
  • data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION)
  • comment: Optional comment or explanation for the score
  • config_id: Optional ID of a score config defined in Langfuse
  • metadata: Optional metadata to be attached to the score
Example:
with langfuse.start_as_current_observation(name="process-user-request") as span:
    # Process request
    result = process_complete_request()
    span.update(output=result)

    # Score the overall trace
    langfuse.score_current_trace(
        name="overall_quality",
        value=0.95,
        data_type="NUMERIC",
        comment="High quality end-to-end response",
        metadata={"evaluator": "gpt-4", "criteria": "comprehensive"}
    )
def flush(self) -> None:
2189    def flush(self) -> None:
2190        """Force flush all pending spans and events to the Langfuse API.
2191
2192        This method manually flushes any pending spans, scores, and other events to the
2193        Langfuse API. It's useful in scenarios where you want to ensure all data is sent
2194        before proceeding, without waiting for the automatic flush interval.
2195
2196        Example:
2197            ```python
2198            # Record some spans and scores
2199            with langfuse.start_as_current_observation(name="operation") as span:
2200                # Do work...
2201                pass
2202
2203            # Ensure all data is sent to Langfuse before proceeding
2204            langfuse.flush()
2205
2206            # Continue with other work
2207            ```
2208        """
2209        if self._resources is not None:
2210            self._resources.flush()

Force flush all pending spans and events to the Langfuse API.

This method manually flushes any pending spans, scores, and other events to the Langfuse API. It's useful in scenarios where you want to ensure all data is sent before proceeding, without waiting for the automatic flush interval.

Example:
# Record some spans and scores
with langfuse.start_as_current_observation(name="operation") as span:
    # Do work...
    pass

# Ensure all data is sent to Langfuse before proceeding
langfuse.flush()

# Continue with other work
def shutdown(self) -> None:
2212    def shutdown(self) -> None:
2213        """Shut down the Langfuse client and flush all pending data.
2214
2215        This method cleanly shuts down the Langfuse client, ensuring all pending data
2216        is flushed to the API and all background threads are properly terminated.
2217
2218        It's important to call this method when your application is shutting down to
2219        prevent data loss and resource leaks. For most applications, using the client
2220        as a context manager or relying on the automatic shutdown via atexit is sufficient.
2221
2222        Example:
2223            ```python
2224            # Initialize Langfuse
2225            langfuse = Langfuse(public_key="...", secret_key="...")
2226
2227            # Use Langfuse throughout your application
2228            # ...
2229
2230            # When application is shutting down
2231            langfuse.shutdown()
2232            ```
2233        """
2234        if self._resources is not None:
2235            self._resources.shutdown()

Shut down the Langfuse client and flush all pending data.

This method cleanly shuts down the Langfuse client, ensuring all pending data is flushed to the API and all background threads are properly terminated.

It's important to call this method when your application is shutting down to prevent data loss and resource leaks. For most applications, using the client as a context manager or relying on the automatic shutdown via atexit is sufficient.

Example:
# Initialize Langfuse
langfuse = Langfuse(public_key="...", secret_key="...")

# Use Langfuse throughout your application
# ...

# When application is shutting down
langfuse.shutdown()
def get_current_trace_id(self) -> Optional[str]:
2237    def get_current_trace_id(self) -> Optional[str]:
2238        """Get the trace ID of the current active span.
2239
2240        This method retrieves the trace ID from the currently active span in the context.
2241        It can be used to get the trace ID for referencing in logs, external systems,
2242        or for creating related operations.
2243
2244        Returns:
2245            The current trace ID as a 32-character lowercase hexadecimal string,
2246            or None if there is no active span.
2247
2248        Example:
2249            ```python
2250            with langfuse.start_as_current_observation(name="process-request") as span:
2251                # Get the current trace ID for reference
2252                trace_id = langfuse.get_current_trace_id()
2253
2254                # Use it for external correlation
2255                log.info(f"Processing request with trace_id: {trace_id}")
2256
2257                # Or pass to another system
2258                external_system.process(data, trace_id=trace_id)
2259            ```
2260        """
2261        if not self._tracing_enabled:
2262            langfuse_logger.debug(
2263                "Operation skipped: get_current_trace_id - Tracing is disabled or client is in no-op mode."
2264            )
2265            return None
2266
2267        current_otel_span = self._get_current_otel_span()
2268
2269        return self._get_otel_trace_id(current_otel_span) if current_otel_span else None

Get the trace ID of the current active span.

This method retrieves the trace ID from the currently active span in the context. It can be used to get the trace ID for referencing in logs, external systems, or for creating related operations.

Returns:

The current trace ID as a 32-character lowercase hexadecimal string, or None if there is no active span.

Example:
with langfuse.start_as_current_observation(name="process-request") as span:
    # Get the current trace ID for reference
    trace_id = langfuse.get_current_trace_id()

    # Use it for external correlation
    log.info(f"Processing request with trace_id: {trace_id}")

    # Or pass to another system
    external_system.process(data, trace_id=trace_id)
def get_current_observation_id(self) -> Optional[str]:
2271    def get_current_observation_id(self) -> Optional[str]:
2272        """Get the observation ID (span ID) of the current active span.
2273
2274        This method retrieves the observation ID from the currently active span in the context.
2275        It can be used to get the observation ID for referencing in logs, external systems,
2276        or for creating scores or other related operations.
2277
2278        Returns:
2279            The current observation ID as a 16-character lowercase hexadecimal string,
2280            or None if there is no active span.
2281
2282        Example:
2283            ```python
2284            with langfuse.start_as_current_observation(name="process-user-query") as span:
2285                # Get the current observation ID
2286                observation_id = langfuse.get_current_observation_id()
2287
2288                # Store it for later reference
2289                cache.set(f"query_{query_id}_observation", observation_id)
2290
2291                # Process the query...
2292            ```
2293        """
2294        if not self._tracing_enabled:
2295            langfuse_logger.debug(
2296                "Operation skipped: get_current_observation_id - Tracing is disabled or client is in no-op mode."
2297            )
2298            return None
2299
2300        current_otel_span = self._get_current_otel_span()
2301
2302        return self._get_otel_span_id(current_otel_span) if current_otel_span else None

Get the observation ID (span ID) of the current active span.

This method retrieves the observation ID from the currently active span in the context. It can be used to get the observation ID for referencing in logs, external systems, or for creating scores or other related operations.

Returns:

The current observation ID as a 16-character lowercase hexadecimal string, or None if there is no active span.

Example:
with langfuse.start_as_current_observation(name="process-user-query") as span:
    # Get the current observation ID
    observation_id = langfuse.get_current_observation_id()

    # Store it for later reference
    cache.set(f"query_{query_id}_observation", observation_id)

    # Process the query...
def get_trace_url(self, *, trace_id: Optional[str] = None) -> Optional[str]:
2315    def get_trace_url(self, *, trace_id: Optional[str] = None) -> Optional[str]:
2316        """Get the URL to view a trace in the Langfuse UI.
2317
2318        This method generates a URL that links directly to a trace in the Langfuse UI.
2319        It's useful for providing links in logs, notifications, or debugging tools.
2320
2321        Args:
2322            trace_id: Optional trace ID to generate a URL for. If not provided,
2323                     the trace ID of the current active span will be used.
2324
2325        Returns:
2326            A URL string pointing to the trace in the Langfuse UI,
2327            or None if the project ID couldn't be retrieved or no trace ID is available.
2328
2329        Example:
2330            ```python
2331            # Get URL for the current trace
2332            with langfuse.start_as_current_observation(name="process-request") as span:
2333                trace_url = langfuse.get_trace_url()
2334                log.info(f"Processing trace: {trace_url}")
2335
2336            # Get URL for a specific trace
2337            specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef")
2338            send_notification(f"Review needed for trace: {specific_trace_url}")
2339            ```
2340        """
2341        final_trace_id = trace_id or self.get_current_trace_id()
2342        if not final_trace_id:
2343            return None
2344
2345        project_id = self._get_project_id()
2346
2347        return (
2348            f"{self._base_url}/project/{project_id}/traces/{final_trace_id}"
2349            if project_id and final_trace_id
2350            else None
2351        )

Get the URL to view a trace in the Langfuse UI.

This method generates a URL that links directly to a trace in the Langfuse UI. It's useful for providing links in logs, notifications, or debugging tools.

Arguments:
  • trace_id: Optional trace ID to generate a URL for. If not provided, the trace ID of the current active span will be used.
Returns:

A URL string pointing to the trace in the Langfuse UI, or None if the project ID couldn't be retrieved or no trace ID is available.

Example:
# Get URL for the current trace
with langfuse.start_as_current_observation(name="process-request") as span:
    trace_url = langfuse.get_trace_url()
    log.info(f"Processing trace: {trace_url}")

# Get URL for a specific trace
specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef")
send_notification(f"Review needed for trace: {specific_trace_url}")
def get_dataset( self, name: str, *, fetch_items_page_size: Optional[int] = 50, version: Optional[datetime.datetime] = None) -> langfuse._client.datasets.DatasetClient:
2353    def get_dataset(
2354        self,
2355        name: str,
2356        *,
2357        fetch_items_page_size: Optional[int] = 50,
2358        version: Optional[datetime] = None,
2359    ) -> "DatasetClient":
2360        """Fetch a dataset by its name.
2361
2362        Args:
2363            name: The name of the dataset to fetch.
2364            fetch_items_page_size: All items of the dataset will be fetched in chunks of this size. Defaults to 50.
2365            version: Retrieve dataset items as they existed at this specific point in time (UTC).
2366                If provided, returns the state of items at the specified UTC timestamp.
2367                If not provided, returns the latest version. Must be a timezone-aware datetime object in UTC.
2368
2369        Returns:
2370            DatasetClient: The dataset with the given name.
2371        """
2372        try:
2373            langfuse_logger.debug(f"Getting datasets {name}")
2374            dataset = self.api.datasets.get(dataset_name=self._url_encode(name))
2375
2376            dataset_items: List[DatasetItem] = []
2377            page = 1
2378
2379            while True:
2380                new_items = self.api.dataset_items.list(
2381                    dataset_name=self._url_encode(name, is_url_param=True),
2382                    page=page,
2383                    limit=fetch_items_page_size,
2384                    version=version,
2385                )
2386                dataset_items.extend(
2387                    self._hydrate_dataset_item_media_references(item)
2388                    for item in new_items.data
2389                )
2390
2391                if new_items.meta.total_pages <= page:
2392                    break
2393
2394                page += 1
2395
2396            return DatasetClient(
2397                dataset=dataset,
2398                items=dataset_items,
2399                version=version,
2400                langfuse_client=self,
2401            )
2402
2403        except Error as e:
2404            handle_fern_exception(e)
2405            raise e

Fetch a dataset by its name.

Arguments:
  • name: The name of the dataset to fetch.
  • fetch_items_page_size: All items of the dataset will be fetched in chunks of this size. Defaults to 50.
  • version: Retrieve dataset items as they existed at this specific point in time (UTC). If provided, returns the state of items at the specified UTC timestamp. If not provided, returns the latest version. Must be a timezone-aware datetime object in UTC.
Returns:

DatasetClient: The dataset with the given name.

def get_dataset_run( self, *, dataset_name: str, run_name: str) -> langfuse.api.DatasetRunWithItems:
2407    def get_dataset_run(
2408        self, *, dataset_name: str, run_name: str
2409    ) -> DatasetRunWithItems:
2410        """Fetch a dataset run by dataset name and run name.
2411
2412        Args:
2413            dataset_name (str): The name of the dataset.
2414            run_name (str): The name of the run.
2415
2416        Returns:
2417            DatasetRunWithItems: The dataset run with its items.
2418        """
2419        try:
2420            return cast(
2421                DatasetRunWithItems,
2422                self.api.datasets.get_run(
2423                    dataset_name=self._url_encode(dataset_name),
2424                    run_name=self._url_encode(run_name),
2425                    request_options=None,
2426                ),
2427            )
2428        except Error as e:
2429            handle_fern_exception(e)
2430            raise e

Fetch a dataset run by dataset name and run name.

Arguments:
  • dataset_name (str): The name of the dataset.
  • run_name (str): The name of the run.
Returns:

DatasetRunWithItems: The dataset run with its items.

def get_dataset_runs( self, *, dataset_name: str, page: Optional[int] = None, limit: Optional[int] = None) -> langfuse.api.PaginatedDatasetRuns:
2432    def get_dataset_runs(
2433        self,
2434        *,
2435        dataset_name: str,
2436        page: Optional[int] = None,
2437        limit: Optional[int] = None,
2438    ) -> PaginatedDatasetRuns:
2439        """Fetch all runs for a dataset.
2440
2441        Args:
2442            dataset_name (str): The name of the dataset.
2443            page (Optional[int]): Page number, starts at 1.
2444            limit (Optional[int]): Limit of items per page.
2445
2446        Returns:
2447            PaginatedDatasetRuns: Paginated list of dataset runs.
2448        """
2449        try:
2450            return cast(
2451                PaginatedDatasetRuns,
2452                self.api.datasets.get_runs(
2453                    dataset_name=self._url_encode(dataset_name),
2454                    page=page,
2455                    limit=limit,
2456                    request_options=None,
2457                ),
2458            )
2459        except Error as e:
2460            handle_fern_exception(e)
2461            raise e

Fetch all runs for a dataset.

Arguments:
  • dataset_name (str): The name of the dataset.
  • page (Optional[int]): Page number, starts at 1.
  • limit (Optional[int]): Limit of items per page.
Returns:

PaginatedDatasetRuns: Paginated list of dataset runs.

def delete_dataset_run( self, *, dataset_name: str, run_name: str) -> langfuse.api.DeleteDatasetRunResponse:
2463    def delete_dataset_run(
2464        self, *, dataset_name: str, run_name: str
2465    ) -> DeleteDatasetRunResponse:
2466        """Delete a dataset run and all its run items. This action is irreversible.
2467
2468        Args:
2469            dataset_name (str): The name of the dataset.
2470            run_name (str): The name of the run.
2471
2472        Returns:
2473            DeleteDatasetRunResponse: Confirmation of deletion.
2474        """
2475        try:
2476            return cast(
2477                DeleteDatasetRunResponse,
2478                self.api.datasets.delete_run(
2479                    dataset_name=self._url_encode(dataset_name),
2480                    run_name=self._url_encode(run_name),
2481                    request_options=None,
2482                ),
2483            )
2484        except Error as e:
2485            handle_fern_exception(e)
2486            raise e

Delete a dataset run and all its run items. This action is irreversible.

Arguments:
  • dataset_name (str): The name of the dataset.
  • run_name (str): The name of the run.
Returns:

DeleteDatasetRunResponse: Confirmation of deletion.

def run_experiment( self, *, name: str, run_name: Optional[str] = None, description: Optional[str] = None, data: Union[List[langfuse.experiment.LocalExperimentItem], List[langfuse.api.DatasetItem]], task: langfuse.experiment.TaskFunction, evaluators: List[langfuse.experiment.EvaluatorFunction] = [], composite_evaluator: Optional[CompositeEvaluatorFunction] = None, run_evaluators: List[langfuse.experiment.RunEvaluatorFunction] = [], max_concurrency: int = 50, metadata: Optional[Dict[str, str]] = None, _dataset_version: Optional[datetime.datetime] = None) -> langfuse.experiment.ExperimentResult:
2488    def run_experiment(
2489        self,
2490        *,
2491        name: str,
2492        run_name: Optional[str] = None,
2493        description: Optional[str] = None,
2494        data: ExperimentData,
2495        task: TaskFunction,
2496        evaluators: List[EvaluatorFunction] = [],
2497        composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
2498        run_evaluators: List[RunEvaluatorFunction] = [],
2499        max_concurrency: int = 50,
2500        metadata: Optional[Dict[str, str]] = None,
2501        _dataset_version: Optional[datetime] = None,
2502    ) -> ExperimentResult:
2503        """Run an experiment on a dataset with automatic tracing and evaluation.
2504
2505        This method executes a task function on each item in the provided dataset,
2506        automatically traces all executions with Langfuse for observability, runs
2507        item-level and run-level evaluators on the outputs, and returns comprehensive
2508        results with evaluation metrics.
2509
2510        The experiment system provides:
2511        - Automatic tracing of all task executions
2512        - Concurrent processing with configurable limits
2513        - Comprehensive error handling that isolates failures
2514        - Integration with Langfuse datasets for experiment tracking
2515        - Flexible evaluation framework supporting both sync and async evaluators
2516
2517        Args:
2518            name: Human-readable name for the experiment. Used for identification
2519                in the Langfuse UI.
2520            run_name: Optional exact name for the experiment run. If provided, this will be
2521                used as the exact dataset run name if the `data` contains Langfuse dataset items.
2522                If not provided, this will default to the experiment name appended with an ISO timestamp.
2523            description: Optional description explaining the experiment's purpose,
2524                methodology, or expected outcomes.
2525            data: Array of data items to process. Can be either:
2526                - List of dict-like items with 'input', 'expected_output', 'metadata' keys
2527                - List of Langfuse DatasetItem objects from dataset.items
2528            task: Function that processes each data item and returns output.
2529                Must accept 'item' as keyword argument and can return sync or async results.
2530                The task function signature should be: task(*, item, **kwargs) -> Any
2531            evaluators: List of functions to evaluate each item's output individually.
2532                Each evaluator receives input, output, expected_output, and metadata.
2533                Can return single Evaluation dict or list of Evaluation dicts.
2534            composite_evaluator: Optional function that creates composite scores from item-level evaluations.
2535                Receives the same inputs as item-level evaluators (input, output, expected_output, metadata)
2536                plus the list of evaluations from item-level evaluators. Useful for weighted averages,
2537                pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics.
2538            run_evaluators: List of functions to evaluate the entire experiment run.
2539                Each run evaluator receives all item_results and can compute aggregate metrics.
2540                Useful for calculating averages, distributions, or cross-item comparisons.
2541            max_concurrency: Maximum number of concurrent task executions (default: 50).
2542                Controls the number of items processed simultaneously. Adjust based on
2543                API rate limits and system resources.
2544            metadata: Optional metadata dictionary to attach to all experiment traces.
2545                This metadata will be included in every trace created during the experiment.
2546                If `data` are Langfuse dataset items, the metadata will be attached to the dataset run, too.
2547
2548        Returns:
2549            ExperimentResult containing:
2550            - run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset.
2551            - item_results: List of results for each processed item with outputs and evaluations
2552            - run_evaluations: List of aggregate evaluation results for the entire run
2553            - experiment_id: Stable identifier for the experiment run across all items
2554            - dataset_run_id: ID of the dataset run (if using Langfuse datasets)
2555            - dataset_run_url: Direct URL to view results in Langfuse UI (if applicable)
2556
2557        Raises:
2558            ValueError: If required parameters are missing or invalid
2559            Exception: If experiment setup fails (individual item failures are handled gracefully)
2560
2561        Examples:
2562            Basic experiment with local data:
2563            ```python
2564            def summarize_text(*, item, **kwargs):
2565                return f"Summary: {item['input'][:50]}..."
2566
2567            def length_evaluator(*, input, output, expected_output=None, **kwargs):
2568                return {
2569                    "name": "output_length",
2570                    "value": len(output),
2571                    "comment": f"Output contains {len(output)} characters"
2572                }
2573
2574            result = langfuse.run_experiment(
2575                name="Text Summarization Test",
2576                description="Evaluate summarization quality and length",
2577                data=[
2578                    {"input": "Long article text...", "expected_output": "Expected summary"},
2579                    {"input": "Another article...", "expected_output": "Another summary"}
2580                ],
2581                task=summarize_text,
2582                evaluators=[length_evaluator]
2583            )
2584
2585            print(f"Processed {len(result.item_results)} items")
2586            for item_result in result.item_results:
2587                print(f"Input: {item_result.item['input']}")
2588                print(f"Output: {item_result.output}")
2589                print(f"Evaluations: {item_result.evaluations}")
2590            ```
2591
2592            Advanced experiment with async task and multiple evaluators:
2593            ```python
2594            async def llm_task(*, item, **kwargs):
2595                # Simulate async LLM call
2596                response = await openai_client.chat.completions.create(
2597                    model="gpt-4",
2598                    messages=[{"role": "user", "content": item["input"]}]
2599                )
2600                return response.choices[0].message.content
2601
2602            def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
2603                if expected_output and expected_output.lower() in output.lower():
2604                    return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"}
2605                return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"}
2606
2607            def toxicity_evaluator(*, input, output, expected_output=None, **kwargs):
2608                # Simulate toxicity check
2609                toxicity_score = check_toxicity(output)  # Your toxicity checker
2610                return {
2611                    "name": "toxicity",
2612                    "value": toxicity_score,
2613                    "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}"
2614                }
2615
2616            def average_accuracy(*, item_results, **kwargs):
2617                accuracies = [
2618                    eval.value for result in item_results
2619                    for eval in result.evaluations
2620                    if eval.name == "accuracy"
2621                ]
2622                return {
2623                    "name": "average_accuracy",
2624                    "value": sum(accuracies) / len(accuracies) if accuracies else 0,
2625                    "comment": f"Average accuracy across {len(accuracies)} items"
2626                }
2627
2628            result = langfuse.run_experiment(
2629                name="LLM Safety and Accuracy Test",
2630                description="Evaluate model accuracy and safety across diverse prompts",
2631                data=test_dataset,  # Your dataset items
2632                task=llm_task,
2633                evaluators=[accuracy_evaluator, toxicity_evaluator],
2634                run_evaluators=[average_accuracy],
2635                max_concurrency=5,  # Limit concurrent API calls
2636                metadata={"model": "gpt-4", "temperature": 0.7}
2637            )
2638            ```
2639
2640            Using with Langfuse datasets:
2641            ```python
2642            # Get dataset from Langfuse
2643            dataset = langfuse.get_dataset("my-eval-dataset")
2644
2645            result = dataset.run_experiment(
2646                name="Production Model Evaluation",
2647                description="Monthly evaluation of production model performance",
2648                task=my_production_task,
2649                evaluators=[accuracy_evaluator, latency_evaluator]
2650            )
2651
2652            # Results automatically linked to dataset in Langfuse UI
2653            print(f"View results: {result['dataset_run_url']}")
2654            ```
2655
2656        Note:
2657            - Task and evaluator functions can be either synchronous or asynchronous
2658            - Individual item failures are logged but don't stop the experiment
2659            - All executions are automatically traced and visible in Langfuse UI
2660            - When using Langfuse datasets, results are automatically linked for easy comparison
2661            - This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.)
2662            - Async execution is handled automatically with smart event loop detection
2663        """
2664        return cast(
2665            ExperimentResult,
2666            run_async_safely(
2667                self._run_experiment_async(
2668                    name=name,
2669                    run_name=self._create_experiment_run_name(
2670                        name=name, run_name=run_name
2671                    ),
2672                    description=description,
2673                    data=data,
2674                    task=task,
2675                    evaluators=evaluators or [],
2676                    composite_evaluator=composite_evaluator,
2677                    run_evaluators=run_evaluators or [],
2678                    max_concurrency=max_concurrency,
2679                    metadata=metadata,
2680                    dataset_version=_dataset_version,
2681                ),
2682            ),
2683        )

Run an experiment on a dataset with automatic tracing and evaluation.

This method executes a task function on each item in the provided dataset, automatically traces all executions with Langfuse for observability, runs item-level and run-level evaluators on the outputs, and returns comprehensive results with evaluation metrics.

The experiment system provides:

  • Automatic tracing of all task executions
  • Concurrent processing with configurable limits
  • Comprehensive error handling that isolates failures
  • Integration with Langfuse datasets for experiment tracking
  • Flexible evaluation framework supporting both sync and async evaluators
Arguments:
  • name: Human-readable name for the experiment. Used for identification in the Langfuse UI.
  • run_name: Optional exact name for the experiment run. If provided, this will be used as the exact dataset run name if the data contains Langfuse dataset items. If not provided, this will default to the experiment name appended with an ISO timestamp.
  • description: Optional description explaining the experiment's purpose, methodology, or expected outcomes.
  • data: Array of data items to process. Can be either:
    • List of dict-like items with 'input', 'expected_output', 'metadata' keys
    • List of Langfuse DatasetItem objects from dataset.items
  • task: Function that processes each data item and returns output. Must accept 'item' as keyword argument and can return sync or async results. The task function signature should be: task(*, item, **kwargs) -> Any
  • evaluators: List of functions to evaluate each item's output individually. Each evaluator receives input, output, expected_output, and metadata. Can return single Evaluation dict or list of Evaluation dicts.
  • composite_evaluator: Optional function that creates composite scores from item-level evaluations. Receives the same inputs as item-level evaluators (input, output, expected_output, metadata) plus the list of evaluations from item-level evaluators. Useful for weighted averages, pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics.
  • run_evaluators: List of functions to evaluate the entire experiment run. Each run evaluator receives all item_results and can compute aggregate metrics. Useful for calculating averages, distributions, or cross-item comparisons.
  • max_concurrency: Maximum number of concurrent task executions (default: 50). Controls the number of items processed simultaneously. Adjust based on API rate limits and system resources.
  • metadata: Optional metadata dictionary to attach to all experiment traces. This metadata will be included in every trace created during the experiment. If data are Langfuse dataset items, the metadata will be attached to the dataset run, too.
Returns:

ExperimentResult containing:

  • run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset.
  • item_results: List of results for each processed item with outputs and evaluations
  • run_evaluations: List of aggregate evaluation results for the entire run
  • experiment_id: Stable identifier for the experiment run across all items
  • dataset_run_id: ID of the dataset run (if using Langfuse datasets)
  • dataset_run_url: Direct URL to view results in Langfuse UI (if applicable)
Raises:
  • ValueError: If required parameters are missing or invalid
  • Exception: If experiment setup fails (individual item failures are handled gracefully)
Examples:

Basic experiment with local data:

def summarize_text(*, item, **kwargs):
    return f"Summary: {item['input'][:50]}..."

def length_evaluator(*, input, output, expected_output=None, **kwargs):
    return {
        "name": "output_length",
        "value": len(output),
        "comment": f"Output contains {len(output)} characters"
    }

result = langfuse.run_experiment(
    name="Text Summarization Test",
    description="Evaluate summarization quality and length",
    data=[
        {"input": "Long article text...", "expected_output": "Expected summary"},
        {"input": "Another article...", "expected_output": "Another summary"}
    ],
    task=summarize_text,
    evaluators=[length_evaluator]
)

print(f"Processed {len(result.item_results)} items")
for item_result in result.item_results:
    print(f"Input: {item_result.item['input']}")
    print(f"Output: {item_result.output}")
    print(f"Evaluations: {item_result.evaluations}")

Advanced experiment with async task and multiple evaluators:

async def llm_task(*, item, **kwargs):
    # Simulate async LLM call
    response = await openai_client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": item["input"]}]
    )
    return response.choices[0].message.content

def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
    if expected_output and expected_output.lower() in output.lower():
        return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"}
    return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"}

def toxicity_evaluator(*, input, output, expected_output=None, **kwargs):
    # Simulate toxicity check
    toxicity_score = check_toxicity(output)  # Your toxicity checker
    return {
        "name": "toxicity",
        "value": toxicity_score,
        "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}"
    }

def average_accuracy(*, item_results, **kwargs):
    accuracies = [
        eval.value for result in item_results
        for eval in result.evaluations
        if eval.name == "accuracy"
    ]
    return {
        "name": "average_accuracy",
        "value": sum(accuracies) / len(accuracies) if accuracies else 0,
        "comment": f"Average accuracy across {len(accuracies)} items"
    }

result = langfuse.run_experiment(
    name="LLM Safety and Accuracy Test",
    description="Evaluate model accuracy and safety across diverse prompts",
    data=test_dataset,  # Your dataset items
    task=llm_task,
    evaluators=[accuracy_evaluator, toxicity_evaluator],
    run_evaluators=[average_accuracy],
    max_concurrency=5,  # Limit concurrent API calls
    metadata={"model": "gpt-4", "temperature": 0.7}
)

Using with Langfuse datasets:

# Get dataset from Langfuse
dataset = langfuse.get_dataset("my-eval-dataset")

result = dataset.run_experiment(
    name="Production Model Evaluation",
    description="Monthly evaluation of production model performance",
    task=my_production_task,
    evaluators=[accuracy_evaluator, latency_evaluator]
)

# Results automatically linked to dataset in Langfuse UI
print(f"View results: {result['dataset_run_url']}")
Note:
  • Task and evaluator functions can be either synchronous or asynchronous
  • Individual item failures are logged but don't stop the experiment
  • All executions are automatically traced and visible in Langfuse UI
  • When using Langfuse datasets, results are automatically linked for easy comparison
  • This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.)
  • Async execution is handled automatically with smart event loop detection
def run_batched_evaluation( self, *, scope: Literal['traces', 'observations'], mapper: MapperFunction, filter: Optional[str] = None, fetch_batch_size: int = 50, fetch_trace_fields: Optional[str] = None, max_items: Optional[int] = None, max_retries: int = 3, evaluators: List[langfuse.experiment.EvaluatorFunction], composite_evaluator: Optional[CompositeEvaluatorFunction] = None, max_concurrency: int = 5, metadata: Optional[Dict[str, Any]] = None, _add_observation_scores_to_trace: bool = False, _additional_trace_tags: Optional[List[str]] = None, resume_from: Optional[BatchEvaluationResumeToken] = None, verbose: bool = False) -> BatchEvaluationResult:
3045    def run_batched_evaluation(
3046        self,
3047        *,
3048        scope: Literal["traces", "observations"],
3049        mapper: MapperFunction,
3050        filter: Optional[str] = None,
3051        fetch_batch_size: int = 50,
3052        fetch_trace_fields: Optional[str] = None,
3053        max_items: Optional[int] = None,
3054        max_retries: int = 3,
3055        evaluators: List[EvaluatorFunction],
3056        composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
3057        max_concurrency: int = 5,
3058        metadata: Optional[Dict[str, Any]] = None,
3059        _add_observation_scores_to_trace: bool = False,
3060        _additional_trace_tags: Optional[List[str]] = None,
3061        resume_from: Optional[BatchEvaluationResumeToken] = None,
3062        verbose: bool = False,
3063    ) -> BatchEvaluationResult:
3064        """Fetch traces or observations and run evaluations on each item.
3065
3066        This method provides a powerful way to evaluate existing data in Langfuse at scale.
3067        It fetches items based on filters, transforms them using a mapper function, runs
3068        evaluators on each item, and creates scores that are linked back to the original
3069        entities. This is ideal for:
3070
3071        - Running evaluations on production traces after deployment
3072        - Backtesting new evaluation metrics on historical data
3073        - Batch scoring of observations for quality monitoring
3074        - Periodic evaluation runs on recent data
3075
3076        The method uses a streaming/pipeline approach to process items in batches, making
3077        it memory-efficient for large datasets. It includes comprehensive error handling,
3078        retry logic, and resume capability for long-running evaluations.
3079
3080        Args:
3081            scope: The type of items to evaluate. Must be one of:
3082                - "traces": Evaluate complete traces with all their observations
3083                - "observations": Evaluate individual observations (spans, generations, events)
3084            mapper: Function that transforms API response objects into evaluator inputs.
3085                Receives a trace/observation object and returns an EvaluatorInputs
3086                instance with input, output, expected_output, and metadata fields.
3087                Can be sync or async.
3088            evaluators: List of evaluation functions to run on each item. Each evaluator
3089                receives the mapped inputs and returns Evaluation object(s). Evaluator
3090                failures are logged but don't stop the batch evaluation.
3091            filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples:
3092                - '{"tags": ["production"]}'
3093                - '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}'
3094                Default: None (fetches all items).
3095            fetch_batch_size: Number of items to fetch per API call and hold in memory.
3096                Larger values may be faster but use more memory. Default: 50.
3097            fetch_trace_fields: Comma-separated list of fields to include when fetching traces. Available field groups: 'core' (always included), 'io' (input, output, metadata), 'scores', 'observations', 'metrics'. If not specified, all fields are returned. Example: 'core,scores,metrics'. Note: Excluded 'observations' or 'scores' fields return empty arrays; excluded 'metrics' returns -1 for 'totalCost' and 'latency'. Only relevant if scope is 'traces'.
3098            max_items: Maximum total number of items to process. If None, processes all
3099                items matching the filter. Useful for testing or limiting evaluation runs.
3100                Default: None (process all).
3101            max_concurrency: Maximum number of items to evaluate concurrently. Controls
3102                parallelism and resource usage. Default: 5.
3103            composite_evaluator: Optional function that creates a composite score from
3104                item-level evaluations. Receives the original item and its evaluations,
3105                returns a single Evaluation. Useful for weighted averages or combined metrics.
3106                Default: None.
3107            metadata: Optional metadata dict to add to all created scores. Useful for
3108                tracking evaluation runs, versions, or other context. Default: None.
3109            max_retries: Maximum number of retry attempts for failed batch fetches.
3110                Uses exponential backoff (1s, 2s, 4s). Default: 3.
3111            verbose: If True, logs progress information to console. Useful for monitoring
3112                long-running evaluations. Default: False.
3113            resume_from: Optional resume token from a previous incomplete run. Allows
3114                continuing evaluation after interruption or failure. Default: None.
3115
3116
3117        Returns:
3118            BatchEvaluationResult containing:
3119                - total_items_fetched: Number of items fetched from API
3120                - total_items_processed: Number of items successfully evaluated
3121                - total_items_failed: Number of items that failed evaluation
3122                - total_scores_created: Scores created by item-level evaluators
3123                - total_composite_scores_created: Scores created by composite evaluator
3124                - total_evaluations_failed: Individual evaluator failures
3125                - evaluator_stats: Per-evaluator statistics (success rate, scores created)
3126                - resume_token: Token for resuming if incomplete (None if completed)
3127                - completed: True if all items processed
3128                - duration_seconds: Total execution time
3129                - failed_item_ids: IDs of items that failed
3130                - error_summary: Error types and counts
3131                - has_more_items: True if max_items reached but more exist
3132
3133        Raises:
3134            ValueError: If invalid scope is provided.
3135
3136        Examples:
3137            Basic trace evaluation:
3138            ```python
3139            from langfuse import Langfuse, EvaluatorInputs, Evaluation
3140
3141            client = Langfuse()
3142
3143            # Define mapper to extract fields from traces
3144            def trace_mapper(trace):
3145                return EvaluatorInputs(
3146                    input=trace.input,
3147                    output=trace.output,
3148                    expected_output=None,
3149                    metadata={"trace_id": trace.id}
3150                )
3151
3152            # Define evaluator
3153            def length_evaluator(*, input, output, expected_output, metadata):
3154                return Evaluation(
3155                    name="output_length",
3156                    value=len(output) if output else 0
3157                )
3158
3159            # Run batch evaluation
3160            result = client.run_batched_evaluation(
3161                scope="traces",
3162                mapper=trace_mapper,
3163                evaluators=[length_evaluator],
3164                filter='{"tags": ["production"]}',
3165                max_items=1000,
3166                verbose=True
3167            )
3168
3169            print(f"Processed {result.total_items_processed} traces")
3170            print(f"Created {result.total_scores_created} scores")
3171            ```
3172
3173            Evaluation with composite scorer:
3174            ```python
3175            def accuracy_evaluator(*, input, output, expected_output, metadata):
3176                # ... evaluation logic
3177                return Evaluation(name="accuracy", value=0.85)
3178
3179            def relevance_evaluator(*, input, output, expected_output, metadata):
3180                # ... evaluation logic
3181                return Evaluation(name="relevance", value=0.92)
3182
3183            def composite_evaluator(*, item, evaluations):
3184                # Weighted average of evaluations
3185                weights = {"accuracy": 0.6, "relevance": 0.4}
3186                total = sum(
3187                    e.value * weights.get(e.name, 0)
3188                    for e in evaluations
3189                    if isinstance(e.value, (int, float))
3190                )
3191                return Evaluation(
3192                    name="composite_score",
3193                    value=total,
3194                    comment=f"Weighted average of {len(evaluations)} metrics"
3195                )
3196
3197            result = client.run_batched_evaluation(
3198                scope="traces",
3199                mapper=trace_mapper,
3200                evaluators=[accuracy_evaluator, relevance_evaluator],
3201                composite_evaluator=composite_evaluator,
3202                filter='{"user_id": "important_user"}',
3203                verbose=True
3204            )
3205            ```
3206
3207            Handling incomplete runs with resume:
3208            ```python
3209            # Initial run that may fail or timeout
3210            result = client.run_batched_evaluation(
3211                scope="observations",
3212                mapper=obs_mapper,
3213                evaluators=[my_evaluator],
3214                max_items=10000,
3215                verbose=True
3216            )
3217
3218            # Check if incomplete
3219            if not result.completed and result.resume_token:
3220                print(f"Processed {result.resume_token.items_processed} items before interruption")
3221
3222                # Resume from where it left off
3223                result = client.run_batched_evaluation(
3224                    scope="observations",
3225                    mapper=obs_mapper,
3226                    evaluators=[my_evaluator],
3227                    resume_from=result.resume_token,
3228                    verbose=True
3229                )
3230
3231            print(f"Total items processed: {result.total_items_processed}")
3232            ```
3233
3234            Monitoring evaluator performance:
3235            ```python
3236            result = client.run_batched_evaluation(...)
3237
3238            for stats in result.evaluator_stats:
3239                success_rate = stats.successful_runs / stats.total_runs
3240                print(f"{stats.name}:")
3241                print(f"  Success rate: {success_rate:.1%}")
3242                print(f"  Scores created: {stats.total_scores_created}")
3243
3244                if stats.failed_runs > 0:
3245                    print(f"  ⚠️  Failed {stats.failed_runs} times")
3246            ```
3247
3248        Note:
3249            - Evaluator failures are logged but don't stop the batch evaluation
3250            - Individual item failures are tracked but don't stop processing
3251            - Fetch failures are retried with exponential backoff
3252            - All scores are automatically flushed to Langfuse at the end
3253            - The resume mechanism uses timestamp-based filtering to avoid duplicates
3254        """
3255        runner = BatchEvaluationRunner(self)
3256
3257        return cast(
3258            BatchEvaluationResult,
3259            run_async_safely(
3260                runner.run_async(
3261                    scope=scope,
3262                    mapper=mapper,
3263                    evaluators=evaluators,
3264                    filter=filter,
3265                    fetch_batch_size=fetch_batch_size,
3266                    fetch_trace_fields=fetch_trace_fields,
3267                    max_items=max_items,
3268                    max_concurrency=max_concurrency,
3269                    composite_evaluator=composite_evaluator,
3270                    metadata=metadata,
3271                    _add_observation_scores_to_trace=_add_observation_scores_to_trace,
3272                    _additional_trace_tags=_additional_trace_tags,
3273                    max_retries=max_retries,
3274                    verbose=verbose,
3275                    resume_from=resume_from,
3276                )
3277            ),
3278        )

Fetch traces or observations and run evaluations on each item.

This method provides a powerful way to evaluate existing data in Langfuse at scale. It fetches items based on filters, transforms them using a mapper function, runs evaluators on each item, and creates scores that are linked back to the original entities. This is ideal for:

  • Running evaluations on production traces after deployment
  • Backtesting new evaluation metrics on historical data
  • Batch scoring of observations for quality monitoring
  • Periodic evaluation runs on recent data

The method uses a streaming/pipeline approach to process items in batches, making it memory-efficient for large datasets. It includes comprehensive error handling, retry logic, and resume capability for long-running evaluations.

Arguments:
  • scope: The type of items to evaluate. Must be one of:
    • "traces": Evaluate complete traces with all their observations
    • "observations": Evaluate individual observations (spans, generations, events)
  • mapper: Function that transforms API response objects into evaluator inputs. Receives a trace/observation object and returns an EvaluatorInputs instance with input, output, expected_output, and metadata fields. Can be sync or async.
  • evaluators: List of evaluation functions to run on each item. Each evaluator receives the mapped inputs and returns Evaluation object(s). Evaluator failures are logged but don't stop the batch evaluation.
  • filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples:
    • '{"tags": ["production"]}'
    • '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}' Default: None (fetches all items).
  • fetch_batch_size: Number of items to fetch per API call and hold in memory. Larger values may be faster but use more memory. Default: 50.
  • fetch_trace_fields: Comma-separated list of fields to include when fetching traces. Available field groups: 'core' (always included), 'io' (input, output, metadata), 'scores', 'observations', 'metrics'. If not specified, all fields are returned. Example: 'core,scores,metrics'. Note: Excluded 'observations' or 'scores' fields return empty arrays; excluded 'metrics' returns -1 for 'totalCost' and 'latency'. Only relevant if scope is 'traces'.
  • max_items: Maximum total number of items to process. If None, processes all items matching the filter. Useful for testing or limiting evaluation runs. Default: None (process all).
  • max_concurrency: Maximum number of items to evaluate concurrently. Controls parallelism and resource usage. Default: 5.
  • composite_evaluator: Optional function that creates a composite score from item-level evaluations. Receives the original item and its evaluations, returns a single Evaluation. Useful for weighted averages or combined metrics. Default: None.
  • metadata: Optional metadata dict to add to all created scores. Useful for tracking evaluation runs, versions, or other context. Default: None.
  • max_retries: Maximum number of retry attempts for failed batch fetches. Uses exponential backoff (1s, 2s, 4s). Default: 3.
  • verbose: If True, logs progress information to console. Useful for monitoring long-running evaluations. Default: False.
  • resume_from: Optional resume token from a previous incomplete run. Allows continuing evaluation after interruption or failure. Default: None.
Returns:

BatchEvaluationResult containing: - total_items_fetched: Number of items fetched from API - total_items_processed: Number of items successfully evaluated - total_items_failed: Number of items that failed evaluation - total_scores_created: Scores created by item-level evaluators - total_composite_scores_created: Scores created by composite evaluator - total_evaluations_failed: Individual evaluator failures - evaluator_stats: Per-evaluator statistics (success rate, scores created) - resume_token: Token for resuming if incomplete (None if completed) - completed: True if all items processed - duration_seconds: Total execution time - failed_item_ids: IDs of items that failed - error_summary: Error types and counts - has_more_items: True if max_items reached but more exist

Raises:
  • ValueError: If invalid scope is provided.
Examples:

Basic trace evaluation:

from langfuse import Langfuse, EvaluatorInputs, Evaluation

client = Langfuse()

# Define mapper to extract fields from traces
def trace_mapper(trace):
    return EvaluatorInputs(
        input=trace.input,
        output=trace.output,
        expected_output=None,
        metadata={"trace_id": trace.id}
    )

# Define evaluator
def length_evaluator(*, input, output, expected_output, metadata):
    return Evaluation(
        name="output_length",
        value=len(output) if output else 0
    )

# Run batch evaluation
result = client.run_batched_evaluation(
    scope="traces",
    mapper=trace_mapper,
    evaluators=[length_evaluator],
    filter='{"tags": ["production"]}',
    max_items=1000,
    verbose=True
)

print(f"Processed {result.total_items_processed} traces")
print(f"Created {result.total_scores_created} scores")

Evaluation with composite scorer:

def accuracy_evaluator(*, input, output, expected_output, metadata):
    # ... evaluation logic
    return Evaluation(name="accuracy", value=0.85)

def relevance_evaluator(*, input, output, expected_output, metadata):
    # ... evaluation logic
    return Evaluation(name="relevance", value=0.92)

def composite_evaluator(*, item, evaluations):
    # Weighted average of evaluations
    weights = {"accuracy": 0.6, "relevance": 0.4}
    total = sum(
        e.value * weights.get(e.name, 0)
        for e in evaluations
        if isinstance(e.value, (int, float))
    )
    return Evaluation(
        name="composite_score",
        value=total,
        comment=f"Weighted average of {len(evaluations)} metrics"
    )

result = client.run_batched_evaluation(
    scope="traces",
    mapper=trace_mapper,
    evaluators=[accuracy_evaluator, relevance_evaluator],
    composite_evaluator=composite_evaluator,
    filter='{"user_id": "important_user"}',
    verbose=True
)

Handling incomplete runs with resume:

# Initial run that may fail or timeout
result = client.run_batched_evaluation(
    scope="observations",
    mapper=obs_mapper,
    evaluators=[my_evaluator],
    max_items=10000,
    verbose=True
)

# Check if incomplete
if not result.completed and result.resume_token:
    print(f"Processed {result.resume_token.items_processed} items before interruption")

    # Resume from where it left off
    result = client.run_batched_evaluation(
        scope="observations",
        mapper=obs_mapper,
        evaluators=[my_evaluator],
        resume_from=result.resume_token,
        verbose=True
    )

print(f"Total items processed: {result.total_items_processed}")

Monitoring evaluator performance:

result = client.run_batched_evaluation(...)

for stats in result.evaluator_stats:
    success_rate = stats.successful_runs / stats.total_runs
    print(f"{stats.name}:")
    print(f"  Success rate: {success_rate:.1%}")
    print(f"  Scores created: {stats.total_scores_created}")

    if stats.failed_runs > 0:
        print(f"  ⚠️  Failed {stats.failed_runs} times")
Note:
  • Evaluator failures are logged but don't stop the batch evaluation
  • Individual item failures are tracked but don't stop processing
  • Fetch failures are retried with exponential backoff
  • All scores are automatically flushed to Langfuse at the end
  • The resume mechanism uses timestamp-based filtering to avoid duplicates
def auth_check(self) -> bool:
3280    def auth_check(self) -> bool:
3281        """Check if the provided credentials (public and secret key) are valid.
3282
3283        Raises:
3284            Exception: If no projects were found for the provided credentials.
3285
3286        Note:
3287            This method is blocking. It is discouraged to use it in production code.
3288        """
3289        try:
3290            projects = self.api.projects.get()
3291            langfuse_logger.debug(
3292                f"Auth check successful, found {len(projects.data)} projects"
3293            )
3294            if len(projects.data) == 0:
3295                raise Exception(
3296                    "Auth check failed, no project found for the keys provided."
3297                )
3298            return True
3299
3300        except AttributeError as e:
3301            langfuse_logger.warning(
3302                f"Auth check failed: Client not properly initialized. Error: {e}"
3303            )
3304            return False
3305
3306        except Error as e:
3307            handle_fern_exception(e)
3308            raise e

Check if the provided credentials (public and secret key) are valid.

Raises:
  • Exception: If no projects were found for the provided credentials.
Note:

This method is blocking. It is discouraged to use it in production code.

def create_dataset( self, *, name: str, description: Optional[str] = None, metadata: Optional[Any] = None, input_schema: Optional[Any] = None, expected_output_schema: Optional[Any] = None) -> langfuse.api.Dataset:
3310    def create_dataset(
3311        self,
3312        *,
3313        name: str,
3314        description: Optional[str] = None,
3315        metadata: Optional[Any] = None,
3316        input_schema: Optional[Any] = None,
3317        expected_output_schema: Optional[Any] = None,
3318    ) -> Dataset:
3319        """Create a dataset with the given name on Langfuse.
3320
3321        Args:
3322            name: Name of the dataset to create.
3323            description: Description of the dataset. Defaults to None.
3324            metadata: Additional metadata. Defaults to None.
3325            input_schema: JSON Schema for validating dataset item inputs. When set, all new items will be validated against this schema.
3326            expected_output_schema: JSON Schema for validating dataset item expected outputs. When set, all new items will be validated against this schema.
3327
3328        Returns:
3329            Dataset: The created dataset as returned by the Langfuse API.
3330        """
3331        try:
3332            langfuse_logger.debug(f"Creating datasets {name}")
3333
3334            result = self.api.datasets.create(
3335                name=name,
3336                description=description,
3337                metadata=metadata,
3338                input_schema=input_schema,
3339                expected_output_schema=expected_output_schema,
3340            )
3341
3342            return cast(Dataset, result)
3343
3344        except Error as e:
3345            handle_fern_exception(e)
3346            raise e

Create a dataset with the given name on Langfuse.

Arguments:
  • name: Name of the dataset to create.
  • description: Description of the dataset. Defaults to None.
  • metadata: Additional metadata. Defaults to None.
  • input_schema: JSON Schema for validating dataset item inputs. When set, all new items will be validated against this schema.
  • expected_output_schema: JSON Schema for validating dataset item expected outputs. When set, all new items will be validated against this schema.
Returns:

Dataset: The created dataset as returned by the Langfuse API.

def create_dataset_item( self, *, dataset_name: str, input: Optional[Any] = None, expected_output: Optional[Any] = None, metadata: Optional[Any] = None, source_trace_id: Optional[str] = None, source_observation_id: Optional[str] = None, status: Optional[langfuse.api.DatasetStatus] = None, id: Optional[str] = None) -> langfuse.api.DatasetItem:
3348    def create_dataset_item(
3349        self,
3350        *,
3351        dataset_name: str,
3352        input: Optional[Any] = None,
3353        expected_output: Optional[Any] = None,
3354        metadata: Optional[Any] = None,
3355        source_trace_id: Optional[str] = None,
3356        source_observation_id: Optional[str] = None,
3357        status: Optional[DatasetStatus] = None,
3358        id: Optional[str] = None,
3359    ) -> DatasetItem:
3360        """Create a dataset item.
3361
3362        Upserts if an item with id already exists.
3363
3364        Args:
3365            dataset_name: Name of the dataset in which the dataset item should be created.
3366            input: Input data. Defaults to None. Can contain any dict, list or scalar.
3367            expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar.
3368            metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar.
3369            source_trace_id: Id of the source trace. Defaults to None.
3370            source_observation_id: Id of the source observation. Defaults to None.
3371            status: Status of the dataset item. Defaults to ACTIVE for newly created items.
3372            id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets.
3373
3374        Returns:
3375            DatasetItem: The created dataset item as returned by the Langfuse API.
3376
3377        Example:
3378            ```python
3379            from langfuse import Langfuse
3380
3381            langfuse = Langfuse()
3382
3383            # Uploading items to the Langfuse dataset named "capital_cities"
3384            langfuse.create_dataset_item(
3385                dataset_name="capital_cities",
3386                input={"input": {"country": "Italy"}},
3387                expected_output={"expected_output": "Rome"},
3388                metadata={"foo": "bar"}
3389            )
3390            ```
3391        """
3392        try:
3393            langfuse_logger.debug(f"Creating dataset item for dataset {dataset_name}")
3394
3395            # Media uploads must reference the (dataset, item) they belong to, and
3396            # the item need not exist yet — so settle on the item id up front and
3397            # reuse it for the create call below.
3398            item_id = id if id is not None else str(uuid.uuid4())
3399
3400            # Single pass per field: swap each LangfuseMedia for its reference
3401            # string (derived from content, not the upload) and collect the media
3402            # still to upload, deduped by media id and tagged with its field.
3403            pending_media: Dict[str, Tuple[LangfuseMedia, str]] = {}
3404            input = self._process_dataset_item_media(
3405                data=input,
3406                pending_media=pending_media,
3407                field=DatasetItemMediaReferenceField.INPUT.value,
3408            )
3409            expected_output = self._process_dataset_item_media(
3410                data=expected_output,
3411                pending_media=pending_media,
3412                field=DatasetItemMediaReferenceField.EXPECTED_OUTPUT.value,
3413            )
3414            metadata = self._process_dataset_item_media(
3415                data=metadata,
3416                pending_media=pending_media,
3417                field=DatasetItemMediaReferenceField.METADATA.value,
3418            )
3419
3420            # The upload needs the dataset id, but the create API only takes the
3421            # name. Resolve it once, and only when there is actually media to
3422            # upload — a plain item pays no extra datasets.get round-trip.
3423            if pending_media:
3424                assert self._resources is not None
3425                dataset_id = self.api.datasets.get(self._url_encode(dataset_name)).id
3426                for media, field in pending_media.values():
3427                    self._resources._media_manager._upload_media_sync(
3428                        media=media,
3429                        dataset_id=dataset_id,
3430                        dataset_item_id=item_id,
3431                        field=field,
3432                    )
3433
3434            result = self.api.dataset_items.create(
3435                dataset_name=dataset_name,
3436                input=input,
3437                expected_output=expected_output,
3438                metadata=metadata,
3439                source_trace_id=source_trace_id,
3440                source_observation_id=source_observation_id,
3441                status=status,
3442                id=item_id,
3443            )
3444
3445            return cast(DatasetItem, result)
3446        except Error as e:
3447            handle_fern_exception(e)
3448            raise e

Create a dataset item.

Upserts if an item with id already exists.

Arguments:
  • dataset_name: Name of the dataset in which the dataset item should be created.
  • input: Input data. Defaults to None. Can contain any dict, list or scalar.
  • expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar.
  • metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar.
  • source_trace_id: Id of the source trace. Defaults to None.
  • source_observation_id: Id of the source observation. Defaults to None.
  • status: Status of the dataset item. Defaults to ACTIVE for newly created items.
  • id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets.
Returns:

DatasetItem: The created dataset item as returned by the Langfuse API.

Example:
from langfuse import Langfuse

langfuse = Langfuse()

# Uploading items to the Langfuse dataset named "capital_cities"
langfuse.create_dataset_item(
    dataset_name="capital_cities",
    input={"input": {"country": "Italy"}},
    expected_output={"expected_output": "Rome"},
    metadata={"foo": "bar"}
)
def resolve_media_references( self, *, obj: Any, resolve_with: Literal['base64_data_uri'], max_depth: int = 10, content_fetch_timeout_seconds: int = 5) -> Any:
3574    def resolve_media_references(
3575        self,
3576        *,
3577        obj: Any,
3578        resolve_with: Literal["base64_data_uri"],
3579        max_depth: int = 10,
3580        content_fetch_timeout_seconds: int = 5,
3581    ) -> Any:
3582        """Replace media reference strings in an object with base64 data URIs.
3583
3584        This method recursively traverses an object (up to max_depth) looking for media reference strings
3585        in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using
3586        the provided Langfuse client and replaces the reference string with a base64 data URI.
3587
3588        If fetching media content fails for a reference string, a warning is logged and the reference
3589        string is left unchanged.
3590
3591        Args:
3592            obj: The object to process. Can be a primitive value, array, or nested object.
3593                If the object has a __dict__ attribute, a dict will be returned instead of the original object type.
3594            resolve_with: The representation of the media content to replace the media reference string with.
3595                Currently only "base64_data_uri" is supported.
3596            max_depth: int: The maximum depth to traverse the object. Default is 10.
3597            content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5.
3598
3599        Returns:
3600            A deep copy of the input object with all media references replaced with base64 data URIs where possible.
3601            If the input object has a __dict__ attribute, a dict will be returned instead of the original object type.
3602
3603        Example:
3604            obj = {
3605                "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@",
3606                "nested": {
3607                    "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@"
3608                }
3609            }
3610
3611            result = await LangfuseMedia.resolve_media_references(obj, langfuse_client)
3612
3613            # Result:
3614            # {
3615            #     "image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...",
3616            #     "nested": {
3617            #         "pdf": "data:application/pdf;base64,JVBERi0xLjcK..."
3618            #     }
3619            # }
3620        """
3621        return LangfuseMedia.resolve_media_references(
3622            langfuse_client=self,
3623            obj=obj,
3624            resolve_with=resolve_with,
3625            max_depth=max_depth,
3626            content_fetch_timeout_seconds=content_fetch_timeout_seconds,
3627        )

Replace media reference strings in an object with base64 data URIs.

This method recursively traverses an object (up to max_depth) looking for media reference strings in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using the provided Langfuse client and replaces the reference string with a base64 data URI.

If fetching media content fails for a reference string, a warning is logged and the reference string is left unchanged.

Arguments:
  • obj: The object to process. Can be a primitive value, array, or nested object. If the object has a __dict__ attribute, a dict will be returned instead of the original object type.
  • resolve_with: The representation of the media content to replace the media reference string with. Currently only "base64_data_uri" is supported.
  • max_depth: int: The maximum depth to traverse the object. Default is 10.
  • content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5.
Returns:

A deep copy of the input object with all media references replaced with base64 data URIs where possible. If the input object has a __dict__ attribute, a dict will be returned instead of the original object type.

Example:

obj = { "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@", "nested": { "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@" } }

result = await LangfuseMedia.resolve_media_references(obj, langfuse_client)

Result:

{

"image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...",

"nested": {

"pdf": "data:application/pdf;base64,JVBERi0xLjcK..."

}

}

def get_prompt( self, name: str, *, version: Optional[int] = None, label: Optional[str] = None, type: Literal['chat', 'text'] = 'text', cache_ttl_seconds: Optional[int] = None, fallback: Union[List[langfuse.model.ChatMessageDict], NoneType, str] = None, max_retries: Optional[int] = None, fetch_timeout_seconds: Optional[int] = None) -> Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient]:
3657    def get_prompt(
3658        self,
3659        name: str,
3660        *,
3661        version: Optional[int] = None,
3662        label: Optional[str] = None,
3663        type: Literal["chat", "text"] = "text",
3664        cache_ttl_seconds: Optional[int] = None,
3665        fallback: Union[Optional[List[ChatMessageDict]], Optional[str]] = None,
3666        max_retries: Optional[int] = None,
3667        fetch_timeout_seconds: Optional[int] = None,
3668    ) -> PromptClient:
3669        """Get a prompt.
3670
3671        This method attempts to fetch the requested prompt from the local cache. If the prompt is not found
3672        in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again
3673        and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will
3674        return the expired prompt as a fallback.
3675
3676        Args:
3677            name (str): The name of the prompt to retrieve.
3678
3679        Keyword Args:
3680            version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both.
3681            label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both.
3682            cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a
3683            keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0.
3684            type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text".
3685            fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None.
3686            max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds.
3687            fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default.
3688
3689        Returns:
3690            The prompt object retrieved from the cache or directly fetched if not cached or expired of type
3691            - TextPromptClient, if type argument is 'text'.
3692            - ChatPromptClient, if type argument is 'chat'.
3693
3694        Raises:
3695            Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an
3696            expired prompt in the cache, in which case it logs a warning and returns the expired prompt.
3697        """
3698        if self._resources is None:
3699            raise Error(
3700                "SDK is not correctly initialized. Check the init logs for more details."
3701            )
3702        if version is not None and label is not None:
3703            raise ValueError("Cannot specify both version and label at the same time.")
3704
3705        if not name:
3706            raise ValueError("Prompt name cannot be empty.")
3707
3708        cache_key = PromptCache.generate_cache_key(name, version=version, label=label)
3709        bounded_max_retries = self._get_bounded_max_retries(
3710            max_retries, default_max_retries=2, max_retries_upper_bound=4
3711        )
3712
3713        langfuse_logger.debug(f"Getting prompt '{cache_key}'")
3714        cached_prompt = self._resources.prompt_cache.get(cache_key)
3715
3716        if cached_prompt is None or cache_ttl_seconds == 0:
3717            langfuse_logger.debug(
3718                f"Prompt '{cache_key}' not found in cache or caching disabled."
3719            )
3720            try:
3721                return self._fetch_prompt_and_update_cache(
3722                    name,
3723                    version=version,
3724                    label=label,
3725                    ttl_seconds=cache_ttl_seconds,
3726                    max_retries=bounded_max_retries,
3727                    fetch_timeout_seconds=fetch_timeout_seconds,
3728                )
3729            except Exception as e:
3730                if fallback:
3731                    langfuse_logger.warning(
3732                        f"Returning fallback prompt for '{cache_key}' due to fetch error: {e}"
3733                    )
3734
3735                    fallback_client_args: Dict[str, Any] = {
3736                        "name": name,
3737                        "prompt": fallback,
3738                        "type": type,
3739                        "version": version or 0,
3740                        "config": {},
3741                        "labels": [label] if label else [],
3742                        "tags": [],
3743                    }
3744
3745                    if type == "text":
3746                        return TextPromptClient(
3747                            prompt=Prompt_Text(**fallback_client_args),
3748                            is_fallback=True,
3749                        )
3750
3751                    if type == "chat":
3752                        return ChatPromptClient(
3753                            prompt=Prompt_Chat(**fallback_client_args),
3754                            is_fallback=True,
3755                        )
3756
3757                raise e
3758
3759        if cached_prompt.is_expired():
3760            langfuse_logger.debug(f"Stale prompt '{cache_key}' found in cache.")
3761            try:
3762                # refresh prompt in background thread, refresh_prompt deduplicates tasks
3763                langfuse_logger.debug(f"Refreshing prompt '{cache_key}' in background.")
3764
3765                def refresh_task() -> None:
3766                    self._fetch_prompt_and_update_cache(
3767                        name,
3768                        version=version,
3769                        label=label,
3770                        ttl_seconds=cache_ttl_seconds,
3771                        max_retries=bounded_max_retries,
3772                        fetch_timeout_seconds=fetch_timeout_seconds,
3773                    )
3774
3775                self._resources.prompt_cache.add_refresh_prompt_task_if_current(
3776                    cache_key,
3777                    cached_prompt,
3778                    refresh_task,
3779                )
3780                langfuse_logger.debug(
3781                    f"Returning stale prompt '{cache_key}' from cache."
3782                )
3783                # return stale prompt
3784                return cached_prompt.value
3785
3786            except Exception as e:
3787                langfuse_logger.warning(
3788                    f"Error when refreshing cached prompt '{cache_key}', returning cached version. Error: {e}"
3789                )
3790                # creation of refresh prompt task failed, return stale prompt
3791                return cached_prompt.value
3792
3793        return cached_prompt.value

Get a prompt.

This method attempts to fetch the requested prompt from the local cache. If the prompt is not found in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will return the expired prompt as a fallback.

Arguments:
  • name (str): The name of the prompt to retrieve.
Keyword Args:
  • version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the production label is returned. Specify either version or label, not both.
  • label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the production label is returned. Specify either version or label, not both.
  • cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a
  • keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0.
  • type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text".
  • fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None.
  • max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds.
  • fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default.
Returns:

The prompt object retrieved from the cache or directly fetched if not cached or expired of type

  • TextPromptClient, if type argument is 'text'.
  • ChatPromptClient, if type argument is 'chat'.
Raises:
  • Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an
  • expired prompt in the cache, in which case it logs a warning and returns the expired prompt.
def create_prompt( self, *, name: str, prompt: Union[str, List[Union[langfuse.model.ChatMessageDict, langfuse.model.ChatMessageWithPlaceholdersDict_Message, langfuse.model.ChatMessageWithPlaceholdersDict_Placeholder]]], labels: List[str] = [], tags: Optional[List[str]] = None, type: Optional[Literal['chat', 'text']] = 'text', config: Optional[Any] = None, commit_message: Optional[str] = None) -> Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient]:
3895    def create_prompt(
3896        self,
3897        *,
3898        name: str,
3899        prompt: Union[
3900            str, List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]]
3901        ],
3902        labels: List[str] = [],
3903        tags: Optional[List[str]] = None,
3904        type: Optional[Literal["chat", "text"]] = "text",
3905        config: Optional[Any] = None,
3906        commit_message: Optional[str] = None,
3907    ) -> PromptClient:
3908        """Create a new prompt in Langfuse.
3909
3910        Keyword Args:
3911            name : The name of the prompt to be created.
3912            prompt : The content of the prompt to be created.
3913            is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead.
3914            labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label.
3915            tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt.
3916            config: Additional structured data to be saved with the prompt. Defaults to None.
3917            type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text".
3918            commit_message: Optional string describing the change.
3919
3920        Returns:
3921            TextPromptClient: The prompt if type argument is 'text'.
3922            ChatPromptClient: The prompt if type argument is 'chat'.
3923        """
3924        try:
3925            langfuse_logger.debug(f"Creating prompt {name=}, {labels=}")
3926
3927            if type == "chat":
3928                if not isinstance(prompt, list):
3929                    raise ValueError(
3930                        "For 'chat' type, 'prompt' must be a list of chat messages with role and content attributes."
3931                    )
3932                request: Union[CreateChatPromptRequest, CreateTextPromptRequest] = (
3933                    CreateChatPromptRequest(
3934                        name=name,
3935                        prompt=cast(Any, prompt),
3936                        labels=labels,
3937                        tags=tags,
3938                        config=config or {},
3939                        commit_message=commit_message,
3940                        type=CreateChatPromptType.CHAT,
3941                    )
3942                )
3943                server_prompt = self.api.prompts.create(request=request)
3944
3945                if self._resources is not None:
3946                    self._resources.prompt_cache.invalidate(name)
3947
3948                return ChatPromptClient(prompt=cast(Prompt_Chat, server_prompt))
3949
3950            if not isinstance(prompt, str):
3951                raise ValueError("For 'text' type, 'prompt' must be a string.")
3952
3953            request = CreateTextPromptRequest(
3954                name=name,
3955                prompt=prompt,
3956                labels=labels,
3957                tags=tags,
3958                config=config or {},
3959                commit_message=commit_message,
3960            )
3961
3962            server_prompt = self.api.prompts.create(request=request)
3963
3964            if self._resources is not None:
3965                self._resources.prompt_cache.invalidate(name)
3966
3967            return TextPromptClient(prompt=cast(Prompt_Text, server_prompt))
3968
3969        except Error as e:
3970            handle_fern_exception(e)
3971            raise e

Create a new prompt in Langfuse.

Keyword Args:
  • name : The name of the prompt to be created.
  • prompt : The content of the prompt to be created.
  • is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead.
  • labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label.
  • tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt.
  • config: Additional structured data to be saved with the prompt. Defaults to None.
  • type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text".
  • commit_message: Optional string describing the change.
Returns:

TextPromptClient: The prompt if type argument is 'text'. ChatPromptClient: The prompt if type argument is 'chat'.

def update_prompt(self, *, name: str, version: int, new_labels: List[str] = []) -> Any:
3973    def update_prompt(
3974        self,
3975        *,
3976        name: str,
3977        version: int,
3978        new_labels: List[str] = [],
3979    ) -> Any:
3980        """Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name.
3981
3982        Args:
3983            name (str): The name of the prompt to update.
3984            version (int): The version number of the prompt to update.
3985            new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to [].
3986
3987        Returns:
3988            Prompt: The updated prompt from the Langfuse API.
3989
3990        """
3991        updated_prompt = self.api.prompt_version.update(
3992            name=self._url_encode(name),
3993            version=version,
3994            new_labels=new_labels,
3995        )
3996
3997        if self._resources is not None:
3998            self._resources.prompt_cache.invalidate(name)
3999
4000        return updated_prompt

Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name.

Arguments:
  • name (str): The name of the prompt to update.
  • version (int): The version number of the prompt to update.
  • new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to [].
Returns:

Prompt: The updated prompt from the Langfuse API.

def clear_prompt_cache(self) -> None:
4015    def clear_prompt_cache(self) -> None:
4016        """Clear the entire prompt cache, removing all cached prompts.
4017
4018        This method is useful when you want to force a complete refresh of all
4019        cached prompts, for example after major updates or when you need to
4020        ensure the latest versions are fetched from the server.
4021        """
4022        if self._resources is not None:
4023            self._resources.prompt_cache.clear()

Clear the entire prompt cache, removing all cached prompts.

This method is useful when you want to force a complete refresh of all cached prompts, for example after major updates or when you need to ensure the latest versions are fetched from the server.

class LangfuseMedia:
 99class LangfuseMedia:
100    """A class for wrapping media objects for upload to Langfuse.
101
102    This class handles the preparation and formatting of media content for Langfuse,
103    supporting both base64 data URIs and raw content bytes.
104
105    Args:
106        obj (Optional[object]): The source object to be wrapped. Can be accessed via the `obj` attribute.
107        base64_data_uri (Optional[str]): A base64-encoded data URI containing the media content
108            and content type (e.g., "data:image/jpeg;base64,/9j/4AAQ...").
109        content_type (Optional[str]): The MIME type of the media content when providing raw bytes.
110        content_bytes (Optional[bytes]): Raw bytes of the media content.
111        file_path (Optional[str]): The path to the file containing the media content. For relative paths,
112            the current working directory is used.
113
114    Raises:
115        ValueError: If neither base64_data_uri or the combination of content_bytes
116            and content_type is provided.
117    """
118
119    obj: object
120
121    _content_bytes: Optional[bytes]
122    _content_type: Optional[MediaContentType]
123    _source: Optional[str]
124    _media_id: Optional[str]
125
126    def __init__(
127        self,
128        *,
129        obj: Optional[object] = None,
130        base64_data_uri: Optional[str] = None,
131        content_type: Optional[MediaContentType] = None,
132        content_bytes: Optional[bytes] = None,
133        file_path: Optional[str] = None,
134    ):
135        """Initialize a LangfuseMedia object.
136
137        Args:
138            obj: The object to wrap.
139
140            base64_data_uri: A base64-encoded data URI containing the media content
141                and content type (e.g., "data:image/jpeg;base64,/9j/4AAQ...").
142            content_type: The MIME type of the media content when providing raw bytes or reading from a file.
143            content_bytes: Raw bytes of the media content.
144            file_path: The path to the file containing the media content. For relative paths,
145                the current working directory is used.
146        """
147        self.obj = obj
148
149        if base64_data_uri is not None:
150            parsed_data = self._parse_base64_data_uri(base64_data_uri)
151            self._content_bytes, self._content_type = parsed_data
152            self._source = "base64_data_uri"
153
154        elif content_bytes is not None and content_type is not None:
155            self._content_type = content_type
156            self._content_bytes = content_bytes
157            self._source = "bytes"
158        elif (
159            file_path is not None
160            and content_type is not None
161            and os.path.exists(file_path)
162        ):
163            self._content_bytes = self._read_file(file_path)
164            self._content_type = content_type if self._content_bytes else None
165            self._source = "file" if self._content_bytes else None
166        else:
167            logger.error(
168                "base64_data_uri, or content_bytes and content_type, or file_path must be provided to LangfuseMedia"
169            )
170
171            self._content_bytes = None
172            self._content_type = None
173            self._source = None
174
175        self._media_id = self._get_media_id()
176
177    def _read_file(self, file_path: str) -> Optional[bytes]:
178        try:
179            with open(file_path, "rb") as file:
180                return file.read()
181        except Exception as e:
182            logger.error(f"Error reading file at path {file_path}", exc_info=e)
183
184            return None
185
186    def _get_media_id(self) -> Optional[str]:
187        content_hash = self._content_sha256_hash
188
189        if content_hash is None:
190            return None
191
192        # Convert hash to base64Url
193        url_safe_content_hash = content_hash.replace("+", "-").replace("/", "_")
194
195        return url_safe_content_hash[:22]
196
197    @property
198    def _content_length(self) -> Optional[int]:
199        return len(self._content_bytes) if self._content_bytes else None
200
201    @property
202    def _content_sha256_hash(self) -> Optional[str]:
203        if self._content_bytes is None:
204            return None
205
206        sha256_hash_bytes = hashlib.sha256(self._content_bytes).digest()
207
208        return base64.b64encode(sha256_hash_bytes).decode("utf-8")
209
210    @property
211    def _reference_string(self) -> Optional[str]:
212        if self._content_type is None or self._source is None or self._media_id is None:
213            return None
214
215        return f"@@@langfuseMedia:type={self._content_type}|id={self._media_id}|source={self._source}@@@"
216
217    @staticmethod
218    def parse_reference_string(reference_string: str) -> ParsedMediaReference:
219        """Parse a media reference string into a ParsedMediaReference.
220
221        Example reference string:
222            "@@@langfuseMedia:type=image/jpeg|id=some-uuid|source=base64_data_uri@@@"
223
224        Args:
225            reference_string: The reference string to parse.
226
227        Returns:
228            A TypedDict with the media_id, source, and content_type.
229
230        Raises:
231            ValueError: If the reference string is empty or not a string.
232            ValueError: If the reference string does not start with "@@@langfuseMedia:type=".
233            ValueError: If the reference string does not end with "@@@".
234            ValueError: If the reference string is missing required fields.
235        """
236        if not reference_string:
237            raise ValueError("Reference string is empty")
238
239        if not isinstance(reference_string, str):
240            raise ValueError("Reference string is not a string")
241
242        if not reference_string.startswith("@@@langfuseMedia:type="):
243            raise ValueError(
244                "Reference string does not start with '@@@langfuseMedia:type='"
245            )
246
247        if not reference_string.endswith("@@@"):
248            raise ValueError("Reference string does not end with '@@@'")
249
250        content = reference_string[len("@@@langfuseMedia:") :].rstrip("@@@")
251
252        # Split into key-value pairs
253        pairs = content.split("|")
254        parsed_data = {}
255
256        for pair in pairs:
257            key, value = pair.split("=", 1)
258            parsed_data[key] = value
259
260        # Verify all required fields are present
261        if not all(key in parsed_data for key in ["type", "id", "source"]):
262            raise ValueError("Missing required fields in reference string")
263
264        return ParsedMediaReference(
265            media_id=parsed_data["id"],
266            source=parsed_data["source"],
267            content_type=cast(MediaContentType, parsed_data["type"]),
268        )
269
270    def _parse_base64_data_uri(
271        self, data: str
272    ) -> Tuple[Optional[bytes], Optional[MediaContentType]]:
273        # Example data URI: data:image/jpeg;base64,/9j/4AAQ...
274        try:
275            if not data or not isinstance(data, str):
276                raise ValueError("Data URI is not a string")
277
278            if not data.startswith("data:"):
279                raise ValueError("Data URI does not start with 'data:'")
280
281            header, actual_data = data[5:].split(",", 1)
282            if not header or not actual_data:
283                raise ValueError("Invalid URI")
284
285            # Split header into parts and check for base64
286            header_parts = header.split(";")
287            if "base64" not in header_parts:
288                raise ValueError("Data is not base64 encoded")
289
290            # Content type is the first part
291            content_type = header_parts[0]
292            if not content_type:
293                raise ValueError("Content type is empty")
294
295            return base64.b64decode(actual_data), cast(MediaContentType, content_type)
296
297        except Exception as e:
298            logger.error("Error parsing base64 data URI", exc_info=e)
299
300            return None, None
301
302    @staticmethod
303    def resolve_media_references(
304        *,
305        obj: T,
306        langfuse_client: "Langfuse",
307        resolve_with: Literal["base64_data_uri"],
308        max_depth: int = 10,
309        content_fetch_timeout_seconds: int = 10,
310    ) -> T:
311        """Replace media reference strings in an object with base64 data URIs.
312
313        This method recursively traverses an object (up to max_depth) looking for media reference strings
314        in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using
315        the provided Langfuse client and replaces the reference string with a base64 data URI.
316
317        If fetching media content fails for a reference string, a warning is logged and the reference
318        string is left unchanged.
319
320        Args:
321            obj: The object to process. Can be a primitive value, array, or nested object.
322                If the object has a __dict__ attribute, a dict will be returned instead of the original object type.
323            langfuse_client: Langfuse client instance used to fetch media content.
324            resolve_with: The representation of the media content to replace the media reference string with.
325                Currently only "base64_data_uri" is supported.
326            max_depth: Optional. Default is 10. The maximum depth to traverse the object.
327
328        Returns:
329            A deep copy of the input object with all media references replaced with base64 data URIs where possible.
330            If the input object has a __dict__ attribute, a dict will be returned instead of the original object type.
331
332        Example:
333            obj = {
334                "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@",
335                "nested": {
336                    "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@"
337                }
338            }
339
340            result = await LangfuseMedia.resolve_media_references(obj, langfuse_client)
341
342            # Result:
343            # {
344            #     "image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...",
345            #     "nested": {
346            #         "pdf": "data:application/pdf;base64,JVBERi0xLjcK..."
347            #     }
348            # }
349        """
350
351        def traverse(obj: Any, depth: int) -> Any:
352            if depth > max_depth:
353                return obj
354
355            # Handle string
356            if isinstance(obj, str):
357                regex = r"@@@langfuseMedia:.+?@@@"
358                reference_string_matches = re.findall(regex, obj)
359                if len(reference_string_matches) == 0:
360                    return obj
361
362                result = obj
363                reference_string_to_media_content = {}
364                httpx_client = (
365                    langfuse_client._resources.httpx_client
366                    if langfuse_client._resources is not None
367                    else None
368                )
369
370                for reference_string in reference_string_matches:
371                    try:
372                        parsed_media_reference = LangfuseMedia.parse_reference_string(
373                            reference_string
374                        )
375                        media_data = langfuse_client.api.media.get(
376                            parsed_media_reference["media_id"]
377                        )
378                        media_content = (
379                            httpx_client.get(
380                                media_data.url,
381                                timeout=content_fetch_timeout_seconds,
382                            )
383                            if httpx_client is not None
384                            else httpx.get(
385                                media_data.url, timeout=content_fetch_timeout_seconds
386                            )
387                        )
388                        media_content.raise_for_status()
389
390                        base64_media_content = base64.b64encode(
391                            media_content.content
392                        ).decode()
393                        base64_data_uri = f"data:{media_data.content_type};base64,{base64_media_content}"
394
395                        reference_string_to_media_content[reference_string] = (
396                            base64_data_uri
397                        )
398                    except Exception as e:
399                        logger.warning(
400                            f"Error fetching media content for reference string {reference_string}: {e}"
401                        )
402                        # Do not replace the reference string if there's an error
403                        continue
404
405                for (
406                    ref_str,
407                    media_content_str,
408                ) in reference_string_to_media_content.items():
409                    result = result.replace(ref_str, media_content_str)
410
411                return result
412
413            # Handle arrays
414            if isinstance(obj, list):
415                return [traverse(item, depth + 1) for item in obj]
416
417            # Handle dictionaries
418            if isinstance(obj, dict):
419                return {key: traverse(value, depth + 1) for key, value in obj.items()}
420
421            # Handle objects:
422            if hasattr(obj, "__dict__"):
423                return {
424                    key: traverse(value, depth + 1)
425                    for key, value in obj.__dict__.items()
426                }
427
428            return obj
429
430        return cast(T, traverse(obj, 0))

A class for wrapping media objects for upload to Langfuse.

This class handles the preparation and formatting of media content for Langfuse, supporting both base64 data URIs and raw content bytes.

Arguments:
  • obj (Optional[object]): The source object to be wrapped. Can be accessed via the obj attribute.
  • base64_data_uri (Optional[str]): A base64-encoded data URI containing the media content and content type (e.g., "data:image/jpeg;base64,/9j/4AAQ...").
  • content_type (Optional[str]): The MIME type of the media content when providing raw bytes.
  • content_bytes (Optional[bytes]): Raw bytes of the media content.
  • file_path (Optional[str]): The path to the file containing the media content. For relative paths, the current working directory is used.
Raises:
  • ValueError: If neither base64_data_uri or the combination of content_bytes and content_type is provided.
LangfuseMedia( *, obj: Optional[object] = None, base64_data_uri: Optional[str] = None, content_type: Optional[langfuse.api.MediaContentType] = None, content_bytes: Optional[bytes] = None, file_path: Optional[str] = None)
126    def __init__(
127        self,
128        *,
129        obj: Optional[object] = None,
130        base64_data_uri: Optional[str] = None,
131        content_type: Optional[MediaContentType] = None,
132        content_bytes: Optional[bytes] = None,
133        file_path: Optional[str] = None,
134    ):
135        """Initialize a LangfuseMedia object.
136
137        Args:
138            obj: The object to wrap.
139
140            base64_data_uri: A base64-encoded data URI containing the media content
141                and content type (e.g., "data:image/jpeg;base64,/9j/4AAQ...").
142            content_type: The MIME type of the media content when providing raw bytes or reading from a file.
143            content_bytes: Raw bytes of the media content.
144            file_path: The path to the file containing the media content. For relative paths,
145                the current working directory is used.
146        """
147        self.obj = obj
148
149        if base64_data_uri is not None:
150            parsed_data = self._parse_base64_data_uri(base64_data_uri)
151            self._content_bytes, self._content_type = parsed_data
152            self._source = "base64_data_uri"
153
154        elif content_bytes is not None and content_type is not None:
155            self._content_type = content_type
156            self._content_bytes = content_bytes
157            self._source = "bytes"
158        elif (
159            file_path is not None
160            and content_type is not None
161            and os.path.exists(file_path)
162        ):
163            self._content_bytes = self._read_file(file_path)
164            self._content_type = content_type if self._content_bytes else None
165            self._source = "file" if self._content_bytes else None
166        else:
167            logger.error(
168                "base64_data_uri, or content_bytes and content_type, or file_path must be provided to LangfuseMedia"
169            )
170
171            self._content_bytes = None
172            self._content_type = None
173            self._source = None
174
175        self._media_id = self._get_media_id()

Initialize a LangfuseMedia object.

Arguments:
  • obj: The object to wrap.
  • base64_data_uri: A base64-encoded data URI containing the media content and content type (e.g., "data:image/jpeg;base64,/9j/4AAQ...").
  • content_type: The MIME type of the media content when providing raw bytes or reading from a file.
  • content_bytes: Raw bytes of the media content.
  • file_path: The path to the file containing the media content. For relative paths, the current working directory is used.
obj: object
@staticmethod
def parse_reference_string(reference_string: str) -> langfuse.types.ParsedMediaReference:
217    @staticmethod
218    def parse_reference_string(reference_string: str) -> ParsedMediaReference:
219        """Parse a media reference string into a ParsedMediaReference.
220
221        Example reference string:
222            "@@@langfuseMedia:type=image/jpeg|id=some-uuid|source=base64_data_uri@@@"
223
224        Args:
225            reference_string: The reference string to parse.
226
227        Returns:
228            A TypedDict with the media_id, source, and content_type.
229
230        Raises:
231            ValueError: If the reference string is empty or not a string.
232            ValueError: If the reference string does not start with "@@@langfuseMedia:type=".
233            ValueError: If the reference string does not end with "@@@".
234            ValueError: If the reference string is missing required fields.
235        """
236        if not reference_string:
237            raise ValueError("Reference string is empty")
238
239        if not isinstance(reference_string, str):
240            raise ValueError("Reference string is not a string")
241
242        if not reference_string.startswith("@@@langfuseMedia:type="):
243            raise ValueError(
244                "Reference string does not start with '@@@langfuseMedia:type='"
245            )
246
247        if not reference_string.endswith("@@@"):
248            raise ValueError("Reference string does not end with '@@@'")
249
250        content = reference_string[len("@@@langfuseMedia:") :].rstrip("@@@")
251
252        # Split into key-value pairs
253        pairs = content.split("|")
254        parsed_data = {}
255
256        for pair in pairs:
257            key, value = pair.split("=", 1)
258            parsed_data[key] = value
259
260        # Verify all required fields are present
261        if not all(key in parsed_data for key in ["type", "id", "source"]):
262            raise ValueError("Missing required fields in reference string")
263
264        return ParsedMediaReference(
265            media_id=parsed_data["id"],
266            source=parsed_data["source"],
267            content_type=cast(MediaContentType, parsed_data["type"]),
268        )

Parse a media reference string into a ParsedMediaReference.

Example reference string:

"@@@langfuseMedia:type=image/jpeg|id=some-uuid|source=base64_data_uri@@@"

Arguments:
  • reference_string: The reference string to parse.
Returns:

A TypedDict with the media_id, source, and content_type.

Raises:
  • ValueError: If the reference string is empty or not a string.
  • ValueError: If the reference string does not start with "@@@langfuseMedia:type=".
  • ValueError: If the reference string does not end with "@@@".
  • ValueError: If the reference string is missing required fields.
@staticmethod
def resolve_media_references( *, obj: ~T, langfuse_client: Langfuse, resolve_with: Literal['base64_data_uri'], max_depth: int = 10, content_fetch_timeout_seconds: int = 10) -> ~T:
302    @staticmethod
303    def resolve_media_references(
304        *,
305        obj: T,
306        langfuse_client: "Langfuse",
307        resolve_with: Literal["base64_data_uri"],
308        max_depth: int = 10,
309        content_fetch_timeout_seconds: int = 10,
310    ) -> T:
311        """Replace media reference strings in an object with base64 data URIs.
312
313        This method recursively traverses an object (up to max_depth) looking for media reference strings
314        in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using
315        the provided Langfuse client and replaces the reference string with a base64 data URI.
316
317        If fetching media content fails for a reference string, a warning is logged and the reference
318        string is left unchanged.
319
320        Args:
321            obj: The object to process. Can be a primitive value, array, or nested object.
322                If the object has a __dict__ attribute, a dict will be returned instead of the original object type.
323            langfuse_client: Langfuse client instance used to fetch media content.
324            resolve_with: The representation of the media content to replace the media reference string with.
325                Currently only "base64_data_uri" is supported.
326            max_depth: Optional. Default is 10. The maximum depth to traverse the object.
327
328        Returns:
329            A deep copy of the input object with all media references replaced with base64 data URIs where possible.
330            If the input object has a __dict__ attribute, a dict will be returned instead of the original object type.
331
332        Example:
333            obj = {
334                "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@",
335                "nested": {
336                    "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@"
337                }
338            }
339
340            result = await LangfuseMedia.resolve_media_references(obj, langfuse_client)
341
342            # Result:
343            # {
344            #     "image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...",
345            #     "nested": {
346            #         "pdf": "data:application/pdf;base64,JVBERi0xLjcK..."
347            #     }
348            # }
349        """
350
351        def traverse(obj: Any, depth: int) -> Any:
352            if depth > max_depth:
353                return obj
354
355            # Handle string
356            if isinstance(obj, str):
357                regex = r"@@@langfuseMedia:.+?@@@"
358                reference_string_matches = re.findall(regex, obj)
359                if len(reference_string_matches) == 0:
360                    return obj
361
362                result = obj
363                reference_string_to_media_content = {}
364                httpx_client = (
365                    langfuse_client._resources.httpx_client
366                    if langfuse_client._resources is not None
367                    else None
368                )
369
370                for reference_string in reference_string_matches:
371                    try:
372                        parsed_media_reference = LangfuseMedia.parse_reference_string(
373                            reference_string
374                        )
375                        media_data = langfuse_client.api.media.get(
376                            parsed_media_reference["media_id"]
377                        )
378                        media_content = (
379                            httpx_client.get(
380                                media_data.url,
381                                timeout=content_fetch_timeout_seconds,
382                            )
383                            if httpx_client is not None
384                            else httpx.get(
385                                media_data.url, timeout=content_fetch_timeout_seconds
386                            )
387                        )
388                        media_content.raise_for_status()
389
390                        base64_media_content = base64.b64encode(
391                            media_content.content
392                        ).decode()
393                        base64_data_uri = f"data:{media_data.content_type};base64,{base64_media_content}"
394
395                        reference_string_to_media_content[reference_string] = (
396                            base64_data_uri
397                        )
398                    except Exception as e:
399                        logger.warning(
400                            f"Error fetching media content for reference string {reference_string}: {e}"
401                        )
402                        # Do not replace the reference string if there's an error
403                        continue
404
405                for (
406                    ref_str,
407                    media_content_str,
408                ) in reference_string_to_media_content.items():
409                    result = result.replace(ref_str, media_content_str)
410
411                return result
412
413            # Handle arrays
414            if isinstance(obj, list):
415                return [traverse(item, depth + 1) for item in obj]
416
417            # Handle dictionaries
418            if isinstance(obj, dict):
419                return {key: traverse(value, depth + 1) for key, value in obj.items()}
420
421            # Handle objects:
422            if hasattr(obj, "__dict__"):
423                return {
424                    key: traverse(value, depth + 1)
425                    for key, value in obj.__dict__.items()
426                }
427
428            return obj
429
430        return cast(T, traverse(obj, 0))

Replace media reference strings in an object with base64 data URIs.

This method recursively traverses an object (up to max_depth) looking for media reference strings in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using the provided Langfuse client and replaces the reference string with a base64 data URI.

If fetching media content fails for a reference string, a warning is logged and the reference string is left unchanged.

Arguments:
  • obj: The object to process. Can be a primitive value, array, or nested object. If the object has a __dict__ attribute, a dict will be returned instead of the original object type.
  • langfuse_client: Langfuse client instance used to fetch media content.
  • resolve_with: The representation of the media content to replace the media reference string with. Currently only "base64_data_uri" is supported.
  • max_depth: Optional. Default is 10. The maximum depth to traverse the object.
Returns:

A deep copy of the input object with all media references replaced with base64 data URIs where possible. If the input object has a __dict__ attribute, a dict will be returned instead of the original object type.

Example:

obj = { "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@", "nested": { "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@" } }

result = await LangfuseMedia.resolve_media_references(obj, langfuse_client)

Result:

{

"image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...",

"nested": {

"pdf": "data:application/pdf;base64,JVBERi0xLjcK..."

}

}

@dataclass(frozen=True)
class LangfuseMediaReference:
24@dataclass(frozen=True)
25class LangfuseMediaReference:
26    """Resolved reference to media stored in Langfuse."""
27
28    media_id: str
29    content_type: str
30    url: str
31    url_expiry: Optional[str] = None
32    content_length: Optional[int] = None
33    reference_string: Optional[str] = None
34
35    def is_url_expired(self) -> bool:
36        """Return whether the signed URL is already expired."""
37        if self.url_expiry is None:
38            return False
39
40        expiry = self.url_expiry.replace("Z", "+00:00")
41
42        try:
43            expiry_datetime = datetime.fromisoformat(expiry)
44        except ValueError:
45            return False
46
47        if expiry_datetime.tzinfo is None:
48            expiry_datetime = expiry_datetime.replace(tzinfo=timezone.utc)
49
50        return expiry_datetime <= datetime.now(timezone.utc)
51
52    def fetch_bytes(
53        self, *, timeout: float = 30.0, client: Optional[httpx.Client] = None
54    ) -> bytes:
55        """Fetch the media content from the signed URL.
56
57        Args:
58            timeout: Request timeout in seconds.
59            client: Optional httpx client to use for the request. Pass this to
60                honor custom transport settings (proxy, CA bundle, mTLS) — in
61                particular when multiple Langfuse clients are configured, since
62                the SDK cannot otherwise tell which client produced this
63                reference. When omitted, the single configured client is used,
64                falling back to a default httpx client.
65        """
66        from langfuse._client.resource_manager import LangfuseResourceManager
67
68        httpx_client = client or LangfuseResourceManager.get_singleton_httpx_client()
69        response = (
70            httpx_client.get(self.url, timeout=timeout)
71            if httpx_client is not None
72            else httpx.get(self.url, timeout=timeout)
73        )
74        response.raise_for_status()
75
76        return response.content
77
78    def fetch_base64(
79        self, *, timeout: float = 30.0, client: Optional[httpx.Client] = None
80    ) -> str:
81        """Fetch media and return raw base64 without a data URI prefix.
82
83        See :meth:`fetch_bytes` for the ``client`` argument.
84        """
85        return base64.b64encode(
86            self.fetch_bytes(timeout=timeout, client=client)
87        ).decode()
88
89    def fetch_data_uri(
90        self, *, timeout: float = 30.0, client: Optional[httpx.Client] = None
91    ) -> str:
92        """Fetch media and return it as a data URI.
93
94        See :meth:`fetch_bytes` for the ``client`` argument.
95        """
96        return f"data:{self.content_type};base64,{self.fetch_base64(timeout=timeout, client=client)}"

Resolved reference to media stored in Langfuse.

LangfuseMediaReference( media_id: str, content_type: str, url: str, url_expiry: Optional[str] = None, content_length: Optional[int] = None, reference_string: Optional[str] = None)
media_id: str
content_type: str
url: str
url_expiry: Optional[str] = None
content_length: Optional[int] = None
reference_string: Optional[str] = None
def is_url_expired(self) -> bool:
35    def is_url_expired(self) -> bool:
36        """Return whether the signed URL is already expired."""
37        if self.url_expiry is None:
38            return False
39
40        expiry = self.url_expiry.replace("Z", "+00:00")
41
42        try:
43            expiry_datetime = datetime.fromisoformat(expiry)
44        except ValueError:
45            return False
46
47        if expiry_datetime.tzinfo is None:
48            expiry_datetime = expiry_datetime.replace(tzinfo=timezone.utc)
49
50        return expiry_datetime <= datetime.now(timezone.utc)

Return whether the signed URL is already expired.

def fetch_bytes( self, *, timeout: float = 30.0, client: Optional[httpx.Client] = None) -> bytes:
52    def fetch_bytes(
53        self, *, timeout: float = 30.0, client: Optional[httpx.Client] = None
54    ) -> bytes:
55        """Fetch the media content from the signed URL.
56
57        Args:
58            timeout: Request timeout in seconds.
59            client: Optional httpx client to use for the request. Pass this to
60                honor custom transport settings (proxy, CA bundle, mTLS) — in
61                particular when multiple Langfuse clients are configured, since
62                the SDK cannot otherwise tell which client produced this
63                reference. When omitted, the single configured client is used,
64                falling back to a default httpx client.
65        """
66        from langfuse._client.resource_manager import LangfuseResourceManager
67
68        httpx_client = client or LangfuseResourceManager.get_singleton_httpx_client()
69        response = (
70            httpx_client.get(self.url, timeout=timeout)
71            if httpx_client is not None
72            else httpx.get(self.url, timeout=timeout)
73        )
74        response.raise_for_status()
75
76        return response.content

Fetch the media content from the signed URL.

Arguments:
  • timeout: Request timeout in seconds.
  • client: Optional httpx client to use for the request. Pass this to honor custom transport settings (proxy, CA bundle, mTLS) — in particular when multiple Langfuse clients are configured, since the SDK cannot otherwise tell which client produced this reference. When omitted, the single configured client is used, falling back to a default httpx client.
def fetch_base64( self, *, timeout: float = 30.0, client: Optional[httpx.Client] = None) -> str:
78    def fetch_base64(
79        self, *, timeout: float = 30.0, client: Optional[httpx.Client] = None
80    ) -> str:
81        """Fetch media and return raw base64 without a data URI prefix.
82
83        See :meth:`fetch_bytes` for the ``client`` argument.
84        """
85        return base64.b64encode(
86            self.fetch_bytes(timeout=timeout, client=client)
87        ).decode()

Fetch media and return raw base64 without a data URI prefix.

See fetch_bytes() for the client argument.

def fetch_data_uri( self, *, timeout: float = 30.0, client: Optional[httpx.Client] = None) -> str:
89    def fetch_data_uri(
90        self, *, timeout: float = 30.0, client: Optional[httpx.Client] = None
91    ) -> str:
92        """Fetch media and return it as a data URI.
93
94        See :meth:`fetch_bytes` for the ``client`` argument.
95        """
96        return f"data:{self.content_type};base64,{self.fetch_base64(timeout=timeout, client=client)}"

Fetch media and return it as a data URI.

See fetch_bytes() for the client argument.

def get_client(*, public_key: Optional[str] = None) -> Langfuse:
 65def get_client(*, public_key: Optional[str] = None) -> Langfuse:
 66    """Get or create a Langfuse client instance.
 67
 68    Returns an existing Langfuse client or creates a new one if none exists. In multi-project setups,
 69    providing a public_key is required. Multi-project support is experimental - see Langfuse docs.
 70
 71    Behavior:
 72    - Single project: Returns existing client or creates new one
 73    - Multi-project: Requires public_key to return specific client
 74    - No public_key in multi-project: Returns disabled client to prevent data leakage
 75
 76    The function uses a singleton pattern per public_key to conserve resources and maintain state.
 77
 78    Args:
 79        public_key (Optional[str]): Project identifier
 80            - With key: Returns client for that project
 81            - Without key: Returns single client or disabled client if multiple exist
 82
 83    Returns:
 84        Langfuse: Client instance in one of three states:
 85            1. Client for specified public_key
 86            2. Default client for single-project setup
 87            3. Disabled client when multiple projects exist without key
 88
 89    Security:
 90        Disables tracing when multiple projects exist without explicit key to prevent
 91        cross-project data leakage. Multi-project setups are experimental.
 92
 93    Example:
 94        ```python
 95        # Single project
 96        client = get_client()  # Default client
 97
 98        # In multi-project usage:
 99        client_a = get_client(public_key="project_a_key")  # Returns project A's client
100        client_b = get_client(public_key="project_b_key")  # Returns project B's client
101
102        # Without specific key in multi-project setup:
103        client = get_client()  # Returns disabled client for safety
104        ```
105    """
106    with LangfuseResourceManager._lock:
107        active_instances = LangfuseResourceManager._instances
108
109        # If no explicit public_key provided, check execution context
110        if not public_key:
111            public_key = _current_public_key.get(None)
112
113        if not public_key:
114            if len(active_instances) == 0:
115                # No clients initialized yet, create default instance
116                return Langfuse()
117
118            if len(active_instances) == 1:
119                # Only one client exists, safe to use without specifying key
120                instance = list(active_instances.values())[0]
121
122                # Initialize with the credentials bound to the instance
123                # This is important if the original instance was instantiated
124                # via constructor arguments
125                return _create_client_from_instance(instance)
126
127            else:
128                # Multiple clients exist but no key specified - disable tracing
129                # to prevent cross-project data leakage
130                langfuse_logger.warning(
131                    "No 'langfuse_public_key' passed to decorated function, but multiple langfuse clients are instantiated in current process. Skipping tracing for this function to avoid cross-project leakage."
132                )
133                return Langfuse(
134                    tracing_enabled=False, public_key="fake", secret_key="fake"
135                )
136
137        else:
138            # Specific key provided, look up existing instance
139            target_instance: Optional[LangfuseResourceManager] = active_instances.get(
140                public_key, None
141            )
142
143            if target_instance is None:
144                # No instance found with this key - client not initialized properly
145                langfuse_logger.warning(
146                    f"No Langfuse client with public key {public_key} has been initialized. Skipping tracing for decorated function."
147                )
148                return Langfuse(
149                    tracing_enabled=False, public_key="fake", secret_key="fake"
150                )
151
152            # target_instance is guaranteed to be not None at this point
153            return _create_client_from_instance(target_instance, public_key)

Get or create a Langfuse client instance.

Returns an existing Langfuse client or creates a new one if none exists. In multi-project setups, providing a public_key is required. Multi-project support is experimental - see Langfuse docs.

Behavior:

  • Single project: Returns existing client or creates new one
  • Multi-project: Requires public_key to return specific client
  • No public_key in multi-project: Returns disabled client to prevent data leakage

The function uses a singleton pattern per public_key to conserve resources and maintain state.

Arguments:
  • public_key (Optional[str]): Project identifier
    • With key: Returns client for that project
    • Without key: Returns single client or disabled client if multiple exist
Returns:

Langfuse: Client instance in one of three states: 1. Client for specified public_key 2. Default client for single-project setup 3. Disabled client when multiple projects exist without key

Security:

Disables tracing when multiple projects exist without explicit key to prevent cross-project data leakage. Multi-project setups are experimental.

Example:
# Single project
client = get_client()  # Default client

# In multi-project usage:
client_a = get_client(public_key="project_a_key")  # Returns project A's client
client_b = get_client(public_key="project_b_key")  # Returns project B's client

# Without specific key in multi-project setup:
client = get_client()  # Returns disabled client for safety
def observe( func: Optional[~F] = None, *, name: Optional[str] = None, as_type: Union[Literal['generation', 'embedding'], Literal['span', 'agent', 'tool', 'chain', 'retriever', 'evaluator', 'guardrail'], NoneType] = None, capture_input: Optional[bool] = None, capture_output: Optional[bool] = None, transform_to_string: Optional[Callable[[Iterable], str]] = None) -> Union[~F, Callable[[~F], ~F]]:
 88    def observe(
 89        self,
 90        func: Optional[F] = None,
 91        *,
 92        name: Optional[str] = None,
 93        as_type: Optional[ObservationTypeLiteralNoEvent] = None,
 94        capture_input: Optional[bool] = None,
 95        capture_output: Optional[bool] = None,
 96        transform_to_string: Optional[Callable[[Iterable], str]] = None,
 97    ) -> Union[F, Callable[[F], F]]:
 98        """Wrap a function to create and manage Langfuse tracing around its execution, supporting both synchronous and asynchronous functions.
 99
100        This decorator provides seamless integration of Langfuse observability into your codebase. It automatically creates
101        spans or generations around function execution, capturing timing, inputs/outputs, and error states. The decorator
102        intelligently handles both synchronous and asynchronous functions, preserving function signatures and type hints.
103
104        Using OpenTelemetry's distributed tracing system, it maintains proper trace context propagation throughout your application,
105        enabling you to see hierarchical traces of function calls with detailed performance metrics and function-specific details.
106
107        Args:
108            func (Optional[Callable]): The function to decorate. When used with parentheses @observe(), this will be None.
109            name (Optional[str]): Custom name for the created trace or span. If not provided, the function name is used.
110            as_type (Optional[Literal]): Set the observation type. Supported values:
111                    "generation", "span", "agent", "tool", "chain", "retriever", "embedding", "evaluator", "guardrail".
112                    Observation types are highlighted in the Langfuse UI for filtering and visualization.
113                    The types "generation" and "embedding" create a span on which additional attributes such as model metrics
114                    can be set.
115
116        Returns:
117            Callable: A wrapped version of the original function that automatically creates and manages Langfuse spans.
118
119        Example:
120            For general function tracing with automatic naming:
121            ```python
122            @observe()
123            def process_user_request(user_id, query):
124                # Function is automatically traced with name "process_user_request"
125                return get_response(query)
126            ```
127
128            For language model generation tracking:
129            ```python
130            @observe(name="answer-generation", as_type="generation")
131            async def generate_answer(query):
132                # Creates a generation-type span with extended LLM metrics
133                response = await openai.chat.completions.create(
134                    model="gpt-4",
135                    messages=[{"role": "user", "content": query}]
136                )
137                return response.choices[0].message.content
138            ```
139
140            For trace context propagation between functions:
141            ```python
142            @observe()
143            def main_process():
144                # Parent span is created
145                return sub_process()  # Child span automatically connected to parent
146
147            @observe()
148            def sub_process():
149                # Automatically becomes a child span of main_process
150                return "result"
151            ```
152
153        Raises:
154            Exception: Propagates any exceptions from the wrapped function after logging them in the trace.
155
156        Notes:
157            - The decorator preserves the original function's signature, docstring, and return type.
158            - Proper parent-child relationships between spans are automatically maintained.
159            - Special keyword arguments can be passed to control tracing:
160              - langfuse_trace_id: Explicitly set the trace ID for this function call
161              - langfuse_parent_observation_id: Explicitly set the parent span ID
162              - langfuse_public_key: Use a specific Langfuse project (when multiple clients exist)
163            - For async functions, the decorator returns an async function wrapper.
164            - For sync functions, the decorator returns a synchronous wrapper.
165        """
166        valid_types = set(get_observation_types_list(ObservationTypeLiteralNoEvent))
167        if as_type is not None and as_type not in valid_types:
168            logger.warning(
169                f"Invalid as_type '{as_type}'. Valid types are: {', '.join(sorted(valid_types))}. Defaulting to 'span'."
170            )
171            as_type = "span"
172
173        function_io_capture_enabled = os.environ.get(
174            LANGFUSE_OBSERVE_DECORATOR_IO_CAPTURE_ENABLED, "True"
175        ).lower() not in ("false", "0")
176
177        should_capture_input = (
178            capture_input if capture_input is not None else function_io_capture_enabled
179        )
180
181        should_capture_output = (
182            capture_output
183            if capture_output is not None
184            else function_io_capture_enabled
185        )
186
187        def decorator(func: F) -> F:
188            return (
189                self._async_observe(
190                    func,
191                    name=name,
192                    as_type=as_type,
193                    capture_input=should_capture_input,
194                    capture_output=should_capture_output,
195                    transform_to_string=transform_to_string,
196                )
197                if asyncio.iscoroutinefunction(func)
198                else self._sync_observe(
199                    func,
200                    name=name,
201                    as_type=as_type,
202                    capture_input=should_capture_input,
203                    capture_output=should_capture_output,
204                    transform_to_string=transform_to_string,
205                )
206            )
207
208        """Handle decorator with or without parentheses.
209
210        This logic enables the decorator to work both with and without parentheses:
211        - @observe - Python passes the function directly to the decorator
212        - @observe() - Python calls the decorator first, which must return a function decorator
213
214        When called without arguments (@observe), the func parameter contains the function to decorate,
215        so we directly apply the decorator to it. When called with parentheses (@observe()),
216        func is None, so we return the decorator function itself for Python to apply in the next step.
217        """
218        if func is None:
219            return decorator
220        else:
221            return decorator(func)

Wrap a function to create and manage Langfuse tracing around its execution, supporting both synchronous and asynchronous functions.

This decorator provides seamless integration of Langfuse observability into your codebase. It automatically creates spans or generations around function execution, capturing timing, inputs/outputs, and error states. The decorator intelligently handles both synchronous and asynchronous functions, preserving function signatures and type hints.

Using OpenTelemetry's distributed tracing system, it maintains proper trace context propagation throughout your application, enabling you to see hierarchical traces of function calls with detailed performance metrics and function-specific details.

Arguments:
  • func (Optional[Callable]): The function to decorate. When used with parentheses @observe(), this will be None.
  • name (Optional[str]): Custom name for the created trace or span. If not provided, the function name is used.
  • as_type (Optional[Literal]): Set the observation type. Supported values: "generation", "span", "agent", "tool", "chain", "retriever", "embedding", "evaluator", "guardrail". Observation types are highlighted in the Langfuse UI for filtering and visualization. The types "generation" and "embedding" create a span on which additional attributes such as model metrics can be set.
Returns:

Callable: A wrapped version of the original function that automatically creates and manages Langfuse spans.

Example:

For general function tracing with automatic naming:

@observe()
def process_user_request(user_id, query):
    # Function is automatically traced with name "process_user_request"
    return get_response(query)

For language model generation tracking:

@observe(name="answer-generation", as_type="generation")
async def generate_answer(query):
    # Creates a generation-type span with extended LLM metrics
    response = await openai.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": query}]
    )
    return response.choices[0].message.content

For trace context propagation between functions:

@observe()
def main_process():
    # Parent span is created
    return sub_process()  # Child span automatically connected to parent

@observe()
def sub_process():
    # Automatically becomes a child span of main_process
    return "result"
Raises:
  • Exception: Propagates any exceptions from the wrapped function after logging them in the trace.
Notes:
  • The decorator preserves the original function's signature, docstring, and return type.
  • Proper parent-child relationships between spans are automatically maintained.
  • Special keyword arguments can be passed to control tracing:
    • langfuse_trace_id: Explicitly set the trace ID for this function call
    • langfuse_parent_observation_id: Explicitly set the parent span ID
    • langfuse_public_key: Use a specific Langfuse project (when multiple clients exist)
  • For async functions, the decorator returns an async function wrapper.
  • For sync functions, the decorator returns a synchronous wrapper.
def propagate_attributes( *, user_id: Optional[str] = None, session_id: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, version: Optional[str] = None, tags: Optional[List[str]] = None, trace_name: Optional[str] = None, environment: Optional[str] = None, as_baggage: bool = False) -> opentelemetry.util._decorator._AgnosticContextManager[typing.Any]:
 98def propagate_attributes(
 99    *,
100    user_id: Optional[str] = None,
101    session_id: Optional[str] = None,
102    metadata: Optional[Dict[str, Any]] = None,
103    version: Optional[str] = None,
104    tags: Optional[List[str]] = None,
105    trace_name: Optional[str] = None,
106    environment: Optional[str] = None,
107    as_baggage: bool = False,
108) -> _AgnosticContextManager[Any]:
109    """Propagate trace-level attributes to all spans created within this context.
110
111    This context manager sets attributes on the currently active span AND automatically
112    propagates them to all new child spans created within the context. This is the
113    recommended way to set trace-level attributes like user_id, session_id,
114    environment, and metadata dimensions that should be consistently applied across
115    all observations in a trace.
116
117    **IMPORTANT**: Call this as early as possible within your trace/workflow. Only the
118    currently active span and spans created after entering this context will have these
119    attributes. Pre-existing spans will NOT be retroactively updated.
120
121    **Why this matters**: Langfuse aggregation queries (e.g., total cost by user_id,
122    filtering by session_id) only include observations that have the attribute set.
123    If you call `propagate_attributes` late in your workflow, earlier spans won't be
124    included in aggregations for that attribute.
125
126    Args:
127        user_id: User identifier to associate with all spans in this context.
128            Must be US-ASCII string, ≤200 characters. Use this to track which user
129            generated each trace and enable e.g. per-user cost/performance analysis.
130        session_id: Session identifier to associate with all spans in this context.
131            Must be US-ASCII string, ≤200 characters. Use this to group related traces
132            within a user session (e.g., a conversation thread, multi-turn interaction).
133        metadata: Additional key-value metadata to propagate to all spans.
134            - Keys must be US-ASCII strings
135            - Values are coerced to strings
136            - Coerced values must be ≤200 characters
137            - Use for dimensions like internal correlating identifiers
138            - AVOID: large payloads or sensitive data
139        version: Version identfier for parts of your application that are independently versioned, e.g. agents
140        tags: List of tags to categorize the group of observations
141        trace_name: Name to assign to the trace. Must be US-ASCII string, ≤200 characters.
142            Use this to set a consistent trace name for all spans created within this context.
143        environment: Langfuse environment to assign to spans created in this context.
144            Must be a lowercase alphanumeric string with optional hyphens or underscores,
145            must be ≤40 characters, and must not start with "langfuse". This maps to
146            the first-class `langfuse.environment` attribute, not to trace metadata.
147            Use it for request-scoped environments, for example when one shared proxy
148            handles calls from dev, staging, qa, and prod. A propagated environment
149            takes precedence over the local client default configured via
150            `Langfuse(environment=...)` or `LANGFUSE_TRACING_ENVIRONMENT` for spans
151            created while this propagation context is active.
152        as_baggage: If True, propagates attributes using OpenTelemetry baggage for
153            cross-process/service propagation. **Security warning**: When enabled,
154            attribute values are added to HTTP headers on ALL outbound requests.
155            This includes `environment` as the `langfuse_environment` baggage entry.
156            Only enable if values are safe to transmit via HTTP headers and you need
157            cross-service tracing. Default: False.
158
159    Returns:
160        Context manager that propagates attributes to all child spans.
161
162    Example:
163        Basic usage with user and session tracking:
164
165        ```python
166        from langfuse import Langfuse
167
168        langfuse = Langfuse()
169
170        # Set attributes early in the trace
171        with langfuse.start_as_current_observation(name="user_workflow") as span:
172            with langfuse.propagate_attributes(
173                user_id="user_123",
174                session_id="session_abc",
175                environment="production",
176                metadata={"experiment": "variant_a"}
177            ):
178                # All spans created here will have user_id, session_id, environment, and metadata
179                with langfuse.start_observation(name="llm_call") as llm_span:
180                    # This span inherits user_id, session_id, environment, and experiment metadata
181                    ...
182
183                with langfuse.start_generation(name="completion") as gen:
184                    # This span also inherits all attributes
185                    ...
186        ```
187
188        Late propagation (anti-pattern):
189
190        ```python
191        with langfuse.start_as_current_observation(name="workflow") as span:
192            # These spans WON'T have user_id
193            early_span = langfuse.start_observation(name="early_work")
194            early_span.end()
195
196            # Set attributes in the middle
197            with langfuse.propagate_attributes(user_id="user_123"):
198                # Only spans created AFTER this point will have user_id
199                late_span = langfuse.start_observation(name="late_work")
200                late_span.end()
201
202            # Result: Aggregations by user_id will miss "early_work" span
203        ```
204
205        Cross-service propagation with baggage (advanced):
206
207        ```python
208        # Service A - originating service
209        with langfuse.start_as_current_observation(name="api_request"):
210            with langfuse.propagate_attributes(
211                user_id="user_123",
212                session_id="session_abc",
213                environment="staging",
214                as_baggage=True  # Propagate via HTTP headers
215            ):
216                # Make HTTP request to Service B
217                response = requests.get("https://service-b.example.com/api")
218                # user_id, session_id, and environment are now in HTTP headers
219
220        # Service B - downstream service
221        # OpenTelemetry will automatically extract baggage from HTTP headers
222        # and propagate attributes to spans in Service B. If Service B has a local
223        # Langfuse environment configured, the propagated environment wins for
224        # spans created within this context.
225        ```
226
227    Note:
228        - **Validation**: Attribute values (user_id, session_id, version, tags,
229          trace_name) must be strings ≤200 characters. Environment must also match
230          Langfuse's environment format: lowercase alphanumeric with optional
231          hyphens or underscores, must be ≤40 characters, and it must not start with "langfuse". Metadata
232          values are coerced to strings before the 200 character limit is applied.
233          Invalid values will be dropped with a warning logged.
234        - **OpenTelemetry**: This uses OpenTelemetry context propagation under the hood,
235          making it compatible with other OTel-instrumented libraries.
236
237    Raises:
238        No exceptions are raised. Invalid values are logged as warnings and dropped.
239    """
240    return _propagate_attributes(
241        user_id=user_id,
242        session_id=session_id,
243        metadata=metadata,
244        version=version,
245        tags=tags,
246        trace_name=trace_name,
247        environment=environment,
248        as_baggage=as_baggage,
249    )

Propagate trace-level attributes to all spans created within this context.

This context manager sets attributes on the currently active span AND automatically propagates them to all new child spans created within the context. This is the recommended way to set trace-level attributes like user_id, session_id, environment, and metadata dimensions that should be consistently applied across all observations in a trace.

IMPORTANT: Call this as early as possible within your trace/workflow. Only the currently active span and spans created after entering this context will have these attributes. Pre-existing spans will NOT be retroactively updated.

Why this matters: Langfuse aggregation queries (e.g., total cost by user_id, filtering by session_id) only include observations that have the attribute set. If you call propagate_attributes late in your workflow, earlier spans won't be included in aggregations for that attribute.

Arguments:
  • user_id: User identifier to associate with all spans in this context. Must be US-ASCII string, ≤200 characters. Use this to track which user generated each trace and enable e.g. per-user cost/performance analysis.
  • session_id: Session identifier to associate with all spans in this context. Must be US-ASCII string, ≤200 characters. Use this to group related traces within a user session (e.g., a conversation thread, multi-turn interaction).
  • metadata: Additional key-value metadata to propagate to all spans.
    • Keys must be US-ASCII strings
    • Values are coerced to strings
    • Coerced values must be ≤200 characters
    • Use for dimensions like internal correlating identifiers
    • AVOID: large payloads or sensitive data
  • version: Version identfier for parts of your application that are independently versioned, e.g. agents
  • tags: List of tags to categorize the group of observations
  • trace_name: Name to assign to the trace. Must be US-ASCII string, ≤200 characters. Use this to set a consistent trace name for all spans created within this context.
  • environment: Langfuse environment to assign to spans created in this context. Must be a lowercase alphanumeric string with optional hyphens or underscores, must be ≤40 characters, and must not start with "langfuse". This maps to the first-class langfuse.environment attribute, not to trace metadata. Use it for request-scoped environments, for example when one shared proxy handles calls from dev, staging, qa, and prod. A propagated environment takes precedence over the local client default configured via Langfuse(environment=...) or LANGFUSE_TRACING_ENVIRONMENT for spans created while this propagation context is active.
  • as_baggage: If True, propagates attributes using OpenTelemetry baggage for cross-process/service propagation. Security warning: When enabled, attribute values are added to HTTP headers on ALL outbound requests. This includes environment as the langfuse_environment baggage entry. Only enable if values are safe to transmit via HTTP headers and you need cross-service tracing. Default: False.
Returns:

Context manager that propagates attributes to all child spans.

Example:

Basic usage with user and session tracking:

from langfuse import Langfuse

langfuse = Langfuse()

# Set attributes early in the trace
with langfuse.start_as_current_observation(name="user_workflow") as span:
    with langfuse.propagate_attributes(
        user_id="user_123",
        session_id="session_abc",
        environment="production",
        metadata={"experiment": "variant_a"}
    ):
        # All spans created here will have user_id, session_id, environment, and metadata
        with langfuse.start_observation(name="llm_call") as llm_span:
            # This span inherits user_id, session_id, environment, and experiment metadata
            ...

        with langfuse.start_generation(name="completion") as gen:
            # This span also inherits all attributes
            ...

Late propagation (anti-pattern):

with langfuse.start_as_current_observation(name="workflow") as span:
    # These spans WON'T have user_id
    early_span = langfuse.start_observation(name="early_work")
    early_span.end()

    # Set attributes in the middle
    with langfuse.propagate_attributes(user_id="user_123"):
        # Only spans created AFTER this point will have user_id
        late_span = langfuse.start_observation(name="late_work")
        late_span.end()

    # Result: Aggregations by user_id will miss "early_work" span

Cross-service propagation with baggage (advanced):

# Service A - originating service
with langfuse.start_as_current_observation(name="api_request"):
    with langfuse.propagate_attributes(
        user_id="user_123",
        session_id="session_abc",
        environment="staging",
        as_baggage=True  # Propagate via HTTP headers
    ):
        # Make HTTP request to Service B
        response = requests.get("https://service-b.example.com/api")
        # user_id, session_id, and environment are now in HTTP headers

# Service B - downstream service
# OpenTelemetry will automatically extract baggage from HTTP headers
# and propagate attributes to spans in Service B. If Service B has a local
# Langfuse environment configured, the propagated environment wins for
# spans created within this context.
Note:
  • Validation: Attribute values (user_id, session_id, version, tags, trace_name) must be strings ≤200 characters. Environment must also match Langfuse's environment format: lowercase alphanumeric with optional hyphens or underscores, must be ≤40 characters, and it must not start with "langfuse". Metadata values are coerced to strings before the 200 character limit is applied. Invalid values will be dropped with a warning logged.
  • OpenTelemetry: This uses OpenTelemetry context propagation under the hood, making it compatible with other OTel-instrumented libraries.
Raises:
  • No exceptions are raised. Invalid values are logged as warnings and dropped.
ObservationTypeLiteral = typing.Union[typing.Literal['generation', 'embedding'], typing.Literal['span', 'agent', 'tool', 'chain', 'retriever', 'evaluator', 'guardrail'], typing.Literal['event']]
class LangfuseSpan(langfuse._client.span.LangfuseObservationWrapper):
1267class LangfuseSpan(LangfuseObservationWrapper):
1268    """Standard span implementation for general operations in Langfuse.
1269
1270    This class represents a general-purpose span that can be used to trace
1271    any operation in your application. It extends the base LangfuseObservationWrapper
1272    with specific methods for creating child spans, generations, and updating
1273    span-specific attributes. If possible, use a more specific type for
1274    better observability and insights.
1275    """
1276
1277    def __init__(
1278        self,
1279        *,
1280        otel_span: otel_trace_api.Span,
1281        langfuse_client: "Langfuse",
1282        input: Optional[Any] = None,
1283        output: Optional[Any] = None,
1284        metadata: Optional[Any] = None,
1285        environment: Optional[str] = None,
1286        release: Optional[str] = None,
1287        version: Optional[str] = None,
1288        level: Optional[SpanLevel] = None,
1289        status_message: Optional[str] = None,
1290    ):
1291        """Initialize a new LangfuseSpan.
1292
1293        Args:
1294            otel_span: The OpenTelemetry span to wrap
1295            langfuse_client: Reference to the parent Langfuse client
1296            input: Input data for the span (any JSON-serializable object)
1297            output: Output data from the span (any JSON-serializable object)
1298            metadata: Additional metadata to associate with the span
1299            environment: The tracing environment
1300            release: Release identifier for the application
1301            version: Version identifier for the code or component
1302            level: Importance level of the span (info, warning, error)
1303            status_message: Optional status message for the span
1304        """
1305        super().__init__(
1306            otel_span=otel_span,
1307            as_type="span",
1308            langfuse_client=langfuse_client,
1309            input=input,
1310            output=output,
1311            metadata=metadata,
1312            environment=environment,
1313            release=release,
1314            version=version,
1315            level=level,
1316            status_message=status_message,
1317        )

Standard span implementation for general operations in Langfuse.

This class represents a general-purpose span that can be used to trace any operation in your application. It extends the base LangfuseObservationWrapper with specific methods for creating child spans, generations, and updating span-specific attributes. If possible, use a more specific type for better observability and insights.

LangfuseSpan( *, otel_span: opentelemetry.trace.span.Span, langfuse_client: Langfuse, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, environment: Optional[str] = None, release: Optional[str] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None)
1277    def __init__(
1278        self,
1279        *,
1280        otel_span: otel_trace_api.Span,
1281        langfuse_client: "Langfuse",
1282        input: Optional[Any] = None,
1283        output: Optional[Any] = None,
1284        metadata: Optional[Any] = None,
1285        environment: Optional[str] = None,
1286        release: Optional[str] = None,
1287        version: Optional[str] = None,
1288        level: Optional[SpanLevel] = None,
1289        status_message: Optional[str] = None,
1290    ):
1291        """Initialize a new LangfuseSpan.
1292
1293        Args:
1294            otel_span: The OpenTelemetry span to wrap
1295            langfuse_client: Reference to the parent Langfuse client
1296            input: Input data for the span (any JSON-serializable object)
1297            output: Output data from the span (any JSON-serializable object)
1298            metadata: Additional metadata to associate with the span
1299            environment: The tracing environment
1300            release: Release identifier for the application
1301            version: Version identifier for the code or component
1302            level: Importance level of the span (info, warning, error)
1303            status_message: Optional status message for the span
1304        """
1305        super().__init__(
1306            otel_span=otel_span,
1307            as_type="span",
1308            langfuse_client=langfuse_client,
1309            input=input,
1310            output=output,
1311            metadata=metadata,
1312            environment=environment,
1313            release=release,
1314            version=version,
1315            level=level,
1316            status_message=status_message,
1317        )

Initialize a new LangfuseSpan.

Arguments:
  • otel_span: The OpenTelemetry span to wrap
  • langfuse_client: Reference to the parent Langfuse client
  • input: Input data for the span (any JSON-serializable object)
  • output: Output data from the span (any JSON-serializable object)
  • metadata: Additional metadata to associate with the span
  • environment: The tracing environment
  • release: Release identifier for the application
  • version: Version identifier for the code or component
  • level: Importance level of the span (info, warning, error)
  • status_message: Optional status message for the span
class LangfuseGeneration(langfuse._client.span.LangfuseObservationWrapper):
1320class LangfuseGeneration(LangfuseObservationWrapper):
1321    """Specialized span implementation for AI model generations in Langfuse.
1322
1323    This class represents a generation span specifically designed for tracking
1324    AI/LLM operations. It extends the base LangfuseObservationWrapper with specialized
1325    attributes for model details, token usage, and costs.
1326    """
1327
1328    def __init__(
1329        self,
1330        *,
1331        otel_span: otel_trace_api.Span,
1332        langfuse_client: "Langfuse",
1333        input: Optional[Any] = None,
1334        output: Optional[Any] = None,
1335        metadata: Optional[Any] = None,
1336        environment: Optional[str] = None,
1337        release: Optional[str] = None,
1338        version: Optional[str] = None,
1339        level: Optional[SpanLevel] = None,
1340        status_message: Optional[str] = None,
1341        completion_start_time: Optional[datetime] = None,
1342        model: Optional[str] = None,
1343        model_parameters: Optional[Dict[str, MapValue]] = None,
1344        usage_details: Optional[Dict[str, int]] = None,
1345        cost_details: Optional[Dict[str, float]] = None,
1346        prompt: Optional[PromptClient] = None,
1347    ):
1348        """Initialize a new LangfuseGeneration span.
1349
1350        Args:
1351            otel_span: The OpenTelemetry span to wrap
1352            langfuse_client: Reference to the parent Langfuse client
1353            input: Input data for the generation (e.g., prompts)
1354            output: Output from the generation (e.g., completions)
1355            metadata: Additional metadata to associate with the generation
1356            environment: The tracing environment
1357            release: Release identifier for the application
1358            version: Version identifier for the model or component
1359            level: Importance level of the generation (info, warning, error)
1360            status_message: Optional status message for the generation
1361            completion_start_time: When the model started generating the response
1362            model: Name/identifier of the AI model used (e.g., "gpt-4")
1363            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1364            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1365            cost_details: Cost information for the model call
1366            prompt: Associated prompt template from Langfuse prompt management
1367        """
1368        super().__init__(
1369            as_type="generation",
1370            otel_span=otel_span,
1371            langfuse_client=langfuse_client,
1372            input=input,
1373            output=output,
1374            metadata=metadata,
1375            environment=environment,
1376            release=release,
1377            version=version,
1378            level=level,
1379            status_message=status_message,
1380            completion_start_time=completion_start_time,
1381            model=model,
1382            model_parameters=model_parameters,
1383            usage_details=usage_details,
1384            cost_details=cost_details,
1385            prompt=prompt,
1386        )

Specialized span implementation for AI model generations in Langfuse.

This class represents a generation span specifically designed for tracking AI/LLM operations. It extends the base LangfuseObservationWrapper with specialized attributes for model details, token usage, and costs.

LangfuseGeneration( *, otel_span: opentelemetry.trace.span.Span, langfuse_client: Langfuse, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, environment: Optional[str] = None, release: Optional[str] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, float, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None)
1328    def __init__(
1329        self,
1330        *,
1331        otel_span: otel_trace_api.Span,
1332        langfuse_client: "Langfuse",
1333        input: Optional[Any] = None,
1334        output: Optional[Any] = None,
1335        metadata: Optional[Any] = None,
1336        environment: Optional[str] = None,
1337        release: Optional[str] = None,
1338        version: Optional[str] = None,
1339        level: Optional[SpanLevel] = None,
1340        status_message: Optional[str] = None,
1341        completion_start_time: Optional[datetime] = None,
1342        model: Optional[str] = None,
1343        model_parameters: Optional[Dict[str, MapValue]] = None,
1344        usage_details: Optional[Dict[str, int]] = None,
1345        cost_details: Optional[Dict[str, float]] = None,
1346        prompt: Optional[PromptClient] = None,
1347    ):
1348        """Initialize a new LangfuseGeneration span.
1349
1350        Args:
1351            otel_span: The OpenTelemetry span to wrap
1352            langfuse_client: Reference to the parent Langfuse client
1353            input: Input data for the generation (e.g., prompts)
1354            output: Output from the generation (e.g., completions)
1355            metadata: Additional metadata to associate with the generation
1356            environment: The tracing environment
1357            release: Release identifier for the application
1358            version: Version identifier for the model or component
1359            level: Importance level of the generation (info, warning, error)
1360            status_message: Optional status message for the generation
1361            completion_start_time: When the model started generating the response
1362            model: Name/identifier of the AI model used (e.g., "gpt-4")
1363            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1364            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1365            cost_details: Cost information for the model call
1366            prompt: Associated prompt template from Langfuse prompt management
1367        """
1368        super().__init__(
1369            as_type="generation",
1370            otel_span=otel_span,
1371            langfuse_client=langfuse_client,
1372            input=input,
1373            output=output,
1374            metadata=metadata,
1375            environment=environment,
1376            release=release,
1377            version=version,
1378            level=level,
1379            status_message=status_message,
1380            completion_start_time=completion_start_time,
1381            model=model,
1382            model_parameters=model_parameters,
1383            usage_details=usage_details,
1384            cost_details=cost_details,
1385            prompt=prompt,
1386        )

Initialize a new LangfuseGeneration span.

Arguments:
  • otel_span: The OpenTelemetry span to wrap
  • langfuse_client: Reference to the parent Langfuse client
  • input: Input data for the generation (e.g., prompts)
  • output: Output from the generation (e.g., completions)
  • metadata: Additional metadata to associate with the generation
  • environment: The tracing environment
  • release: Release identifier for the application
  • version: Version identifier for the model or component
  • level: Importance level of the generation (info, warning, error)
  • status_message: Optional status message for the generation
  • completion_start_time: When the model started generating the response
  • model: Name/identifier of the AI model used (e.g., "gpt-4")
  • model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
  • usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
  • cost_details: Cost information for the model call
  • prompt: Associated prompt template from Langfuse prompt management
class LangfuseEvent(langfuse._client.span.LangfuseObservationWrapper):
1389class LangfuseEvent(LangfuseObservationWrapper):
1390    """Specialized span implementation for Langfuse Events."""
1391
1392    def __init__(
1393        self,
1394        *,
1395        otel_span: otel_trace_api.Span,
1396        langfuse_client: "Langfuse",
1397        input: Optional[Any] = None,
1398        output: Optional[Any] = None,
1399        metadata: Optional[Any] = None,
1400        environment: Optional[str] = None,
1401        release: Optional[str] = None,
1402        version: Optional[str] = None,
1403        level: Optional[SpanLevel] = None,
1404        status_message: Optional[str] = None,
1405    ):
1406        """Initialize a new LangfuseEvent span.
1407
1408        Args:
1409            otel_span: The OpenTelemetry span to wrap
1410            langfuse_client: Reference to the parent Langfuse client
1411            input: Input data for the event
1412            output: Output from the event
1413            metadata: Additional metadata to associate with the generation
1414            environment: The tracing environment
1415            release: Release identifier for the application
1416            version: Version identifier for the model or component
1417            level: Importance level of the generation (info, warning, error)
1418            status_message: Optional status message for the generation
1419        """
1420        super().__init__(
1421            otel_span=otel_span,
1422            as_type="event",
1423            langfuse_client=langfuse_client,
1424            input=input,
1425            output=output,
1426            metadata=metadata,
1427            environment=environment,
1428            release=release,
1429            version=version,
1430            level=level,
1431            status_message=status_message,
1432        )
1433
1434    def update(
1435        self,
1436        *,
1437        name: Optional[str] = None,
1438        input: Optional[Any] = None,
1439        output: Optional[Any] = None,
1440        metadata: Optional[Any] = None,
1441        version: Optional[str] = None,
1442        level: Optional[SpanLevel] = None,
1443        status_message: Optional[str] = None,
1444        completion_start_time: Optional[datetime] = None,
1445        model: Optional[str] = None,
1446        model_parameters: Optional[Dict[str, MapValue]] = None,
1447        usage_details: Optional[Dict[str, int]] = None,
1448        cost_details: Optional[Dict[str, float]] = None,
1449        prompt: Optional[PromptClient] = None,
1450        **kwargs: Any,
1451    ) -> "LangfuseEvent":
1452        """Update is not allowed for LangfuseEvent because events cannot be updated.
1453
1454        This method logs a warning and returns self without making changes.
1455
1456        Returns:
1457            self: Returns the unchanged LangfuseEvent instance
1458        """
1459        langfuse_logger.warning(
1460            "Attempted to update LangfuseEvent observation. Events cannot be updated after creation."
1461        )
1462        return self

Specialized span implementation for Langfuse Events.

LangfuseEvent( *, otel_span: opentelemetry.trace.span.Span, langfuse_client: Langfuse, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, environment: Optional[str] = None, release: Optional[str] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None)
1392    def __init__(
1393        self,
1394        *,
1395        otel_span: otel_trace_api.Span,
1396        langfuse_client: "Langfuse",
1397        input: Optional[Any] = None,
1398        output: Optional[Any] = None,
1399        metadata: Optional[Any] = None,
1400        environment: Optional[str] = None,
1401        release: Optional[str] = None,
1402        version: Optional[str] = None,
1403        level: Optional[SpanLevel] = None,
1404        status_message: Optional[str] = None,
1405    ):
1406        """Initialize a new LangfuseEvent span.
1407
1408        Args:
1409            otel_span: The OpenTelemetry span to wrap
1410            langfuse_client: Reference to the parent Langfuse client
1411            input: Input data for the event
1412            output: Output from the event
1413            metadata: Additional metadata to associate with the generation
1414            environment: The tracing environment
1415            release: Release identifier for the application
1416            version: Version identifier for the model or component
1417            level: Importance level of the generation (info, warning, error)
1418            status_message: Optional status message for the generation
1419        """
1420        super().__init__(
1421            otel_span=otel_span,
1422            as_type="event",
1423            langfuse_client=langfuse_client,
1424            input=input,
1425            output=output,
1426            metadata=metadata,
1427            environment=environment,
1428            release=release,
1429            version=version,
1430            level=level,
1431            status_message=status_message,
1432        )

Initialize a new LangfuseEvent span.

Arguments:
  • otel_span: The OpenTelemetry span to wrap
  • langfuse_client: Reference to the parent Langfuse client
  • input: Input data for the event
  • output: Output from the event
  • metadata: Additional metadata to associate with the generation
  • environment: The tracing environment
  • release: Release identifier for the application
  • version: Version identifier for the model or component
  • level: Importance level of the generation (info, warning, error)
  • status_message: Optional status message for the generation
def update( self, *, name: Optional[str] = None, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, float, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None, **kwargs: Any) -> LangfuseEvent:
1434    def update(
1435        self,
1436        *,
1437        name: Optional[str] = None,
1438        input: Optional[Any] = None,
1439        output: Optional[Any] = None,
1440        metadata: Optional[Any] = None,
1441        version: Optional[str] = None,
1442        level: Optional[SpanLevel] = None,
1443        status_message: Optional[str] = None,
1444        completion_start_time: Optional[datetime] = None,
1445        model: Optional[str] = None,
1446        model_parameters: Optional[Dict[str, MapValue]] = None,
1447        usage_details: Optional[Dict[str, int]] = None,
1448        cost_details: Optional[Dict[str, float]] = None,
1449        prompt: Optional[PromptClient] = None,
1450        **kwargs: Any,
1451    ) -> "LangfuseEvent":
1452        """Update is not allowed for LangfuseEvent because events cannot be updated.
1453
1454        This method logs a warning and returns self without making changes.
1455
1456        Returns:
1457            self: Returns the unchanged LangfuseEvent instance
1458        """
1459        langfuse_logger.warning(
1460            "Attempted to update LangfuseEvent observation. Events cannot be updated after creation."
1461        )
1462        return self

Update is not allowed for LangfuseEvent because events cannot be updated.

This method logs a warning and returns self without making changes.

Returns:

self: Returns the unchanged LangfuseEvent instance

class LangfuseOtelSpanAttributes:
28class LangfuseOtelSpanAttributes:
29    # Langfuse-Trace attributes
30    TRACE_NAME = "langfuse.trace.name"
31    TRACE_USER_ID = "user.id"
32    TRACE_SESSION_ID = "session.id"
33    TRACE_TAGS = "langfuse.trace.tags"
34    TRACE_PUBLIC = "langfuse.trace.public"
35    TRACE_METADATA = "langfuse.trace.metadata"
36    TRACE_INPUT = "langfuse.trace.input"
37    TRACE_OUTPUT = "langfuse.trace.output"
38
39    # Langfuse-observation attributes
40    OBSERVATION_TYPE = "langfuse.observation.type"
41    OBSERVATION_METADATA = "langfuse.observation.metadata"
42    OBSERVATION_LEVEL = "langfuse.observation.level"
43    OBSERVATION_STATUS_MESSAGE = "langfuse.observation.status_message"
44    OBSERVATION_INPUT = "langfuse.observation.input"
45    OBSERVATION_OUTPUT = "langfuse.observation.output"
46
47    # Langfuse-observation of type Generation attributes
48    OBSERVATION_COMPLETION_START_TIME = "langfuse.observation.completion_start_time"
49    OBSERVATION_MODEL = "langfuse.observation.model.name"
50    OBSERVATION_MODEL_PARAMETERS = "langfuse.observation.model.parameters"
51    OBSERVATION_USAGE_DETAILS = "langfuse.observation.usage_details"
52    OBSERVATION_COST_DETAILS = "langfuse.observation.cost_details"
53    OBSERVATION_PROMPT_NAME = "langfuse.observation.prompt.name"
54    OBSERVATION_PROMPT_VERSION = "langfuse.observation.prompt.version"
55
56    # General
57    ENVIRONMENT = "langfuse.environment"
58    RELEASE = "langfuse.release"
59    VERSION = "langfuse.version"
60
61    # Internal
62    AS_ROOT = "langfuse.internal.as_root"
63    IS_APP_ROOT = "langfuse.internal.is_app_root"
64
65    # Experiments
66    EXPERIMENT_ID = "langfuse.experiment.id"
67    EXPERIMENT_NAME = "langfuse.experiment.name"
68    EXPERIMENT_DESCRIPTION = "langfuse.experiment.description"
69    EXPERIMENT_METADATA = "langfuse.experiment.metadata"
70    EXPERIMENT_DATASET_ID = "langfuse.experiment.dataset.id"
71    EXPERIMENT_ITEM_ID = "langfuse.experiment.item.id"
72    EXPERIMENT_ITEM_EXPECTED_OUTPUT = "langfuse.experiment.item.expected_output"
73    EXPERIMENT_ITEM_METADATA = "langfuse.experiment.item.metadata"
74    EXPERIMENT_ITEM_ROOT_OBSERVATION_ID = "langfuse.experiment.item.root_observation_id"
TRACE_NAME = 'langfuse.trace.name'
TRACE_USER_ID = 'user.id'
TRACE_SESSION_ID = 'session.id'
TRACE_TAGS = 'langfuse.trace.tags'
TRACE_PUBLIC = 'langfuse.trace.public'
TRACE_METADATA = 'langfuse.trace.metadata'
TRACE_INPUT = 'langfuse.trace.input'
TRACE_OUTPUT = 'langfuse.trace.output'
OBSERVATION_TYPE = 'langfuse.observation.type'
OBSERVATION_METADATA = 'langfuse.observation.metadata'
OBSERVATION_LEVEL = 'langfuse.observation.level'
OBSERVATION_STATUS_MESSAGE = 'langfuse.observation.status_message'
OBSERVATION_INPUT = 'langfuse.observation.input'
OBSERVATION_OUTPUT = 'langfuse.observation.output'
OBSERVATION_COMPLETION_START_TIME = 'langfuse.observation.completion_start_time'
OBSERVATION_MODEL = 'langfuse.observation.model.name'
OBSERVATION_MODEL_PARAMETERS = 'langfuse.observation.model.parameters'
OBSERVATION_USAGE_DETAILS = 'langfuse.observation.usage_details'
OBSERVATION_COST_DETAILS = 'langfuse.observation.cost_details'
OBSERVATION_PROMPT_NAME = 'langfuse.observation.prompt.name'
OBSERVATION_PROMPT_VERSION = 'langfuse.observation.prompt.version'
ENVIRONMENT = 'langfuse.environment'
RELEASE = 'langfuse.release'
VERSION = 'langfuse.version'
AS_ROOT = 'langfuse.internal.as_root'
IS_APP_ROOT = 'langfuse.internal.is_app_root'
EXPERIMENT_ID = 'langfuse.experiment.id'
EXPERIMENT_NAME = 'langfuse.experiment.name'
EXPERIMENT_DESCRIPTION = 'langfuse.experiment.description'
EXPERIMENT_METADATA = 'langfuse.experiment.metadata'
EXPERIMENT_DATASET_ID = 'langfuse.experiment.dataset.id'
EXPERIMENT_ITEM_ID = 'langfuse.experiment.item.id'
EXPERIMENT_ITEM_EXPECTED_OUTPUT = 'langfuse.experiment.item.expected_output'
EXPERIMENT_ITEM_METADATA = 'langfuse.experiment.item.metadata'
EXPERIMENT_ITEM_ROOT_OBSERVATION_ID = 'langfuse.experiment.item.root_observation_id'
class LangfuseAgent(langfuse._client.span.LangfuseObservationWrapper):
1465class LangfuseAgent(LangfuseObservationWrapper):
1466    """Agent observation for reasoning blocks that act on tools using LLM guidance."""
1467
1468    def __init__(self, **kwargs: Any) -> None:
1469        """Initialize a new LangfuseAgent span."""
1470        kwargs["as_type"] = "agent"
1471        super().__init__(**kwargs)

Agent observation for reasoning blocks that act on tools using LLM guidance.

LangfuseAgent(**kwargs: Any)
1468    def __init__(self, **kwargs: Any) -> None:
1469        """Initialize a new LangfuseAgent span."""
1470        kwargs["as_type"] = "agent"
1471        super().__init__(**kwargs)

Initialize a new LangfuseAgent span.

class LangfuseTool(langfuse._client.span.LangfuseObservationWrapper):
1474class LangfuseTool(LangfuseObservationWrapper):
1475    """Tool observation representing external tool calls, e.g., calling a weather API."""
1476
1477    def __init__(self, **kwargs: Any) -> None:
1478        """Initialize a new LangfuseTool span."""
1479        kwargs["as_type"] = "tool"
1480        super().__init__(**kwargs)

Tool observation representing external tool calls, e.g., calling a weather API.

LangfuseTool(**kwargs: Any)
1477    def __init__(self, **kwargs: Any) -> None:
1478        """Initialize a new LangfuseTool span."""
1479        kwargs["as_type"] = "tool"
1480        super().__init__(**kwargs)

Initialize a new LangfuseTool span.

class LangfuseChain(langfuse._client.span.LangfuseObservationWrapper):
1483class LangfuseChain(LangfuseObservationWrapper):
1484    """Chain observation for connecting LLM application steps, e.g. passing context from retriever to LLM."""
1485
1486    def __init__(self, **kwargs: Any) -> None:
1487        """Initialize a new LangfuseChain span."""
1488        kwargs["as_type"] = "chain"
1489        super().__init__(**kwargs)

Chain observation for connecting LLM application steps, e.g. passing context from retriever to LLM.

LangfuseChain(**kwargs: Any)
1486    def __init__(self, **kwargs: Any) -> None:
1487        """Initialize a new LangfuseChain span."""
1488        kwargs["as_type"] = "chain"
1489        super().__init__(**kwargs)

Initialize a new LangfuseChain span.

class LangfuseEmbedding(langfuse._client.span.LangfuseObservationWrapper):
1501class LangfuseEmbedding(LangfuseObservationWrapper):
1502    """Embedding observation for LLM embedding calls, typically used before retrieval."""
1503
1504    def __init__(self, **kwargs: Any) -> None:
1505        """Initialize a new LangfuseEmbedding span."""
1506        kwargs["as_type"] = "embedding"
1507        super().__init__(**kwargs)

Embedding observation for LLM embedding calls, typically used before retrieval.

LangfuseEmbedding(**kwargs: Any)
1504    def __init__(self, **kwargs: Any) -> None:
1505        """Initialize a new LangfuseEmbedding span."""
1506        kwargs["as_type"] = "embedding"
1507        super().__init__(**kwargs)

Initialize a new LangfuseEmbedding span.

class LangfuseEvaluator(langfuse._client.span.LangfuseObservationWrapper):
1510class LangfuseEvaluator(LangfuseObservationWrapper):
1511    """Evaluator observation for assessing relevance, correctness, or helpfulness of LLM outputs."""
1512
1513    def __init__(self, **kwargs: Any) -> None:
1514        """Initialize a new LangfuseEvaluator span."""
1515        kwargs["as_type"] = "evaluator"
1516        super().__init__(**kwargs)

Evaluator observation for assessing relevance, correctness, or helpfulness of LLM outputs.

LangfuseEvaluator(**kwargs: Any)
1513    def __init__(self, **kwargs: Any) -> None:
1514        """Initialize a new LangfuseEvaluator span."""
1515        kwargs["as_type"] = "evaluator"
1516        super().__init__(**kwargs)

Initialize a new LangfuseEvaluator span.

class LangfuseRetriever(langfuse._client.span.LangfuseObservationWrapper):
1492class LangfuseRetriever(LangfuseObservationWrapper):
1493    """Retriever observation for data retrieval steps, e.g. vector store or database queries."""
1494
1495    def __init__(self, **kwargs: Any) -> None:
1496        """Initialize a new LangfuseRetriever span."""
1497        kwargs["as_type"] = "retriever"
1498        super().__init__(**kwargs)

Retriever observation for data retrieval steps, e.g. vector store or database queries.

LangfuseRetriever(**kwargs: Any)
1495    def __init__(self, **kwargs: Any) -> None:
1496        """Initialize a new LangfuseRetriever span."""
1497        kwargs["as_type"] = "retriever"
1498        super().__init__(**kwargs)

Initialize a new LangfuseRetriever span.

class LangfuseGuardrail(langfuse._client.span.LangfuseObservationWrapper):
1519class LangfuseGuardrail(LangfuseObservationWrapper):
1520    """Guardrail observation for protection e.g. against jailbreaks or offensive content."""
1521
1522    def __init__(self, **kwargs: Any) -> None:
1523        """Initialize a new LangfuseGuardrail span."""
1524        kwargs["as_type"] = "guardrail"
1525        super().__init__(**kwargs)

Guardrail observation for protection e.g. against jailbreaks or offensive content.

LangfuseGuardrail(**kwargs: Any)
1522    def __init__(self, **kwargs: Any) -> None:
1523        """Initialize a new LangfuseGuardrail span."""
1524        kwargs["as_type"] = "guardrail"
1525        super().__init__(**kwargs)

Initialize a new LangfuseGuardrail span.

class Evaluation:
101class Evaluation:
102    """Represents an evaluation result for an experiment item or an entire experiment run.
103
104    This class provides a strongly-typed way to create evaluation results in evaluator functions.
105    Users must use keyword arguments when instantiating this class.
106
107    Attributes:
108        name: Unique identifier for the evaluation metric. Should be descriptive
109            and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity").
110            Used for aggregation and comparison across experiment runs.
111        value: The evaluation score or result. Can be:
112            - Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42)
113            - String: For categorical results like "positive", "negative", "neutral"
114            - Boolean: For binary assessments like "passes_safety_check"
115        comment: Optional human-readable explanation of the evaluation result.
116            Useful for providing context, explaining scoring rationale, or noting
117            special conditions. Displayed in Langfuse UI for interpretability.
118        metadata: Optional structured metadata about the evaluation process.
119            Can include confidence scores, intermediate calculations, model versions,
120            or any other relevant technical details.
121        data_type: Optional score data type. Required if value is not NUMERIC.
122            One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC.
123        config_id: Optional Langfuse score config ID.
124
125    Examples:
126        Basic accuracy evaluation:
127        ```python
128        from langfuse import Evaluation
129
130        def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
131            if not expected_output:
132                return Evaluation(name="accuracy", value=0, comment="No expected output")
133
134            is_correct = output.strip().lower() == expected_output.strip().lower()
135            return Evaluation(
136                name="accuracy",
137                value=1.0 if is_correct else 0.0,
138                comment="Correct answer" if is_correct else "Incorrect answer"
139            )
140        ```
141
142        Multi-metric evaluator:
143        ```python
144        def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs):
145            return [
146                Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"),
147                Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"),
148                Evaluation(
149                    name="quality",
150                    value=0.85,
151                    comment="High quality response",
152                    metadata={"confidence": 0.92, "model": "gpt-4"}
153                )
154            ]
155        ```
156
157        Categorical evaluation:
158        ```python
159        def sentiment_evaluator(*, input, output, **kwargs):
160            sentiment = analyze_sentiment(output)  # Returns "positive", "negative", or "neutral"
161            return Evaluation(
162                name="sentiment",
163                value=sentiment,
164                comment=f"Response expresses {sentiment} sentiment",
165                data_type="CATEGORICAL"
166            )
167        ```
168
169        Failed evaluation with error handling:
170        ```python
171        def external_api_evaluator(*, input, output, **kwargs):
172            try:
173                score = external_api.evaluate(output)
174                return Evaluation(name="external_score", value=score)
175            except Exception as e:
176                return Evaluation(
177                    name="external_score",
178                    value=0,
179                    comment=f"API unavailable: {e}",
180                    metadata={"error": str(e), "retry_count": 3}
181                )
182        ```
183
184    Note:
185        All arguments must be passed as keywords. Positional arguments are not allowed
186        to ensure code clarity and prevent errors from argument reordering.
187    """
188
189    def __init__(
190        self,
191        *,
192        name: str,
193        value: Union[int, float, str, bool],
194        comment: Optional[str] = None,
195        metadata: Optional[Dict[str, Any]] = None,
196        data_type: Optional[ExperimentScoreType] = None,
197        config_id: Optional[str] = None,
198    ):
199        """Initialize an Evaluation with the provided data.
200
201        Args:
202            name: Unique identifier for the evaluation metric.
203            value: The evaluation score or result.
204            comment: Optional human-readable explanation of the result.
205            metadata: Optional structured metadata about the evaluation process.
206            data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
207            config_id: Optional Langfuse score config ID.
208
209        Note:
210            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
211        """
212        self.name = name
213        self.value = value
214        self.comment = comment
215        self.metadata = metadata
216        self.data_type = data_type
217        self.config_id = config_id

Represents an evaluation result for an experiment item or an entire experiment run.

This class provides a strongly-typed way to create evaluation results in evaluator functions. Users must use keyword arguments when instantiating this class.

Attributes:
  • name: Unique identifier for the evaluation metric. Should be descriptive and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity"). Used for aggregation and comparison across experiment runs.
  • value: The evaluation score or result. Can be:
    • Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42)
    • String: For categorical results like "positive", "negative", "neutral"
    • Boolean: For binary assessments like "passes_safety_check"
  • comment: Optional human-readable explanation of the evaluation result. Useful for providing context, explaining scoring rationale, or noting special conditions. Displayed in Langfuse UI for interpretability.
  • metadata: Optional structured metadata about the evaluation process. Can include confidence scores, intermediate calculations, model versions, or any other relevant technical details.
  • data_type: Optional score data type. Required if value is not NUMERIC. One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC.
  • config_id: Optional Langfuse score config ID.
Examples:

Basic accuracy evaluation:

from langfuse import Evaluation

def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
    if not expected_output:
        return Evaluation(name="accuracy", value=0, comment="No expected output")

    is_correct = output.strip().lower() == expected_output.strip().lower()
    return Evaluation(
        name="accuracy",
        value=1.0 if is_correct else 0.0,
        comment="Correct answer" if is_correct else "Incorrect answer"
    )

Multi-metric evaluator:

def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs):
    return [
        Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"),
        Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"),
        Evaluation(
            name="quality",
            value=0.85,
            comment="High quality response",
            metadata={"confidence": 0.92, "model": "gpt-4"}
        )
    ]

Categorical evaluation:

def sentiment_evaluator(*, input, output, **kwargs):
    sentiment = analyze_sentiment(output)  # Returns "positive", "negative", or "neutral"
    return Evaluation(
        name="sentiment",
        value=sentiment,
        comment=f"Response expresses {sentiment} sentiment",
        data_type="CATEGORICAL"
    )

Failed evaluation with error handling:

def external_api_evaluator(*, input, output, **kwargs):
    try:
        score = external_api.evaluate(output)
        return Evaluation(name="external_score", value=score)
    except Exception as e:
        return Evaluation(
            name="external_score",
            value=0,
            comment=f"API unavailable: {e}",
            metadata={"error": str(e), "retry_count": 3}
        )
Note:

All arguments must be passed as keywords. Positional arguments are not allowed to ensure code clarity and prevent errors from argument reordering.

Evaluation( *, name: str, value: Union[int, float, str, bool], comment: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, data_type: Optional[Literal['NUMERIC', 'CATEGORICAL', 'BOOLEAN']] = None, config_id: Optional[str] = None)
189    def __init__(
190        self,
191        *,
192        name: str,
193        value: Union[int, float, str, bool],
194        comment: Optional[str] = None,
195        metadata: Optional[Dict[str, Any]] = None,
196        data_type: Optional[ExperimentScoreType] = None,
197        config_id: Optional[str] = None,
198    ):
199        """Initialize an Evaluation with the provided data.
200
201        Args:
202            name: Unique identifier for the evaluation metric.
203            value: The evaluation score or result.
204            comment: Optional human-readable explanation of the result.
205            metadata: Optional structured metadata about the evaluation process.
206            data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
207            config_id: Optional Langfuse score config ID.
208
209        Note:
210            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
211        """
212        self.name = name
213        self.value = value
214        self.comment = comment
215        self.metadata = metadata
216        self.data_type = data_type
217        self.config_id = config_id

Initialize an Evaluation with the provided data.

Arguments:
  • name: Unique identifier for the evaluation metric.
  • value: The evaluation score or result.
  • comment: Optional human-readable explanation of the result.
  • metadata: Optional structured metadata about the evaluation process.
  • data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
  • config_id: Optional Langfuse score config ID.
Note:

All arguments must be provided as keywords. Positional arguments will raise a TypeError.

name
value
comment
metadata
data_type
config_id
class EvaluatorInputs:
 38class EvaluatorInputs:
 39    """Input data structure for evaluators, returned by mapper functions.
 40
 41    This class provides a strongly-typed container for transforming API response
 42    objects (traces, observations) into the standardized format expected
 43    by evaluator functions. It ensures consistent access to input, output, expected
 44    output, and metadata regardless of the source entity type.
 45
 46    Attributes:
 47        input: The input data that was provided to generate the output being evaluated.
 48            For traces, this might be the initial prompt or request. For observations,
 49            this could be the span's input. The exact meaning depends on your use case.
 50        output: The actual output that was produced and needs to be evaluated.
 51            For traces, this is typically the final response. For observations,
 52            this might be the generation output or span result.
 53        expected_output: Optional ground truth or expected result for comparison.
 54            Used by evaluators to assess correctness. May be None if no ground truth
 55            is available for the entity being evaluated.
 56        metadata: Optional structured metadata providing additional context for evaluation.
 57            Can include information about the entity, execution context, user attributes,
 58            or any other relevant data that evaluators might use.
 59
 60    Examples:
 61        Simple mapper for traces:
 62        ```python
 63        from langfuse import EvaluatorInputs
 64
 65        def trace_mapper(trace):
 66            return EvaluatorInputs(
 67                input=trace.input,
 68                output=trace.output,
 69                expected_output=None,  # No ground truth available
 70                metadata={"user_id": trace.user_id, "tags": trace.tags}
 71            )
 72        ```
 73
 74        Mapper for observations extracting specific fields:
 75        ```python
 76        def observation_mapper(observation):
 77            # Extract input/output from observation's data
 78            input_data = observation.input if hasattr(observation, 'input') else None
 79            output_data = observation.output if hasattr(observation, 'output') else None
 80
 81            return EvaluatorInputs(
 82                input=input_data,
 83                output=output_data,
 84                expected_output=None,
 85                metadata={
 86                    "observation_type": observation.type,
 87                    "model": observation.model,
 88                    "latency_ms": observation.end_time - observation.start_time
 89                }
 90            )
 91        ```
 92        ```
 93
 94    Note:
 95        All arguments must be passed as keywords when instantiating this class.
 96    """
 97
 98    def __init__(
 99        self,
100        *,
101        input: Any,
102        output: Any,
103        expected_output: Any = None,
104        metadata: Optional[Dict[str, Any]] = None,
105    ):
106        """Initialize EvaluatorInputs with the provided data.
107
108        Args:
109            input: The input data for evaluation.
110            output: The output data to be evaluated.
111            expected_output: Optional ground truth for comparison.
112            metadata: Optional additional context for evaluation.
113
114        Note:
115            All arguments must be provided as keywords.
116        """
117        self.input = input
118        self.output = output
119        self.expected_output = expected_output
120        self.metadata = metadata

Input data structure for evaluators, returned by mapper functions.

This class provides a strongly-typed container for transforming API response objects (traces, observations) into the standardized format expected by evaluator functions. It ensures consistent access to input, output, expected output, and metadata regardless of the source entity type.

Attributes:
  • input: The input data that was provided to generate the output being evaluated. For traces, this might be the initial prompt or request. For observations, this could be the span's input. The exact meaning depends on your use case.
  • output: The actual output that was produced and needs to be evaluated. For traces, this is typically the final response. For observations, this might be the generation output or span result.
  • expected_output: Optional ground truth or expected result for comparison. Used by evaluators to assess correctness. May be None if no ground truth is available for the entity being evaluated.
  • metadata: Optional structured metadata providing additional context for evaluation. Can include information about the entity, execution context, user attributes, or any other relevant data that evaluators might use.
Examples:

Simple mapper for traces:

from langfuse import EvaluatorInputs

def trace_mapper(trace):
    return EvaluatorInputs(
        input=trace.input,
        output=trace.output,
        expected_output=None,  # No ground truth available
        metadata={"user_id": trace.user_id, "tags": trace.tags}
    )

Mapper for observations extracting specific fields:

def observation_mapper(observation):
    # Extract input/output from observation's data
    input_data = observation.input if hasattr(observation, 'input') else None
    output_data = observation.output if hasattr(observation, 'output') else None

    return EvaluatorInputs(
        input=input_data,
        output=output_data,
        expected_output=None,
        metadata={
            "observation_type": observation.type,
            "model": observation.model,
            "latency_ms": observation.end_time - observation.start_time
        }
    )

```

Note:

All arguments must be passed as keywords when instantiating this class.

EvaluatorInputs( *, input: Any, output: Any, expected_output: Any = None, metadata: Optional[Dict[str, Any]] = None)
 98    def __init__(
 99        self,
100        *,
101        input: Any,
102        output: Any,
103        expected_output: Any = None,
104        metadata: Optional[Dict[str, Any]] = None,
105    ):
106        """Initialize EvaluatorInputs with the provided data.
107
108        Args:
109            input: The input data for evaluation.
110            output: The output data to be evaluated.
111            expected_output: Optional ground truth for comparison.
112            metadata: Optional additional context for evaluation.
113
114        Note:
115            All arguments must be provided as keywords.
116        """
117        self.input = input
118        self.output = output
119        self.expected_output = expected_output
120        self.metadata = metadata

Initialize EvaluatorInputs with the provided data.

Arguments:
  • input: The input data for evaluation.
  • output: The output data to be evaluated.
  • expected_output: Optional ground truth for comparison.
  • metadata: Optional additional context for evaluation.
Note:

All arguments must be provided as keywords.

input
output
expected_output
metadata
class MapperFunction(typing.Protocol):
123class MapperFunction(Protocol):
124    """Protocol defining the interface for mapper functions in batch evaluation.
125
126    Mapper functions transform API response objects (traces or observations)
127    into the standardized EvaluatorInputs format that evaluators expect. This abstraction
128    allows you to define how to extract and structure evaluation data from different
129    entity types.
130
131    Mapper functions must:
132    - Accept a single item parameter (trace, observation)
133    - Return an EvaluatorInputs instance with input, output, expected_output, metadata
134    - Can be either synchronous or asynchronous
135    - Should handle missing or malformed data gracefully
136    """
137
138    def __call__(
139        self,
140        *,
141        item: Union["TraceWithFullDetails", "ObservationsView"],
142        **kwargs: Dict[str, Any],
143    ) -> Union[EvaluatorInputs, Awaitable[EvaluatorInputs]]:
144        """Transform an API response object into evaluator inputs.
145
146        This method defines how to extract evaluation-relevant data from the raw
147        API response object. The implementation should map entity-specific fields
148        to the standardized input/output/expected_output/metadata structure.
149
150        Args:
151            item: The API response object to transform. The type depends on the scope:
152                - TraceWithFullDetails: When evaluating traces
153                - ObservationsView: When evaluating observations
154
155        Returns:
156            EvaluatorInputs: A structured container with:
157                - input: The input data that generated the output
158                - output: The output to be evaluated
159                - expected_output: Optional ground truth for comparison
160                - metadata: Optional additional context
161
162            Can return either a direct EvaluatorInputs instance or an awaitable
163            (for async mappers that need to fetch additional data).
164
165        Examples:
166            Basic trace mapper:
167            ```python
168            def map_trace(trace):
169                return EvaluatorInputs(
170                    input=trace.input,
171                    output=trace.output,
172                    expected_output=None,
173                    metadata={"trace_id": trace.id, "user": trace.user_id}
174                )
175            ```
176
177            Observation mapper with conditional logic:
178            ```python
179            def map_observation(observation):
180                # Extract fields based on observation type
181                if observation.type == "GENERATION":
182                    input_data = observation.input
183                    output_data = observation.output
184                else:
185                    # For other types, use different fields
186                    input_data = observation.metadata.get("input")
187                    output_data = observation.metadata.get("output")
188
189                return EvaluatorInputs(
190                    input=input_data,
191                    output=output_data,
192                    expected_output=None,
193                    metadata={"obs_id": observation.id, "type": observation.type}
194                )
195            ```
196
197            Async mapper (if additional processing needed):
198            ```python
199            async def map_trace_async(trace):
200                # Could do async processing here if needed
201                processed_output = await some_async_transformation(trace.output)
202
203                return EvaluatorInputs(
204                    input=trace.input,
205                    output=processed_output,
206                    expected_output=None,
207                    metadata={"trace_id": trace.id}
208                )
209            ```
210        """
211        ...

Protocol defining the interface for mapper functions in batch evaluation.

Mapper functions transform API response objects (traces or observations) into the standardized EvaluatorInputs format that evaluators expect. This abstraction allows you to define how to extract and structure evaluation data from different entity types.

Mapper functions must:

  • Accept a single item parameter (trace, observation)
  • Return an EvaluatorInputs instance with input, output, expected_output, metadata
  • Can be either synchronous or asynchronous
  • Should handle missing or malformed data gracefully
MapperFunction(*args, **kwargs)
1927def _no_init_or_replace_init(self, *args, **kwargs):
1928    cls = type(self)
1929
1930    if cls._is_protocol:
1931        raise TypeError('Protocols cannot be instantiated')
1932
1933    # Already using a custom `__init__`. No need to calculate correct
1934    # `__init__` to call. This can lead to RecursionError. See bpo-45121.
1935    if cls.__init__ is not _no_init_or_replace_init:
1936        return
1937
1938    # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`.
1939    # The first instantiation of the subclass will call `_no_init_or_replace_init` which
1940    # searches for a proper new `__init__` in the MRO. The new `__init__`
1941    # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent
1942    # instantiation of the protocol subclass will thus use the new
1943    # `__init__` and no longer call `_no_init_or_replace_init`.
1944    for base in cls.__mro__:
1945        init = base.__dict__.get('__init__', _no_init_or_replace_init)
1946        if init is not _no_init_or_replace_init:
1947            cls.__init__ = init
1948            break
1949    else:
1950        # should not happen
1951        cls.__init__ = object.__init__
1952
1953    cls.__init__(self, *args, **kwargs)
class CompositeEvaluatorFunction(typing.Protocol):
214class CompositeEvaluatorFunction(Protocol):
215    """Protocol defining the interface for composite evaluator functions.
216
217    Composite evaluators create aggregate scores from multiple item-level evaluations.
218    This is commonly used to compute weighted averages, combined metrics, or other
219    composite assessments based on individual evaluation results.
220
221    Composite evaluators:
222    - Accept the same inputs as item-level evaluators (input, output, expected_output, metadata)
223      plus the list of evaluations
224    - Return either a single Evaluation, a list of Evaluations, or a dict
225    - Can be either synchronous or asynchronous
226    - Have access to both raw item data and evaluation results
227    """
228
229    def __call__(
230        self,
231        *,
232        input: Optional[Any] = None,
233        output: Optional[Any] = None,
234        expected_output: Optional[Any] = None,
235        metadata: Optional[Dict[str, Any]] = None,
236        evaluations: List[Evaluation],
237        **kwargs: Dict[str, Any],
238    ) -> Union[
239        Evaluation,
240        List[Evaluation],
241        Dict[str, Any],
242        Awaitable[Evaluation],
243        Awaitable[List[Evaluation]],
244        Awaitable[Dict[str, Any]],
245    ]:
246        r"""Create a composite evaluation from item-level evaluation results.
247
248        This method combines multiple evaluation scores into a single composite metric.
249        Common use cases include weighted averages, pass/fail decisions based on multiple
250        criteria, or custom scoring logic that considers multiple dimensions.
251
252        Args:
253            input: The input data that was provided to the system being evaluated.
254            output: The output generated by the system being evaluated.
255            expected_output: The expected/reference output for comparison (if available).
256            metadata: Additional metadata about the evaluation context.
257            evaluations: List of evaluation results from item-level evaluators.
258                Each evaluation contains name, value, comment, and metadata.
259
260        Returns:
261            Can return any of:
262            - Evaluation: A single composite evaluation result
263            - List[Evaluation]: Multiple composite evaluations
264            - Dict: A dict that will be converted to an Evaluation
265                - name: Identifier for the composite metric (e.g., "composite_score")
266                - value: The computed composite value
267                - comment: Optional explanation of how the score was computed
268                - metadata: Optional details about the composition logic
269
270            Can return either a direct Evaluation instance or an awaitable
271            (for async composite evaluators).
272
273        Examples:
274            Simple weighted average:
275            ```python
276            def weighted_composite(*, input, output, expected_output, metadata, evaluations):
277                weights = {
278                    "accuracy": 0.5,
279                    "relevance": 0.3,
280                    "safety": 0.2
281                }
282
283                total_score = 0.0
284                total_weight = 0.0
285
286                for eval in evaluations:
287                    if eval.name in weights and isinstance(eval.value, (int, float)):
288                        total_score += eval.value * weights[eval.name]
289                        total_weight += weights[eval.name]
290
291                final_score = total_score / total_weight if total_weight > 0 else 0.0
292
293                return Evaluation(
294                    name="composite_score",
295                    value=final_score,
296                    comment=f"Weighted average of {len(evaluations)} metrics"
297                )
298            ```
299
300            Pass/fail composite based on thresholds:
301            ```python
302            def pass_fail_composite(*, input, output, expected_output, metadata, evaluations):
303                # Must pass all criteria
304                thresholds = {
305                    "accuracy": 0.7,
306                    "safety": 0.9,
307                    "relevance": 0.6
308                }
309
310                passes = True
311                failing_metrics = []
312
313                for metric, threshold in thresholds.items():
314                    eval_result = next((e for e in evaluations if e.name == metric), None)
315                    if eval_result and isinstance(eval_result.value, (int, float)):
316                        if eval_result.value < threshold:
317                            passes = False
318                            failing_metrics.append(metric)
319
320                return Evaluation(
321                    name="passes_all_checks",
322                    value=passes,
323                    comment=f"Failed: {', '.join(failing_metrics)}" if failing_metrics else "All checks passed",
324                    data_type="BOOLEAN"
325                )
326            ```
327
328            Async composite with external scoring:
329            ```python
330            async def llm_composite(*, input, output, expected_output, metadata, evaluations):
331                # Use LLM to synthesize multiple evaluation results
332                eval_summary = "\n".join(
333                    f"- {e.name}: {e.value}" for e in evaluations
334                )
335
336                prompt = f"Given these evaluation scores:\n{eval_summary}\n"
337                prompt += f"For the output: {output}\n"
338                prompt += "Provide an overall quality score from 0-1."
339
340                response = await openai.chat.completions.create(
341                    model="gpt-4",
342                    messages=[{"role": "user", "content": prompt}]
343                )
344
345                score = float(response.choices[0].message.content.strip())
346
347                return Evaluation(
348                    name="llm_composite_score",
349                    value=score,
350                    comment="LLM-synthesized composite score"
351                )
352            ```
353
354            Context-aware composite:
355            ```python
356            def context_composite(*, input, output, expected_output, metadata, evaluations):
357                # Adjust weighting based on metadata
358                base_weights = {"accuracy": 0.5, "speed": 0.3, "cost": 0.2}
359
360                # If metadata indicates high importance, prioritize accuracy
361                if metadata and metadata.get('importance') == 'high':
362                    weights = {"accuracy": 0.7, "speed": 0.2, "cost": 0.1}
363                else:
364                    weights = base_weights
365
366                total = sum(
367                    e.value * weights.get(e.name, 0)
368                    for e in evaluations
369                    if isinstance(e.value, (int, float))
370                )
371
372                return Evaluation(
373                    name="weighted_composite",
374                    value=total,
375                    comment="Context-aware weighted composite"
376                )
377            ```
378        """
379        ...

Protocol defining the interface for composite evaluator functions.

Composite evaluators create aggregate scores from multiple item-level evaluations. This is commonly used to compute weighted averages, combined metrics, or other composite assessments based on individual evaluation results.

Composite evaluators:

  • Accept the same inputs as item-level evaluators (input, output, expected_output, metadata) plus the list of evaluations
  • Return either a single Evaluation, a list of Evaluations, or a dict
  • Can be either synchronous or asynchronous
  • Have access to both raw item data and evaluation results
CompositeEvaluatorFunction(*args, **kwargs)
1927def _no_init_or_replace_init(self, *args, **kwargs):
1928    cls = type(self)
1929
1930    if cls._is_protocol:
1931        raise TypeError('Protocols cannot be instantiated')
1932
1933    # Already using a custom `__init__`. No need to calculate correct
1934    # `__init__` to call. This can lead to RecursionError. See bpo-45121.
1935    if cls.__init__ is not _no_init_or_replace_init:
1936        return
1937
1938    # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`.
1939    # The first instantiation of the subclass will call `_no_init_or_replace_init` which
1940    # searches for a proper new `__init__` in the MRO. The new `__init__`
1941    # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent
1942    # instantiation of the protocol subclass will thus use the new
1943    # `__init__` and no longer call `_no_init_or_replace_init`.
1944    for base in cls.__mro__:
1945        init = base.__dict__.get('__init__', _no_init_or_replace_init)
1946        if init is not _no_init_or_replace_init:
1947            cls.__init__ = init
1948            break
1949    else:
1950        # should not happen
1951        cls.__init__ = object.__init__
1952
1953    cls.__init__(self, *args, **kwargs)
class EvaluatorStats:
382class EvaluatorStats:
383    """Statistics for a single evaluator's performance during batch evaluation.
384
385    This class tracks detailed metrics about how a specific evaluator performed
386    across all items in a batch evaluation run. It helps identify evaluator issues,
387    understand reliability, and optimize evaluation pipelines.
388
389    Attributes:
390        name: The name of the evaluator function (extracted from __name__).
391        total_runs: Total number of times the evaluator was invoked.
392        successful_runs: Number of times the evaluator completed successfully.
393        failed_runs: Number of times the evaluator raised an exception or failed.
394        total_scores_created: Total number of evaluation scores created by this evaluator.
395            Can be higher than successful_runs if the evaluator returns multiple scores.
396
397    Examples:
398        Accessing evaluator stats from batch evaluation result:
399        ```python
400        result = client.run_batched_evaluation(...)
401
402        for stats in result.evaluator_stats:
403            print(f"Evaluator: {stats.name}")
404            print(f"  Success rate: {stats.successful_runs / stats.total_runs:.1%}")
405            print(f"  Scores created: {stats.total_scores_created}")
406
407            if stats.failed_runs > 0:
408                print(f"  ⚠️  Failed {stats.failed_runs} times")
409        ```
410
411        Identifying problematic evaluators:
412        ```python
413        result = client.run_batched_evaluation(...)
414
415        # Find evaluators with high failure rates
416        for stats in result.evaluator_stats:
417            failure_rate = stats.failed_runs / stats.total_runs
418            if failure_rate > 0.1:  # More than 10% failures
419                print(f"⚠️  {stats.name} has {failure_rate:.1%} failure rate")
420                print(f"    Consider debugging or removing this evaluator")
421        ```
422
423    Note:
424        All arguments must be passed as keywords when instantiating this class.
425    """
426
427    def __init__(
428        self,
429        *,
430        name: str,
431        total_runs: int = 0,
432        successful_runs: int = 0,
433        failed_runs: int = 0,
434        total_scores_created: int = 0,
435    ):
436        """Initialize EvaluatorStats with the provided metrics.
437
438        Args:
439            name: The evaluator function name.
440            total_runs: Total number of evaluator invocations.
441            successful_runs: Number of successful completions.
442            failed_runs: Number of failures.
443            total_scores_created: Total scores created by this evaluator.
444
445        Note:
446            All arguments must be provided as keywords.
447        """
448        self.name = name
449        self.total_runs = total_runs
450        self.successful_runs = successful_runs
451        self.failed_runs = failed_runs
452        self.total_scores_created = total_scores_created

Statistics for a single evaluator's performance during batch evaluation.

This class tracks detailed metrics about how a specific evaluator performed across all items in a batch evaluation run. It helps identify evaluator issues, understand reliability, and optimize evaluation pipelines.

Attributes:
  • name: The name of the evaluator function (extracted from __name__).
  • total_runs: Total number of times the evaluator was invoked.
  • successful_runs: Number of times the evaluator completed successfully.
  • failed_runs: Number of times the evaluator raised an exception or failed.
  • total_scores_created: Total number of evaluation scores created by this evaluator. Can be higher than successful_runs if the evaluator returns multiple scores.
Examples:

Accessing evaluator stats from batch evaluation result:

result = client.run_batched_evaluation(...)

for stats in result.evaluator_stats:
    print(f"Evaluator: {stats.name}")
    print(f"  Success rate: {stats.successful_runs / stats.total_runs:.1%}")
    print(f"  Scores created: {stats.total_scores_created}")

    if stats.failed_runs > 0:
        print(f"  ⚠️  Failed {stats.failed_runs} times")

Identifying problematic evaluators:

result = client.run_batched_evaluation(...)

# Find evaluators with high failure rates
for stats in result.evaluator_stats:
    failure_rate = stats.failed_runs / stats.total_runs
    if failure_rate > 0.1:  # More than 10% failures
        print(f"⚠️  {stats.name} has {failure_rate:.1%} failure rate")
        print(f"    Consider debugging or removing this evaluator")
Note:

All arguments must be passed as keywords when instantiating this class.

EvaluatorStats( *, name: str, total_runs: int = 0, successful_runs: int = 0, failed_runs: int = 0, total_scores_created: int = 0)
427    def __init__(
428        self,
429        *,
430        name: str,
431        total_runs: int = 0,
432        successful_runs: int = 0,
433        failed_runs: int = 0,
434        total_scores_created: int = 0,
435    ):
436        """Initialize EvaluatorStats with the provided metrics.
437
438        Args:
439            name: The evaluator function name.
440            total_runs: Total number of evaluator invocations.
441            successful_runs: Number of successful completions.
442            failed_runs: Number of failures.
443            total_scores_created: Total scores created by this evaluator.
444
445        Note:
446            All arguments must be provided as keywords.
447        """
448        self.name = name
449        self.total_runs = total_runs
450        self.successful_runs = successful_runs
451        self.failed_runs = failed_runs
452        self.total_scores_created = total_scores_created

Initialize EvaluatorStats with the provided metrics.

Arguments:
  • name: The evaluator function name.
  • total_runs: Total number of evaluator invocations.
  • successful_runs: Number of successful completions.
  • failed_runs: Number of failures.
  • total_scores_created: Total scores created by this evaluator.
Note:

All arguments must be provided as keywords.

name
total_runs
successful_runs
failed_runs
total_scores_created
class BatchEvaluationResumeToken:
455class BatchEvaluationResumeToken:
456    """Token for resuming a failed batch evaluation run.
457
458    This class encapsulates all the information needed to resume a batch evaluation
459    that was interrupted or failed partway through. It uses timestamp-based filtering
460    to avoid re-processing items that were already evaluated, even if the underlying
461    dataset changed between runs.
462
463    Attributes:
464        scope: The type of items being evaluated ("traces", "observations").
465        filter: The original JSON filter string used to query items.
466        last_processed_timestamp: ISO 8601 timestamp of the last successfully processed item.
467            Used to construct a filter that only fetches items after this timestamp.
468        last_processed_id: The ID of the last successfully processed item, for reference.
469        items_processed: Count of items successfully processed before interruption.
470
471    Examples:
472        Resuming a failed batch evaluation:
473        ```python
474        # Initial run that fails partway through
475        try:
476            result = client.run_batched_evaluation(
477                scope="traces",
478                mapper=my_mapper,
479                evaluators=[evaluator1, evaluator2],
480                filter='{"tags": ["production"]}',
481                max_items=10000
482            )
483        except Exception as e:
484            print(f"Evaluation failed: {e}")
485
486            # Save the resume token
487            if result.resume_token:
488                # Store resume token for later (e.g., in a file or database)
489                import json
490                with open("resume_token.json", "w") as f:
491                    json.dump({
492                        "scope": result.resume_token.scope,
493                        "filter": result.resume_token.filter,
494                        "last_timestamp": result.resume_token.last_processed_timestamp,
495                        "last_id": result.resume_token.last_processed_id,
496                        "items_done": result.resume_token.items_processed
497                    }, f)
498
499        # Later, resume from where it left off
500        with open("resume_token.json") as f:
501            token_data = json.load(f)
502
503        resume_token = BatchEvaluationResumeToken(
504            scope=token_data["scope"],
505            filter=token_data["filter"],
506            last_processed_timestamp=token_data["last_timestamp"],
507            last_processed_id=token_data["last_id"],
508            items_processed=token_data["items_done"]
509        )
510
511        # Resume the evaluation
512        result = client.run_batched_evaluation(
513            scope="traces",
514            mapper=my_mapper,
515            evaluators=[evaluator1, evaluator2],
516            resume_from=resume_token
517        )
518
519        print(f"Processed {result.total_items_processed} additional items")
520        ```
521
522        Handling partial completion:
523        ```python
524        result = client.run_batched_evaluation(...)
525
526        if not result.completed:
527            print(f"Evaluation incomplete. Processed {result.resume_token.items_processed} items")
528            print(f"Last item: {result.resume_token.last_processed_id}")
529            print(f"Resume from: {result.resume_token.last_processed_timestamp}")
530
531            # Optionally retry automatically
532            if result.resume_token:
533                print("Retrying...")
534                result = client.run_batched_evaluation(
535                    scope=result.resume_token.scope,
536                    mapper=my_mapper,
537                    evaluators=my_evaluators,
538                    resume_from=result.resume_token
539                )
540        ```
541
542    Note:
543        All arguments must be passed as keywords when instantiating this class.
544        The timestamp-based approach means that items created after the initial run
545        but before the timestamp will be skipped. This is intentional to avoid
546        duplicates and ensure consistent evaluation.
547    """
548
549    def __init__(
550        self,
551        *,
552        scope: str,
553        filter: Optional[str],
554        last_processed_timestamp: str,
555        last_processed_id: str,
556        items_processed: int,
557    ):
558        """Initialize BatchEvaluationResumeToken with the provided state.
559
560        Args:
561            scope: The scope type ("traces", "observations").
562            filter: The original JSON filter string.
563            last_processed_timestamp: ISO 8601 timestamp of last processed item.
564            last_processed_id: ID of last processed item.
565            items_processed: Count of items processed before interruption.
566
567        Note:
568            All arguments must be provided as keywords.
569        """
570        self.scope = scope
571        self.filter = filter
572        self.last_processed_timestamp = last_processed_timestamp
573        self.last_processed_id = last_processed_id
574        self.items_processed = items_processed

Token for resuming a failed batch evaluation run.

This class encapsulates all the information needed to resume a batch evaluation that was interrupted or failed partway through. It uses timestamp-based filtering to avoid re-processing items that were already evaluated, even if the underlying dataset changed between runs.

Attributes:
  • scope: The type of items being evaluated ("traces", "observations").
  • filter: The original JSON filter string used to query items.
  • last_processed_timestamp: ISO 8601 timestamp of the last successfully processed item. Used to construct a filter that only fetches items after this timestamp.
  • last_processed_id: The ID of the last successfully processed item, for reference.
  • items_processed: Count of items successfully processed before interruption.
Examples:

Resuming a failed batch evaluation:

# Initial run that fails partway through
try:
    result = client.run_batched_evaluation(
        scope="traces",
        mapper=my_mapper,
        evaluators=[evaluator1, evaluator2],
        filter='{"tags": ["production"]}',
        max_items=10000
    )
except Exception as e:
    print(f"Evaluation failed: {e}")

    # Save the resume token
    if result.resume_token:
        # Store resume token for later (e.g., in a file or database)
        import json
        with open("resume_token.json", "w") as f:
            json.dump({
                "scope": result.resume_token.scope,
                "filter": result.resume_token.filter,
                "last_timestamp": result.resume_token.last_processed_timestamp,
                "last_id": result.resume_token.last_processed_id,
                "items_done": result.resume_token.items_processed
            }, f)

# Later, resume from where it left off
with open("resume_token.json") as f:
    token_data = json.load(f)

resume_token = BatchEvaluationResumeToken(
    scope=token_data["scope"],
    filter=token_data["filter"],
    last_processed_timestamp=token_data["last_timestamp"],
    last_processed_id=token_data["last_id"],
    items_processed=token_data["items_done"]
)

# Resume the evaluation
result = client.run_batched_evaluation(
    scope="traces",
    mapper=my_mapper,
    evaluators=[evaluator1, evaluator2],
    resume_from=resume_token
)

print(f"Processed {result.total_items_processed} additional items")

Handling partial completion:

result = client.run_batched_evaluation(...)

if not result.completed:
    print(f"Evaluation incomplete. Processed {result.resume_token.items_processed} items")
    print(f"Last item: {result.resume_token.last_processed_id}")
    print(f"Resume from: {result.resume_token.last_processed_timestamp}")

    # Optionally retry automatically
    if result.resume_token:
        print("Retrying...")
        result = client.run_batched_evaluation(
            scope=result.resume_token.scope,
            mapper=my_mapper,
            evaluators=my_evaluators,
            resume_from=result.resume_token
        )
Note:

All arguments must be passed as keywords when instantiating this class. The timestamp-based approach means that items created after the initial run but before the timestamp will be skipped. This is intentional to avoid duplicates and ensure consistent evaluation.

BatchEvaluationResumeToken( *, scope: str, filter: Optional[str], last_processed_timestamp: str, last_processed_id: str, items_processed: int)
549    def __init__(
550        self,
551        *,
552        scope: str,
553        filter: Optional[str],
554        last_processed_timestamp: str,
555        last_processed_id: str,
556        items_processed: int,
557    ):
558        """Initialize BatchEvaluationResumeToken with the provided state.
559
560        Args:
561            scope: The scope type ("traces", "observations").
562            filter: The original JSON filter string.
563            last_processed_timestamp: ISO 8601 timestamp of last processed item.
564            last_processed_id: ID of last processed item.
565            items_processed: Count of items processed before interruption.
566
567        Note:
568            All arguments must be provided as keywords.
569        """
570        self.scope = scope
571        self.filter = filter
572        self.last_processed_timestamp = last_processed_timestamp
573        self.last_processed_id = last_processed_id
574        self.items_processed = items_processed

Initialize BatchEvaluationResumeToken with the provided state.

Arguments:
  • scope: The scope type ("traces", "observations").
  • filter: The original JSON filter string.
  • last_processed_timestamp: ISO 8601 timestamp of last processed item.
  • last_processed_id: ID of last processed item.
  • items_processed: Count of items processed before interruption.
Note:

All arguments must be provided as keywords.

scope
filter
last_processed_timestamp
last_processed_id
items_processed
class BatchEvaluationResult:
577class BatchEvaluationResult:
578    r"""Complete result structure for batch evaluation execution.
579
580    This class encapsulates comprehensive statistics and metadata about a batch
581    evaluation run, including counts, evaluator-specific metrics, timing information,
582    error details, and resume capability.
583
584    Attributes:
585        total_items_fetched: Total number of items fetched from the API.
586        total_items_processed: Number of items successfully evaluated.
587        total_items_failed: Number of items that failed during evaluation.
588        total_scores_created: Total scores created by all item-level evaluators.
589        total_composite_scores_created: Scores created by the composite evaluator.
590        total_evaluations_failed: Number of individual evaluator failures across all items.
591        evaluator_stats: List of per-evaluator statistics (success/failure rates, scores created).
592        resume_token: Token for resuming if evaluation was interrupted (None if completed).
593        completed: True if all items were processed, False if stopped early or failed.
594        duration_seconds: Total time taken to execute the batch evaluation.
595        failed_item_ids: List of IDs for items that failed evaluation.
596        error_summary: Dictionary mapping error types to occurrence counts.
597        has_more_items: True if max_items limit was reached but more items exist.
598        item_evaluations: Dictionary mapping item IDs to their evaluation results (both regular and composite).
599
600    Examples:
601        Basic result inspection:
602        ```python
603        result = client.run_batched_evaluation(...)
604
605        print(f"Processed: {result.total_items_processed}/{result.total_items_fetched}")
606        print(f"Scores created: {result.total_scores_created}")
607        print(f"Duration: {result.duration_seconds:.2f}s")
608        print(f"Success rate: {result.total_items_processed / result.total_items_fetched:.1%}")
609        ```
610
611        Detailed analysis with evaluator stats:
612        ```python
613        result = client.run_batched_evaluation(...)
614
615        print(f"\n📊 Batch Evaluation Results")
616        print(f"{'='*50}")
617        print(f"Items processed: {result.total_items_processed}")
618        print(f"Items failed: {result.total_items_failed}")
619        print(f"Scores created: {result.total_scores_created}")
620
621        if result.total_composite_scores_created > 0:
622            print(f"Composite scores: {result.total_composite_scores_created}")
623
624        print(f"\n📈 Evaluator Performance:")
625        for stats in result.evaluator_stats:
626            success_rate = stats.successful_runs / stats.total_runs if stats.total_runs > 0 else 0
627            print(f"\n  {stats.name}:")
628            print(f"    Success rate: {success_rate:.1%}")
629            print(f"    Scores created: {stats.total_scores_created}")
630            if stats.failed_runs > 0:
631                print(f"    ⚠️  Failures: {stats.failed_runs}")
632
633        if result.error_summary:
634            print(f"\n⚠️  Errors encountered:")
635            for error_type, count in result.error_summary.items():
636                print(f"    {error_type}: {count}")
637        ```
638
639        Handling incomplete runs:
640        ```python
641        result = client.run_batched_evaluation(...)
642
643        if not result.completed:
644            print("⚠️  Evaluation incomplete!")
645
646            if result.resume_token:
647                print(f"Processed {result.resume_token.items_processed} items before failure")
648                print(f"Use resume_from parameter to continue from:")
649                print(f"  Timestamp: {result.resume_token.last_processed_timestamp}")
650                print(f"  Last ID: {result.resume_token.last_processed_id}")
651
652        if result.has_more_items:
653            print(f"ℹ️  More items available beyond max_items limit")
654        ```
655
656        Performance monitoring:
657        ```python
658        result = client.run_batched_evaluation(...)
659
660        items_per_second = result.total_items_processed / result.duration_seconds
661        avg_scores_per_item = result.total_scores_created / result.total_items_processed
662
663        print(f"Performance metrics:")
664        print(f"  Throughput: {items_per_second:.2f} items/second")
665        print(f"  Avg scores/item: {avg_scores_per_item:.2f}")
666        print(f"  Total duration: {result.duration_seconds:.2f}s")
667
668        if result.total_evaluations_failed > 0:
669            failure_rate = result.total_evaluations_failed / (
670                result.total_items_processed * len(result.evaluator_stats)
671            )
672            print(f"  Evaluation failure rate: {failure_rate:.1%}")
673        ```
674
675    Note:
676        All arguments must be passed as keywords when instantiating this class.
677    """
678
679    def __init__(
680        self,
681        *,
682        total_items_fetched: int,
683        total_items_processed: int,
684        total_items_failed: int,
685        total_scores_created: int,
686        total_composite_scores_created: int,
687        total_evaluations_failed: int,
688        evaluator_stats: List[EvaluatorStats],
689        resume_token: Optional[BatchEvaluationResumeToken],
690        completed: bool,
691        duration_seconds: float,
692        failed_item_ids: List[str],
693        error_summary: Dict[str, int],
694        has_more_items: bool,
695        item_evaluations: Dict[str, List["Evaluation"]],
696    ):
697        """Initialize BatchEvaluationResult with comprehensive statistics.
698
699        Args:
700            total_items_fetched: Total items fetched from API.
701            total_items_processed: Items successfully evaluated.
702            total_items_failed: Items that failed evaluation.
703            total_scores_created: Scores from item-level evaluators.
704            total_composite_scores_created: Scores from composite evaluator.
705            total_evaluations_failed: Individual evaluator failures.
706            evaluator_stats: Per-evaluator statistics.
707            resume_token: Token for resuming (None if completed).
708            completed: Whether all items were processed.
709            duration_seconds: Total execution time.
710            failed_item_ids: IDs of failed items.
711            error_summary: Error types and counts.
712            has_more_items: Whether more items exist beyond max_items.
713            item_evaluations: Dictionary mapping item IDs to their evaluation results.
714
715        Note:
716            All arguments must be provided as keywords.
717        """
718        self.total_items_fetched = total_items_fetched
719        self.total_items_processed = total_items_processed
720        self.total_items_failed = total_items_failed
721        self.total_scores_created = total_scores_created
722        self.total_composite_scores_created = total_composite_scores_created
723        self.total_evaluations_failed = total_evaluations_failed
724        self.evaluator_stats = evaluator_stats
725        self.resume_token = resume_token
726        self.completed = completed
727        self.duration_seconds = duration_seconds
728        self.failed_item_ids = failed_item_ids
729        self.error_summary = error_summary
730        self.has_more_items = has_more_items
731        self.item_evaluations = item_evaluations
732
733    def __str__(self) -> str:
734        """Return a formatted string representation of the batch evaluation results.
735
736        Returns:
737            A multi-line string with a summary of the evaluation results.
738        """
739        lines = []
740        lines.append("=" * 60)
741        lines.append("Batch Evaluation Results")
742        lines.append("=" * 60)
743
744        # Summary statistics
745        lines.append(f"\nStatus: {'Completed' if self.completed else 'Incomplete'}")
746        lines.append(f"Duration: {self.duration_seconds:.2f}s")
747        lines.append(f"\nItems fetched: {self.total_items_fetched}")
748        lines.append(f"Items processed: {self.total_items_processed}")
749
750        if self.total_items_failed > 0:
751            lines.append(f"Items failed: {self.total_items_failed}")
752
753        # Success rate
754        if self.total_items_fetched > 0:
755            success_rate = self.total_items_processed / self.total_items_fetched * 100
756            lines.append(f"Success rate: {success_rate:.1f}%")
757
758        # Scores created
759        lines.append(f"\nScores created: {self.total_scores_created}")
760        if self.total_composite_scores_created > 0:
761            lines.append(f"Composite scores: {self.total_composite_scores_created}")
762
763        total_scores = self.total_scores_created + self.total_composite_scores_created
764        lines.append(f"Total scores: {total_scores}")
765
766        # Evaluator statistics
767        if self.evaluator_stats:
768            lines.append("\nEvaluator Performance:")
769            for stats in self.evaluator_stats:
770                lines.append(f"  {stats.name}:")
771                if stats.total_runs > 0:
772                    success_rate = (
773                        stats.successful_runs / stats.total_runs * 100
774                        if stats.total_runs > 0
775                        else 0
776                    )
777                    lines.append(
778                        f"    Runs: {stats.successful_runs}/{stats.total_runs} "
779                        f"({success_rate:.1f}% success)"
780                    )
781                    lines.append(f"    Scores created: {stats.total_scores_created}")
782                    if stats.failed_runs > 0:
783                        lines.append(f"    Failed runs: {stats.failed_runs}")
784
785        # Performance metrics
786        if self.total_items_processed > 0 and self.duration_seconds > 0:
787            items_per_sec = self.total_items_processed / self.duration_seconds
788            lines.append("\nPerformance:")
789            lines.append(f"  Throughput: {items_per_sec:.2f} items/second")
790            if self.total_scores_created > 0:
791                avg_scores = self.total_scores_created / self.total_items_processed
792                lines.append(f"  Avg scores per item: {avg_scores:.2f}")
793
794        # Errors and warnings
795        if self.error_summary:
796            lines.append("\nErrors encountered:")
797            for error_type, count in self.error_summary.items():
798                lines.append(f"  {error_type}: {count}")
799
800        # Incomplete run information
801        if not self.completed:
802            lines.append("\nWarning: Evaluation incomplete")
803            if self.resume_token:
804                lines.append(
805                    f"  Last processed: {self.resume_token.last_processed_timestamp}"
806                )
807                lines.append(f"  Items processed: {self.resume_token.items_processed}")
808                lines.append("  Use resume_from parameter to continue")
809
810        if self.has_more_items:
811            lines.append("\nNote: More items available beyond max_items limit")
812
813        lines.append("=" * 60)
814        return "\n".join(lines)

Complete result structure for batch evaluation execution.

This class encapsulates comprehensive statistics and metadata about a batch evaluation run, including counts, evaluator-specific metrics, timing information, error details, and resume capability.

Attributes:
  • total_items_fetched: Total number of items fetched from the API.
  • total_items_processed: Number of items successfully evaluated.
  • total_items_failed: Number of items that failed during evaluation.
  • total_scores_created: Total scores created by all item-level evaluators.
  • total_composite_scores_created: Scores created by the composite evaluator.
  • total_evaluations_failed: Number of individual evaluator failures across all items.
  • evaluator_stats: List of per-evaluator statistics (success/failure rates, scores created).
  • resume_token: Token for resuming if evaluation was interrupted (None if completed).
  • completed: True if all items were processed, False if stopped early or failed.
  • duration_seconds: Total time taken to execute the batch evaluation.
  • failed_item_ids: List of IDs for items that failed evaluation.
  • error_summary: Dictionary mapping error types to occurrence counts.
  • has_more_items: True if max_items limit was reached but more items exist.
  • item_evaluations: Dictionary mapping item IDs to their evaluation results (both regular and composite).
Examples:

Basic result inspection:

result = client.run_batched_evaluation(...)

print(f"Processed: {result.total_items_processed}/{result.total_items_fetched}")
print(f"Scores created: {result.total_scores_created}")
print(f"Duration: {result.duration_seconds:.2f}s")
print(f"Success rate: {result.total_items_processed / result.total_items_fetched:.1%}")

Detailed analysis with evaluator stats:

result = client.run_batched_evaluation(...)

print(f"\n📊 Batch Evaluation Results")
print(f"{'='*50}")
print(f"Items processed: {result.total_items_processed}")
print(f"Items failed: {result.total_items_failed}")
print(f"Scores created: {result.total_scores_created}")

if result.total_composite_scores_created > 0:
    print(f"Composite scores: {result.total_composite_scores_created}")

print(f"\n📈 Evaluator Performance:")
for stats in result.evaluator_stats:
    success_rate = stats.successful_runs / stats.total_runs if stats.total_runs > 0 else 0
    print(f"\n  {stats.name}:")
    print(f"    Success rate: {success_rate:.1%}")
    print(f"    Scores created: {stats.total_scores_created}")
    if stats.failed_runs > 0:
        print(f"    ⚠️  Failures: {stats.failed_runs}")

if result.error_summary:
    print(f"\n⚠️  Errors encountered:")
    for error_type, count in result.error_summary.items():
        print(f"    {error_type}: {count}")

Handling incomplete runs:

result = client.run_batched_evaluation(...)

if not result.completed:
    print("⚠️  Evaluation incomplete!")

    if result.resume_token:
        print(f"Processed {result.resume_token.items_processed} items before failure")
        print(f"Use resume_from parameter to continue from:")
        print(f"  Timestamp: {result.resume_token.last_processed_timestamp}")
        print(f"  Last ID: {result.resume_token.last_processed_id}")

if result.has_more_items:
    print(f"ℹ️  More items available beyond max_items limit")

Performance monitoring:

result = client.run_batched_evaluation(...)

items_per_second = result.total_items_processed / result.duration_seconds
avg_scores_per_item = result.total_scores_created / result.total_items_processed

print(f"Performance metrics:")
print(f"  Throughput: {items_per_second:.2f} items/second")
print(f"  Avg scores/item: {avg_scores_per_item:.2f}")
print(f"  Total duration: {result.duration_seconds:.2f}s")

if result.total_evaluations_failed > 0:
    failure_rate = result.total_evaluations_failed / (
        result.total_items_processed * len(result.evaluator_stats)
    )
    print(f"  Evaluation failure rate: {failure_rate:.1%}")
Note:

All arguments must be passed as keywords when instantiating this class.

BatchEvaluationResult( *, total_items_fetched: int, total_items_processed: int, total_items_failed: int, total_scores_created: int, total_composite_scores_created: int, total_evaluations_failed: int, evaluator_stats: List[EvaluatorStats], resume_token: Optional[BatchEvaluationResumeToken], completed: bool, duration_seconds: float, failed_item_ids: List[str], error_summary: Dict[str, int], has_more_items: bool, item_evaluations: Dict[str, List[Evaluation]])
679    def __init__(
680        self,
681        *,
682        total_items_fetched: int,
683        total_items_processed: int,
684        total_items_failed: int,
685        total_scores_created: int,
686        total_composite_scores_created: int,
687        total_evaluations_failed: int,
688        evaluator_stats: List[EvaluatorStats],
689        resume_token: Optional[BatchEvaluationResumeToken],
690        completed: bool,
691        duration_seconds: float,
692        failed_item_ids: List[str],
693        error_summary: Dict[str, int],
694        has_more_items: bool,
695        item_evaluations: Dict[str, List["Evaluation"]],
696    ):
697        """Initialize BatchEvaluationResult with comprehensive statistics.
698
699        Args:
700            total_items_fetched: Total items fetched from API.
701            total_items_processed: Items successfully evaluated.
702            total_items_failed: Items that failed evaluation.
703            total_scores_created: Scores from item-level evaluators.
704            total_composite_scores_created: Scores from composite evaluator.
705            total_evaluations_failed: Individual evaluator failures.
706            evaluator_stats: Per-evaluator statistics.
707            resume_token: Token for resuming (None if completed).
708            completed: Whether all items were processed.
709            duration_seconds: Total execution time.
710            failed_item_ids: IDs of failed items.
711            error_summary: Error types and counts.
712            has_more_items: Whether more items exist beyond max_items.
713            item_evaluations: Dictionary mapping item IDs to their evaluation results.
714
715        Note:
716            All arguments must be provided as keywords.
717        """
718        self.total_items_fetched = total_items_fetched
719        self.total_items_processed = total_items_processed
720        self.total_items_failed = total_items_failed
721        self.total_scores_created = total_scores_created
722        self.total_composite_scores_created = total_composite_scores_created
723        self.total_evaluations_failed = total_evaluations_failed
724        self.evaluator_stats = evaluator_stats
725        self.resume_token = resume_token
726        self.completed = completed
727        self.duration_seconds = duration_seconds
728        self.failed_item_ids = failed_item_ids
729        self.error_summary = error_summary
730        self.has_more_items = has_more_items
731        self.item_evaluations = item_evaluations

Initialize BatchEvaluationResult with comprehensive statistics.

Arguments:
  • total_items_fetched: Total items fetched from API.
  • total_items_processed: Items successfully evaluated.
  • total_items_failed: Items that failed evaluation.
  • total_scores_created: Scores from item-level evaluators.
  • total_composite_scores_created: Scores from composite evaluator.
  • total_evaluations_failed: Individual evaluator failures.
  • evaluator_stats: Per-evaluator statistics.
  • resume_token: Token for resuming (None if completed).
  • completed: Whether all items were processed.
  • duration_seconds: Total execution time.
  • failed_item_ids: IDs of failed items.
  • error_summary: Error types and counts.
  • has_more_items: Whether more items exist beyond max_items.
  • item_evaluations: Dictionary mapping item IDs to their evaluation results.
Note:

All arguments must be provided as keywords.

total_items_fetched
total_items_processed
total_items_failed
total_scores_created
total_composite_scores_created
total_evaluations_failed
evaluator_stats
resume_token
completed
duration_seconds
failed_item_ids
error_summary
has_more_items
item_evaluations
class RunnerContext:
1062class RunnerContext:
1063    """Wraps :meth:`Langfuse.run_experiment` with CI-injected defaults.
1064
1065    Intended for use with the ``langfuse/experiment-action`` GitHub Action
1066    (https://github.com/langfuse/experiment-action). The action builds a
1067    ``RunnerContext`` before invoking the user's ``experiment(context)``
1068    function. Defaults set here (dataset, metadata tags) are applied when
1069    the user omits them on the :meth:`run_experiment` call; users can
1070    override any default by passing the corresponding argument explicitly.
1071    """
1072
1073    def __init__(
1074        self,
1075        *,
1076        client: "Langfuse",
1077        data: Optional[ExperimentData] = None,
1078        dataset_version: Optional[datetime] = None,
1079        metadata: Optional[Dict[str, str]] = None,
1080    ):
1081        """Build a ``RunnerContext`` populated with defaults for ``run_experiment``.
1082
1083        Typically called by the ``langfuse/experiment-action`` GitHub Action,
1084        not by end users directly. Every field except ``client`` is optional:
1085        fields left as ``None`` simply mean the corresponding argument must be
1086        supplied on the :meth:`run_experiment` call.
1087
1088        Args:
1089            client: Initialized Langfuse SDK client used to execute the
1090                experiment. The action creates this from the
1091                ``langfuse_public_key`` / ``langfuse_secret_key`` /
1092                ``langfuse_base_url`` inputs.
1093            data: Default dataset items to run the experiment on. Accepts
1094                either ``List[LocalExperimentItem]`` or ``List[DatasetItem]``.
1095                Injected by the action when ``dataset_name`` is configured.
1096                If ``None``, the user must pass ``data=`` to
1097                :meth:`run_experiment`.
1098            dataset_version: Optional pinned dataset version. Injected by the
1099                action when ``dataset_version`` is configured.
1100            metadata: Default metadata attached to every experiment trace and
1101                the dataset run. The action injects GitHub-sourced tags (SHA,
1102                PR link, workflow run link, branch, GH user, etc.). Merged
1103                with any ``metadata`` passed to :meth:`run_experiment`, with
1104                user-supplied keys winning on collision.
1105        """
1106        self.client = client
1107        self.data = data
1108        self.dataset_version = dataset_version
1109        self.metadata = metadata
1110
1111    def run_experiment(
1112        self,
1113        *,
1114        name: str,
1115        run_name: Optional[str] = None,
1116        description: Optional[str] = None,
1117        data: Optional[ExperimentData] = None,
1118        task: TaskFunction,
1119        evaluators: List[EvaluatorFunction] = [],
1120        composite_evaluator: Optional["CompositeEvaluatorFunction"] = None,
1121        run_evaluators: List[RunEvaluatorFunction] = [],
1122        max_concurrency: int = 50,
1123        metadata: Optional[Dict[str, str]] = None,
1124        _dataset_version: Optional[datetime] = None,
1125    ) -> ExperimentResult:
1126        resolved_data = data if data is not None else self.data
1127        if resolved_data is None:
1128            raise ValueError(
1129                "`data` must be provided either on the RunnerContext or the run_experiment call"
1130            )
1131
1132        resolved_dataset_version = (
1133            _dataset_version if _dataset_version is not None else self.dataset_version
1134        )
1135
1136        merged_metadata: Optional[Dict[str, str]]
1137        if self.metadata is None and metadata is None:
1138            merged_metadata = None
1139        else:
1140            merged_metadata = {**(self.metadata or {}), **(metadata or {})}
1141
1142        return self.client.run_experiment(
1143            name=name,
1144            run_name=run_name,
1145            description=description,
1146            data=resolved_data,
1147            task=task,
1148            evaluators=evaluators,
1149            composite_evaluator=composite_evaluator,
1150            run_evaluators=run_evaluators,
1151            max_concurrency=max_concurrency,
1152            metadata=merged_metadata,
1153            _dataset_version=resolved_dataset_version,
1154        )

Wraps Langfuse.run_experiment() with CI-injected defaults.

Intended for use with the langfuse/experiment-action GitHub Action (https://github.com/langfuse/experiment-action). The action builds a RunnerContext before invoking the user's experiment(context) function. Defaults set here (dataset, metadata tags) are applied when the user omits them on the run_experiment() call; users can override any default by passing the corresponding argument explicitly.

RunnerContext( *, client: Langfuse, data: Union[List[langfuse.experiment.LocalExperimentItem], List[langfuse.api.DatasetItem], NoneType] = None, dataset_version: Optional[datetime.datetime] = None, metadata: Optional[Dict[str, str]] = None)
1073    def __init__(
1074        self,
1075        *,
1076        client: "Langfuse",
1077        data: Optional[ExperimentData] = None,
1078        dataset_version: Optional[datetime] = None,
1079        metadata: Optional[Dict[str, str]] = None,
1080    ):
1081        """Build a ``RunnerContext`` populated with defaults for ``run_experiment``.
1082
1083        Typically called by the ``langfuse/experiment-action`` GitHub Action,
1084        not by end users directly. Every field except ``client`` is optional:
1085        fields left as ``None`` simply mean the corresponding argument must be
1086        supplied on the :meth:`run_experiment` call.
1087
1088        Args:
1089            client: Initialized Langfuse SDK client used to execute the
1090                experiment. The action creates this from the
1091                ``langfuse_public_key`` / ``langfuse_secret_key`` /
1092                ``langfuse_base_url`` inputs.
1093            data: Default dataset items to run the experiment on. Accepts
1094                either ``List[LocalExperimentItem]`` or ``List[DatasetItem]``.
1095                Injected by the action when ``dataset_name`` is configured.
1096                If ``None``, the user must pass ``data=`` to
1097                :meth:`run_experiment`.
1098            dataset_version: Optional pinned dataset version. Injected by the
1099                action when ``dataset_version`` is configured.
1100            metadata: Default metadata attached to every experiment trace and
1101                the dataset run. The action injects GitHub-sourced tags (SHA,
1102                PR link, workflow run link, branch, GH user, etc.). Merged
1103                with any ``metadata`` passed to :meth:`run_experiment`, with
1104                user-supplied keys winning on collision.
1105        """
1106        self.client = client
1107        self.data = data
1108        self.dataset_version = dataset_version
1109        self.metadata = metadata

Build a RunnerContext populated with defaults for run_experiment.

Typically called by the langfuse/experiment-action GitHub Action, not by end users directly. Every field except client is optional: fields left as None simply mean the corresponding argument must be supplied on the run_experiment() call.

Arguments:
  • client: Initialized Langfuse SDK client used to execute the experiment. The action creates this from the langfuse_public_key / langfuse_secret_key / langfuse_base_url inputs.
  • data: Default dataset items to run the experiment on. Accepts either List[LocalExperimentItem] or List[DatasetItem]. Injected by the action when dataset_name is configured. If None, the user must pass data= to run_experiment().
  • dataset_version: Optional pinned dataset version. Injected by the action when dataset_version is configured.
  • metadata: Default metadata attached to every experiment trace and the dataset run. The action injects GitHub-sourced tags (SHA, PR link, workflow run link, branch, GH user, etc.). Merged with any metadata passed to run_experiment(), with user-supplied keys winning on collision.
client
data
dataset_version
metadata
def run_experiment( self, *, name: str, run_name: Optional[str] = None, description: Optional[str] = None, data: Union[List[langfuse.experiment.LocalExperimentItem], List[langfuse.api.DatasetItem], NoneType] = None, task: langfuse.experiment.TaskFunction, evaluators: List[langfuse.experiment.EvaluatorFunction] = [], composite_evaluator: Optional[CompositeEvaluatorFunction] = None, run_evaluators: List[langfuse.experiment.RunEvaluatorFunction] = [], max_concurrency: int = 50, metadata: Optional[Dict[str, str]] = None, _dataset_version: Optional[datetime.datetime] = None) -> langfuse.experiment.ExperimentResult:
1111    def run_experiment(
1112        self,
1113        *,
1114        name: str,
1115        run_name: Optional[str] = None,
1116        description: Optional[str] = None,
1117        data: Optional[ExperimentData] = None,
1118        task: TaskFunction,
1119        evaluators: List[EvaluatorFunction] = [],
1120        composite_evaluator: Optional["CompositeEvaluatorFunction"] = None,
1121        run_evaluators: List[RunEvaluatorFunction] = [],
1122        max_concurrency: int = 50,
1123        metadata: Optional[Dict[str, str]] = None,
1124        _dataset_version: Optional[datetime] = None,
1125    ) -> ExperimentResult:
1126        resolved_data = data if data is not None else self.data
1127        if resolved_data is None:
1128            raise ValueError(
1129                "`data` must be provided either on the RunnerContext or the run_experiment call"
1130            )
1131
1132        resolved_dataset_version = (
1133            _dataset_version if _dataset_version is not None else self.dataset_version
1134        )
1135
1136        merged_metadata: Optional[Dict[str, str]]
1137        if self.metadata is None and metadata is None:
1138            merged_metadata = None
1139        else:
1140            merged_metadata = {**(self.metadata or {}), **(metadata or {})}
1141
1142        return self.client.run_experiment(
1143            name=name,
1144            run_name=run_name,
1145            description=description,
1146            data=resolved_data,
1147            task=task,
1148            evaluators=evaluators,
1149            composite_evaluator=composite_evaluator,
1150            run_evaluators=run_evaluators,
1151            max_concurrency=max_concurrency,
1152            metadata=merged_metadata,
1153            _dataset_version=resolved_dataset_version,
1154        )
class RegressionError(builtins.Exception):
1157class RegressionError(Exception):
1158    """Raised by a user's ``experiment`` function to signal a CI gate failure.
1159
1160    Intended for use with the ``langfuse/experiment-action`` GitHub Action
1161    (https://github.com/langfuse/experiment-action). The action catches this
1162    exception and, when ``should_fail_on_error`` is enabled, fails the
1163    workflow run and renders a callout in the PR comment using
1164    ``metric``/``value``/``threshold`` if supplied, otherwise ``str(exc)``.
1165
1166    Callers choose one of three forms:
1167
1168    - ``RegressionError(result=r)`` — minimal, generic message.
1169    - ``RegressionError(result=r, message="...")`` — free-form message.
1170    - ``RegressionError(result=r, metric="acc", value=0.7, threshold=0.9)`` —
1171      structured; ``metric`` and ``value`` must be provided together so the
1172      action can render a targeted callout without ``None`` placeholders.
1173    """
1174
1175    @overload
1176    def __init__(self, *, result: ExperimentResult) -> None: ...
1177    @overload
1178    def __init__(self, *, result: ExperimentResult, message: str) -> None: ...
1179    @overload
1180    def __init__(
1181        self,
1182        *,
1183        result: ExperimentResult,
1184        metric: str,
1185        value: float,
1186        threshold: Optional[float] = None,
1187        message: Optional[str] = None,
1188    ) -> None: ...
1189    def __init__(
1190        self,
1191        *,
1192        result: ExperimentResult,
1193        metric: Optional[str] = None,
1194        value: Optional[float] = None,
1195        threshold: Optional[float] = None,
1196        message: Optional[str] = None,
1197    ):
1198        self.result = result
1199        self.metric = metric
1200        self.value = value
1201        self.threshold = threshold
1202        if message is not None:
1203            formatted = message
1204        elif metric is not None and value is not None:
1205            formatted = f"Regression on `{metric}`: {value} (threshold {threshold})"
1206        else:
1207            formatted = "Experiment regression detected"
1208        super().__init__(formatted)

Raised by a user's experiment function to signal a CI gate failure.

Intended for use with the langfuse/experiment-action GitHub Action (https://github.com/langfuse/experiment-action). The action catches this exception and, when should_fail_on_error is enabled, fails the workflow run and renders a callout in the PR comment using metric/value/threshold if supplied, otherwise str(exc).

Callers choose one of three forms:

  • RegressionError(result=r) — minimal, generic message.
  • RegressionError(result=r, message="...") — free-form message.
  • RegressionError(result=r, metric="acc", value=0.7, threshold=0.9) — structured; metric and value must be provided together so the action can render a targeted callout without None placeholders.
RegressionError( *, result: langfuse.experiment.ExperimentResult, metric: Optional[str] = None, value: Optional[float] = None, threshold: Optional[float] = None, message: Optional[str] = None)
1189    def __init__(
1190        self,
1191        *,
1192        result: ExperimentResult,
1193        metric: Optional[str] = None,
1194        value: Optional[float] = None,
1195        threshold: Optional[float] = None,
1196        message: Optional[str] = None,
1197    ):
1198        self.result = result
1199        self.metric = metric
1200        self.value = value
1201        self.threshold = threshold
1202        if message is not None:
1203            formatted = message
1204        elif metric is not None and value is not None:
1205            formatted = f"Regression on `{metric}`: {value} (threshold {threshold})"
1206        else:
1207            formatted = "Experiment regression detected"
1208        super().__init__(formatted)
result
metric
value
threshold
__version__ = '4.12.0'
def is_default_export_span(span: opentelemetry.sdk.trace.ReadableSpan) -> bool:
 98def is_default_export_span(span: ReadableSpan) -> bool:
 99    """Return whether a span should be exported by default."""
100    return (
101        is_langfuse_span(span) or is_genai_span(span) or is_known_llm_instrumentor(span)
102    )

Return whether a span should be exported by default.

def is_langfuse_span(span: opentelemetry.sdk.trace.ReadableSpan) -> bool:
61def is_langfuse_span(span: ReadableSpan) -> bool:
62    """Return whether the span was created by the Langfuse SDK tracer."""
63    return (
64        span.instrumentation_scope is not None
65        and span.instrumentation_scope.name == LANGFUSE_TRACER_NAME
66    )

Return whether the span was created by the Langfuse SDK tracer.

def is_genai_span(span: opentelemetry.sdk.trace.ReadableSpan) -> bool:
69def is_genai_span(span: ReadableSpan) -> bool:
70    """Return whether the span has any ``gen_ai.*`` semantic convention attribute."""
71    if span.attributes is None:
72        return False
73
74    return any(
75        isinstance(key, str) and key.startswith("gen_ai")
76        for key in span.attributes.keys()
77    )

Return whether the span has any gen_ai.* semantic convention attribute.

def is_known_llm_instrumentor(span: opentelemetry.sdk.trace.ReadableSpan) -> bool:
85def is_known_llm_instrumentor(span: ReadableSpan) -> bool:
86    """Return whether the span comes from a known LLM instrumentation scope."""
87    if span.instrumentation_scope is None:
88        return False
89
90    scope_name = span.instrumentation_scope.name
91
92    return any(
93        _matches_scope_prefix(scope_name, prefix)
94        for prefix in KNOWN_LLM_INSTRUMENTATION_SCOPE_PREFIXES
95    )

Return whether the span comes from a known LLM instrumentation scope.

KNOWN_LLM_INSTRUMENTATION_SCOPE_PREFIXES = frozenset({'opentelemetry.instrumentation.together', 'haystack', 'opentelemetry.instrumentation.watsonx', 'opentelemetry.instrumentation.mistralai', 'opentelemetry.instrumentation.replicate', 'opentelemetry.instrumentation.google_generativeai', 'opentelemetry.instrumentation.cohere', 'opentelemetry.instrumentation.sagemaker', 'opentelemetry.instrumentation.langchain', 'pydantic-ai', 'langfuse-sdk', 'langsmith', 'opentelemetry.instrumentation.agno', 'opentelemetry.instrumentation.bedrock', 'vllm', 'opentelemetry.instrumentation.openai_v2', 'strands-agents', 'autogen-core', 'opentelemetry.instrumentation.anthropic', 'opentelemetry.instrumentation.haystack', 'opentelemetry.instrumentation.writer', 'openinference', 'opentelemetry.instrumentation.llamaindex', 'opentelemetry.instrumentation.crewai', 'opentelemetry.instrumentation.openai_agents', 'opentelemetry.instrumentation.groq', 'ai', 'agent_framework', 'opentelemetry.instrumentation.openai', 'opentelemetry.instrumentation.transformers', 'opentelemetry.instrumentation.vertexai', 'opentelemetry.instrumentation.ollama', 'opentelemetry.instrumentation.alephalpha', 'opentelemetry.instrumentation.voyageai', 'litellm'})
class MaskOtelSpansFunction(typing.Protocol):
224class MaskOtelSpansFunction(Protocol):
225    """Function protocol for export-stage OpenTelemetry span masking.
226
227    `mask_otel_spans` runs after Langfuse decides which spans this client should
228    export and after export-stage media handling has converted supported media
229    payloads into Langfuse media references. It affects only the spans exported
230    by this Langfuse client. If the same OpenTelemetry spans are sent to another
231    exporter, that exporter receives its own unmodified copy.
232
233    The function is synchronous. It usually runs on the OpenTelemetry batch span
234    processor worker thread; during `flush()` and shutdown it may run on the
235    caller thread. Keep it deterministic and fast, and avoid relying on request
236    locals, the current active span, or async I/O.
237
238    Return `None` to leave the whole batch unchanged, or return
239    `MaskOtelSpansResult` with sparse patches for the spans that should change.
240
241    Example:
242        ```python
243        from typing import Optional
244
245        from langfuse import Langfuse
246        from langfuse.types import (
247            MaskOtelSpansParams,
248            MaskOtelSpansResult,
249            OtelSpanPatch,
250        )
251
252        def mask_otel_spans(
253            *, params: MaskOtelSpansParams
254        ) -> Optional[MaskOtelSpansResult]:
255            patches = {}
256
257            for identifier, span in params.spans.items():
258                if span.instrumentation_scope_name == "openai":
259                    patches[identifier] = OtelSpanPatch(
260                        delete_attributes=(
261                            "gen_ai.prompt.0.content",
262                            "gen_ai.completion.0.content",
263                        ),
264                        set_attributes={"masking.applied": True},
265                    )
266
267            return MaskOtelSpansResult(span_patches=patches)
268
269        langfuse = Langfuse(mask_otel_spans=mask_otel_spans)
270        ```
271    """
272
273    def __call__(
274        self, *, params: MaskOtelSpansParams
275    ) -> Optional[MaskOtelSpansResult]: ...

Function protocol for export-stage OpenTelemetry span masking.

mask_otel_spans runs after Langfuse decides which spans this client should export and after export-stage media handling has converted supported media payloads into Langfuse media references. It affects only the spans exported by this Langfuse client. If the same OpenTelemetry spans are sent to another exporter, that exporter receives its own unmodified copy.

The function is synchronous. It usually runs on the OpenTelemetry batch span processor worker thread; during flush() and shutdown it may run on the caller thread. Keep it deterministic and fast, and avoid relying on request locals, the current active span, or async I/O.

Return None to leave the whole batch unchanged, or return MaskOtelSpansResult with sparse patches for the spans that should change.

Example:
from typing import Optional

from langfuse import Langfuse
from langfuse.types import (
    MaskOtelSpansParams,
    MaskOtelSpansResult,
    OtelSpanPatch,
)

def mask_otel_spans(
    *, params: MaskOtelSpansParams
) -> Optional[MaskOtelSpansResult]:
    patches = {}

    for identifier, span in params.spans.items():
        if span.instrumentation_scope_name == "openai":
            patches[identifier] = OtelSpanPatch(
                delete_attributes=(
                    "gen_ai.prompt.0.content",
                    "gen_ai.completion.0.content",
                ),
                set_attributes={"masking.applied": True},
            )

    return MaskOtelSpansResult(span_patches=patches)

langfuse = Langfuse(mask_otel_spans=mask_otel_spans)
MaskOtelSpansFunction(*args, **kwargs)
1927def _no_init_or_replace_init(self, *args, **kwargs):
1928    cls = type(self)
1929
1930    if cls._is_protocol:
1931        raise TypeError('Protocols cannot be instantiated')
1932
1933    # Already using a custom `__init__`. No need to calculate correct
1934    # `__init__` to call. This can lead to RecursionError. See bpo-45121.
1935    if cls.__init__ is not _no_init_or_replace_init:
1936        return
1937
1938    # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`.
1939    # The first instantiation of the subclass will call `_no_init_or_replace_init` which
1940    # searches for a proper new `__init__` in the MRO. The new `__init__`
1941    # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent
1942    # instantiation of the protocol subclass will thus use the new
1943    # `__init__` and no longer call `_no_init_or_replace_init`.
1944    for base in cls.__mro__:
1945        init = base.__dict__.get('__init__', _no_init_or_replace_init)
1946        if init is not _no_init_or_replace_init:
1947            cls.__init__ = init
1948            break
1949    else:
1950        # should not happen
1951        cls.__init__ = object.__init__
1952
1953    cls.__init__(self, *args, **kwargs)
@dataclass(frozen=True)
class MaskOtelSpansParams:
123@dataclass(frozen=True)
124class MaskOtelSpansParams:
125    """Input passed to an export-stage OpenTelemetry span masking function.
126
127    A single call receives one OpenTelemetry export batch, not necessarily a
128    complete trace, request, or Langfuse observation tree. Batch contents depend
129    on OpenTelemetry span processor settings such as `flush_at`,
130    `flush_interval`, explicit `flush()`, and shutdown.
131
132    Example:
133        ```python
134        from typing import Optional
135
136        from langfuse.types import (
137            MaskOtelSpansParams,
138            MaskOtelSpansResult,
139            OtelSpanPatch,
140        )
141
142        def mask_otel_spans(
143            *, params: MaskOtelSpansParams
144        ) -> Optional[MaskOtelSpansResult]:
145            patches = {}
146
147            for identifier, span in params.spans.items():
148                if "http.request.header.authorization" in span.attributes:
149                    patches[identifier] = OtelSpanPatch(
150                        delete_attributes=("http.request.header.authorization",),
151                        set_attributes={"security.redacted": True},
152                    )
153
154            return MaskOtelSpansResult(span_patches=patches)
155        ```
156
157    Attributes:
158        spans: Read-only mapping from stable span identifiers to span snapshots.
159            Return patches using keys from this mapping.
160    """
161
162    spans: Mapping[OtelSpanIdentifier, OtelSpanData]

Input passed to an export-stage OpenTelemetry span masking function.

A single call receives one OpenTelemetry export batch, not necessarily a complete trace, request, or Langfuse observation tree. Batch contents depend on OpenTelemetry span processor settings such as flush_at, flush_interval, explicit flush(), and shutdown.

Example:
from typing import Optional

from langfuse.types import (
    MaskOtelSpansParams,
    MaskOtelSpansResult,
    OtelSpanPatch,
)

def mask_otel_spans(
    *, params: MaskOtelSpansParams
) -> Optional[MaskOtelSpansResult]:
    patches = {}

    for identifier, span in params.spans.items():
        if "http.request.header.authorization" in span.attributes:
            patches[identifier] = OtelSpanPatch(
                delete_attributes=("http.request.header.authorization",),
                set_attributes={"security.redacted": True},
            )

    return MaskOtelSpansResult(span_patches=patches)
Attributes:
  • spans: Read-only mapping from stable span identifiers to span snapshots. Return patches using keys from this mapping.
MaskOtelSpansParams( spans: Mapping[OtelSpanIdentifier, OtelSpanData])
spans: Mapping[OtelSpanIdentifier, OtelSpanData]
@dataclass(frozen=True)
class MaskOtelSpansResult:
200@dataclass(frozen=True)
201class MaskOtelSpansResult:
202    """Patches returned by a `mask_otel_spans` function.
203
204    Omit spans that do not need changes. A mapping value of `None` also leaves
205    that span unchanged. Returning an invalid patch to drop a span is not a
206    supported API; use `should_export_span` when you need span-level export
207    filtering.
208
209    If `mask_otel_spans` raises or returns an object that is not a
210    `MaskOtelSpansResult`, Langfuse drops the whole export batch. If one
211    individual `OtelSpanPatch` is invalid, Langfuse drops only that span from
212    the export batch.
213
214    Attributes:
215        span_patches: Mapping from identifiers in `MaskOtelSpansParams.spans` to
216            sparse attribute patches.
217    """
218
219    span_patches: Mapping[OtelSpanIdentifier, Optional[OtelSpanPatch]] = field(
220        default_factory=lambda: MappingProxyType({})
221    )

Patches returned by a mask_otel_spans function.

Omit spans that do not need changes. A mapping value of None also leaves that span unchanged. Returning an invalid patch to drop a span is not a supported API; use should_export_span when you need span-level export filtering.

If mask_otel_spans raises or returns an object that is not a MaskOtelSpansResult, Langfuse drops the whole export batch. If one individual OtelSpanPatch is invalid, Langfuse drops only that span from the export batch.

Attributes:
MaskOtelSpansResult( span_patches: Mapping[OtelSpanIdentifier, Optional[OtelSpanPatch]] = <factory>)
span_patches: Mapping[OtelSpanIdentifier, Optional[OtelSpanPatch]]
@dataclass(frozen=True)
class OtelSpanData:
 82@dataclass(frozen=True)
 83class OtelSpanData:
 84    """Read-only OpenTelemetry span snapshot passed to `mask_otel_spans`.
 85
 86    The snapshot contains the span data that Langfuse is about to export after
 87    the SDK has applied `should_export_span` filtering and export-stage media
 88    processing. The mappings are immutable views and mutating them is not
 89    supported; return an `OtelSpanPatch` to change exported attributes.
 90
 91    `mask_otel_spans` can only change span attributes. It cannot change the
 92    span name, IDs, parent relationship, resource attributes, events, links, or
 93    instrumentation scope.
 94
 95    Attributes:
 96        trace_id: Lowercase 32-character hexadecimal OpenTelemetry trace ID.
 97        span_id: Lowercase 16-character hexadecimal OpenTelemetry span ID.
 98        parent_span_id: Lowercase hexadecimal parent span ID, or `None` for a
 99            root span or when the parent is not available.
100        name: OpenTelemetry span name.
101        instrumentation_scope_name: Name of the instrumentation scope that
102            emitted the span, for example `openai` or `langfuse`.
103        instrumentation_scope_version: Version of the instrumentation scope, if
104            the instrumentation library provided one.
105        attributes: Read-only attributes that will be exported unless patched.
106            Values use OpenTelemetry `AttributeValue` types: strings, booleans,
107            numbers, or homogeneous sequences of those scalar values.
108        resource_attributes: Read-only resource attributes from the span's
109            OpenTelemetry resource. These are available for decisions only and
110            cannot be patched through `mask_otel_spans`.
111    """
112
113    trace_id: str
114    span_id: str
115    parent_span_id: Optional[str]
116    name: str
117    instrumentation_scope_name: Optional[str]
118    instrumentation_scope_version: Optional[str]
119    attributes: Mapping[str, AttributeValue]
120    resource_attributes: Mapping[str, AttributeValue]

Read-only OpenTelemetry span snapshot passed to mask_otel_spans.

The snapshot contains the span data that Langfuse is about to export after the SDK has applied should_export_span filtering and export-stage media processing. The mappings are immutable views and mutating them is not supported; return an OtelSpanPatch to change exported attributes.

mask_otel_spans can only change span attributes. It cannot change the span name, IDs, parent relationship, resource attributes, events, links, or instrumentation scope.

Attributes:
  • trace_id: Lowercase 32-character hexadecimal OpenTelemetry trace ID.
  • span_id: Lowercase 16-character hexadecimal OpenTelemetry span ID.
  • parent_span_id: Lowercase hexadecimal parent span ID, or None for a root span or when the parent is not available.
  • name: OpenTelemetry span name.
  • instrumentation_scope_name: Name of the instrumentation scope that emitted the span, for example openai or langfuse.
  • instrumentation_scope_version: Version of the instrumentation scope, if the instrumentation library provided one.
  • attributes: Read-only attributes that will be exported unless patched. Values use OpenTelemetry AttributeValue types: strings, booleans, numbers, or homogeneous sequences of those scalar values.
  • resource_attributes: Read-only resource attributes from the span's OpenTelemetry resource. These are available for decisions only and cannot be patched through mask_otel_spans.
OtelSpanData( trace_id: str, span_id: str, parent_span_id: Optional[str], name: str, instrumentation_scope_name: Optional[str], instrumentation_scope_version: Optional[str], attributes: Mapping[str, str | bool | int | float | Sequence[str] | Sequence[bool] | Sequence[int] | Sequence[float]], resource_attributes: Mapping[str, str | bool | int | float | Sequence[str] | Sequence[bool] | Sequence[int] | Sequence[float]])
trace_id: str
span_id: str
parent_span_id: Optional[str]
name: str
instrumentation_scope_name: Optional[str]
instrumentation_scope_version: Optional[str]
attributes: Mapping[str, str | bool | int | float | Sequence[str] | Sequence[bool] | Sequence[int] | Sequence[float]]
resource_attributes: Mapping[str, str | bool | int | float | Sequence[str] | Sequence[bool] | Sequence[int] | Sequence[float]]
@dataclass(frozen=True)
class OtelSpanIdentifier:
65@dataclass(frozen=True)
66class OtelSpanIdentifier:
67    """Stable key for one OpenTelemetry span in a masking batch.
68
69    Use this object as the key when returning a patch for a span. It is a
70    frozen, hashable dataclass, so the safest pattern is to reuse the exact
71    identifier object from `MaskOtelSpansParams.spans` instead of rebuilding it.
72
73    Attributes:
74        trace_id: Lowercase 32-character hexadecimal OpenTelemetry trace ID.
75        span_id: Lowercase 16-character hexadecimal OpenTelemetry span ID.
76    """
77
78    trace_id: str
79    span_id: str

Stable key for one OpenTelemetry span in a masking batch.

Use this object as the key when returning a patch for a span. It is a frozen, hashable dataclass, so the safest pattern is to reuse the exact identifier object from MaskOtelSpansParams.spans instead of rebuilding it.

Attributes:
  • trace_id: Lowercase 32-character hexadecimal OpenTelemetry trace ID.
  • span_id: Lowercase 16-character hexadecimal OpenTelemetry span ID.
OtelSpanIdentifier(trace_id: str, span_id: str)
trace_id: str
span_id: str
@dataclass(frozen=True)
class OtelSpanPatch:
165@dataclass(frozen=True)
166class OtelSpanPatch:
167    """Attribute changes to apply to one OpenTelemetry span before export.
168
169    Patches are sparse: include only the attributes that should change. Langfuse
170    deletes `delete_attributes` first and then applies `set_attributes`, so a key
171    present in both fields is exported with the value from `set_attributes`.
172
173    Attribute values must be valid OpenTelemetry attributes: strings, booleans,
174    integers, floats, or homogeneous sequences of those scalar types. If one
175    value is not valid for OpenTelemetry, Langfuse removes that attribute from
176    the export rather than sending an invalid span.
177
178    Example:
179        ```python
180        OtelSpanPatch(
181            delete_attributes=("gen_ai.prompt.0.content",),
182            set_attributes={
183                "gen_ai.prompt.redacted": True,
184                "app.masking.rule": "drop_prompt_text",
185            },
186        )
187        ```
188
189    Attributes:
190        set_attributes: Attribute values to add or replace on the exported span.
191        delete_attributes: Attribute keys to remove from the exported span.
192    """
193
194    set_attributes: Mapping[str, AttributeValue] = field(
195        default_factory=lambda: MappingProxyType({})
196    )
197    delete_attributes: Sequence[str] = field(default_factory=tuple)

Attribute changes to apply to one OpenTelemetry span before export.

Patches are sparse: include only the attributes that should change. Langfuse deletes delete_attributes first and then applies set_attributes, so a key present in both fields is exported with the value from set_attributes.

Attribute values must be valid OpenTelemetry attributes: strings, booleans, integers, floats, or homogeneous sequences of those scalar types. If one value is not valid for OpenTelemetry, Langfuse removes that attribute from the export rather than sending an invalid span.

Example:
OtelSpanPatch(
    delete_attributes=("gen_ai.prompt.0.content",),
    set_attributes={
        "gen_ai.prompt.redacted": True,
        "app.masking.rule": "drop_prompt_text",
    },
)
Attributes:
  • set_attributes: Attribute values to add or replace on the exported span.
  • delete_attributes: Attribute keys to remove from the exported span.
OtelSpanPatch( set_attributes: Mapping[str, str | bool | int | float | Sequence[str] | Sequence[bool] | Sequence[int] | Sequence[float]] = <factory>, delete_attributes: Sequence[str] = <factory>)
set_attributes: Mapping[str, str | bool | int | float | Sequence[str] | Sequence[bool] | Sequence[int] | Sequence[float]]
delete_attributes: Sequence[str]