langfuse

 1""".. include:: ../README.md"""
 2
 3from langfuse.batch_evaluation import (
 4    BatchEvaluationResult,
 5    BatchEvaluationResumeToken,
 6    CompositeEvaluatorFunction,
 7    EvaluatorInputs,
 8    EvaluatorStats,
 9    MapperFunction,
10)
11from langfuse.experiment import Evaluation, RegressionError, RunnerContext
12
13from ._client import client as _client_module
14from ._client.attributes import LangfuseOtelSpanAttributes
15from ._client.constants import ObservationTypeLiteral
16from ._client.get_client import get_client
17from ._client.observe import observe
18from ._client.propagation import propagate_attributes
19from ._client.span import (
20    LangfuseAgent,
21    LangfuseChain,
22    LangfuseEmbedding,
23    LangfuseEvaluator,
24    LangfuseEvent,
25    LangfuseGeneration,
26    LangfuseGuardrail,
27    LangfuseRetriever,
28    LangfuseSpan,
29    LangfuseTool,
30)
31from ._version import __version__
32from .media import LangfuseMedia, LangfuseMediaReference
33from .span_filter import (
34    KNOWN_LLM_INSTRUMENTATION_SCOPE_PREFIXES,
35    is_default_export_span,
36    is_genai_span,
37    is_known_llm_instrumentor,
38    is_langfuse_span,
39)
40from .types import (
41    MaskOtelSpansFunction,
42    MaskOtelSpansParams,
43    MaskOtelSpansResult,
44    OtelSpanData,
45    OtelSpanIdentifier,
46    OtelSpanPatch,
47)
48
49Langfuse = _client_module.Langfuse
50
51__all__ = [
52    "Langfuse",
53    "LangfuseMedia",
54    "LangfuseMediaReference",
55    "get_client",
56    "observe",
57    "propagate_attributes",
58    "ObservationTypeLiteral",
59    "LangfuseSpan",
60    "LangfuseGeneration",
61    "LangfuseEvent",
62    "LangfuseOtelSpanAttributes",
63    "LangfuseAgent",
64    "LangfuseTool",
65    "LangfuseChain",
66    "LangfuseEmbedding",
67    "LangfuseEvaluator",
68    "LangfuseRetriever",
69    "LangfuseGuardrail",
70    "Evaluation",
71    "EvaluatorInputs",
72    "MapperFunction",
73    "CompositeEvaluatorFunction",
74    "EvaluatorStats",
75    "BatchEvaluationResumeToken",
76    "BatchEvaluationResult",
77    "RunnerContext",
78    "RegressionError",
79    "__version__",
80    "is_default_export_span",
81    "is_langfuse_span",
82    "is_genai_span",
83    "is_known_llm_instrumentor",
84    "KNOWN_LLM_INSTRUMENTATION_SCOPE_PREFIXES",
85    "MaskOtelSpansFunction",
86    "MaskOtelSpansParams",
87    "MaskOtelSpansResult",
88    "OtelSpanData",
89    "OtelSpanIdentifier",
90    "OtelSpanPatch",
91    "experiment",
92    "api",
93]
class Langfuse:
 156class Langfuse:
 157    """Main client for Langfuse tracing and platform features.
 158
 159    This class provides an interface for creating and managing traces, spans,
 160    and generations in Langfuse as well as interacting with the Langfuse API.
 161
 162    The client features a thread-safe singleton pattern for each unique public API key,
 163    ensuring consistent trace context propagation across your application. It implements
 164    efficient batching of spans with configurable flush settings and includes background
 165    thread management for media uploads and score ingestion.
 166
 167    Configuration is flexible through either direct parameters or environment variables,
 168    with graceful fallbacks and runtime configuration updates.
 169
 170    Attributes:
 171        api: Synchronous API client for Langfuse backend communication
 172        async_api: Asynchronous API client for Langfuse backend communication
 173        _otel_tracer: Internal LangfuseTracer instance managing OpenTelemetry components
 174
 175    Parameters:
 176        public_key (Optional[str]): Your Langfuse public API key. Can also be set via LANGFUSE_PUBLIC_KEY environment variable.
 177        secret_key (Optional[str]): Your Langfuse secret API key. Can also be set via LANGFUSE_SECRET_KEY environment variable.
 178        base_url (Optional[str]): The Langfuse API base URL. Defaults to "https://cloud.langfuse.com". Can also be set via LANGFUSE_BASE_URL environment variable.
 179        host (Optional[str]): Deprecated. Use base_url instead. The Langfuse API host URL. Defaults to "https://cloud.langfuse.com".
 180        timeout (Optional[int]): Timeout in seconds for API requests. Defaults to 5 seconds.
 181        httpx_client (Optional[httpx.Client]): Custom httpx client for making non-tracing HTTP requests. If not provided, a default client will be created.
 182            **Fork safety**: ``httpx.Client`` is thread-safe but not process-safe. When using
 183            ``fork()``-based servers (e.g. Gunicorn with ``--preload``), the SDK automatically
 184            recreates its internally-managed HTTP client in child processes after fork. A custom
 185            ``httpx_client`` is intentionally left as-is (the fork-inherited copy is reused), so
 186            you retain the opportunity to handle process-safety yourself — for example by
 187            registering your own ``os.register_at_fork(after_in_child=...)`` handler to close and
 188            reopen connections on the custom client.
 189        debug (bool): Enable debug logging. Defaults to False. Can also be set via LANGFUSE_DEBUG environment variable.
 190        tracing_enabled (Optional[bool]): Enable or disable tracing. Defaults to True. Can also be set via LANGFUSE_TRACING_ENABLED environment variable.
 191        flush_at (Optional[int]): Number of spans to batch before sending to the API. Defaults to 512. Can also be set via LANGFUSE_FLUSH_AT environment variable.
 192        flush_interval (Optional[float]): Time in seconds between batch flushes. Defaults to 5 seconds. Can also be set via LANGFUSE_FLUSH_INTERVAL environment variable.
 193        environment (Optional[str]): Environment name for tracing. Default is 'default'. Can also be set via LANGFUSE_TRACING_ENVIRONMENT environment variable. Can be any lowercase alphanumeric string with hyphens and underscores that does not start with 'langfuse'.
 194        release (Optional[str]): Release version/hash of your application. Used for grouping analytics by release.
 195        media_upload_thread_count (Optional[int]): Number of background threads for handling media uploads. Defaults to 1. Can also be set via LANGFUSE_MEDIA_UPLOAD_THREAD_COUNT environment variable.
 196        sample_rate (Optional[float]): Sampling rate for traces (0.0 to 1.0). Defaults to 1.0 (100% of traces are sampled). Can also be set via LANGFUSE_SAMPLE_RATE environment variable.
 197        mask (Optional[MaskFunction]): Function to mask sensitive data synchronously when Langfuse SDK attributes are created. This applies only to data set through Langfuse SDK APIs such as `start_observation()`, `update()`, and `set_trace_io()`.
 198        mask_otel_spans (Optional[MaskOtelSpansFunction]): Synchronous export-stage hook for masking raw OpenTelemetry span attributes before this Langfuse client sends them to Langfuse. Use this for spans created by third-party OpenTelemetry instrumentations, or when you need to inspect final span attributes after export filtering and Langfuse media handling. It does not modify spans already exported through other OpenTelemetry exporters.
 199
 200            The hook receives one OpenTelemetry export batch. A batch is not guaranteed to contain a complete trace, request, or Langfuse observation tree. The hook usually runs on the OpenTelemetry batch span processor worker thread; during `flush()` and shutdown it may run on the caller thread. Keep it synchronous, deterministic, and fast.
 201
 202            Return `None` to leave the batch unchanged. Return `MaskOtelSpansResult` with `OtelSpanPatch` values to delete or replace attributes on selected spans. If the hook raises or returns an invalid batch result, Langfuse drops the whole export batch. If one returned span patch is invalid, Langfuse drops only that span from the Langfuse export.
 203
 204            Example:
 205                ```python
 206                from typing import Optional
 207
 208                from langfuse import Langfuse
 209                from langfuse.types import (
 210                    MaskOtelSpansParams,
 211                    MaskOtelSpansResult,
 212                    OtelSpanPatch,
 213                )
 214
 215                def mask_otel_spans(
 216                    *, params: MaskOtelSpansParams
 217                ) -> Optional[MaskOtelSpansResult]:
 218                    patches = {}
 219
 220                    for identifier, span in params.spans.items():
 221                        if "gen_ai.prompt.0.content" in span.attributes:
 222                            patches[identifier] = OtelSpanPatch(
 223                                delete_attributes=("gen_ai.prompt.0.content",),
 224                                set_attributes={"masking.applied": True},
 225                            )
 226
 227                    return MaskOtelSpansResult(span_patches=patches)
 228
 229                langfuse = Langfuse(mask_otel_spans=mask_otel_spans)
 230                ```
 231        blocked_instrumentation_scopes (Optional[List[str]]): Deprecated. Use `should_export_span` instead. Equivalent behavior:
 232            ```python
 233            from langfuse.span_filter import is_default_export_span
 234            blocked = {"sqlite", "requests"}
 235
 236            should_export_span = lambda span: (
 237                is_default_export_span(span)
 238                and (
 239                    span.instrumentation_scope is None
 240                    or span.instrumentation_scope.name not in blocked
 241                )
 242            )
 243            ```
 244        should_export_span (Optional[Callable[[ReadableSpan], bool]]): Callback to decide whether to export a span. If omitted, Langfuse uses the default filter (Langfuse SDK spans, spans with `gen_ai.*` attributes, and known LLM instrumentation scopes).
 245        additional_headers (Optional[Dict[str, str]]): Additional headers to include in all API requests and in the default OTLPSpanExporter requests. These headers will be merged with default headers. Note: If httpx_client is provided, additional_headers must be set directly on your custom httpx_client as well. If `span_exporter` is provided, these headers are not wired into that exporter and must be configured on the exporter instance directly.
 246        tracer_provider(Optional[TracerProvider]): OpenTelemetry TracerProvider to use for Langfuse. This can be useful to set to have disconnected tracing between Langfuse and other OpenTelemetry-span emitting libraries. Note: To track active spans, the context is still shared between TracerProviders. This may lead to broken trace trees.
 247        id_generator (Optional[IdGenerator]): OpenTelemetry ID generator to use when Langfuse creates its own TracerProvider. If omitted, the OpenTelemetry SDK default is used. If `tracer_provider` is provided, or an OpenTelemetry TracerProvider is already registered globally, configure the ID generator on that provider instead.
 248        span_exporter (Optional[SpanExporter]): Custom OpenTelemetry span exporter for the Langfuse span processor. If omitted, Langfuse creates an OTLPSpanExporter pointed at the Langfuse OTLP endpoint. If provided, Langfuse does not wire `base_url`, exporter headers, exporter auth, or exporter timeout into it. Configure endpoint, headers, and timeout on the exporter instance directly. If you are sending spans to Langfuse v4 or using Langfuse Cloud Fast Preview, include `x-langfuse-ingestion-version=4` on the exporter to enable real time processing of exported spans.
 249
 250    Example:
 251        ```python
 252        from langfuse.otel import Langfuse
 253
 254        # Initialize the client (reads from env vars if not provided)
 255        langfuse = Langfuse(
 256            public_key="your-public-key",
 257            secret_key="your-secret-key",
 258            host="https://cloud.langfuse.com",  # Optional, default shown
 259        )
 260
 261        # Create a trace span
 262        with langfuse.start_as_current_observation(name="process-query") as span:
 263            # Your application code here
 264
 265            # Create a nested generation span for an LLM call
 266            with span.start_as_current_generation(
 267                name="generate-response",
 268                model="gpt-4",
 269                input={"query": "Tell me about AI"},
 270                model_parameters={"temperature": 0.7, "max_tokens": 500}
 271            ) as generation:
 272                # Generate response here
 273                response = "AI is a field of computer science..."
 274
 275                generation.update(
 276                    output=response,
 277                    usage_details={"prompt_tokens": 10, "completion_tokens": 50},
 278                    cost_details={"total_cost": 0.0023}
 279                )
 280
 281                # Score the generation (supports NUMERIC, BOOLEAN, CATEGORICAL)
 282                generation.score(name="relevance", value=0.95, data_type="NUMERIC")
 283        ```
 284    """
 285
 286    _resources: Optional[LangfuseResourceManager] = None
 287    _mask: Optional[MaskFunction] = None
 288    _otel_tracer: otel_trace_api.Tracer
 289
 290    def __init__(
 291        self,
 292        *,
 293        public_key: Optional[str] = None,
 294        secret_key: Optional[str] = None,
 295        base_url: Optional[str] = None,
 296        host: Optional[str] = None,
 297        timeout: Optional[int] = None,
 298        httpx_client: Optional[httpx.Client] = None,
 299        debug: bool = False,
 300        tracing_enabled: Optional[bool] = True,
 301        flush_at: Optional[int] = None,
 302        flush_interval: Optional[float] = None,
 303        environment: Optional[str] = None,
 304        release: Optional[str] = None,
 305        media_upload_thread_count: Optional[int] = None,
 306        sample_rate: Optional[float] = None,
 307        mask: Optional[MaskFunction] = None,
 308        mask_otel_spans: Optional[MaskOtelSpansFunction] = None,
 309        blocked_instrumentation_scopes: Optional[List[str]] = None,
 310        should_export_span: Optional[Callable[[ReadableSpan], bool]] = None,
 311        additional_headers: Optional[Dict[str, str]] = None,
 312        tracer_provider: Optional[TracerProvider] = None,
 313        id_generator: Optional[IdGenerator] = None,
 314        span_exporter: Optional[SpanExporter] = None,
 315    ):
 316        self._base_url = (
 317            base_url
 318            or os.environ.get(LANGFUSE_BASE_URL)
 319            or host
 320            or os.environ.get(LANGFUSE_HOST, "https://cloud.langfuse.com")
 321        )
 322        self._environment = environment or cast(
 323            str, os.environ.get(LANGFUSE_TRACING_ENVIRONMENT)
 324        )
 325        self._release = (
 326            release
 327            or os.environ.get(LANGFUSE_RELEASE, None)
 328            or get_common_release_envs()
 329        )
 330        self._project_id: Optional[str] = None
 331        sample_rate = sample_rate or float(os.environ.get(LANGFUSE_SAMPLE_RATE, 1.0))
 332        if not 0.0 <= sample_rate <= 1.0:
 333            raise ValueError(
 334                f"Sample rate must be between 0.0 and 1.0, got {sample_rate}"
 335            )
 336
 337        timeout = timeout or int(os.environ.get(LANGFUSE_TIMEOUT, 5))
 338
 339        self._tracing_enabled = (
 340            tracing_enabled
 341            and os.environ.get(LANGFUSE_TRACING_ENABLED, "true").lower() != "false"
 342        )
 343        if not self._tracing_enabled:
 344            langfuse_logger.info(
 345                "Configuration: Langfuse tracing is explicitly disabled. No data will be sent to the Langfuse API."
 346            )
 347
 348        debug = (
 349            debug if debug else (os.getenv(LANGFUSE_DEBUG, "false").lower() == "true")
 350        )
 351        if debug:
 352            logging.basicConfig(
 353                format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 354            )
 355            langfuse_logger.setLevel(logging.DEBUG)
 356
 357        public_key = public_key or os.environ.get(LANGFUSE_PUBLIC_KEY)
 358        if public_key is None:
 359            langfuse_logger.warning(
 360                "Authentication error: Langfuse client initialized without public_key. Client will be disabled. "
 361                "Provide a public_key parameter or set LANGFUSE_PUBLIC_KEY environment variable. "
 362            )
 363            self._otel_tracer = otel_trace_api.NoOpTracer()
 364            return
 365
 366        secret_key = secret_key or os.environ.get(LANGFUSE_SECRET_KEY)
 367        if secret_key is None:
 368            langfuse_logger.warning(
 369                "Authentication error: Langfuse client initialized without secret_key. Client will be disabled. "
 370                "Provide a secret_key parameter or set LANGFUSE_SECRET_KEY environment variable. "
 371            )
 372            self._otel_tracer = otel_trace_api.NoOpTracer()
 373            return
 374
 375        if os.environ.get("OTEL_SDK_DISABLED", "false").lower() == "true":
 376            langfuse_logger.warning(
 377                "OTEL_SDK_DISABLED is set. Langfuse tracing will be disabled and no traces will appear in the UI."
 378            )
 379
 380        if blocked_instrumentation_scopes is not None:
 381            warnings.warn(
 382                "`blocked_instrumentation_scopes` is deprecated and will be removed in a future release. "
 383                "Use `should_export_span` instead. Example: "
 384                "from langfuse.span_filter import is_default_export_span; "
 385                'blocked={"scope"}; should_export_span=lambda span: '
 386                "is_default_export_span(span) and (span.instrumentation_scope is None or "
 387                "span.instrumentation_scope.name not in blocked).",
 388                DeprecationWarning,
 389                stacklevel=2,
 390            )
 391
 392        # Initialize api and tracer if requirements are met
 393        self._resources = LangfuseResourceManager(
 394            public_key=public_key,
 395            secret_key=secret_key,
 396            base_url=self._base_url,
 397            timeout=timeout,
 398            environment=self._environment,
 399            release=release,
 400            flush_at=flush_at,
 401            flush_interval=flush_interval,
 402            httpx_client=httpx_client,
 403            media_upload_thread_count=media_upload_thread_count,
 404            sample_rate=sample_rate,
 405            mask=mask,
 406            mask_otel_spans=mask_otel_spans,
 407            tracing_enabled=self._tracing_enabled,
 408            blocked_instrumentation_scopes=blocked_instrumentation_scopes,
 409            should_export_span=should_export_span,
 410            additional_headers=additional_headers,
 411            tracer_provider=tracer_provider,
 412            id_generator=id_generator,
 413            span_exporter=span_exporter,
 414        )
 415        self._mask = self._resources.mask
 416
 417        self._otel_tracer = (
 418            self._resources.tracer
 419            if self._tracing_enabled and self._resources.tracer is not None
 420            else otel_trace_api.NoOpTracer()
 421        )
 422
 423    @property
 424    def api(self) -> LangfuseAPI:
 425        if self._resources is None:
 426            raise AttributeError("Langfuse client is not initialized")
 427
 428        return self._resources.api
 429
 430    @api.setter
 431    def api(self, value: LangfuseAPI) -> None:
 432        if self._resources is None:
 433            raise AttributeError("Langfuse client is not initialized")
 434
 435        self._resources.api = value
 436
 437    @property
 438    def async_api(self) -> AsyncLangfuseAPI:
 439        if self._resources is None:
 440            raise AttributeError("Langfuse client is not initialized")
 441
 442        return self._resources.async_api
 443
 444    @async_api.setter
 445    def async_api(self, value: AsyncLangfuseAPI) -> None:
 446        if self._resources is None:
 447            raise AttributeError("Langfuse client is not initialized")
 448
 449        self._resources.async_api = value
 450
 451    @overload
 452    def start_observation(
 453        self,
 454        *,
 455        trace_context: Optional[TraceContext] = None,
 456        name: str,
 457        as_type: Literal["generation"],
 458        input: Optional[Any] = None,
 459        output: Optional[Any] = None,
 460        metadata: Optional[Any] = None,
 461        version: Optional[str] = None,
 462        level: Optional[SpanLevel] = None,
 463        status_message: Optional[str] = None,
 464        completion_start_time: Optional[datetime] = None,
 465        model: Optional[str] = None,
 466        model_parameters: Optional[Dict[str, MapValue]] = None,
 467        usage_details: Optional[Dict[str, int]] = None,
 468        cost_details: Optional[Dict[str, float]] = None,
 469        prompt: Optional[PromptClient] = None,
 470    ) -> LangfuseGeneration: ...
 471
 472    @overload
 473    def start_observation(
 474        self,
 475        *,
 476        trace_context: Optional[TraceContext] = None,
 477        name: str,
 478        as_type: Literal["span"] = "span",
 479        input: Optional[Any] = None,
 480        output: Optional[Any] = None,
 481        metadata: Optional[Any] = None,
 482        version: Optional[str] = None,
 483        level: Optional[SpanLevel] = None,
 484        status_message: Optional[str] = None,
 485    ) -> LangfuseSpan: ...
 486
 487    @overload
 488    def start_observation(
 489        self,
 490        *,
 491        trace_context: Optional[TraceContext] = None,
 492        name: str,
 493        as_type: Literal["agent"],
 494        input: Optional[Any] = None,
 495        output: Optional[Any] = None,
 496        metadata: Optional[Any] = None,
 497        version: Optional[str] = None,
 498        level: Optional[SpanLevel] = None,
 499        status_message: Optional[str] = None,
 500    ) -> LangfuseAgent: ...
 501
 502    @overload
 503    def start_observation(
 504        self,
 505        *,
 506        trace_context: Optional[TraceContext] = None,
 507        name: str,
 508        as_type: Literal["tool"],
 509        input: Optional[Any] = None,
 510        output: Optional[Any] = None,
 511        metadata: Optional[Any] = None,
 512        version: Optional[str] = None,
 513        level: Optional[SpanLevel] = None,
 514        status_message: Optional[str] = None,
 515    ) -> LangfuseTool: ...
 516
 517    @overload
 518    def start_observation(
 519        self,
 520        *,
 521        trace_context: Optional[TraceContext] = None,
 522        name: str,
 523        as_type: Literal["chain"],
 524        input: Optional[Any] = None,
 525        output: Optional[Any] = None,
 526        metadata: Optional[Any] = None,
 527        version: Optional[str] = None,
 528        level: Optional[SpanLevel] = None,
 529        status_message: Optional[str] = None,
 530    ) -> LangfuseChain: ...
 531
 532    @overload
 533    def start_observation(
 534        self,
 535        *,
 536        trace_context: Optional[TraceContext] = None,
 537        name: str,
 538        as_type: Literal["retriever"],
 539        input: Optional[Any] = None,
 540        output: Optional[Any] = None,
 541        metadata: Optional[Any] = None,
 542        version: Optional[str] = None,
 543        level: Optional[SpanLevel] = None,
 544        status_message: Optional[str] = None,
 545    ) -> LangfuseRetriever: ...
 546
 547    @overload
 548    def start_observation(
 549        self,
 550        *,
 551        trace_context: Optional[TraceContext] = None,
 552        name: str,
 553        as_type: Literal["evaluator"],
 554        input: Optional[Any] = None,
 555        output: Optional[Any] = None,
 556        metadata: Optional[Any] = None,
 557        version: Optional[str] = None,
 558        level: Optional[SpanLevel] = None,
 559        status_message: Optional[str] = None,
 560    ) -> LangfuseEvaluator: ...
 561
 562    @overload
 563    def start_observation(
 564        self,
 565        *,
 566        trace_context: Optional[TraceContext] = None,
 567        name: str,
 568        as_type: Literal["embedding"],
 569        input: Optional[Any] = None,
 570        output: Optional[Any] = None,
 571        metadata: Optional[Any] = None,
 572        version: Optional[str] = None,
 573        level: Optional[SpanLevel] = None,
 574        status_message: Optional[str] = None,
 575        completion_start_time: Optional[datetime] = None,
 576        model: Optional[str] = None,
 577        model_parameters: Optional[Dict[str, MapValue]] = None,
 578        usage_details: Optional[Dict[str, int]] = None,
 579        cost_details: Optional[Dict[str, float]] = None,
 580        prompt: Optional[PromptClient] = None,
 581    ) -> LangfuseEmbedding: ...
 582
 583    @overload
 584    def start_observation(
 585        self,
 586        *,
 587        trace_context: Optional[TraceContext] = None,
 588        name: str,
 589        as_type: Literal["guardrail"],
 590        input: Optional[Any] = None,
 591        output: Optional[Any] = None,
 592        metadata: Optional[Any] = None,
 593        version: Optional[str] = None,
 594        level: Optional[SpanLevel] = None,
 595        status_message: Optional[str] = None,
 596    ) -> LangfuseGuardrail: ...
 597
 598    def start_observation(
 599        self,
 600        *,
 601        trace_context: Optional[TraceContext] = None,
 602        name: str,
 603        as_type: ObservationTypeLiteralNoEvent = "span",
 604        input: Optional[Any] = None,
 605        output: Optional[Any] = None,
 606        metadata: Optional[Any] = None,
 607        version: Optional[str] = None,
 608        level: Optional[SpanLevel] = None,
 609        status_message: Optional[str] = None,
 610        completion_start_time: Optional[datetime] = None,
 611        model: Optional[str] = None,
 612        model_parameters: Optional[Dict[str, MapValue]] = None,
 613        usage_details: Optional[Dict[str, int]] = None,
 614        cost_details: Optional[Dict[str, float]] = None,
 615        prompt: Optional[PromptClient] = None,
 616    ) -> Union[
 617        LangfuseSpan,
 618        LangfuseGeneration,
 619        LangfuseAgent,
 620        LangfuseTool,
 621        LangfuseChain,
 622        LangfuseRetriever,
 623        LangfuseEvaluator,
 624        LangfuseEmbedding,
 625        LangfuseGuardrail,
 626    ]:
 627        """Create a new observation of the specified type.
 628
 629        This method creates a new observation but does not set it as the current span in the
 630        context. To create and use an observation within a context, use start_as_current_observation().
 631
 632        Args:
 633            trace_context: Optional context for connecting to an existing trace
 634            name: Name of the observation
 635            as_type: Type of observation to create (defaults to "span")
 636            input: Input data for the operation
 637            output: Output data from the operation
 638            metadata: Additional metadata to associate with the observation
 639            version: Version identifier for the code or component
 640            level: Importance level of the observation
 641            status_message: Optional status message for the observation
 642            completion_start_time: When the model started generating (for generation types)
 643            model: Name/identifier of the AI model used (for generation types)
 644            model_parameters: Parameters used for the model (for generation types)
 645            usage_details: Token usage information (for generation types)
 646            cost_details: Cost information (for generation types)
 647            prompt: Associated prompt template (for generation types)
 648
 649        Returns:
 650            An observation object of the appropriate type that must be ended with .end()
 651        """
 652        if trace_context:
 653            trace_id = trace_context.get("trace_id", None)
 654            parent_span_id = trace_context.get("parent_span_id", None)
 655
 656            if trace_id:
 657                remote_parent_span = self._create_remote_parent_span(
 658                    trace_id=trace_id, parent_span_id=parent_span_id
 659                )
 660
 661                with otel_trace_api.use_span(
 662                    cast(otel_trace_api.Span, remote_parent_span)
 663                ):
 664                    otel_span = self._otel_tracer.start_span(name=name)
 665                    otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True)
 666
 667                    return self._create_observation_from_otel_span(
 668                        otel_span=otel_span,
 669                        as_type=as_type,
 670                        input=input,
 671                        output=output,
 672                        metadata=metadata,
 673                        version=version,
 674                        level=level,
 675                        status_message=status_message,
 676                        completion_start_time=completion_start_time,
 677                        model=model,
 678                        model_parameters=model_parameters,
 679                        usage_details=usage_details,
 680                        cost_details=cost_details,
 681                        prompt=prompt,
 682                    )
 683
 684        otel_span = self._otel_tracer.start_span(name=name)
 685
 686        return self._create_observation_from_otel_span(
 687            otel_span=otel_span,
 688            as_type=as_type,
 689            input=input,
 690            output=output,
 691            metadata=metadata,
 692            version=version,
 693            level=level,
 694            status_message=status_message,
 695            completion_start_time=completion_start_time,
 696            model=model,
 697            model_parameters=model_parameters,
 698            usage_details=usage_details,
 699            cost_details=cost_details,
 700            prompt=prompt,
 701        )
 702
 703    def _create_observation_from_otel_span(
 704        self,
 705        *,
 706        otel_span: otel_trace_api.Span,
 707        as_type: ObservationTypeLiteralNoEvent,
 708        input: Optional[Any] = None,
 709        output: Optional[Any] = None,
 710        metadata: Optional[Any] = None,
 711        version: Optional[str] = None,
 712        level: Optional[SpanLevel] = None,
 713        status_message: Optional[str] = None,
 714        completion_start_time: Optional[datetime] = None,
 715        model: Optional[str] = None,
 716        model_parameters: Optional[Dict[str, MapValue]] = None,
 717        usage_details: Optional[Dict[str, int]] = None,
 718        cost_details: Optional[Dict[str, float]] = None,
 719        prompt: Optional[PromptClient] = None,
 720    ) -> Union[
 721        LangfuseSpan,
 722        LangfuseGeneration,
 723        LangfuseAgent,
 724        LangfuseTool,
 725        LangfuseChain,
 726        LangfuseRetriever,
 727        LangfuseEvaluator,
 728        LangfuseEmbedding,
 729        LangfuseGuardrail,
 730    ]:
 731        """Create the appropriate observation type from an OTEL span."""
 732        if as_type in get_observation_types_list(ObservationTypeGenerationLike):
 733            observation_class = self._get_span_class(as_type)
 734            # Type ignore to prevent overloads of internal _get_span_class function,
 735            # issue is that LangfuseEvent could be returned and that classes have diff. args
 736            return observation_class(  # type: ignore[return-value,call-arg]
 737                otel_span=otel_span,
 738                langfuse_client=self,
 739                environment=self._environment,
 740                release=self._release,
 741                input=input,
 742                output=output,
 743                metadata=metadata,
 744                version=version,
 745                level=level,
 746                status_message=status_message,
 747                completion_start_time=completion_start_time,
 748                model=model,
 749                model_parameters=model_parameters,
 750                usage_details=usage_details,
 751                cost_details=cost_details,
 752                prompt=prompt,
 753            )
 754        else:
 755            # For other types (e.g. span, guardrail), create appropriate class without generation properties
 756            observation_class = self._get_span_class(as_type)
 757            # Type ignore to prevent overloads of internal _get_span_class function,
 758            # issue is that LangfuseEvent could be returned and that classes have diff. args
 759            return observation_class(  # type: ignore[return-value,call-arg]
 760                otel_span=otel_span,
 761                langfuse_client=self,
 762                environment=self._environment,
 763                release=self._release,
 764                input=input,
 765                output=output,
 766                metadata=metadata,
 767                version=version,
 768                level=level,
 769                status_message=status_message,
 770            )
 771            # span._observation_type = as_type
 772            # span._otel_span.set_attribute("langfuse.observation.type", as_type)
 773            # return span
 774
 775    @overload
 776    def start_as_current_observation(
 777        self,
 778        *,
 779        trace_context: Optional[TraceContext] = None,
 780        name: str,
 781        as_type: Literal["generation"],
 782        input: Optional[Any] = None,
 783        output: Optional[Any] = None,
 784        metadata: Optional[Any] = None,
 785        version: Optional[str] = None,
 786        level: Optional[SpanLevel] = None,
 787        status_message: Optional[str] = None,
 788        completion_start_time: Optional[datetime] = None,
 789        model: Optional[str] = None,
 790        model_parameters: Optional[Dict[str, MapValue]] = None,
 791        usage_details: Optional[Dict[str, int]] = None,
 792        cost_details: Optional[Dict[str, float]] = None,
 793        prompt: Optional[PromptClient] = None,
 794        end_on_exit: Optional[bool] = None,
 795    ) -> _AgnosticContextManager[LangfuseGeneration]: ...
 796
 797    @overload
 798    def start_as_current_observation(
 799        self,
 800        *,
 801        trace_context: Optional[TraceContext] = None,
 802        name: str,
 803        as_type: Literal["span"] = "span",
 804        input: Optional[Any] = None,
 805        output: Optional[Any] = None,
 806        metadata: Optional[Any] = None,
 807        version: Optional[str] = None,
 808        level: Optional[SpanLevel] = None,
 809        status_message: Optional[str] = None,
 810        end_on_exit: Optional[bool] = None,
 811    ) -> _AgnosticContextManager[LangfuseSpan]: ...
 812
 813    @overload
 814    def start_as_current_observation(
 815        self,
 816        *,
 817        trace_context: Optional[TraceContext] = None,
 818        name: str,
 819        as_type: Literal["agent"],
 820        input: Optional[Any] = None,
 821        output: Optional[Any] = None,
 822        metadata: Optional[Any] = None,
 823        version: Optional[str] = None,
 824        level: Optional[SpanLevel] = None,
 825        status_message: Optional[str] = None,
 826        end_on_exit: Optional[bool] = None,
 827    ) -> _AgnosticContextManager[LangfuseAgent]: ...
 828
 829    @overload
 830    def start_as_current_observation(
 831        self,
 832        *,
 833        trace_context: Optional[TraceContext] = None,
 834        name: str,
 835        as_type: Literal["tool"],
 836        input: Optional[Any] = None,
 837        output: Optional[Any] = None,
 838        metadata: Optional[Any] = None,
 839        version: Optional[str] = None,
 840        level: Optional[SpanLevel] = None,
 841        status_message: Optional[str] = None,
 842        end_on_exit: Optional[bool] = None,
 843    ) -> _AgnosticContextManager[LangfuseTool]: ...
 844
 845    @overload
 846    def start_as_current_observation(
 847        self,
 848        *,
 849        trace_context: Optional[TraceContext] = None,
 850        name: str,
 851        as_type: Literal["chain"],
 852        input: Optional[Any] = None,
 853        output: Optional[Any] = None,
 854        metadata: Optional[Any] = None,
 855        version: Optional[str] = None,
 856        level: Optional[SpanLevel] = None,
 857        status_message: Optional[str] = None,
 858        end_on_exit: Optional[bool] = None,
 859    ) -> _AgnosticContextManager[LangfuseChain]: ...
 860
 861    @overload
 862    def start_as_current_observation(
 863        self,
 864        *,
 865        trace_context: Optional[TraceContext] = None,
 866        name: str,
 867        as_type: Literal["retriever"],
 868        input: Optional[Any] = None,
 869        output: Optional[Any] = None,
 870        metadata: Optional[Any] = None,
 871        version: Optional[str] = None,
 872        level: Optional[SpanLevel] = None,
 873        status_message: Optional[str] = None,
 874        end_on_exit: Optional[bool] = None,
 875    ) -> _AgnosticContextManager[LangfuseRetriever]: ...
 876
 877    @overload
 878    def start_as_current_observation(
 879        self,
 880        *,
 881        trace_context: Optional[TraceContext] = None,
 882        name: str,
 883        as_type: Literal["evaluator"],
 884        input: Optional[Any] = None,
 885        output: Optional[Any] = None,
 886        metadata: Optional[Any] = None,
 887        version: Optional[str] = None,
 888        level: Optional[SpanLevel] = None,
 889        status_message: Optional[str] = None,
 890        end_on_exit: Optional[bool] = None,
 891    ) -> _AgnosticContextManager[LangfuseEvaluator]: ...
 892
 893    @overload
 894    def start_as_current_observation(
 895        self,
 896        *,
 897        trace_context: Optional[TraceContext] = None,
 898        name: str,
 899        as_type: Literal["embedding"],
 900        input: Optional[Any] = None,
 901        output: Optional[Any] = None,
 902        metadata: Optional[Any] = None,
 903        version: Optional[str] = None,
 904        level: Optional[SpanLevel] = None,
 905        status_message: Optional[str] = None,
 906        completion_start_time: Optional[datetime] = None,
 907        model: Optional[str] = None,
 908        model_parameters: Optional[Dict[str, MapValue]] = None,
 909        usage_details: Optional[Dict[str, int]] = None,
 910        cost_details: Optional[Dict[str, float]] = None,
 911        prompt: Optional[PromptClient] = None,
 912        end_on_exit: Optional[bool] = None,
 913    ) -> _AgnosticContextManager[LangfuseEmbedding]: ...
 914
 915    @overload
 916    def start_as_current_observation(
 917        self,
 918        *,
 919        trace_context: Optional[TraceContext] = None,
 920        name: str,
 921        as_type: Literal["guardrail"],
 922        input: Optional[Any] = None,
 923        output: Optional[Any] = None,
 924        metadata: Optional[Any] = None,
 925        version: Optional[str] = None,
 926        level: Optional[SpanLevel] = None,
 927        status_message: Optional[str] = None,
 928        end_on_exit: Optional[bool] = None,
 929    ) -> _AgnosticContextManager[LangfuseGuardrail]: ...
 930
 931    def start_as_current_observation(
 932        self,
 933        *,
 934        trace_context: Optional[TraceContext] = None,
 935        name: str,
 936        as_type: ObservationTypeLiteralNoEvent = "span",
 937        input: Optional[Any] = None,
 938        output: Optional[Any] = None,
 939        metadata: Optional[Any] = None,
 940        version: Optional[str] = None,
 941        level: Optional[SpanLevel] = None,
 942        status_message: Optional[str] = None,
 943        completion_start_time: Optional[datetime] = None,
 944        model: Optional[str] = None,
 945        model_parameters: Optional[Dict[str, MapValue]] = None,
 946        usage_details: Optional[Dict[str, int]] = None,
 947        cost_details: Optional[Dict[str, float]] = None,
 948        prompt: Optional[PromptClient] = None,
 949        end_on_exit: Optional[bool] = None,
 950    ) -> Union[
 951        _AgnosticContextManager[LangfuseGeneration],
 952        _AgnosticContextManager[LangfuseSpan],
 953        _AgnosticContextManager[LangfuseAgent],
 954        _AgnosticContextManager[LangfuseTool],
 955        _AgnosticContextManager[LangfuseChain],
 956        _AgnosticContextManager[LangfuseRetriever],
 957        _AgnosticContextManager[LangfuseEvaluator],
 958        _AgnosticContextManager[LangfuseEmbedding],
 959        _AgnosticContextManager[LangfuseGuardrail],
 960    ]:
 961        """Create a new observation and set it as the current span in a context manager.
 962
 963        This method creates a new observation of the specified type and sets it as the
 964        current span within a context manager. Use this method with a 'with' statement to
 965        automatically handle the observation lifecycle within a code block.
 966
 967        The created observation will be the child of the current span in the context.
 968
 969        Args:
 970            trace_context: Optional context for connecting to an existing trace
 971            name: Name of the observation (e.g., function or operation name)
 972            as_type: Type of observation to create (defaults to "span")
 973            input: Input data for the operation (can be any JSON-serializable object)
 974            output: Output data from the operation (can be any JSON-serializable object)
 975            metadata: Additional metadata to associate with the observation
 976            version: Version identifier for the code or component
 977            level: Importance level of the observation (info, warning, error)
 978            status_message: Optional status message for the observation
 979            end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
 980
 981            The following parameters are available when as_type is: "generation" or "embedding".
 982            completion_start_time: When the model started generating the response
 983            model: Name/identifier of the AI model used (e.g., "gpt-4")
 984            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
 985            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
 986            cost_details: Cost information for the model call
 987            prompt: Associated prompt template from Langfuse prompt management
 988
 989        Returns:
 990            A context manager that yields the appropriate observation type based on as_type
 991
 992        Example:
 993            ```python
 994            # Create a span
 995            with langfuse.start_as_current_observation(name="process-query", as_type="span") as span:
 996                # Do work
 997                result = process_data()
 998                span.update(output=result)
 999
1000                # Create a child span automatically
1001                with span.start_as_current_observation(name="sub-operation") as child_span:
1002                    # Do sub-operation work
1003                    child_span.update(output="sub-result")
1004
1005            # Create a tool observation
1006            with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool:
1007                # Do tool work
1008                results = search_web(query)
1009                tool.update(output=results)
1010
1011            # Create a generation observation
1012            with langfuse.start_as_current_observation(
1013                name="answer-generation",
1014                as_type="generation",
1015                model="gpt-4"
1016            ) as generation:
1017                # Generate answer
1018                response = llm.generate(...)
1019                generation.update(output=response)
1020            ```
1021        """
1022        if as_type in get_observation_types_list(ObservationTypeGenerationLike):
1023            if trace_context:
1024                trace_id = trace_context.get("trace_id", None)
1025                parent_span_id = trace_context.get("parent_span_id", None)
1026
1027                if trace_id:
1028                    remote_parent_span = self._create_remote_parent_span(
1029                        trace_id=trace_id, parent_span_id=parent_span_id
1030                    )
1031
1032                    return cast(
1033                        Union[
1034                            _AgnosticContextManager[LangfuseGeneration],
1035                            _AgnosticContextManager[LangfuseEmbedding],
1036                        ],
1037                        self._create_span_with_parent_context(
1038                            as_type=as_type,
1039                            name=name,
1040                            remote_parent_span=remote_parent_span,
1041                            parent=None,
1042                            end_on_exit=end_on_exit,
1043                            input=input,
1044                            output=output,
1045                            metadata=metadata,
1046                            version=version,
1047                            level=level,
1048                            status_message=status_message,
1049                            completion_start_time=completion_start_time,
1050                            model=model,
1051                            model_parameters=model_parameters,
1052                            usage_details=usage_details,
1053                            cost_details=cost_details,
1054                            prompt=prompt,
1055                        ),
1056                    )
1057
1058            return cast(
1059                Union[
1060                    _AgnosticContextManager[LangfuseGeneration],
1061                    _AgnosticContextManager[LangfuseEmbedding],
1062                ],
1063                self._start_as_current_otel_span_with_processed_media(
1064                    as_type=as_type,
1065                    name=name,
1066                    end_on_exit=end_on_exit,
1067                    input=input,
1068                    output=output,
1069                    metadata=metadata,
1070                    version=version,
1071                    level=level,
1072                    status_message=status_message,
1073                    completion_start_time=completion_start_time,
1074                    model=model,
1075                    model_parameters=model_parameters,
1076                    usage_details=usage_details,
1077                    cost_details=cost_details,
1078                    prompt=prompt,
1079                ),
1080            )
1081
1082        if as_type in get_observation_types_list(ObservationTypeSpanLike):
1083            if trace_context:
1084                trace_id = trace_context.get("trace_id", None)
1085                parent_span_id = trace_context.get("parent_span_id", None)
1086
1087                if trace_id:
1088                    remote_parent_span = self._create_remote_parent_span(
1089                        trace_id=trace_id, parent_span_id=parent_span_id
1090                    )
1091
1092                    return cast(
1093                        Union[
1094                            _AgnosticContextManager[LangfuseSpan],
1095                            _AgnosticContextManager[LangfuseAgent],
1096                            _AgnosticContextManager[LangfuseTool],
1097                            _AgnosticContextManager[LangfuseChain],
1098                            _AgnosticContextManager[LangfuseRetriever],
1099                            _AgnosticContextManager[LangfuseEvaluator],
1100                            _AgnosticContextManager[LangfuseGuardrail],
1101                        ],
1102                        self._create_span_with_parent_context(
1103                            as_type=as_type,
1104                            name=name,
1105                            remote_parent_span=remote_parent_span,
1106                            parent=None,
1107                            end_on_exit=end_on_exit,
1108                            input=input,
1109                            output=output,
1110                            metadata=metadata,
1111                            version=version,
1112                            level=level,
1113                            status_message=status_message,
1114                        ),
1115                    )
1116
1117            return cast(
1118                Union[
1119                    _AgnosticContextManager[LangfuseSpan],
1120                    _AgnosticContextManager[LangfuseAgent],
1121                    _AgnosticContextManager[LangfuseTool],
1122                    _AgnosticContextManager[LangfuseChain],
1123                    _AgnosticContextManager[LangfuseRetriever],
1124                    _AgnosticContextManager[LangfuseEvaluator],
1125                    _AgnosticContextManager[LangfuseGuardrail],
1126                ],
1127                self._start_as_current_otel_span_with_processed_media(
1128                    as_type=as_type,
1129                    name=name,
1130                    end_on_exit=end_on_exit,
1131                    input=input,
1132                    output=output,
1133                    metadata=metadata,
1134                    version=version,
1135                    level=level,
1136                    status_message=status_message,
1137                ),
1138            )
1139
1140        # This should never be reached since all valid types are handled above
1141        langfuse_logger.warning(
1142            f"Unknown observation type: {as_type}, falling back to span"
1143        )
1144        return self._start_as_current_otel_span_with_processed_media(
1145            as_type="span",
1146            name=name,
1147            end_on_exit=end_on_exit,
1148            input=input,
1149            output=output,
1150            metadata=metadata,
1151            version=version,
1152            level=level,
1153            status_message=status_message,
1154        )
1155
1156    def _get_span_class(
1157        self,
1158        as_type: str,
1159    ) -> Union[
1160        Type[LangfuseAgent],
1161        Type[LangfuseTool],
1162        Type[LangfuseChain],
1163        Type[LangfuseRetriever],
1164        Type[LangfuseEvaluator],
1165        Type[LangfuseEmbedding],
1166        Type[LangfuseGuardrail],
1167        Type[LangfuseGeneration],
1168        Type[LangfuseEvent],
1169        Type[LangfuseSpan],
1170    ]:
1171        """Get the appropriate span class based on as_type."""
1172        normalized_type = as_type.lower()
1173
1174        if normalized_type == "agent":
1175            return LangfuseAgent
1176        elif normalized_type == "tool":
1177            return LangfuseTool
1178        elif normalized_type == "chain":
1179            return LangfuseChain
1180        elif normalized_type == "retriever":
1181            return LangfuseRetriever
1182        elif normalized_type == "evaluator":
1183            return LangfuseEvaluator
1184        elif normalized_type == "embedding":
1185            return LangfuseEmbedding
1186        elif normalized_type == "guardrail":
1187            return LangfuseGuardrail
1188        elif normalized_type == "generation":
1189            return LangfuseGeneration
1190        elif normalized_type == "event":
1191            return LangfuseEvent
1192        elif normalized_type == "span":
1193            return LangfuseSpan
1194        else:
1195            return LangfuseSpan
1196
1197    @staticmethod
1198    def _get_observation_type_from_otel_span(otel_span: otel_trace_api.Span) -> str:
1199        if not otel_span.is_recording():
1200            return "span"
1201
1202        attributes = getattr(otel_span, "attributes", None)
1203        if attributes is None or not hasattr(attributes, "get"):
1204            return "span"
1205
1206        observation_type = attributes.get(
1207            LangfuseOtelSpanAttributes.OBSERVATION_TYPE, "span"
1208        )
1209
1210        return observation_type if isinstance(observation_type, str) else "span"
1211
1212    @_agnosticcontextmanager
1213    def _create_span_with_parent_context(
1214        self,
1215        *,
1216        name: str,
1217        parent: Optional[otel_trace_api.Span] = None,
1218        remote_parent_span: Optional[otel_trace_api.Span] = None,
1219        as_type: ObservationTypeLiteralNoEvent,
1220        end_on_exit: Optional[bool] = None,
1221        input: Optional[Any] = None,
1222        output: Optional[Any] = None,
1223        metadata: Optional[Any] = None,
1224        version: Optional[str] = None,
1225        level: Optional[SpanLevel] = None,
1226        status_message: Optional[str] = None,
1227        completion_start_time: Optional[datetime] = None,
1228        model: Optional[str] = None,
1229        model_parameters: Optional[Dict[str, MapValue]] = None,
1230        usage_details: Optional[Dict[str, int]] = None,
1231        cost_details: Optional[Dict[str, float]] = None,
1232        prompt: Optional[PromptClient] = None,
1233    ) -> Any:
1234        parent_span = parent or cast(otel_trace_api.Span, remote_parent_span)
1235
1236        with otel_trace_api.use_span(parent_span):
1237            with self._start_as_current_otel_span_with_processed_media(
1238                name=name,
1239                as_type=as_type,
1240                end_on_exit=end_on_exit,
1241                input=input,
1242                output=output,
1243                metadata=metadata,
1244                version=version,
1245                level=level,
1246                status_message=status_message,
1247                completion_start_time=completion_start_time,
1248                model=model,
1249                model_parameters=model_parameters,
1250                usage_details=usage_details,
1251                cost_details=cost_details,
1252                prompt=prompt,
1253            ) as langfuse_span:
1254                if remote_parent_span is not None:
1255                    langfuse_span._otel_span.set_attribute(
1256                        LangfuseOtelSpanAttributes.AS_ROOT, True
1257                    )
1258
1259                yield langfuse_span
1260
1261    @_agnosticcontextmanager
1262    def _start_as_current_otel_span_with_processed_media(
1263        self,
1264        *,
1265        name: str,
1266        as_type: Optional[ObservationTypeLiteralNoEvent] = None,
1267        end_on_exit: Optional[bool] = None,
1268        input: Optional[Any] = None,
1269        output: Optional[Any] = None,
1270        metadata: Optional[Any] = None,
1271        version: Optional[str] = None,
1272        level: Optional[SpanLevel] = None,
1273        status_message: Optional[str] = None,
1274        completion_start_time: Optional[datetime] = None,
1275        model: Optional[str] = None,
1276        model_parameters: Optional[Dict[str, MapValue]] = None,
1277        usage_details: Optional[Dict[str, int]] = None,
1278        cost_details: Optional[Dict[str, float]] = None,
1279        prompt: Optional[PromptClient] = None,
1280    ) -> Any:
1281        with self._otel_tracer.start_as_current_span(
1282            name=name,
1283            end_on_exit=end_on_exit if end_on_exit is not None else True,
1284        ) as otel_span:
1285            baggage_token = None
1286
1287            if otel_span.is_recording():
1288                context_with_app_root_claim = _set_langfuse_trace_id_in_baggage(
1289                    trace_id=self._get_otel_trace_id(otel_span),
1290                    context=otel_context_api.get_current(),
1291                )
1292                baggage_token = otel_context_api.attach(context_with_app_root_claim)
1293
1294            span_class = self._get_span_class(
1295                as_type or "generation"
1296            )  # default was "generation"
1297
1298            try:
1299                common_args = {
1300                    "otel_span": otel_span,
1301                    "langfuse_client": self,
1302                    "environment": self._environment,
1303                    "release": self._release,
1304                    "input": input,
1305                    "output": output,
1306                    "metadata": metadata,
1307                    "version": version,
1308                    "level": level,
1309                    "status_message": status_message,
1310                }
1311
1312                if span_class in [
1313                    LangfuseGeneration,
1314                    LangfuseEmbedding,
1315                ]:
1316                    common_args.update(
1317                        {
1318                            "completion_start_time": completion_start_time,
1319                            "model": model,
1320                            "model_parameters": model_parameters,
1321                            "usage_details": usage_details,
1322                            "cost_details": cost_details,
1323                            "prompt": prompt,
1324                        }
1325                    )
1326                # For span-like types (span, agent, tool, chain, retriever, evaluator, guardrail), no generation properties needed
1327
1328                yield span_class(**common_args)  # type: ignore[arg-type]
1329
1330            finally:
1331                if baggage_token is not None:
1332                    _detach_context_token_safely(baggage_token)
1333
1334    def _get_current_otel_span(self) -> Optional[otel_trace_api.Span]:
1335        current_span = otel_trace_api.get_current_span()
1336
1337        if current_span is otel_trace_api.INVALID_SPAN:
1338            langfuse_logger.warning(
1339                "Context error: No active span in current context. Operations that depend on an active span will be skipped. "
1340                "Ensure spans are created with start_as_current_observation() or that you're operating within an active span context."
1341            )
1342            return None
1343
1344        return current_span
1345
1346    def update_current_generation(
1347        self,
1348        *,
1349        name: Optional[str] = None,
1350        input: Optional[Any] = None,
1351        output: Optional[Any] = None,
1352        metadata: Optional[Any] = None,
1353        version: Optional[str] = None,
1354        level: Optional[SpanLevel] = None,
1355        status_message: Optional[str] = None,
1356        completion_start_time: Optional[datetime] = None,
1357        model: Optional[str] = None,
1358        model_parameters: Optional[Dict[str, MapValue]] = None,
1359        usage_details: Optional[Dict[str, int]] = None,
1360        cost_details: Optional[Dict[str, float]] = None,
1361        prompt: Optional[PromptClient] = None,
1362    ) -> None:
1363        """Update the current active generation span with new information.
1364
1365        This method updates the current generation span in the active context with
1366        additional information. It's useful for adding output, usage stats, or other
1367        details that become available during or after model generation.
1368
1369        Args:
1370            name: The generation name
1371            input: Updated input data for the model
1372            output: Output from the model (e.g., completions)
1373            metadata: Additional metadata to associate with the generation
1374            version: Version identifier for the model or component
1375            level: Importance level of the generation (info, warning, error)
1376            status_message: Optional status message for the generation
1377            completion_start_time: When the model started generating the response
1378            model: Name/identifier of the AI model used (e.g., "gpt-4")
1379            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1380            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1381            cost_details: Cost information for the model call
1382            prompt: Associated prompt template from Langfuse prompt management
1383
1384        Example:
1385            ```python
1386            with langfuse.start_as_current_generation(name="answer-query") as generation:
1387                # Initial setup and API call
1388                response = llm.generate(...)
1389
1390                # Update with results that weren't available at creation time
1391                langfuse.update_current_generation(
1392                    output=response.text,
1393                    usage_details={
1394                        "prompt_tokens": response.usage.prompt_tokens,
1395                        "completion_tokens": response.usage.completion_tokens
1396                    }
1397                )
1398            ```
1399        """
1400        if not self._tracing_enabled:
1401            langfuse_logger.debug(
1402                "Operation skipped: update_current_generation - Tracing is disabled or client is in no-op mode."
1403            )
1404            return
1405
1406        current_otel_span = self._get_current_otel_span()
1407
1408        if current_otel_span is not None:
1409            generation = LangfuseGeneration(
1410                otel_span=current_otel_span, langfuse_client=self
1411            )
1412
1413            if name:
1414                current_otel_span.update_name(name)
1415
1416            generation.update(
1417                input=input,
1418                output=output,
1419                metadata=metadata,
1420                version=version,
1421                level=level,
1422                status_message=status_message,
1423                completion_start_time=completion_start_time,
1424                model=model,
1425                model_parameters=model_parameters,
1426                usage_details=usage_details,
1427                cost_details=cost_details,
1428                prompt=prompt,
1429            )
1430
1431    def update_current_span(
1432        self,
1433        *,
1434        name: Optional[str] = None,
1435        input: Optional[Any] = None,
1436        output: Optional[Any] = None,
1437        metadata: Optional[Any] = None,
1438        version: Optional[str] = None,
1439        level: Optional[SpanLevel] = None,
1440        status_message: Optional[str] = None,
1441    ) -> None:
1442        """Update the current active span with new information.
1443
1444        This method updates the current span in the active context with
1445        additional information. It's useful for adding outputs or metadata
1446        that become available during execution.
1447
1448        Args:
1449            name: The span name
1450            input: Updated input data for the operation
1451            output: Output data from the operation
1452            metadata: Additional metadata to associate with the span
1453            version: Version identifier for the code or component
1454            level: Importance level of the span (info, warning, error)
1455            status_message: Optional status message for the span
1456
1457        Example:
1458            ```python
1459            with langfuse.start_as_current_observation(name="process-data") as span:
1460                # Initial processing
1461                result = process_first_part()
1462
1463                # Update with intermediate results
1464                langfuse.update_current_span(metadata={"intermediate_result": result})
1465
1466                # Continue processing
1467                final_result = process_second_part(result)
1468
1469                # Final update
1470                langfuse.update_current_span(output=final_result)
1471            ```
1472        """
1473        if not self._tracing_enabled:
1474            langfuse_logger.debug(
1475                "Operation skipped: update_current_span - Tracing is disabled or client is in no-op mode."
1476            )
1477            return
1478
1479        current_otel_span = self._get_current_otel_span()
1480
1481        if current_otel_span is not None:
1482            span_class = self._get_span_class(
1483                self._get_observation_type_from_otel_span(current_otel_span)
1484            )
1485            span = span_class(
1486                otel_span=current_otel_span,
1487                langfuse_client=self,
1488                environment=self._environment,
1489                release=self._release,
1490            )
1491
1492            if name:
1493                current_otel_span.update_name(name)
1494
1495            span.update(
1496                input=input,
1497                output=output,
1498                metadata=metadata,
1499                version=version,
1500                level=level,
1501                status_message=status_message,
1502            )
1503
1504    @deprecated(
1505        "Trace-level input/output is deprecated. "
1506        "For trace attributes (user_id, session_id, tags, etc.), use propagate_attributes() instead. "
1507        "This method will be removed in a future major version."
1508    )
1509    def set_current_trace_io(
1510        self,
1511        *,
1512        input: Optional[Any] = None,
1513        output: Optional[Any] = None,
1514    ) -> None:
1515        """Set trace-level input and output for the current span's trace.
1516
1517        .. deprecated::
1518            This is a legacy method for backward compatibility with Langfuse platform
1519            features that still rely on trace-level input/output (e.g., legacy LLM-as-a-judge
1520            evaluators). It will be removed in a future major version.
1521
1522            For setting other trace attributes (user_id, session_id, metadata, tags, version),
1523            use :meth:`propagate_attributes` instead.
1524
1525        Args:
1526            input: Input data to associate with the trace.
1527            output: Output data to associate with the trace.
1528        """
1529        if not self._tracing_enabled:
1530            langfuse_logger.debug(
1531                "Operation skipped: set_current_trace_io - Tracing is disabled or client is in no-op mode."
1532            )
1533            return
1534
1535        current_otel_span = self._get_current_otel_span()
1536
1537        if current_otel_span is not None and current_otel_span.is_recording():
1538            span_class = self._get_span_class(
1539                self._get_observation_type_from_otel_span(current_otel_span)
1540            )
1541            span = span_class(
1542                otel_span=current_otel_span,
1543                langfuse_client=self,
1544                environment=self._environment,
1545                release=self._release,
1546            )
1547
1548            span.set_trace_io(
1549                input=input,
1550                output=output,
1551            )
1552
1553    def set_current_trace_as_public(self) -> None:
1554        """Make the current trace publicly accessible via its URL.
1555
1556        When a trace is published, anyone with the trace link can view the full trace
1557        without needing to be logged in to Langfuse. This action cannot be undone
1558        programmatically - once published, the entire trace becomes public.
1559
1560        This is a convenience method that publishes the trace from the currently
1561        active span context. Use this when you want to make a trace public from
1562        within a traced function without needing direct access to the span object.
1563        """
1564        if not self._tracing_enabled:
1565            langfuse_logger.debug(
1566                "Operation skipped: set_current_trace_as_public - Tracing is disabled or client is in no-op mode."
1567            )
1568            return
1569
1570        current_otel_span = self._get_current_otel_span()
1571
1572        if current_otel_span is not None and current_otel_span.is_recording():
1573            span_class = self._get_span_class(
1574                self._get_observation_type_from_otel_span(current_otel_span)
1575            )
1576            span = span_class(
1577                otel_span=current_otel_span,
1578                langfuse_client=self,
1579                environment=self._environment,
1580            )
1581
1582            span.set_trace_as_public()
1583
1584    def create_event(
1585        self,
1586        *,
1587        trace_context: Optional[TraceContext] = None,
1588        name: str,
1589        input: Optional[Any] = None,
1590        output: Optional[Any] = None,
1591        metadata: Optional[Any] = None,
1592        version: Optional[str] = None,
1593        level: Optional[SpanLevel] = None,
1594        status_message: Optional[str] = None,
1595    ) -> LangfuseEvent:
1596        """Create a new Langfuse observation of type 'EVENT'.
1597
1598        The created Langfuse Event observation will be the child of the current span in the context.
1599
1600        Args:
1601            trace_context: Optional context for connecting to an existing trace
1602            name: Name of the span (e.g., function or operation name)
1603            input: Input data for the operation (can be any JSON-serializable object)
1604            output: Output data from the operation (can be any JSON-serializable object)
1605            metadata: Additional metadata to associate with the span
1606            version: Version identifier for the code or component
1607            level: Importance level of the span (info, warning, error)
1608            status_message: Optional status message for the span
1609
1610        Returns:
1611            The Langfuse Event object
1612
1613        Example:
1614            ```python
1615            event = langfuse.create_event(name="process-event")
1616            ```
1617        """
1618        timestamp = time_ns()
1619
1620        if trace_context:
1621            trace_id = trace_context.get("trace_id", None)
1622            parent_span_id = trace_context.get("parent_span_id", None)
1623
1624            if trace_id:
1625                remote_parent_span = self._create_remote_parent_span(
1626                    trace_id=trace_id, parent_span_id=parent_span_id
1627                )
1628
1629                with otel_trace_api.use_span(
1630                    cast(otel_trace_api.Span, remote_parent_span)
1631                ):
1632                    otel_span = self._otel_tracer.start_span(
1633                        name=name, start_time=timestamp
1634                    )
1635                    otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True)
1636
1637                    return cast(
1638                        LangfuseEvent,
1639                        LangfuseEvent(
1640                            otel_span=otel_span,
1641                            langfuse_client=self,
1642                            environment=self._environment,
1643                            release=self._release,
1644                            input=input,
1645                            output=output,
1646                            metadata=metadata,
1647                            version=version,
1648                            level=level,
1649                            status_message=status_message,
1650                        ).end(end_time=timestamp),
1651                    )
1652
1653        otel_span = self._otel_tracer.start_span(name=name, start_time=timestamp)
1654
1655        return cast(
1656            LangfuseEvent,
1657            LangfuseEvent(
1658                otel_span=otel_span,
1659                langfuse_client=self,
1660                environment=self._environment,
1661                release=self._release,
1662                input=input,
1663                output=output,
1664                metadata=metadata,
1665                version=version,
1666                level=level,
1667                status_message=status_message,
1668            ).end(end_time=timestamp),
1669        )
1670
1671    def _create_remote_parent_span(
1672        self, *, trace_id: str, parent_span_id: Optional[str]
1673    ) -> Any:
1674        if not self._is_valid_trace_id(trace_id):
1675            langfuse_logger.warning(
1676                f"Passed trace ID '{trace_id}' is not a valid 32 lowercase hex char Langfuse trace id. Ignoring trace ID."
1677            )
1678
1679        if parent_span_id and not self._is_valid_span_id(parent_span_id):
1680            langfuse_logger.warning(
1681                f"Passed span ID '{parent_span_id}' is not a valid 16 lowercase hex char Langfuse span id. Ignoring parent span ID."
1682            )
1683
1684        int_trace_id = int(trace_id, 16)
1685        int_parent_span_id = (
1686            int(parent_span_id, 16)
1687            if parent_span_id
1688            else RandomIdGenerator().generate_span_id()
1689        )
1690
1691        span_context = otel_trace_api.SpanContext(
1692            trace_id=int_trace_id,
1693            span_id=int_parent_span_id,
1694            trace_flags=otel_trace_api.TraceFlags(0x01),  # mark span as sampled
1695            is_remote=False,
1696        )
1697
1698        return otel_trace_api.NonRecordingSpan(span_context)
1699
1700    def _is_valid_trace_id(self, trace_id: str) -> bool:
1701        pattern = r"^[0-9a-f]{32}$"
1702
1703        return bool(re.match(pattern, trace_id))
1704
1705    def _is_valid_span_id(self, span_id: str) -> bool:
1706        pattern = r"^[0-9a-f]{16}$"
1707
1708        return bool(re.match(pattern, span_id))
1709
1710    def _create_observation_id(self, *, seed: Optional[str] = None) -> str:
1711        """Create a unique observation ID for use with Langfuse.
1712
1713        This method generates a unique observation ID (span ID in OpenTelemetry terms)
1714        for use with various Langfuse APIs. It can either generate a random ID or
1715        create a deterministic ID based on a seed string.
1716
1717        Observation IDs must be 16 lowercase hexadecimal characters, representing 8 bytes.
1718        This method ensures the generated ID meets this requirement. If you need to
1719        correlate an external ID with a Langfuse observation ID, use the external ID as
1720        the seed to get a valid, deterministic observation ID.
1721
1722        Args:
1723            seed: Optional string to use as a seed for deterministic ID generation.
1724                 If provided, the same seed will always produce the same ID.
1725                 If not provided, a random ID will be generated.
1726
1727        Returns:
1728            A 16-character lowercase hexadecimal string representing the observation ID.
1729
1730        Example:
1731            ```python
1732            # Generate a random observation ID
1733            obs_id = langfuse.create_observation_id()
1734
1735            # Generate a deterministic ID based on a seed
1736            user_obs_id = langfuse.create_observation_id(seed="user-123-feedback")
1737
1738            # Correlate an external item ID with a Langfuse observation ID
1739            item_id = "item-789012"
1740            correlated_obs_id = langfuse.create_observation_id(seed=item_id)
1741
1742            # Use the ID with Langfuse APIs
1743            langfuse.create_score(
1744                name="relevance",
1745                value=0.95,
1746                trace_id=trace_id,
1747                observation_id=obs_id
1748            )
1749            ```
1750        """
1751        if not seed:
1752            span_id_int = RandomIdGenerator().generate_span_id()
1753
1754            return self._format_otel_span_id(span_id_int)
1755
1756        return sha256(seed.encode("utf-8")).digest()[:8].hex()
1757
1758    @staticmethod
1759    def create_trace_id(*, seed: Optional[str] = None) -> str:
1760        """Create a unique trace ID for use with Langfuse.
1761
1762        This method generates a unique trace ID for use with various Langfuse APIs.
1763        It can either generate a random ID or create a deterministic ID based on
1764        a seed string.
1765
1766        Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes.
1767        This method ensures the generated ID meets this requirement. If you need to
1768        correlate an external ID with a Langfuse trace ID, use the external ID as the
1769        seed to get a valid, deterministic Langfuse trace ID.
1770
1771        Args:
1772            seed: Optional string to use as a seed for deterministic ID generation.
1773                 If provided, the same seed will always produce the same ID.
1774                 If not provided, a random ID will be generated.
1775
1776        Returns:
1777            A 32-character lowercase hexadecimal string representing the Langfuse trace ID.
1778
1779        Example:
1780            ```python
1781            # Generate a random trace ID
1782            trace_id = langfuse.create_trace_id()
1783
1784            # Generate a deterministic ID based on a seed
1785            session_trace_id = langfuse.create_trace_id(seed="session-456")
1786
1787            # Correlate an external ID with a Langfuse trace ID
1788            external_id = "external-system-123456"
1789            correlated_trace_id = langfuse.create_trace_id(seed=external_id)
1790
1791            # Use the ID with trace context
1792            with langfuse.start_as_current_observation(
1793                name="process-request",
1794                trace_context={"trace_id": trace_id}
1795            ) as span:
1796                # Operation will be part of the specific trace
1797                pass
1798            ```
1799        """
1800        if not seed:
1801            trace_id_int = RandomIdGenerator().generate_trace_id()
1802
1803            return Langfuse._format_otel_trace_id(trace_id_int)
1804
1805        return sha256(seed.encode("utf-8")).digest()[:16].hex()
1806
1807    def _get_otel_trace_id(self, otel_span: otel_trace_api.Span) -> str:
1808        span_context = otel_span.get_span_context()
1809
1810        return self._format_otel_trace_id(span_context.trace_id)
1811
1812    def _get_otel_span_id(self, otel_span: otel_trace_api.Span) -> str:
1813        span_context = otel_span.get_span_context()
1814
1815        return self._format_otel_span_id(span_context.span_id)
1816
1817    @staticmethod
1818    def _format_otel_span_id(span_id_int: int) -> str:
1819        """Format an integer span ID to a 16-character lowercase hex string.
1820
1821        Internal method to convert an OpenTelemetry integer span ID to the standard
1822        W3C Trace Context format (16-character lowercase hex string).
1823
1824        Args:
1825            span_id_int: 64-bit integer representing a span ID
1826
1827        Returns:
1828            A 16-character lowercase hexadecimal string
1829        """
1830        return format(span_id_int, "016x")
1831
1832    @staticmethod
1833    def _format_otel_trace_id(trace_id_int: int) -> str:
1834        """Format an integer trace ID to a 32-character lowercase hex string.
1835
1836        Internal method to convert an OpenTelemetry integer trace ID to the standard
1837        W3C Trace Context format (32-character lowercase hex string).
1838
1839        Args:
1840            trace_id_int: 128-bit integer representing a trace ID
1841
1842        Returns:
1843            A 32-character lowercase hexadecimal string
1844        """
1845        return format(trace_id_int, "032x")
1846
1847    @overload
1848    def create_score(
1849        self,
1850        *,
1851        name: str,
1852        value: float,
1853        session_id: Optional[str] = None,
1854        dataset_run_id: Optional[str] = None,
1855        trace_id: Optional[str] = None,
1856        observation_id: Optional[str] = None,
1857        score_id: Optional[str] = None,
1858        data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None,
1859        comment: Optional[str] = None,
1860        config_id: Optional[str] = None,
1861        metadata: Optional[Any] = None,
1862        timestamp: Optional[datetime] = None,
1863        environment: Optional[str] = None,
1864    ) -> None: ...
1865
1866    @overload
1867    def create_score(
1868        self,
1869        *,
1870        name: str,
1871        value: str,
1872        session_id: Optional[str] = None,
1873        dataset_run_id: Optional[str] = None,
1874        trace_id: Optional[str] = None,
1875        score_id: Optional[str] = None,
1876        observation_id: Optional[str] = None,
1877        data_type: Optional[
1878            Literal["CATEGORICAL", "TEXT", "CORRECTION"]
1879        ] = "CATEGORICAL",
1880        comment: Optional[str] = None,
1881        config_id: Optional[str] = None,
1882        metadata: Optional[Any] = None,
1883        timestamp: Optional[datetime] = None,
1884        environment: Optional[str] = None,
1885    ) -> None: ...
1886
1887    def create_score(
1888        self,
1889        *,
1890        name: str,
1891        value: Union[float, str],
1892        session_id: Optional[str] = None,
1893        dataset_run_id: Optional[str] = None,
1894        trace_id: Optional[str] = None,
1895        observation_id: Optional[str] = None,
1896        score_id: Optional[str] = None,
1897        data_type: Optional[ScoreDataType] = None,
1898        comment: Optional[str] = None,
1899        config_id: Optional[str] = None,
1900        metadata: Optional[Any] = None,
1901        timestamp: Optional[datetime] = None,
1902        environment: Optional[str] = None,
1903    ) -> None:
1904        """Create a score for a specific trace or observation.
1905
1906        This method creates a score for evaluating a Langfuse trace or observation. Scores can be
1907        used to track quality metrics, user feedback, or automated evaluations.
1908
1909        Args:
1910            name: Name of the score (e.g., "relevance", "accuracy")
1911            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION)
1912            session_id: ID of the Langfuse session to associate the score with
1913            dataset_run_id: ID of the Langfuse dataset run to associate the score with
1914            trace_id: ID of the Langfuse trace to associate the score with
1915            observation_id: Optional ID of the specific observation to score. Trace ID must be provided too.
1916            score_id: Optional custom ID for the score (auto-generated if not provided)
1917            data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION)
1918            comment: Optional comment or explanation for the score
1919            config_id: Optional ID of a score config defined in Langfuse
1920            metadata: Optional metadata to be attached to the score
1921            timestamp: Optional timestamp for the score (defaults to current UTC time)
1922            environment: Optional environment override for this score. If omitted,
1923                the score uses the client-level environment from
1924                `Langfuse(environment=...)` or `LANGFUSE_TRACING_ENVIRONMENT`.
1925                Langfuse observation wrapper methods pass their resolved span
1926                environment here so scores created via `span.score()` or
1927                `span.score_trace()` stay grouped with the scored observation or
1928                trace, including request-scoped environments propagated with
1929                `propagate_attributes(environment=...)`.
1930
1931        Example:
1932            ```python
1933            # Create a numeric score for accuracy
1934            langfuse.create_score(
1935                name="accuracy",
1936                value=0.92,
1937                trace_id="abcdef1234567890abcdef1234567890",
1938                data_type="NUMERIC",
1939                comment="High accuracy with minor irrelevant details"
1940            )
1941
1942            # Create a categorical score for sentiment
1943            langfuse.create_score(
1944                name="sentiment",
1945                value="positive",
1946                trace_id="abcdef1234567890abcdef1234567890",
1947                observation_id="abcdef1234567890",
1948                data_type="CATEGORICAL"
1949            )
1950            ```
1951        """
1952        if not self._tracing_enabled:
1953            return
1954
1955        score_id = score_id or self._create_observation_id()
1956
1957        try:
1958            new_body = ScoreBody(
1959                id=score_id,
1960                sessionId=session_id,
1961                datasetRunId=dataset_run_id,
1962                traceId=trace_id,
1963                observationId=observation_id,
1964                name=name,
1965                value=value,
1966                dataType=data_type,  # type: ignore
1967                comment=comment,
1968                configId=config_id,
1969                environment=environment or self._environment,
1970                metadata=metadata,
1971            )
1972
1973            event = {
1974                "id": self.create_trace_id(),
1975                "type": "score-create",
1976                "timestamp": timestamp or _get_timestamp(),
1977                "body": new_body,
1978            }
1979
1980            if self._resources is not None:
1981                # Force the score to be in sample if it was for a legacy trace ID, i.e. non-32 hexchar
1982                force_sample = (
1983                    not self._is_valid_trace_id(trace_id) if trace_id else True
1984                )
1985
1986                self._resources.add_score_task(
1987                    event,
1988                    force_sample=force_sample,
1989                )
1990
1991        except Exception as e:
1992            langfuse_logger.exception(
1993                f"Error creating score: Failed to process score event for trace_id={trace_id}, name={name}. Error: {e}"
1994            )
1995
1996    def _create_trace_tags_via_ingestion(
1997        self,
1998        *,
1999        trace_id: str,
2000        tags: List[str],
2001    ) -> None:
2002        """Private helper to enqueue trace tag updates via ingestion API events."""
2003        if not self._tracing_enabled:
2004            return
2005
2006        if len(tags) == 0:
2007            return
2008
2009        try:
2010            new_body = TraceBody(
2011                id=trace_id,
2012                tags=tags,
2013            )
2014
2015            event = {
2016                "id": self.create_trace_id(),
2017                "type": "trace-create",
2018                "timestamp": _get_timestamp(),
2019                "body": new_body,
2020            }
2021
2022            if self._resources is not None:
2023                self._resources.add_trace_task(event)
2024        except Exception as e:
2025            langfuse_logger.exception(
2026                f"Error updating trace tags: Failed to process trace update event for trace_id={trace_id}. Error: {e}"
2027            )
2028
2029    @overload
2030    def score_current_span(
2031        self,
2032        *,
2033        name: str,
2034        value: float,
2035        score_id: Optional[str] = None,
2036        data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None,
2037        comment: Optional[str] = None,
2038        config_id: Optional[str] = None,
2039        metadata: Optional[Any] = None,
2040    ) -> None: ...
2041
2042    @overload
2043    def score_current_span(
2044        self,
2045        *,
2046        name: str,
2047        value: str,
2048        score_id: Optional[str] = None,
2049        data_type: Optional[
2050            Literal["CATEGORICAL", "TEXT", "CORRECTION"]
2051        ] = "CATEGORICAL",
2052        comment: Optional[str] = None,
2053        config_id: Optional[str] = None,
2054        metadata: Optional[Any] = None,
2055    ) -> None: ...
2056
2057    def score_current_span(
2058        self,
2059        *,
2060        name: str,
2061        value: Union[float, str],
2062        score_id: Optional[str] = None,
2063        data_type: Optional[ScoreDataType] = None,
2064        comment: Optional[str] = None,
2065        config_id: Optional[str] = None,
2066        metadata: Optional[Any] = None,
2067    ) -> None:
2068        """Create a score for the current active span.
2069
2070        This method scores the currently active span in the context. It's a convenient
2071        way to score the current operation without needing to know its trace and span IDs.
2072        If the active span has a `langfuse.environment` attribute, including one
2073        set by `propagate_attributes(environment=...)`, the score uses that
2074        environment. Otherwise it uses the client-level environment.
2075
2076        Args:
2077            name: Name of the score (e.g., "relevance", "accuracy")
2078            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION)
2079            score_id: Optional custom ID for the score (auto-generated if not provided)
2080            data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION)
2081            comment: Optional comment or explanation for the score
2082            config_id: Optional ID of a score config defined in Langfuse
2083            metadata: Optional metadata to be attached to the score
2084
2085        Example:
2086            ```python
2087            with langfuse.start_as_current_generation(name="answer-query") as generation:
2088                # Generate answer
2089                response = generate_answer(...)
2090                generation.update(output=response)
2091
2092                # Score the generation
2093                langfuse.score_current_span(
2094                    name="relevance",
2095                    value=0.85,
2096                    data_type="NUMERIC",
2097                    comment="Mostly relevant but contains some tangential information",
2098                    metadata={"model": "gpt-4", "prompt_version": "v2"}
2099                )
2100            ```
2101        """
2102        current_span = self._get_current_otel_span()
2103
2104        if current_span is not None:
2105            trace_id = self._get_otel_trace_id(current_span)
2106            observation_id = self._get_otel_span_id(current_span)
2107
2108            langfuse_logger.info(
2109                f"Score: Creating score name='{name}' value={value} for current span ({observation_id}) in trace {trace_id}"
2110            )
2111
2112            self.create_score(
2113                trace_id=trace_id,
2114                observation_id=observation_id,
2115                name=name,
2116                value=cast(str, value),
2117                score_id=score_id,
2118                data_type=cast(Literal["CATEGORICAL", "TEXT", "CORRECTION"], data_type),
2119                comment=comment,
2120                config_id=config_id,
2121                metadata=metadata,
2122                environment=get_string_span_attribute(
2123                    current_span, LangfuseOtelSpanAttributes.ENVIRONMENT
2124                ),
2125            )
2126
2127    @overload
2128    def score_current_trace(
2129        self,
2130        *,
2131        name: str,
2132        value: float,
2133        score_id: Optional[str] = None,
2134        data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None,
2135        comment: Optional[str] = None,
2136        config_id: Optional[str] = None,
2137        metadata: Optional[Any] = None,
2138    ) -> None: ...
2139
2140    @overload
2141    def score_current_trace(
2142        self,
2143        *,
2144        name: str,
2145        value: str,
2146        score_id: Optional[str] = None,
2147        data_type: Optional[
2148            Literal["CATEGORICAL", "TEXT", "CORRECTION"]
2149        ] = "CATEGORICAL",
2150        comment: Optional[str] = None,
2151        config_id: Optional[str] = None,
2152        metadata: Optional[Any] = None,
2153    ) -> None: ...
2154
2155    def score_current_trace(
2156        self,
2157        *,
2158        name: str,
2159        value: Union[float, str],
2160        score_id: Optional[str] = None,
2161        data_type: Optional[ScoreDataType] = None,
2162        comment: Optional[str] = None,
2163        config_id: Optional[str] = None,
2164        metadata: Optional[Any] = None,
2165    ) -> None:
2166        """Create a score for the current trace.
2167
2168        This method scores the trace of the currently active span. Unlike score_current_span,
2169        this method associates the score with the entire trace rather than a specific span.
2170        It's useful for scoring overall performance or quality of the entire operation.
2171        If the active span has a `langfuse.environment` attribute, including one
2172        set by `propagate_attributes(environment=...)`, the score uses that
2173        environment. Otherwise it uses the client-level environment.
2174
2175        Args:
2176            name: Name of the score (e.g., "user_satisfaction", "overall_quality")
2177            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION)
2178            score_id: Optional custom ID for the score (auto-generated if not provided)
2179            data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION)
2180            comment: Optional comment or explanation for the score
2181            config_id: Optional ID of a score config defined in Langfuse
2182            metadata: Optional metadata to be attached to the score
2183
2184        Example:
2185            ```python
2186            with langfuse.start_as_current_observation(name="process-user-request") as span:
2187                # Process request
2188                result = process_complete_request()
2189                span.update(output=result)
2190
2191                # Score the overall trace
2192                langfuse.score_current_trace(
2193                    name="overall_quality",
2194                    value=0.95,
2195                    data_type="NUMERIC",
2196                    comment="High quality end-to-end response",
2197                    metadata={"evaluator": "gpt-4", "criteria": "comprehensive"}
2198                )
2199            ```
2200        """
2201        current_span = self._get_current_otel_span()
2202
2203        if current_span is not None:
2204            trace_id = self._get_otel_trace_id(current_span)
2205
2206            langfuse_logger.info(
2207                f"Score: Creating score name='{name}' value={value} for entire trace {trace_id}"
2208            )
2209
2210            self.create_score(
2211                trace_id=trace_id,
2212                name=name,
2213                value=cast(str, value),
2214                score_id=score_id,
2215                data_type=cast(Literal["CATEGORICAL", "TEXT", "CORRECTION"], data_type),
2216                comment=comment,
2217                config_id=config_id,
2218                metadata=metadata,
2219                environment=get_string_span_attribute(
2220                    current_span, LangfuseOtelSpanAttributes.ENVIRONMENT
2221                ),
2222            )
2223
2224    def flush(self) -> None:
2225        """Force flush all pending spans and events to the Langfuse API.
2226
2227        This method manually flushes any pending spans, scores, and other events to the
2228        Langfuse API. It's useful in scenarios where you want to ensure all data is sent
2229        before proceeding, without waiting for the automatic flush interval.
2230
2231        Example:
2232            ```python
2233            # Record some spans and scores
2234            with langfuse.start_as_current_observation(name="operation") as span:
2235                # Do work...
2236                pass
2237
2238            # Ensure all data is sent to Langfuse before proceeding
2239            langfuse.flush()
2240
2241            # Continue with other work
2242            ```
2243        """
2244        if self._resources is not None:
2245            self._resources.flush()
2246
2247    def shutdown(self) -> None:
2248        """Shut down the Langfuse client and flush all pending data.
2249
2250        This method cleanly shuts down the Langfuse client, ensuring all pending data
2251        is flushed to the API and all background threads are properly terminated.
2252
2253        It's important to call this method when your application is shutting down to
2254        prevent data loss and resource leaks. For most applications, using the client
2255        as a context manager or relying on the automatic shutdown via atexit is sufficient.
2256
2257        Example:
2258            ```python
2259            # Initialize Langfuse
2260            langfuse = Langfuse(public_key="...", secret_key="...")
2261
2262            # Use Langfuse throughout your application
2263            # ...
2264
2265            # When application is shutting down
2266            langfuse.shutdown()
2267            ```
2268        """
2269        if self._resources is not None:
2270            self._resources.shutdown()
2271
2272    def get_current_trace_id(self) -> Optional[str]:
2273        """Get the trace ID of the current active span.
2274
2275        This method retrieves the trace ID from the currently active span in the context.
2276        It can be used to get the trace ID for referencing in logs, external systems,
2277        or for creating related operations.
2278
2279        Returns:
2280            The current trace ID as a 32-character lowercase hexadecimal string,
2281            or None if there is no active span.
2282
2283        Example:
2284            ```python
2285            with langfuse.start_as_current_observation(name="process-request") as span:
2286                # Get the current trace ID for reference
2287                trace_id = langfuse.get_current_trace_id()
2288
2289                # Use it for external correlation
2290                log.info(f"Processing request with trace_id: {trace_id}")
2291
2292                # Or pass to another system
2293                external_system.process(data, trace_id=trace_id)
2294            ```
2295        """
2296        if not self._tracing_enabled:
2297            langfuse_logger.debug(
2298                "Operation skipped: get_current_trace_id - Tracing is disabled or client is in no-op mode."
2299            )
2300            return None
2301
2302        current_otel_span = self._get_current_otel_span()
2303
2304        return self._get_otel_trace_id(current_otel_span) if current_otel_span else None
2305
2306    def get_current_observation_id(self) -> Optional[str]:
2307        """Get the observation ID (span ID) of the current active span.
2308
2309        This method retrieves the observation ID from the currently active span in the context.
2310        It can be used to get the observation ID for referencing in logs, external systems,
2311        or for creating scores or other related operations.
2312
2313        Returns:
2314            The current observation ID as a 16-character lowercase hexadecimal string,
2315            or None if there is no active span.
2316
2317        Example:
2318            ```python
2319            with langfuse.start_as_current_observation(name="process-user-query") as span:
2320                # Get the current observation ID
2321                observation_id = langfuse.get_current_observation_id()
2322
2323                # Store it for later reference
2324                cache.set(f"query_{query_id}_observation", observation_id)
2325
2326                # Process the query...
2327            ```
2328        """
2329        if not self._tracing_enabled:
2330            langfuse_logger.debug(
2331                "Operation skipped: get_current_observation_id - Tracing is disabled or client is in no-op mode."
2332            )
2333            return None
2334
2335        current_otel_span = self._get_current_otel_span()
2336
2337        return self._get_otel_span_id(current_otel_span) if current_otel_span else None
2338
2339    def _get_project_id(self) -> Optional[str]:
2340        """Fetch and return the current project id. Persisted across requests. Returns None if no project id is found for api keys."""
2341        if not self._project_id:
2342            proj = self.api.projects.get()
2343            if not proj.data or not proj.data[0].id:
2344                return None
2345
2346            self._project_id = proj.data[0].id
2347
2348        return self._project_id
2349
2350    def get_trace_url(self, *, trace_id: Optional[str] = None) -> Optional[str]:
2351        """Get the URL to view a trace in the Langfuse UI.
2352
2353        This method generates a URL that links directly to a trace in the Langfuse UI.
2354        It's useful for providing links in logs, notifications, or debugging tools.
2355
2356        Args:
2357            trace_id: Optional trace ID to generate a URL for. If not provided,
2358                     the trace ID of the current active span will be used.
2359
2360        Returns:
2361            A URL string pointing to the trace in the Langfuse UI,
2362            or None if the project ID couldn't be retrieved or no trace ID is available.
2363
2364        Example:
2365            ```python
2366            # Get URL for the current trace
2367            with langfuse.start_as_current_observation(name="process-request") as span:
2368                trace_url = langfuse.get_trace_url()
2369                log.info(f"Processing trace: {trace_url}")
2370
2371            # Get URL for a specific trace
2372            specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef")
2373            send_notification(f"Review needed for trace: {specific_trace_url}")
2374            ```
2375        """
2376        final_trace_id = trace_id or self.get_current_trace_id()
2377        if not final_trace_id:
2378            return None
2379
2380        project_id = self._get_project_id()
2381
2382        return (
2383            f"{self._base_url}/project/{project_id}/traces/{final_trace_id}"
2384            if project_id and final_trace_id
2385            else None
2386        )
2387
2388    def get_dataset(
2389        self,
2390        name: str,
2391        *,
2392        fetch_items_page_size: Optional[int] = 50,
2393        version: Optional[datetime] = None,
2394    ) -> "DatasetClient":
2395        """Fetch a dataset by its name.
2396
2397        Args:
2398            name: The name of the dataset to fetch.
2399            fetch_items_page_size: All items of the dataset will be fetched in chunks of this size. Defaults to 50.
2400            version: Retrieve dataset items as they existed at this specific point in time (UTC).
2401                If provided, returns the state of items at the specified UTC timestamp.
2402                If not provided, returns the latest version. Must be a timezone-aware datetime object in UTC.
2403
2404        Returns:
2405            DatasetClient: The dataset with the given name.
2406        """
2407        try:
2408            langfuse_logger.debug(f"Getting datasets {name}")
2409            dataset = self.api.datasets.get(dataset_name=self._url_encode(name))
2410
2411            dataset_items: List[DatasetItem] = []
2412            page = 1
2413
2414            while True:
2415                new_items = self.api.dataset_items.list(
2416                    dataset_name=self._url_encode(name, is_url_param=True),
2417                    page=page,
2418                    limit=fetch_items_page_size,
2419                    version=version,
2420                )
2421                dataset_items.extend(
2422                    self._hydrate_dataset_item_media_references(item)
2423                    for item in new_items.data
2424                )
2425
2426                if new_items.meta.total_pages <= page:
2427                    break
2428
2429                page += 1
2430
2431            return DatasetClient(
2432                dataset=dataset,
2433                items=dataset_items,
2434                version=version,
2435                langfuse_client=self,
2436            )
2437
2438        except Error as e:
2439            handle_fern_exception(e)
2440            raise e
2441
2442    def get_dataset_run(
2443        self, *, dataset_name: str, run_name: str
2444    ) -> DatasetRunWithItems:
2445        """Fetch a dataset run by dataset name and run name.
2446
2447        Args:
2448            dataset_name (str): The name of the dataset.
2449            run_name (str): The name of the run.
2450
2451        Returns:
2452            DatasetRunWithItems: The dataset run with its items.
2453        """
2454        try:
2455            return cast(
2456                DatasetRunWithItems,
2457                self.api.datasets.get_run(
2458                    dataset_name=self._url_encode(dataset_name),
2459                    run_name=self._url_encode(run_name),
2460                    request_options=None,
2461                ),
2462            )
2463        except Error as e:
2464            handle_fern_exception(e)
2465            raise e
2466
2467    def get_dataset_runs(
2468        self,
2469        *,
2470        dataset_name: str,
2471        page: Optional[int] = None,
2472        limit: Optional[int] = None,
2473    ) -> PaginatedDatasetRuns:
2474        """Fetch all runs for a dataset.
2475
2476        Args:
2477            dataset_name (str): The name of the dataset.
2478            page (Optional[int]): Page number, starts at 1.
2479            limit (Optional[int]): Limit of items per page.
2480
2481        Returns:
2482            PaginatedDatasetRuns: Paginated list of dataset runs.
2483        """
2484        try:
2485            return cast(
2486                PaginatedDatasetRuns,
2487                self.api.datasets.get_runs(
2488                    dataset_name=self._url_encode(dataset_name),
2489                    page=page,
2490                    limit=limit,
2491                    request_options=None,
2492                ),
2493            )
2494        except Error as e:
2495            handle_fern_exception(e)
2496            raise e
2497
2498    def delete_dataset_run(
2499        self, *, dataset_name: str, run_name: str
2500    ) -> DeleteDatasetRunResponse:
2501        """Delete a dataset run and all its run items. This action is irreversible.
2502
2503        Args:
2504            dataset_name (str): The name of the dataset.
2505            run_name (str): The name of the run.
2506
2507        Returns:
2508            DeleteDatasetRunResponse: Confirmation of deletion.
2509        """
2510        try:
2511            return cast(
2512                DeleteDatasetRunResponse,
2513                self.api.datasets.delete_run(
2514                    dataset_name=self._url_encode(dataset_name),
2515                    run_name=self._url_encode(run_name),
2516                    request_options=None,
2517                ),
2518            )
2519        except Error as e:
2520            handle_fern_exception(e)
2521            raise e
2522
2523    def run_experiment(
2524        self,
2525        *,
2526        name: str,
2527        run_name: Optional[str] = None,
2528        description: Optional[str] = None,
2529        data: ExperimentData,
2530        task: TaskFunction,
2531        evaluators: List[EvaluatorFunction] = [],
2532        composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
2533        run_evaluators: List[RunEvaluatorFunction] = [],
2534        max_concurrency: int = 50,
2535        metadata: Optional[Dict[str, str]] = None,
2536        _dataset_version: Optional[datetime] = None,
2537    ) -> ExperimentResult:
2538        """Run an experiment on a dataset with automatic tracing and evaluation.
2539
2540        This method executes a task function on each item in the provided dataset,
2541        automatically traces all executions with Langfuse for observability, runs
2542        item-level and run-level evaluators on the outputs, and returns comprehensive
2543        results with evaluation metrics.
2544
2545        The experiment system provides:
2546        - Automatic tracing of all task executions
2547        - Concurrent processing with configurable limits
2548        - Comprehensive error handling that isolates failures
2549        - Integration with Langfuse datasets for experiment tracking
2550        - Flexible evaluation framework supporting both sync and async evaluators
2551
2552        Args:
2553            name: Human-readable name for the experiment. Used for identification
2554                in the Langfuse UI.
2555            run_name: Optional exact name for the experiment run. If provided, this will be
2556                used as the exact dataset run name if the `data` contains Langfuse dataset items.
2557                If not provided, this will default to the experiment name appended with an ISO timestamp.
2558            description: Optional description explaining the experiment's purpose,
2559                methodology, or expected outcomes.
2560            data: Array of data items to process. Can be either:
2561                - List of dict-like items with 'input', 'expected_output', 'metadata' keys
2562                - List of Langfuse DatasetItem objects from dataset.items
2563            task: Function that processes each data item and returns output.
2564                Must accept 'item' as keyword argument and can return sync or async results.
2565                The task function signature should be: task(*, item, **kwargs) -> Any
2566            evaluators: List of functions to evaluate each item's output individually.
2567                Each evaluator receives input, output, expected_output, and metadata.
2568                Can return single Evaluation dict or list of Evaluation dicts.
2569            composite_evaluator: Optional function that creates composite scores from item-level evaluations.
2570                Receives the same inputs as item-level evaluators (input, output, expected_output, metadata)
2571                plus the list of evaluations from item-level evaluators. Useful for weighted averages,
2572                pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics.
2573            run_evaluators: List of functions to evaluate the entire experiment run.
2574                Each run evaluator receives all item_results and can compute aggregate metrics.
2575                Useful for calculating averages, distributions, or cross-item comparisons.
2576            max_concurrency: Maximum number of concurrent task executions (default: 50).
2577                Controls the number of items processed simultaneously. Adjust based on
2578                API rate limits and system resources.
2579            metadata: Optional metadata dictionary to attach to all experiment traces.
2580                This metadata will be included in every trace created during the experiment.
2581                If `data` are Langfuse dataset items, the metadata will be attached to the dataset run, too.
2582
2583        Returns:
2584            ExperimentResult containing:
2585            - run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset.
2586            - item_results: List of results for each processed item with outputs and evaluations
2587            - run_evaluations: List of aggregate evaluation results for the entire run
2588            - experiment_id: Stable identifier for the experiment run across all items
2589            - dataset_run_id: ID of the dataset run (if using Langfuse datasets)
2590            - dataset_run_url: Direct URL to view results in Langfuse UI (if applicable)
2591
2592        Raises:
2593            ValueError: If required parameters are missing or invalid
2594            Exception: If experiment setup fails (individual item failures are handled gracefully)
2595
2596        Examples:
2597            Basic experiment with local data:
2598            ```python
2599            def summarize_text(*, item, **kwargs):
2600                return f"Summary: {item['input'][:50]}..."
2601
2602            def length_evaluator(*, input, output, expected_output=None, **kwargs):
2603                return {
2604                    "name": "output_length",
2605                    "value": len(output),
2606                    "comment": f"Output contains {len(output)} characters"
2607                }
2608
2609            result = langfuse.run_experiment(
2610                name="Text Summarization Test",
2611                description="Evaluate summarization quality and length",
2612                data=[
2613                    {"input": "Long article text...", "expected_output": "Expected summary"},
2614                    {"input": "Another article...", "expected_output": "Another summary"}
2615                ],
2616                task=summarize_text,
2617                evaluators=[length_evaluator]
2618            )
2619
2620            print(f"Processed {len(result.item_results)} items")
2621            for item_result in result.item_results:
2622                print(f"Input: {item_result.item['input']}")
2623                print(f"Output: {item_result.output}")
2624                print(f"Evaluations: {item_result.evaluations}")
2625            ```
2626
2627            Advanced experiment with async task and multiple evaluators:
2628            ```python
2629            async def llm_task(*, item, **kwargs):
2630                # Simulate async LLM call
2631                response = await openai_client.chat.completions.create(
2632                    model="gpt-4",
2633                    messages=[{"role": "user", "content": item["input"]}]
2634                )
2635                return response.choices[0].message.content
2636
2637            def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
2638                if expected_output and expected_output.lower() in output.lower():
2639                    return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"}
2640                return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"}
2641
2642            def toxicity_evaluator(*, input, output, expected_output=None, **kwargs):
2643                # Simulate toxicity check
2644                toxicity_score = check_toxicity(output)  # Your toxicity checker
2645                return {
2646                    "name": "toxicity",
2647                    "value": toxicity_score,
2648                    "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}"
2649                }
2650
2651            def average_accuracy(*, item_results, **kwargs):
2652                accuracies = [
2653                    eval.value for result in item_results
2654                    for eval in result.evaluations
2655                    if eval.name == "accuracy"
2656                ]
2657                return {
2658                    "name": "average_accuracy",
2659                    "value": sum(accuracies) / len(accuracies) if accuracies else 0,
2660                    "comment": f"Average accuracy across {len(accuracies)} items"
2661                }
2662
2663            result = langfuse.run_experiment(
2664                name="LLM Safety and Accuracy Test",
2665                description="Evaluate model accuracy and safety across diverse prompts",
2666                data=test_dataset,  # Your dataset items
2667                task=llm_task,
2668                evaluators=[accuracy_evaluator, toxicity_evaluator],
2669                run_evaluators=[average_accuracy],
2670                max_concurrency=5,  # Limit concurrent API calls
2671                metadata={"model": "gpt-4", "temperature": 0.7}
2672            )
2673            ```
2674
2675            Using with Langfuse datasets:
2676            ```python
2677            # Get dataset from Langfuse
2678            dataset = langfuse.get_dataset("my-eval-dataset")
2679
2680            result = dataset.run_experiment(
2681                name="Production Model Evaluation",
2682                description="Monthly evaluation of production model performance",
2683                task=my_production_task,
2684                evaluators=[accuracy_evaluator, latency_evaluator]
2685            )
2686
2687            # Results automatically linked to dataset in Langfuse UI
2688            print(f"View results: {result['dataset_run_url']}")
2689            ```
2690
2691        Note:
2692            - Task and evaluator functions can be either synchronous or asynchronous
2693            - Individual item failures are logged but don't stop the experiment
2694            - All executions are automatically traced and visible in Langfuse UI
2695            - When using Langfuse datasets, results are automatically linked for easy comparison
2696            - This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.)
2697            - Async execution is handled automatically with smart event loop detection
2698        """
2699        return cast(
2700            ExperimentResult,
2701            run_async_safely(
2702                self._run_experiment_async(
2703                    name=name,
2704                    run_name=self._create_experiment_run_name(
2705                        name=name, run_name=run_name
2706                    ),
2707                    description=description,
2708                    data=data,
2709                    task=task,
2710                    evaluators=evaluators or [],
2711                    composite_evaluator=composite_evaluator,
2712                    run_evaluators=run_evaluators or [],
2713                    max_concurrency=max_concurrency,
2714                    metadata=metadata,
2715                    dataset_version=_dataset_version,
2716                ),
2717            ),
2718        )
2719
2720    async def _run_experiment_async(
2721        self,
2722        *,
2723        name: str,
2724        run_name: str,
2725        description: Optional[str],
2726        data: ExperimentData,
2727        task: TaskFunction,
2728        evaluators: List[EvaluatorFunction],
2729        composite_evaluator: Optional[CompositeEvaluatorFunction],
2730        run_evaluators: List[RunEvaluatorFunction],
2731        max_concurrency: int,
2732        metadata: Optional[Dict[str, Any]] = None,
2733        dataset_version: Optional[datetime] = None,
2734    ) -> ExperimentResult:
2735        langfuse_logger.debug(
2736            f"Starting experiment '{name}' run '{run_name}' with {len(data)} items"
2737        )
2738
2739        shared_fallback_experiment_id = self._create_observation_id()
2740
2741        # Set up concurrency control
2742        semaphore = asyncio.Semaphore(max_concurrency)
2743
2744        # Process all items
2745        async def process_item(item: ExperimentItem) -> ExperimentItemResult:
2746            async with semaphore:
2747                return await self._process_experiment_item(
2748                    item,
2749                    task,
2750                    evaluators,
2751                    composite_evaluator,
2752                    shared_fallback_experiment_id,
2753                    name,
2754                    run_name,
2755                    description,
2756                    metadata,
2757                    dataset_version,
2758                )
2759
2760        # Run all items concurrently
2761        tasks = [process_item(item) for item in data]
2762        item_results = await asyncio.gather(*tasks, return_exceptions=True)
2763
2764        # Filter out any exceptions and log errors
2765        valid_results: List[ExperimentItemResult] = []
2766        for i, result in enumerate(item_results):
2767            if isinstance(result, Exception):
2768                langfuse_logger.error(f"Item {i} failed: {result}")
2769            elif isinstance(result, ExperimentItemResult):
2770                valid_results.append(result)  # type: ignore
2771
2772        # Run experiment-level evaluators
2773        run_evaluations: List[Evaluation] = []
2774        for run_evaluator in run_evaluators:
2775            try:
2776                evaluations = await _run_evaluator(
2777                    run_evaluator, item_results=valid_results
2778                )
2779                run_evaluations.extend(evaluations)
2780            except Exception as e:
2781                langfuse_logger.error(f"Run evaluator failed: {e}")
2782
2783        # Generate dataset run URL if applicable
2784        dataset_run_id = next(
2785            (
2786                result.dataset_run_id
2787                for result in valid_results
2788                if result.dataset_run_id
2789            ),
2790            None,
2791        )
2792        dataset_run_url = None
2793        if dataset_run_id and data:
2794            try:
2795                # Check if the first item has dataset_id (for DatasetItem objects)
2796                first_item = data[0]
2797                dataset_id = None
2798
2799                if hasattr(first_item, "dataset_id"):
2800                    dataset_id = getattr(first_item, "dataset_id", None)
2801
2802                if dataset_id:
2803                    project_id = self._get_project_id()
2804
2805                    if project_id:
2806                        dataset_run_url = f"{self._base_url}/project/{project_id}/datasets/{dataset_id}/runs/{dataset_run_id}"
2807
2808            except Exception:
2809                pass  # URL generation is optional
2810
2811        # Store run-level evaluations as scores
2812        for evaluation in run_evaluations:
2813            try:
2814                if dataset_run_id:
2815                    self.create_score(
2816                        dataset_run_id=dataset_run_id,
2817                        name=evaluation.name or "<unknown>",
2818                        value=evaluation.value,  # type: ignore
2819                        comment=evaluation.comment,
2820                        metadata=evaluation.metadata,
2821                        data_type=evaluation.data_type,  # type: ignore
2822                        config_id=evaluation.config_id,
2823                    )
2824
2825            except Exception as e:
2826                langfuse_logger.error(f"Failed to store run evaluation: {e}")
2827
2828        # Flush scores and traces
2829        self.flush()
2830
2831        return ExperimentResult(
2832            name=name,
2833            run_name=run_name,
2834            description=description,
2835            item_results=valid_results,
2836            run_evaluations=run_evaluations,
2837            experiment_id=dataset_run_id or shared_fallback_experiment_id,
2838            dataset_run_id=dataset_run_id,
2839            dataset_run_url=dataset_run_url,
2840        )
2841
2842    async def _process_experiment_item(
2843        self,
2844        item: ExperimentItem,
2845        task: Callable,
2846        evaluators: List[Callable],
2847        composite_evaluator: Optional[CompositeEvaluatorFunction],
2848        fallback_experiment_id: str,
2849        experiment_name: str,
2850        experiment_run_name: str,
2851        experiment_description: Optional[str],
2852        experiment_metadata: Optional[Dict[str, Any]] = None,
2853        dataset_version: Optional[datetime] = None,
2854    ) -> ExperimentItemResult:
2855        span_name = "experiment-item-run"
2856
2857        with self.start_as_current_observation(name=span_name) as span:
2858            try:
2859                input_data = (
2860                    item.get("input")
2861                    if isinstance(item, dict)
2862                    else getattr(item, "input", None)
2863                )
2864
2865                if input_data is None:
2866                    raise ValueError("Experiment Item is missing input. Skipping item.")
2867
2868                expected_output = (
2869                    item.get("expected_output")
2870                    if isinstance(item, dict)
2871                    else getattr(item, "expected_output", None)
2872                )
2873
2874                item_metadata = (
2875                    item.get("metadata")
2876                    if isinstance(item, dict)
2877                    else getattr(item, "metadata", None)
2878                )
2879
2880                final_observation_metadata = {
2881                    "experiment_name": experiment_name,
2882                    "experiment_run_name": experiment_run_name,
2883                    **(experiment_metadata or {}),
2884                }
2885
2886                trace_id = span.trace_id
2887                dataset_id = None
2888                dataset_item_id = None
2889                dataset_run_id = None
2890
2891                # Link to dataset run if this is a dataset item
2892                if hasattr(item, "id") and hasattr(item, "dataset_id"):
2893                    try:
2894                        # Use sync API to avoid event loop issues when run_async_safely
2895                        # creates multiple event loops across different threads
2896                        dataset_run_item = await asyncio.to_thread(
2897                            self.api.dataset_run_items.create,
2898                            run_name=experiment_run_name,
2899                            run_description=experiment_description,
2900                            metadata=experiment_metadata,
2901                            dataset_item_id=item.id,  # type: ignore
2902                            trace_id=trace_id,
2903                            observation_id=span.id,
2904                            dataset_version=dataset_version,
2905                        )
2906
2907                        dataset_run_id = dataset_run_item.dataset_run_id
2908
2909                    except Exception as e:
2910                        langfuse_logger.error(f"Failed to create dataset run item: {e}")
2911
2912                if (
2913                    not isinstance(item, dict)
2914                    and hasattr(item, "dataset_id")
2915                    and hasattr(item, "id")
2916                ):
2917                    dataset_id = item.dataset_id
2918                    dataset_item_id = item.id
2919
2920                    final_observation_metadata.update(
2921                        {"dataset_id": dataset_id, "dataset_item_id": dataset_item_id}
2922                    )
2923
2924                if isinstance(item_metadata, dict):
2925                    final_observation_metadata.update(item_metadata)
2926
2927                experiment_id = dataset_run_id or fallback_experiment_id
2928                experiment_item_id = (
2929                    dataset_item_id or get_sha256_hash_hex(_serialize(input_data))[:16]
2930                )
2931                span._otel_span.set_attributes(
2932                    {
2933                        k: v
2934                        for k, v in {
2935                            LangfuseOtelSpanAttributes.ENVIRONMENT: LANGFUSE_SDK_EXPERIMENT_ENVIRONMENT,
2936                            LangfuseOtelSpanAttributes.EXPERIMENT_DESCRIPTION: experiment_description,
2937                            LangfuseOtelSpanAttributes.EXPERIMENT_ITEM_EXPECTED_OUTPUT: _serialize(
2938                                expected_output
2939                            ),
2940                        }.items()
2941                        if v is not None
2942                    }
2943                )
2944
2945                propagated_experiment_attributes = PropagatedExperimentAttributes(
2946                    experiment_id=experiment_id,
2947                    experiment_name=experiment_run_name,
2948                    experiment_metadata=_flatten_and_serialize_metadata_values(
2949                        experiment_metadata
2950                    ),
2951                    experiment_dataset_id=dataset_id,
2952                    experiment_item_id=experiment_item_id,
2953                    experiment_item_metadata=_flatten_and_serialize_metadata_values(
2954                        item_metadata if isinstance(item_metadata, dict) else None
2955                    ),
2956                    experiment_item_root_observation_id=span.id,
2957                )
2958
2959                with _propagate_attributes(experiment=propagated_experiment_attributes):
2960                    output = await _run_task(task, item)
2961
2962                span.update(
2963                    input=input_data,
2964                    output=output,
2965                    metadata=final_observation_metadata,
2966                )
2967
2968            except Exception as e:
2969                span.update(
2970                    output=f"Error: {str(e)}", level="ERROR", status_message=str(e)
2971                )
2972                raise e
2973
2974            # Run evaluators
2975            evaluations = []
2976
2977            for evaluator in evaluators:
2978                try:
2979                    eval_metadata: Optional[Dict[str, Any]] = None
2980
2981                    if isinstance(item, dict):
2982                        eval_metadata = item.get("metadata")
2983                    elif hasattr(item, "metadata"):
2984                        eval_metadata = item.metadata
2985
2986                    with _propagate_attributes(
2987                        experiment=propagated_experiment_attributes
2988                    ):
2989                        eval_results = await _run_evaluator(
2990                            evaluator,
2991                            input=input_data,
2992                            output=output,
2993                            expected_output=expected_output,
2994                            metadata=eval_metadata,
2995                        )
2996                        evaluations.extend(eval_results)
2997
2998                        # Store evaluations as scores
2999                        for evaluation in eval_results:
3000                            self.create_score(
3001                                trace_id=trace_id,
3002                                observation_id=span.id,
3003                                name=evaluation.name,
3004                                value=evaluation.value,  # type: ignore
3005                                comment=evaluation.comment,
3006                                metadata=evaluation.metadata,
3007                                config_id=evaluation.config_id,
3008                                data_type=evaluation.data_type,  # type: ignore
3009                            )
3010
3011                except Exception as e:
3012                    langfuse_logger.error(f"Evaluator failed: {e}")
3013
3014            # Run composite evaluator if provided and we have evaluations
3015            if composite_evaluator and evaluations:
3016                try:
3017                    composite_eval_metadata: Optional[Dict[str, Any]] = None
3018                    if isinstance(item, dict):
3019                        composite_eval_metadata = item.get("metadata")
3020                    elif hasattr(item, "metadata"):
3021                        composite_eval_metadata = item.metadata
3022
3023                    with _propagate_attributes(
3024                        experiment=propagated_experiment_attributes
3025                    ):
3026                        result = composite_evaluator(
3027                            input=input_data,
3028                            output=output,
3029                            expected_output=expected_output,
3030                            metadata=composite_eval_metadata,
3031                            evaluations=evaluations,
3032                        )
3033
3034                        # Handle async composite evaluators
3035                        if asyncio.iscoroutine(result):
3036                            result = await result
3037
3038                        # Normalize to list
3039                        composite_evals: List[Evaluation] = []
3040                        if isinstance(result, (dict, Evaluation)):
3041                            composite_evals = [result]  # type: ignore
3042                        elif isinstance(result, list):
3043                            composite_evals = result  # type: ignore
3044
3045                        # Store composite evaluations as scores and add to evaluations list
3046                        for composite_evaluation in composite_evals:
3047                            self.create_score(
3048                                trace_id=trace_id,
3049                                observation_id=span.id,
3050                                name=composite_evaluation.name,
3051                                value=composite_evaluation.value,  # type: ignore
3052                                comment=composite_evaluation.comment,
3053                                metadata=composite_evaluation.metadata,
3054                                config_id=composite_evaluation.config_id,
3055                                data_type=composite_evaluation.data_type,  # type: ignore
3056                            )
3057                            evaluations.append(composite_evaluation)
3058
3059                except Exception as e:
3060                    langfuse_logger.error(f"Composite evaluator failed: {e}")
3061
3062            return ExperimentItemResult(
3063                item=item,
3064                output=output,
3065                evaluations=evaluations,
3066                trace_id=trace_id,
3067                dataset_run_id=dataset_run_id,
3068            )
3069
3070    def _create_experiment_run_name(
3071        self, *, name: Optional[str] = None, run_name: Optional[str] = None
3072    ) -> str:
3073        if run_name:
3074            return run_name
3075
3076        iso_timestamp = _get_timestamp().isoformat().replace("+00:00", "Z")
3077
3078        return f"{name} - {iso_timestamp}"
3079
3080    def run_batched_evaluation(
3081        self,
3082        *,
3083        scope: Literal["traces", "observations"],
3084        mapper: MapperFunction,
3085        filter: Optional[str] = None,
3086        fetch_batch_size: int = 50,
3087        fetch_trace_fields: Optional[str] = None,
3088        max_items: Optional[int] = None,
3089        max_retries: int = 3,
3090        evaluators: List[EvaluatorFunction],
3091        composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
3092        max_concurrency: int = 5,
3093        metadata: Optional[Dict[str, Any]] = None,
3094        _add_observation_scores_to_trace: bool = False,
3095        _additional_trace_tags: Optional[List[str]] = None,
3096        resume_from: Optional[BatchEvaluationResumeToken] = None,
3097        verbose: bool = False,
3098    ) -> BatchEvaluationResult:
3099        """Fetch traces or observations and run evaluations on each item.
3100
3101        This method provides a powerful way to evaluate existing data in Langfuse at scale.
3102        It fetches items based on filters, transforms them using a mapper function, runs
3103        evaluators on each item, and creates scores that are linked back to the original
3104        entities. This is ideal for:
3105
3106        - Running evaluations on production traces after deployment
3107        - Backtesting new evaluation metrics on historical data
3108        - Batch scoring of observations for quality monitoring
3109        - Periodic evaluation runs on recent data
3110
3111        The method uses a streaming/pipeline approach to process items in batches, making
3112        it memory-efficient for large datasets. It includes comprehensive error handling,
3113        retry logic, and resume capability for long-running evaluations.
3114
3115        Args:
3116            scope: The type of items to evaluate. Must be one of:
3117                - "traces": Evaluate complete traces with all their observations
3118                - "observations": Evaluate individual observations (spans, generations, events)
3119            mapper: Function that transforms API response objects into evaluator inputs.
3120                Receives a trace/observation object and returns an EvaluatorInputs
3121                instance with input, output, expected_output, and metadata fields.
3122                Can be sync or async.
3123            evaluators: List of evaluation functions to run on each item. Each evaluator
3124                receives the mapped inputs and returns Evaluation object(s). Evaluator
3125                failures are logged but don't stop the batch evaluation.
3126            filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples:
3127                - '{"tags": ["production"]}'
3128                - '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}'
3129                Default: None (fetches all items).
3130            fetch_batch_size: Number of items to fetch per API call and hold in memory.
3131                Larger values may be faster but use more memory. Default: 50.
3132            fetch_trace_fields: Comma-separated list of fields to include when fetching traces. Available field groups: 'core' (always included), 'io' (input, output, metadata), 'scores', 'observations', 'metrics'. If not specified, all fields are returned. Example: 'core,scores,metrics'. Note: Excluded 'observations' or 'scores' fields return empty arrays; excluded 'metrics' returns -1 for 'totalCost' and 'latency'. Only relevant if scope is 'traces'.
3133            max_items: Maximum total number of items to process. If None, processes all
3134                items matching the filter. Useful for testing or limiting evaluation runs.
3135                Default: None (process all).
3136            max_concurrency: Maximum number of items to evaluate concurrently. Controls
3137                parallelism and resource usage. Default: 5.
3138            composite_evaluator: Optional function that creates a composite score from
3139                item-level evaluations. Receives the original item and its evaluations,
3140                returns a single Evaluation. Useful for weighted averages or combined metrics.
3141                Default: None.
3142            metadata: Optional metadata dict to add to all created scores. Useful for
3143                tracking evaluation runs, versions, or other context. Default: None.
3144            max_retries: Maximum number of retry attempts for failed batch fetches.
3145                Uses exponential backoff (1s, 2s, 4s). Default: 3.
3146            verbose: If True, logs progress information to console. Useful for monitoring
3147                long-running evaluations. Default: False.
3148            resume_from: Optional resume token from a previous incomplete run. Allows
3149                continuing evaluation after interruption or failure. Default: None.
3150
3151
3152        Returns:
3153            BatchEvaluationResult containing:
3154                - total_items_fetched: Number of items fetched from API
3155                - total_items_processed: Number of items successfully evaluated
3156                - total_items_failed: Number of items that failed evaluation
3157                - total_scores_created: Scores created by item-level evaluators
3158                - total_composite_scores_created: Scores created by composite evaluator
3159                - total_evaluations_failed: Individual evaluator failures
3160                - evaluator_stats: Per-evaluator statistics (success rate, scores created)
3161                - resume_token: Token for resuming if incomplete (None if completed)
3162                - completed: True if all items processed
3163                - duration_seconds: Total execution time
3164                - failed_item_ids: IDs of items that failed
3165                - error_summary: Error types and counts
3166                - has_more_items: True if max_items reached but more exist
3167
3168        Raises:
3169            ValueError: If invalid scope is provided.
3170
3171        Examples:
3172            Basic trace evaluation:
3173            ```python
3174            from langfuse import Langfuse, EvaluatorInputs, Evaluation
3175
3176            client = Langfuse()
3177
3178            # Define mapper to extract fields from traces
3179            def trace_mapper(trace):
3180                return EvaluatorInputs(
3181                    input=trace.input,
3182                    output=trace.output,
3183                    expected_output=None,
3184                    metadata={"trace_id": trace.id}
3185                )
3186
3187            # Define evaluator
3188            def length_evaluator(*, input, output, expected_output, metadata):
3189                return Evaluation(
3190                    name="output_length",
3191                    value=len(output) if output else 0
3192                )
3193
3194            # Run batch evaluation
3195            result = client.run_batched_evaluation(
3196                scope="traces",
3197                mapper=trace_mapper,
3198                evaluators=[length_evaluator],
3199                filter='{"tags": ["production"]}',
3200                max_items=1000,
3201                verbose=True
3202            )
3203
3204            print(f"Processed {result.total_items_processed} traces")
3205            print(f"Created {result.total_scores_created} scores")
3206            ```
3207
3208            Evaluation with composite scorer:
3209            ```python
3210            def accuracy_evaluator(*, input, output, expected_output, metadata):
3211                # ... evaluation logic
3212                return Evaluation(name="accuracy", value=0.85)
3213
3214            def relevance_evaluator(*, input, output, expected_output, metadata):
3215                # ... evaluation logic
3216                return Evaluation(name="relevance", value=0.92)
3217
3218            def composite_evaluator(*, item, evaluations):
3219                # Weighted average of evaluations
3220                weights = {"accuracy": 0.6, "relevance": 0.4}
3221                total = sum(
3222                    e.value * weights.get(e.name, 0)
3223                    for e in evaluations
3224                    if isinstance(e.value, (int, float))
3225                )
3226                return Evaluation(
3227                    name="composite_score",
3228                    value=total,
3229                    comment=f"Weighted average of {len(evaluations)} metrics"
3230                )
3231
3232            result = client.run_batched_evaluation(
3233                scope="traces",
3234                mapper=trace_mapper,
3235                evaluators=[accuracy_evaluator, relevance_evaluator],
3236                composite_evaluator=composite_evaluator,
3237                filter='{"user_id": "important_user"}',
3238                verbose=True
3239            )
3240            ```
3241
3242            Handling incomplete runs with resume:
3243            ```python
3244            # Initial run that may fail or timeout
3245            result = client.run_batched_evaluation(
3246                scope="observations",
3247                mapper=obs_mapper,
3248                evaluators=[my_evaluator],
3249                max_items=10000,
3250                verbose=True
3251            )
3252
3253            # Check if incomplete
3254            if not result.completed and result.resume_token:
3255                print(f"Processed {result.resume_token.items_processed} items before interruption")
3256
3257                # Resume from where it left off
3258                result = client.run_batched_evaluation(
3259                    scope="observations",
3260                    mapper=obs_mapper,
3261                    evaluators=[my_evaluator],
3262                    resume_from=result.resume_token,
3263                    verbose=True
3264                )
3265
3266            print(f"Total items processed: {result.total_items_processed}")
3267            ```
3268
3269            Monitoring evaluator performance:
3270            ```python
3271            result = client.run_batched_evaluation(...)
3272
3273            for stats in result.evaluator_stats:
3274                success_rate = stats.successful_runs / stats.total_runs
3275                print(f"{stats.name}:")
3276                print(f"  Success rate: {success_rate:.1%}")
3277                print(f"  Scores created: {stats.total_scores_created}")
3278
3279                if stats.failed_runs > 0:
3280                    print(f"  ⚠️  Failed {stats.failed_runs} times")
3281            ```
3282
3283        Note:
3284            - Evaluator failures are logged but don't stop the batch evaluation
3285            - Individual item failures are tracked but don't stop processing
3286            - Fetch failures are retried with exponential backoff
3287            - All scores are automatically flushed to Langfuse at the end
3288            - The resume mechanism uses timestamp-based filtering to avoid duplicates
3289        """
3290        runner = BatchEvaluationRunner(self)
3291
3292        return cast(
3293            BatchEvaluationResult,
3294            run_async_safely(
3295                runner.run_async(
3296                    scope=scope,
3297                    mapper=mapper,
3298                    evaluators=evaluators,
3299                    filter=filter,
3300                    fetch_batch_size=fetch_batch_size,
3301                    fetch_trace_fields=fetch_trace_fields,
3302                    max_items=max_items,
3303                    max_concurrency=max_concurrency,
3304                    composite_evaluator=composite_evaluator,
3305                    metadata=metadata,
3306                    _add_observation_scores_to_trace=_add_observation_scores_to_trace,
3307                    _additional_trace_tags=_additional_trace_tags,
3308                    max_retries=max_retries,
3309                    verbose=verbose,
3310                    resume_from=resume_from,
3311                )
3312            ),
3313        )
3314
3315    def auth_check(self) -> bool:
3316        """Check if the provided credentials (public and secret key) are valid.
3317
3318        Raises:
3319            Exception: If no projects were found for the provided credentials.
3320
3321        Note:
3322            This method is blocking. It is discouraged to use it in production code.
3323        """
3324        try:
3325            projects = self.api.projects.get()
3326            langfuse_logger.debug(
3327                f"Auth check successful, found {len(projects.data)} projects"
3328            )
3329            if len(projects.data) == 0:
3330                raise Exception(
3331                    "Auth check failed, no project found for the keys provided."
3332                )
3333            return True
3334
3335        except AttributeError as e:
3336            langfuse_logger.warning(
3337                f"Auth check failed: Client not properly initialized. Error: {e}"
3338            )
3339            return False
3340
3341        except Error as e:
3342            handle_fern_exception(e)
3343            raise e
3344
3345    def create_dataset(
3346        self,
3347        *,
3348        name: str,
3349        description: Optional[str] = None,
3350        metadata: Optional[Any] = None,
3351        input_schema: Optional[Any] = None,
3352        expected_output_schema: Optional[Any] = None,
3353    ) -> Dataset:
3354        """Create a dataset with the given name on Langfuse.
3355
3356        Args:
3357            name: Name of the dataset to create.
3358            description: Description of the dataset. Defaults to None.
3359            metadata: Additional metadata. Defaults to None.
3360            input_schema: JSON Schema for validating dataset item inputs. When set, all new items will be validated against this schema.
3361            expected_output_schema: JSON Schema for validating dataset item expected outputs. When set, all new items will be validated against this schema.
3362
3363        Returns:
3364            Dataset: The created dataset as returned by the Langfuse API.
3365        """
3366        try:
3367            langfuse_logger.debug(f"Creating datasets {name}")
3368
3369            result = self.api.datasets.create(
3370                name=name,
3371                description=description,
3372                metadata=metadata,
3373                input_schema=input_schema,
3374                expected_output_schema=expected_output_schema,
3375            )
3376
3377            return cast(Dataset, result)
3378
3379        except Error as e:
3380            handle_fern_exception(e)
3381            raise e
3382
3383    def create_dataset_item(
3384        self,
3385        *,
3386        dataset_name: str,
3387        input: Optional[Any] = None,
3388        expected_output: Optional[Any] = None,
3389        metadata: Optional[Any] = None,
3390        source_trace_id: Optional[str] = None,
3391        source_observation_id: Optional[str] = None,
3392        status: Optional[DatasetStatus] = None,
3393        id: Optional[str] = None,
3394    ) -> DatasetItem:
3395        """Create a dataset item.
3396
3397        Upserts if an item with id already exists.
3398
3399        Args:
3400            dataset_name: Name of the dataset in which the dataset item should be created.
3401            input: Input data. Defaults to None. Can contain any dict, list or scalar.
3402            expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar.
3403            metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar.
3404            source_trace_id: Id of the source trace. Defaults to None.
3405            source_observation_id: Id of the source observation. Defaults to None.
3406            status: Status of the dataset item. Defaults to ACTIVE for newly created items.
3407            id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets.
3408
3409        Returns:
3410            DatasetItem: The created dataset item as returned by the Langfuse API.
3411
3412        Example:
3413            ```python
3414            from langfuse import Langfuse
3415
3416            langfuse = Langfuse()
3417
3418            # Uploading items to the Langfuse dataset named "capital_cities"
3419            langfuse.create_dataset_item(
3420                dataset_name="capital_cities",
3421                input={"input": {"country": "Italy"}},
3422                expected_output={"expected_output": "Rome"},
3423                metadata={"foo": "bar"}
3424            )
3425            ```
3426        """
3427        try:
3428            langfuse_logger.debug(f"Creating dataset item for dataset {dataset_name}")
3429
3430            # Media uploads must reference the (dataset, item) they belong to, and
3431            # the item need not exist yet — so settle on the item id up front and
3432            # reuse it for the create call below.
3433            item_id = id if id is not None else str(uuid.uuid4())
3434
3435            # Single pass per field: swap each LangfuseMedia for its reference
3436            # string (derived from content, not the upload) and collect the media
3437            # still to upload, deduped by media id and tagged with its field.
3438            pending_media: Dict[str, Tuple[LangfuseMedia, str]] = {}
3439            input = self._process_dataset_item_media(
3440                data=input,
3441                pending_media=pending_media,
3442                field=DatasetItemMediaReferenceField.INPUT.value,
3443            )
3444            expected_output = self._process_dataset_item_media(
3445                data=expected_output,
3446                pending_media=pending_media,
3447                field=DatasetItemMediaReferenceField.EXPECTED_OUTPUT.value,
3448            )
3449            metadata = self._process_dataset_item_media(
3450                data=metadata,
3451                pending_media=pending_media,
3452                field=DatasetItemMediaReferenceField.METADATA.value,
3453            )
3454
3455            # The upload needs the dataset id, but the create API only takes the
3456            # name. Resolve it once, and only when there is actually media to
3457            # upload — a plain item pays no extra datasets.get round-trip.
3458            if pending_media:
3459                assert self._resources is not None
3460                dataset_id = self.api.datasets.get(self._url_encode(dataset_name)).id
3461                for media, field in pending_media.values():
3462                    self._resources._media_manager._upload_media_sync(
3463                        media=media,
3464                        dataset_id=dataset_id,
3465                        dataset_item_id=item_id,
3466                        field=field,
3467                    )
3468
3469            result = self.api.dataset_items.create(
3470                dataset_name=dataset_name,
3471                input=input,
3472                expected_output=expected_output,
3473                metadata=metadata,
3474                source_trace_id=source_trace_id,
3475                source_observation_id=source_observation_id,
3476                status=status,
3477                id=item_id,
3478            )
3479
3480            return cast(DatasetItem, result)
3481        except Error as e:
3482            handle_fern_exception(e)
3483            raise e
3484
3485    def _process_dataset_item_media(
3486        self,
3487        *,
3488        data: Any,
3489        pending_media: Dict[str, Tuple[LangfuseMedia, str]],
3490        field: str,
3491    ) -> Any:
3492        """Swap each ``LangfuseMedia`` for its reference string in ``data``.
3493
3494        Each replaced media is recorded in ``pending_media`` (keyed by media id,
3495        so the same media across fields uploads once) for the caller to upload
3496        after the dataset id has been resolved.
3497        """
3498        if self._resources is None:
3499            return data
3500
3501        max_levels = 10
3502
3503        def _process_data_recursively(
3504            data: Any, level: int, ancestor_container_ids: set[int]
3505        ) -> Any:
3506            if isinstance(data, LangfuseMedia):
3507                reference_string = data._reference_string
3508                media_id = data._media_id
3509                if reference_string is None or media_id is None:
3510                    raise ValueError(
3511                        "Cannot create dataset item with invalid LangfuseMedia."
3512                    )
3513                # First field a media appears in wins; later duplicates dedupe.
3514                pending_media.setdefault(media_id, (data, field))
3515                return reference_string
3516
3517            if isinstance(data, LangfuseMediaReference):
3518                return data.reference_string if data.reference_string else data
3519
3520            # Tuples are intentionally excluded: namedtuple subclasses can't be
3521            # rebuilt from an iterable, so media inside them is left untouched.
3522            if not isinstance(data, (list, set, frozenset, dict)):
3523                return data
3524
3525            # Container ids only protect against recursive cycles.
3526            data_id = id(data)
3527            if data_id in ancestor_container_ids or level > max_levels:
3528                return data
3529
3530            next_ancestor_container_ids = ancestor_container_ids | {data_id}
3531
3532            if isinstance(data, (list, set, frozenset)):
3533                processed = (
3534                    _process_data_recursively(
3535                        item, level + 1, next_ancestor_container_ids
3536                    )
3537                    for item in data
3538                )
3539                return type(data)(processed)
3540
3541            return {
3542                key: _process_data_recursively(
3543                    value, level + 1, next_ancestor_container_ids
3544                )
3545                for key, value in data.items()
3546            }
3547
3548        return _process_data_recursively(data, 1, set())
3549
3550    def _hydrate_dataset_item_media_references(self, item: DatasetItem) -> DatasetItem:
3551        media_references = item.media_references or []
3552        if not media_references:
3553            return item
3554
3555        # Map the API enum member to the snake_case model attribute so this keeps
3556        # working regardless of the enum's wire value (e.g. "expectedOutput").
3557        attr_by_field = {
3558            DatasetItemMediaReferenceField.INPUT: "input",
3559            DatasetItemMediaReferenceField.EXPECTED_OUTPUT: "expected_output",
3560            DatasetItemMediaReferenceField.METADATA: "metadata",
3561        }
3562        hydrated_fields = {
3563            "input": item.input,
3564            "expected_output": item.expected_output,
3565            "metadata": item.metadata,
3566        }
3567
3568        for media_reference in media_references:
3569            media = media_reference.media
3570            field = attr_by_field.get(media_reference.field)
3571            if field is None:
3572                continue
3573
3574            replacement = LangfuseMediaReference(
3575                media_id=media.media_id,
3576                content_type=media.content_type,
3577                url=media.url,
3578                url_expiry=media.url_expiry,
3579                content_length=media.content_length,
3580                reference_string=media_reference.reference_string,
3581            )
3582            hydrated_fields[field] = self._replace_json_path_value(
3583                value=hydrated_fields[field],
3584                path=media_reference.json_path,
3585                replacement=replacement,
3586            )
3587
3588        return item.model_copy(
3589            update={
3590                "input": hydrated_fields["input"],
3591                "expected_output": hydrated_fields["expected_output"],
3592                "metadata": hydrated_fields["metadata"],
3593            }
3594        )
3595
3596    def _replace_json_path_value(
3597        self, *, value: Any, path: str, replacement: LangfuseMediaReference
3598    ) -> Any:
3599        try:
3600            return json_path.set_value_at_path(value, path, replacement)
3601        except Exception as e:
3602            langfuse_logger.warning(
3603                f"Failed to hydrate dataset media reference at JSONPath {path}",
3604                exc_info=e,
3605            )
3606
3607            return value
3608
3609    def resolve_media_references(
3610        self,
3611        *,
3612        obj: Any,
3613        resolve_with: Literal["base64_data_uri"],
3614        max_depth: int = 10,
3615        content_fetch_timeout_seconds: int = 5,
3616    ) -> Any:
3617        """Replace media reference strings in an object with base64 data URIs.
3618
3619        This method recursively traverses an object (up to max_depth) looking for media reference strings
3620        in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using
3621        the provided Langfuse client and replaces the reference string with a base64 data URI.
3622
3623        If fetching media content fails for a reference string, a warning is logged and the reference
3624        string is left unchanged.
3625
3626        Args:
3627            obj: The object to process. Can be a primitive value, array, or nested object.
3628                If the object has a __dict__ attribute, a dict will be returned instead of the original object type.
3629            resolve_with: The representation of the media content to replace the media reference string with.
3630                Currently only "base64_data_uri" is supported.
3631            max_depth: int: The maximum depth to traverse the object. Default is 10.
3632            content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5.
3633
3634        Returns:
3635            A deep copy of the input object with all media references replaced with base64 data URIs where possible.
3636            If the input object has a __dict__ attribute, a dict will be returned instead of the original object type.
3637
3638        Example:
3639            obj = {
3640                "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@",
3641                "nested": {
3642                    "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@"
3643                }
3644            }
3645
3646            result = await LangfuseMedia.resolve_media_references(obj, langfuse_client)
3647
3648            # Result:
3649            # {
3650            #     "image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...",
3651            #     "nested": {
3652            #         "pdf": "data:application/pdf;base64,JVBERi0xLjcK..."
3653            #     }
3654            # }
3655        """
3656        return LangfuseMedia.resolve_media_references(
3657            langfuse_client=self,
3658            obj=obj,
3659            resolve_with=resolve_with,
3660            max_depth=max_depth,
3661            content_fetch_timeout_seconds=content_fetch_timeout_seconds,
3662        )
3663
3664    @overload
3665    def get_prompt(
3666        self,
3667        name: str,
3668        *,
3669        version: Optional[int] = None,
3670        label: Optional[str] = None,
3671        type: Literal["chat"],
3672        cache_ttl_seconds: Optional[int] = None,
3673        fallback: Optional[List[ChatMessageDict]] = None,
3674        max_retries: Optional[int] = None,
3675        fetch_timeout_seconds: Optional[int] = None,
3676    ) -> ChatPromptClient: ...
3677
3678    @overload
3679    def get_prompt(
3680        self,
3681        name: str,
3682        *,
3683        version: Optional[int] = None,
3684        label: Optional[str] = None,
3685        type: Literal["text"] = "text",
3686        cache_ttl_seconds: Optional[int] = None,
3687        fallback: Optional[str] = None,
3688        max_retries: Optional[int] = None,
3689        fetch_timeout_seconds: Optional[int] = None,
3690    ) -> TextPromptClient: ...
3691
3692    def get_prompt(
3693        self,
3694        name: str,
3695        *,
3696        version: Optional[int] = None,
3697        label: Optional[str] = None,
3698        type: Literal["chat", "text"] = "text",
3699        cache_ttl_seconds: Optional[int] = None,
3700        fallback: Union[Optional[List[ChatMessageDict]], Optional[str]] = None,
3701        max_retries: Optional[int] = None,
3702        fetch_timeout_seconds: Optional[int] = None,
3703    ) -> PromptClient:
3704        """Get a prompt.
3705
3706        This method attempts to fetch the requested prompt from the local cache. If the prompt is not found
3707        in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again
3708        and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will
3709        return the expired prompt as a fallback.
3710
3711        Args:
3712            name (str): The name of the prompt to retrieve.
3713
3714        Keyword Args:
3715            version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both.
3716            label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both.
3717            cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a
3718            keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0.
3719            type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text".
3720            fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None.
3721            max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds.
3722            fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default.
3723
3724        Returns:
3725            The prompt object retrieved from the cache or directly fetched if not cached or expired of type
3726            - TextPromptClient, if type argument is 'text'.
3727            - ChatPromptClient, if type argument is 'chat'.
3728
3729        Raises:
3730            Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an
3731            expired prompt in the cache, in which case it logs a warning and returns the expired prompt.
3732        """
3733        if self._resources is None:
3734            raise Error(
3735                "SDK is not correctly initialized. Check the init logs for more details."
3736            )
3737        if version is not None and label is not None:
3738            raise ValueError("Cannot specify both version and label at the same time.")
3739
3740        if not name:
3741            raise ValueError("Prompt name cannot be empty.")
3742
3743        cache_key = PromptCache.generate_cache_key(name, version=version, label=label)
3744        bounded_max_retries = self._get_bounded_max_retries(
3745            max_retries, default_max_retries=2, max_retries_upper_bound=4
3746        )
3747
3748        langfuse_logger.debug(f"Getting prompt '{cache_key}'")
3749        cached_prompt = self._resources.prompt_cache.get(cache_key)
3750
3751        if cached_prompt is None or cache_ttl_seconds == 0:
3752            langfuse_logger.debug(
3753                f"Prompt '{cache_key}' not found in cache or caching disabled."
3754            )
3755            try:
3756                return self._fetch_prompt_and_update_cache(
3757                    name,
3758                    version=version,
3759                    label=label,
3760                    ttl_seconds=cache_ttl_seconds,
3761                    max_retries=bounded_max_retries,
3762                    fetch_timeout_seconds=fetch_timeout_seconds,
3763                )
3764            except Exception as e:
3765                if fallback:
3766                    langfuse_logger.warning(
3767                        f"Returning fallback prompt for '{cache_key}' due to fetch error: {e}"
3768                    )
3769
3770                    fallback_client_args: Dict[str, Any] = {
3771                        "name": name,
3772                        "prompt": fallback,
3773                        "type": type,
3774                        "version": version or 0,
3775                        "config": {},
3776                        "labels": [label] if label else [],
3777                        "tags": [],
3778                    }
3779
3780                    if type == "text":
3781                        return TextPromptClient(
3782                            prompt=Prompt_Text(**fallback_client_args),
3783                            is_fallback=True,
3784                        )
3785
3786                    if type == "chat":
3787                        return ChatPromptClient(
3788                            prompt=Prompt_Chat(**fallback_client_args),
3789                            is_fallback=True,
3790                        )
3791
3792                raise e
3793
3794        if cached_prompt.is_expired():
3795            langfuse_logger.debug(f"Stale prompt '{cache_key}' found in cache.")
3796            try:
3797                # refresh prompt in background thread, refresh_prompt deduplicates tasks
3798                langfuse_logger.debug(f"Refreshing prompt '{cache_key}' in background.")
3799
3800                def refresh_task() -> None:
3801                    self._fetch_prompt_and_update_cache(
3802                        name,
3803                        version=version,
3804                        label=label,
3805                        ttl_seconds=cache_ttl_seconds,
3806                        max_retries=bounded_max_retries,
3807                        fetch_timeout_seconds=fetch_timeout_seconds,
3808                    )
3809
3810                self._resources.prompt_cache.add_refresh_prompt_task_if_current(
3811                    cache_key,
3812                    cached_prompt,
3813                    refresh_task,
3814                )
3815                langfuse_logger.debug(
3816                    f"Returning stale prompt '{cache_key}' from cache."
3817                )
3818                # return stale prompt
3819                return cached_prompt.value
3820
3821            except Exception as e:
3822                langfuse_logger.warning(
3823                    f"Error when refreshing cached prompt '{cache_key}', returning cached version. Error: {e}"
3824                )
3825                # creation of refresh prompt task failed, return stale prompt
3826                return cached_prompt.value
3827
3828        return cached_prompt.value
3829
3830    def _fetch_prompt_and_update_cache(
3831        self,
3832        name: str,
3833        *,
3834        version: Optional[int] = None,
3835        label: Optional[str] = None,
3836        ttl_seconds: Optional[int] = None,
3837        max_retries: int,
3838        fetch_timeout_seconds: Optional[int],
3839    ) -> PromptClient:
3840        cache_key = PromptCache.generate_cache_key(name, version=version, label=label)
3841        langfuse_logger.debug(f"Fetching prompt '{cache_key}' from server...")
3842
3843        try:
3844
3845            @backoff.on_exception(
3846                backoff.constant, Exception, max_tries=max_retries + 1, logger=None
3847            )
3848            def fetch_prompts() -> Any:
3849                return self.api.prompts.get(
3850                    self._url_encode(name),
3851                    version=version,
3852                    label=label,
3853                    request_options={
3854                        "timeout_in_seconds": fetch_timeout_seconds,
3855                    }
3856                    if fetch_timeout_seconds is not None
3857                    else None,
3858                )
3859
3860            prompt_response = fetch_prompts()
3861
3862            prompt: PromptClient
3863            if prompt_response.type == "chat":
3864                prompt = ChatPromptClient(prompt_response)
3865            else:
3866                prompt = TextPromptClient(prompt_response)
3867
3868            if self._resources is not None:
3869                self._resources.prompt_cache.set(cache_key, prompt, ttl_seconds)
3870
3871            return prompt
3872
3873        except NotFoundError as not_found_error:
3874            langfuse_logger.warning(
3875                f"Prompt '{cache_key}' not found during refresh, evicting from cache."
3876            )
3877            if self._resources is not None:
3878                self._resources.prompt_cache.delete(cache_key)
3879            raise not_found_error
3880
3881        except Exception as e:
3882            langfuse_logger.error(
3883                f"Error while fetching prompt '{cache_key}': {str(e)}"
3884            )
3885            raise e
3886
3887    def _get_bounded_max_retries(
3888        self,
3889        max_retries: Optional[int],
3890        *,
3891        default_max_retries: int = 2,
3892        max_retries_upper_bound: int = 4,
3893    ) -> int:
3894        if max_retries is None:
3895            return default_max_retries
3896
3897        bounded_max_retries = min(
3898            max(max_retries, 0),
3899            max_retries_upper_bound,
3900        )
3901
3902        return bounded_max_retries
3903
3904    @overload
3905    def create_prompt(
3906        self,
3907        *,
3908        name: str,
3909        prompt: List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]],
3910        labels: List[str] = [],
3911        tags: Optional[List[str]] = None,
3912        type: Optional[Literal["chat"]],
3913        config: Optional[Any] = None,
3914        commit_message: Optional[str] = None,
3915    ) -> ChatPromptClient: ...
3916
3917    @overload
3918    def create_prompt(
3919        self,
3920        *,
3921        name: str,
3922        prompt: str,
3923        labels: List[str] = [],
3924        tags: Optional[List[str]] = None,
3925        type: Optional[Literal["text"]] = "text",
3926        config: Optional[Any] = None,
3927        commit_message: Optional[str] = None,
3928    ) -> TextPromptClient: ...
3929
3930    def create_prompt(
3931        self,
3932        *,
3933        name: str,
3934        prompt: Union[
3935            str, List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]]
3936        ],
3937        labels: List[str] = [],
3938        tags: Optional[List[str]] = None,
3939        type: Optional[Literal["chat", "text"]] = "text",
3940        config: Optional[Any] = None,
3941        commit_message: Optional[str] = None,
3942    ) -> PromptClient:
3943        """Create a new prompt in Langfuse.
3944
3945        Keyword Args:
3946            name : The name of the prompt to be created.
3947            prompt : The content of the prompt to be created.
3948            is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead.
3949            labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label.
3950            tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt.
3951            config: Additional structured data to be saved with the prompt. Defaults to None.
3952            type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text".
3953            commit_message: Optional string describing the change.
3954
3955        Returns:
3956            TextPromptClient: The prompt if type argument is 'text'.
3957            ChatPromptClient: The prompt if type argument is 'chat'.
3958        """
3959        try:
3960            langfuse_logger.debug(f"Creating prompt {name=}, {labels=}")
3961
3962            if type == "chat":
3963                if not isinstance(prompt, list):
3964                    raise ValueError(
3965                        "For 'chat' type, 'prompt' must be a list of chat messages with role and content attributes."
3966                    )
3967                request: Union[CreateChatPromptRequest, CreateTextPromptRequest] = (
3968                    CreateChatPromptRequest(
3969                        name=name,
3970                        prompt=cast(Any, prompt),
3971                        labels=labels,
3972                        tags=tags,
3973                        config=config or {},
3974                        commit_message=commit_message,
3975                        type=CreateChatPromptType.CHAT,
3976                    )
3977                )
3978                server_prompt = self.api.prompts.create(request=request)
3979
3980                if self._resources is not None:
3981                    self._resources.prompt_cache.invalidate(name)
3982
3983                return ChatPromptClient(prompt=cast(Prompt_Chat, server_prompt))
3984
3985            if not isinstance(prompt, str):
3986                raise ValueError("For 'text' type, 'prompt' must be a string.")
3987
3988            request = CreateTextPromptRequest(
3989                name=name,
3990                prompt=prompt,
3991                labels=labels,
3992                tags=tags,
3993                config=config or {},
3994                commit_message=commit_message,
3995            )
3996
3997            server_prompt = self.api.prompts.create(request=request)
3998
3999            if self._resources is not None:
4000                self._resources.prompt_cache.invalidate(name)
4001
4002            return TextPromptClient(prompt=cast(Prompt_Text, server_prompt))
4003
4004        except Error as e:
4005            handle_fern_exception(e)
4006            raise e
4007
4008    def update_prompt(
4009        self,
4010        *,
4011        name: str,
4012        version: int,
4013        new_labels: List[str] = [],
4014    ) -> Any:
4015        """Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name.
4016
4017        Args:
4018            name (str): The name of the prompt to update.
4019            version (int): The version number of the prompt to update.
4020            new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to [].
4021
4022        Returns:
4023            Prompt: The updated prompt from the Langfuse API.
4024
4025        """
4026        updated_prompt = self.api.prompt_version.update(
4027            name=self._url_encode(name),
4028            version=version,
4029            new_labels=new_labels,
4030        )
4031
4032        if self._resources is not None:
4033            self._resources.prompt_cache.invalidate(name)
4034
4035        return updated_prompt
4036
4037    def _url_encode(self, url: str, *, is_url_param: Optional[bool] = False) -> str:
4038        # httpx ≥ 0.28 does its own WHATWG-compliant quoting (eg. encodes bare
4039        # “%”, “?”, “#”, “|”, … in query/path parts).  Re-quoting here would
4040        # double-encode, so we skip when the value is about to be sent straight
4041        # to httpx (`is_url_param=True`) and the installed version is ≥ 0.28.
4042        if is_url_param and Version(httpx.__version__) >= Version("0.28.0"):
4043            return url
4044
4045        # urllib.parse.quote does not escape slashes "/" by default; we need to add safe="" to force escaping
4046        # we need add safe="" to force escaping of slashes
4047        # This is necessary for prompts in prompt folders
4048        return urllib.parse.quote(url, safe="")
4049
4050    def clear_prompt_cache(self) -> None:
4051        """Clear the entire prompt cache, removing all cached prompts.
4052
4053        This method is useful when you want to force a complete refresh of all
4054        cached prompts, for example after major updates or when you need to
4055        ensure the latest versions are fetched from the server.
4056        """
4057        if self._resources is not None:
4058            self._resources.prompt_cache.clear()

Main client for Langfuse tracing and platform features.

This class provides an interface for creating and managing traces, spans, and generations in Langfuse as well as interacting with the Langfuse API.

The client features a thread-safe singleton pattern for each unique public API key, ensuring consistent trace context propagation across your application. It implements efficient batching of spans with configurable flush settings and includes background thread management for media uploads and score ingestion.

Configuration is flexible through either direct parameters or environment variables, with graceful fallbacks and runtime configuration updates.

Attributes:
  • api: Synchronous API client for Langfuse backend communication
  • async_api: Asynchronous API client for Langfuse backend communication
  • _otel_tracer: Internal LangfuseTracer instance managing OpenTelemetry components
Arguments:
  • public_key (Optional[str]): Your Langfuse public API key. Can also be set via LANGFUSE_PUBLIC_KEY environment variable.
  • secret_key (Optional[str]): Your Langfuse secret API key. Can also be set via LANGFUSE_SECRET_KEY environment variable.
  • base_url (Optional[str]): The Langfuse API base URL. Defaults to "https://cloud.langfuse.com". Can also be set via LANGFUSE_BASE_URL environment variable.
  • host (Optional[str]): Deprecated. Use base_url instead. The Langfuse API host URL. Defaults to "https://cloud.langfuse.com".
  • timeout (Optional[int]): Timeout in seconds for API requests. Defaults to 5 seconds.
  • httpx_client (Optional[httpx.Client]): Custom httpx client for making non-tracing HTTP requests. If not provided, a default client will be created. Fork safety: httpx.Client is thread-safe but not process-safe. When using fork()-based servers (e.g. Gunicorn with --preload), the SDK automatically recreates its internally-managed HTTP client in child processes after fork. A custom httpx_client is intentionally left as-is (the fork-inherited copy is reused), so you retain the opportunity to handle process-safety yourself — for example by registering your own os.register_at_fork(after_in_child=...) handler to close and reopen connections on the custom client.
  • debug (bool): Enable debug logging. Defaults to False. Can also be set via LANGFUSE_DEBUG environment variable.
  • tracing_enabled (Optional[bool]): Enable or disable tracing. Defaults to True. Can also be set via LANGFUSE_TRACING_ENABLED environment variable.
  • flush_at (Optional[int]): Number of spans to batch before sending to the API. Defaults to 512. Can also be set via LANGFUSE_FLUSH_AT environment variable.
  • flush_interval (Optional[float]): Time in seconds between batch flushes. Defaults to 5 seconds. Can also be set via LANGFUSE_FLUSH_INTERVAL environment variable.
  • environment (Optional[str]): Environment name for tracing. Default is 'default'. Can also be set via LANGFUSE_TRACING_ENVIRONMENT environment variable. Can be any lowercase alphanumeric string with hyphens and underscores that does not start with 'langfuse'.
  • release (Optional[str]): Release version/hash of your application. Used for grouping analytics by release.
  • media_upload_thread_count (Optional[int]): Number of background threads for handling media uploads. Defaults to 1. Can also be set via LANGFUSE_MEDIA_UPLOAD_THREAD_COUNT environment variable.
  • sample_rate (Optional[float]): Sampling rate for traces (0.0 to 1.0). Defaults to 1.0 (100% of traces are sampled). Can also be set via LANGFUSE_SAMPLE_RATE environment variable.
  • mask (Optional[MaskFunction]): Function to mask sensitive data synchronously when Langfuse SDK attributes are created. This applies only to data set through Langfuse SDK APIs such as start_observation(), update(), and set_trace_io().
  • mask_otel_spans (Optional[MaskOtelSpansFunction]): Synchronous export-stage hook for masking raw OpenTelemetry span attributes before this Langfuse client sends them to Langfuse. Use this for spans created by third-party OpenTelemetry instrumentations, or when you need to inspect final span attributes after export filtering and Langfuse media handling. It does not modify spans already exported through other OpenTelemetry exporters.

    The hook receives one OpenTelemetry export batch. A batch is not guaranteed to contain a complete trace, request, or Langfuse observation tree. The hook usually runs on the OpenTelemetry batch span processor worker thread; during flush() and shutdown it may run on the caller thread. Keep it synchronous, deterministic, and fast.

    Return None to leave the batch unchanged. Return MaskOtelSpansResult with OtelSpanPatch values to delete or replace attributes on selected spans. If the hook raises or returns an invalid batch result, Langfuse drops the whole export batch. If one returned span patch is invalid, Langfuse drops only that span from the Langfuse export.

    Example:

    from typing import Optional
    
    from langfuse import Langfuse
    from langfuse.types import (
        MaskOtelSpansParams,
        MaskOtelSpansResult,
        OtelSpanPatch,
    )
    
    def mask_otel_spans(
        *, params: MaskOtelSpansParams
    ) -> Optional[MaskOtelSpansResult]:
        patches = {}
    
        for identifier, span in params.spans.items():
            if "gen_ai.prompt.0.content" in span.attributes:
                patches[identifier] = OtelSpanPatch(
                    delete_attributes=("gen_ai.prompt.0.content",),
                    set_attributes={"masking.applied": True},
                )
    
        return MaskOtelSpansResult(span_patches=patches)
    
    langfuse = Langfuse(mask_otel_spans=mask_otel_spans)
    
  • blocked_instrumentation_scopes (Optional[List[str]]): Deprecated. Use should_export_span instead. Equivalent behavior:

    from langfuse.span_filter import is_default_export_span
    blocked = {"sqlite", "requests"}
    
    should_export_span = lambda span: (
        is_default_export_span(span)
        and (
            span.instrumentation_scope is None
            or span.instrumentation_scope.name not in blocked
        )
    )
    
  • should_export_span (Optional[Callable[[ReadableSpan], bool]]): Callback to decide whether to export a span. If omitted, Langfuse uses the default filter (Langfuse SDK spans, spans with gen_ai.* attributes, and known LLM instrumentation scopes).

  • additional_headers (Optional[Dict[str, str]]): Additional headers to include in all API requests and in the default OTLPSpanExporter requests. These headers will be merged with default headers. Note: If httpx_client is provided, additional_headers must be set directly on your custom httpx_client as well. If span_exporter is provided, these headers are not wired into that exporter and must be configured on the exporter instance directly.
  • tracer_provider(Optional[TracerProvider]): OpenTelemetry TracerProvider to use for Langfuse. This can be useful to set to have disconnected tracing between Langfuse and other OpenTelemetry-span emitting libraries. Note: To track active spans, the context is still shared between TracerProviders. This may lead to broken trace trees.
  • id_generator (Optional[IdGenerator]): OpenTelemetry ID generator to use when Langfuse creates its own TracerProvider. If omitted, the OpenTelemetry SDK default is used. If tracer_provider is provided, or an OpenTelemetry TracerProvider is already registered globally, configure the ID generator on that provider instead.
  • span_exporter (Optional[SpanExporter]): Custom OpenTelemetry span exporter for the Langfuse span processor. If omitted, Langfuse creates an OTLPSpanExporter pointed at the Langfuse OTLP endpoint. If provided, Langfuse does not wire base_url, exporter headers, exporter auth, or exporter timeout into it. Configure endpoint, headers, and timeout on the exporter instance directly. If you are sending spans to Langfuse v4 or using Langfuse Cloud Fast Preview, include x-langfuse-ingestion-version=4 on the exporter to enable real time processing of exported spans.
Example:
from langfuse.otel import Langfuse

# Initialize the client (reads from env vars if not provided)
langfuse = Langfuse(
    public_key="your-public-key",
    secret_key="your-secret-key",
    host="https://cloud.langfuse.com",  # Optional, default shown
)

# Create a trace span
with langfuse.start_as_current_observation(name="process-query") as span:
    # Your application code here

    # Create a nested generation span for an LLM call
    with span.start_as_current_generation(
        name="generate-response",
        model="gpt-4",
        input={"query": "Tell me about AI"},
        model_parameters={"temperature": 0.7, "max_tokens": 500}
    ) as generation:
        # Generate response here
        response = "AI is a field of computer science..."

        generation.update(
            output=response,
            usage_details={"prompt_tokens": 10, "completion_tokens": 50},
            cost_details={"total_cost": 0.0023}
        )

        # Score the generation (supports NUMERIC, BOOLEAN, CATEGORICAL)
        generation.score(name="relevance", value=0.95, data_type="NUMERIC")
Langfuse( *, public_key: Optional[str] = None, secret_key: Optional[str] = None, base_url: Optional[str] = None, host: Optional[str] = None, timeout: Optional[int] = None, httpx_client: Optional[httpx.Client] = None, debug: bool = False, tracing_enabled: Optional[bool] = True, flush_at: Optional[int] = None, flush_interval: Optional[float] = None, environment: Optional[str] = None, release: Optional[str] = None, media_upload_thread_count: Optional[int] = None, sample_rate: Optional[float] = None, mask: Optional[langfuse.types.MaskFunction] = None, mask_otel_spans: Optional[MaskOtelSpansFunction] = None, blocked_instrumentation_scopes: Optional[List[str]] = None, should_export_span: Optional[Callable[[opentelemetry.sdk.trace.ReadableSpan], bool]] = None, additional_headers: Optional[Dict[str, str]] = None, tracer_provider: Optional[opentelemetry.sdk.trace.TracerProvider] = None, id_generator: Optional[opentelemetry.sdk.trace.id_generator.IdGenerator] = None, span_exporter: Optional[opentelemetry.sdk.trace.export.SpanExporter] = None)
290    def __init__(
291        self,
292        *,
293        public_key: Optional[str] = None,
294        secret_key: Optional[str] = None,
295        base_url: Optional[str] = None,
296        host: Optional[str] = None,
297        timeout: Optional[int] = None,
298        httpx_client: Optional[httpx.Client] = None,
299        debug: bool = False,
300        tracing_enabled: Optional[bool] = True,
301        flush_at: Optional[int] = None,
302        flush_interval: Optional[float] = None,
303        environment: Optional[str] = None,
304        release: Optional[str] = None,
305        media_upload_thread_count: Optional[int] = None,
306        sample_rate: Optional[float] = None,
307        mask: Optional[MaskFunction] = None,
308        mask_otel_spans: Optional[MaskOtelSpansFunction] = None,
309        blocked_instrumentation_scopes: Optional[List[str]] = None,
310        should_export_span: Optional[Callable[[ReadableSpan], bool]] = None,
311        additional_headers: Optional[Dict[str, str]] = None,
312        tracer_provider: Optional[TracerProvider] = None,
313        id_generator: Optional[IdGenerator] = None,
314        span_exporter: Optional[SpanExporter] = None,
315    ):
316        self._base_url = (
317            base_url
318            or os.environ.get(LANGFUSE_BASE_URL)
319            or host
320            or os.environ.get(LANGFUSE_HOST, "https://cloud.langfuse.com")
321        )
322        self._environment = environment or cast(
323            str, os.environ.get(LANGFUSE_TRACING_ENVIRONMENT)
324        )
325        self._release = (
326            release
327            or os.environ.get(LANGFUSE_RELEASE, None)
328            or get_common_release_envs()
329        )
330        self._project_id: Optional[str] = None
331        sample_rate = sample_rate or float(os.environ.get(LANGFUSE_SAMPLE_RATE, 1.0))
332        if not 0.0 <= sample_rate <= 1.0:
333            raise ValueError(
334                f"Sample rate must be between 0.0 and 1.0, got {sample_rate}"
335            )
336
337        timeout = timeout or int(os.environ.get(LANGFUSE_TIMEOUT, 5))
338
339        self._tracing_enabled = (
340            tracing_enabled
341            and os.environ.get(LANGFUSE_TRACING_ENABLED, "true").lower() != "false"
342        )
343        if not self._tracing_enabled:
344            langfuse_logger.info(
345                "Configuration: Langfuse tracing is explicitly disabled. No data will be sent to the Langfuse API."
346            )
347
348        debug = (
349            debug if debug else (os.getenv(LANGFUSE_DEBUG, "false").lower() == "true")
350        )
351        if debug:
352            logging.basicConfig(
353                format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
354            )
355            langfuse_logger.setLevel(logging.DEBUG)
356
357        public_key = public_key or os.environ.get(LANGFUSE_PUBLIC_KEY)
358        if public_key is None:
359            langfuse_logger.warning(
360                "Authentication error: Langfuse client initialized without public_key. Client will be disabled. "
361                "Provide a public_key parameter or set LANGFUSE_PUBLIC_KEY environment variable. "
362            )
363            self._otel_tracer = otel_trace_api.NoOpTracer()
364            return
365
366        secret_key = secret_key or os.environ.get(LANGFUSE_SECRET_KEY)
367        if secret_key is None:
368            langfuse_logger.warning(
369                "Authentication error: Langfuse client initialized without secret_key. Client will be disabled. "
370                "Provide a secret_key parameter or set LANGFUSE_SECRET_KEY environment variable. "
371            )
372            self._otel_tracer = otel_trace_api.NoOpTracer()
373            return
374
375        if os.environ.get("OTEL_SDK_DISABLED", "false").lower() == "true":
376            langfuse_logger.warning(
377                "OTEL_SDK_DISABLED is set. Langfuse tracing will be disabled and no traces will appear in the UI."
378            )
379
380        if blocked_instrumentation_scopes is not None:
381            warnings.warn(
382                "`blocked_instrumentation_scopes` is deprecated and will be removed in a future release. "
383                "Use `should_export_span` instead. Example: "
384                "from langfuse.span_filter import is_default_export_span; "
385                'blocked={"scope"}; should_export_span=lambda span: '
386                "is_default_export_span(span) and (span.instrumentation_scope is None or "
387                "span.instrumentation_scope.name not in blocked).",
388                DeprecationWarning,
389                stacklevel=2,
390            )
391
392        # Initialize api and tracer if requirements are met
393        self._resources = LangfuseResourceManager(
394            public_key=public_key,
395            secret_key=secret_key,
396            base_url=self._base_url,
397            timeout=timeout,
398            environment=self._environment,
399            release=release,
400            flush_at=flush_at,
401            flush_interval=flush_interval,
402            httpx_client=httpx_client,
403            media_upload_thread_count=media_upload_thread_count,
404            sample_rate=sample_rate,
405            mask=mask,
406            mask_otel_spans=mask_otel_spans,
407            tracing_enabled=self._tracing_enabled,
408            blocked_instrumentation_scopes=blocked_instrumentation_scopes,
409            should_export_span=should_export_span,
410            additional_headers=additional_headers,
411            tracer_provider=tracer_provider,
412            id_generator=id_generator,
413            span_exporter=span_exporter,
414        )
415        self._mask = self._resources.mask
416
417        self._otel_tracer = (
418            self._resources.tracer
419            if self._tracing_enabled and self._resources.tracer is not None
420            else otel_trace_api.NoOpTracer()
421        )
api: langfuse.api.LangfuseAPI
423    @property
424    def api(self) -> LangfuseAPI:
425        if self._resources is None:
426            raise AttributeError("Langfuse client is not initialized")
427
428        return self._resources.api
async_api: langfuse.api.AsyncLangfuseAPI
437    @property
438    def async_api(self) -> AsyncLangfuseAPI:
439        if self._resources is None:
440            raise AttributeError("Langfuse client is not initialized")
441
442        return self._resources.async_api
def start_observation( self, *, trace_context: Optional[langfuse.types.TraceContext] = None, name: str, as_type: Union[Literal['generation', 'embedding'], Literal['span', 'agent', 'tool', 'chain', 'retriever', 'evaluator', 'guardrail']] = 'span', input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, float, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None) -> Union[LangfuseSpan, LangfuseGeneration, LangfuseAgent, LangfuseTool, LangfuseChain, LangfuseRetriever, LangfuseEvaluator, LangfuseEmbedding, LangfuseGuardrail]:
598    def start_observation(
599        self,
600        *,
601        trace_context: Optional[TraceContext] = None,
602        name: str,
603        as_type: ObservationTypeLiteralNoEvent = "span",
604        input: Optional[Any] = None,
605        output: Optional[Any] = None,
606        metadata: Optional[Any] = None,
607        version: Optional[str] = None,
608        level: Optional[SpanLevel] = None,
609        status_message: Optional[str] = None,
610        completion_start_time: Optional[datetime] = None,
611        model: Optional[str] = None,
612        model_parameters: Optional[Dict[str, MapValue]] = None,
613        usage_details: Optional[Dict[str, int]] = None,
614        cost_details: Optional[Dict[str, float]] = None,
615        prompt: Optional[PromptClient] = None,
616    ) -> Union[
617        LangfuseSpan,
618        LangfuseGeneration,
619        LangfuseAgent,
620        LangfuseTool,
621        LangfuseChain,
622        LangfuseRetriever,
623        LangfuseEvaluator,
624        LangfuseEmbedding,
625        LangfuseGuardrail,
626    ]:
627        """Create a new observation of the specified type.
628
629        This method creates a new observation but does not set it as the current span in the
630        context. To create and use an observation within a context, use start_as_current_observation().
631
632        Args:
633            trace_context: Optional context for connecting to an existing trace
634            name: Name of the observation
635            as_type: Type of observation to create (defaults to "span")
636            input: Input data for the operation
637            output: Output data from the operation
638            metadata: Additional metadata to associate with the observation
639            version: Version identifier for the code or component
640            level: Importance level of the observation
641            status_message: Optional status message for the observation
642            completion_start_time: When the model started generating (for generation types)
643            model: Name/identifier of the AI model used (for generation types)
644            model_parameters: Parameters used for the model (for generation types)
645            usage_details: Token usage information (for generation types)
646            cost_details: Cost information (for generation types)
647            prompt: Associated prompt template (for generation types)
648
649        Returns:
650            An observation object of the appropriate type that must be ended with .end()
651        """
652        if trace_context:
653            trace_id = trace_context.get("trace_id", None)
654            parent_span_id = trace_context.get("parent_span_id", None)
655
656            if trace_id:
657                remote_parent_span = self._create_remote_parent_span(
658                    trace_id=trace_id, parent_span_id=parent_span_id
659                )
660
661                with otel_trace_api.use_span(
662                    cast(otel_trace_api.Span, remote_parent_span)
663                ):
664                    otel_span = self._otel_tracer.start_span(name=name)
665                    otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True)
666
667                    return self._create_observation_from_otel_span(
668                        otel_span=otel_span,
669                        as_type=as_type,
670                        input=input,
671                        output=output,
672                        metadata=metadata,
673                        version=version,
674                        level=level,
675                        status_message=status_message,
676                        completion_start_time=completion_start_time,
677                        model=model,
678                        model_parameters=model_parameters,
679                        usage_details=usage_details,
680                        cost_details=cost_details,
681                        prompt=prompt,
682                    )
683
684        otel_span = self._otel_tracer.start_span(name=name)
685
686        return self._create_observation_from_otel_span(
687            otel_span=otel_span,
688            as_type=as_type,
689            input=input,
690            output=output,
691            metadata=metadata,
692            version=version,
693            level=level,
694            status_message=status_message,
695            completion_start_time=completion_start_time,
696            model=model,
697            model_parameters=model_parameters,
698            usage_details=usage_details,
699            cost_details=cost_details,
700            prompt=prompt,
701        )

Create a new observation of the specified type.

This method creates a new observation but does not set it as the current span in the context. To create and use an observation within a context, use start_as_current_observation().

Arguments:
  • trace_context: Optional context for connecting to an existing trace
  • name: Name of the observation
  • as_type: Type of observation to create (defaults to "span")
  • input: Input data for the operation
  • output: Output data from the operation
  • metadata: Additional metadata to associate with the observation
  • version: Version identifier for the code or component
  • level: Importance level of the observation
  • status_message: Optional status message for the observation
  • completion_start_time: When the model started generating (for generation types)
  • model: Name/identifier of the AI model used (for generation types)
  • model_parameters: Parameters used for the model (for generation types)
  • usage_details: Token usage information (for generation types)
  • cost_details: Cost information (for generation types)
  • prompt: Associated prompt template (for generation types)
Returns:

An observation object of the appropriate type that must be ended with .end()

def start_as_current_observation( self, *, trace_context: Optional[langfuse.types.TraceContext] = None, name: str, as_type: Union[Literal['generation', 'embedding'], Literal['span', 'agent', 'tool', 'chain', 'retriever', 'evaluator', 'guardrail']] = 'span', input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, float, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None, end_on_exit: Optional[bool] = None) -> Union[opentelemetry.util._decorator._AgnosticContextManager[LangfuseGeneration], opentelemetry.util._decorator._AgnosticContextManager[LangfuseSpan], opentelemetry.util._decorator._AgnosticContextManager[LangfuseAgent], opentelemetry.util._decorator._AgnosticContextManager[LangfuseTool], opentelemetry.util._decorator._AgnosticContextManager[LangfuseChain], opentelemetry.util._decorator._AgnosticContextManager[LangfuseRetriever], opentelemetry.util._decorator._AgnosticContextManager[LangfuseEvaluator], opentelemetry.util._decorator._AgnosticContextManager[LangfuseEmbedding], opentelemetry.util._decorator._AgnosticContextManager[LangfuseGuardrail]]:
 931    def start_as_current_observation(
 932        self,
 933        *,
 934        trace_context: Optional[TraceContext] = None,
 935        name: str,
 936        as_type: ObservationTypeLiteralNoEvent = "span",
 937        input: Optional[Any] = None,
 938        output: Optional[Any] = None,
 939        metadata: Optional[Any] = None,
 940        version: Optional[str] = None,
 941        level: Optional[SpanLevel] = None,
 942        status_message: Optional[str] = None,
 943        completion_start_time: Optional[datetime] = None,
 944        model: Optional[str] = None,
 945        model_parameters: Optional[Dict[str, MapValue]] = None,
 946        usage_details: Optional[Dict[str, int]] = None,
 947        cost_details: Optional[Dict[str, float]] = None,
 948        prompt: Optional[PromptClient] = None,
 949        end_on_exit: Optional[bool] = None,
 950    ) -> Union[
 951        _AgnosticContextManager[LangfuseGeneration],
 952        _AgnosticContextManager[LangfuseSpan],
 953        _AgnosticContextManager[LangfuseAgent],
 954        _AgnosticContextManager[LangfuseTool],
 955        _AgnosticContextManager[LangfuseChain],
 956        _AgnosticContextManager[LangfuseRetriever],
 957        _AgnosticContextManager[LangfuseEvaluator],
 958        _AgnosticContextManager[LangfuseEmbedding],
 959        _AgnosticContextManager[LangfuseGuardrail],
 960    ]:
 961        """Create a new observation and set it as the current span in a context manager.
 962
 963        This method creates a new observation of the specified type and sets it as the
 964        current span within a context manager. Use this method with a 'with' statement to
 965        automatically handle the observation lifecycle within a code block.
 966
 967        The created observation will be the child of the current span in the context.
 968
 969        Args:
 970            trace_context: Optional context for connecting to an existing trace
 971            name: Name of the observation (e.g., function or operation name)
 972            as_type: Type of observation to create (defaults to "span")
 973            input: Input data for the operation (can be any JSON-serializable object)
 974            output: Output data from the operation (can be any JSON-serializable object)
 975            metadata: Additional metadata to associate with the observation
 976            version: Version identifier for the code or component
 977            level: Importance level of the observation (info, warning, error)
 978            status_message: Optional status message for the observation
 979            end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
 980
 981            The following parameters are available when as_type is: "generation" or "embedding".
 982            completion_start_time: When the model started generating the response
 983            model: Name/identifier of the AI model used (e.g., "gpt-4")
 984            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
 985            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
 986            cost_details: Cost information for the model call
 987            prompt: Associated prompt template from Langfuse prompt management
 988
 989        Returns:
 990            A context manager that yields the appropriate observation type based on as_type
 991
 992        Example:
 993            ```python
 994            # Create a span
 995            with langfuse.start_as_current_observation(name="process-query", as_type="span") as span:
 996                # Do work
 997                result = process_data()
 998                span.update(output=result)
 999
1000                # Create a child span automatically
1001                with span.start_as_current_observation(name="sub-operation") as child_span:
1002                    # Do sub-operation work
1003                    child_span.update(output="sub-result")
1004
1005            # Create a tool observation
1006            with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool:
1007                # Do tool work
1008                results = search_web(query)
1009                tool.update(output=results)
1010
1011            # Create a generation observation
1012            with langfuse.start_as_current_observation(
1013                name="answer-generation",
1014                as_type="generation",
1015                model="gpt-4"
1016            ) as generation:
1017                # Generate answer
1018                response = llm.generate(...)
1019                generation.update(output=response)
1020            ```
1021        """
1022        if as_type in get_observation_types_list(ObservationTypeGenerationLike):
1023            if trace_context:
1024                trace_id = trace_context.get("trace_id", None)
1025                parent_span_id = trace_context.get("parent_span_id", None)
1026
1027                if trace_id:
1028                    remote_parent_span = self._create_remote_parent_span(
1029                        trace_id=trace_id, parent_span_id=parent_span_id
1030                    )
1031
1032                    return cast(
1033                        Union[
1034                            _AgnosticContextManager[LangfuseGeneration],
1035                            _AgnosticContextManager[LangfuseEmbedding],
1036                        ],
1037                        self._create_span_with_parent_context(
1038                            as_type=as_type,
1039                            name=name,
1040                            remote_parent_span=remote_parent_span,
1041                            parent=None,
1042                            end_on_exit=end_on_exit,
1043                            input=input,
1044                            output=output,
1045                            metadata=metadata,
1046                            version=version,
1047                            level=level,
1048                            status_message=status_message,
1049                            completion_start_time=completion_start_time,
1050                            model=model,
1051                            model_parameters=model_parameters,
1052                            usage_details=usage_details,
1053                            cost_details=cost_details,
1054                            prompt=prompt,
1055                        ),
1056                    )
1057
1058            return cast(
1059                Union[
1060                    _AgnosticContextManager[LangfuseGeneration],
1061                    _AgnosticContextManager[LangfuseEmbedding],
1062                ],
1063                self._start_as_current_otel_span_with_processed_media(
1064                    as_type=as_type,
1065                    name=name,
1066                    end_on_exit=end_on_exit,
1067                    input=input,
1068                    output=output,
1069                    metadata=metadata,
1070                    version=version,
1071                    level=level,
1072                    status_message=status_message,
1073                    completion_start_time=completion_start_time,
1074                    model=model,
1075                    model_parameters=model_parameters,
1076                    usage_details=usage_details,
1077                    cost_details=cost_details,
1078                    prompt=prompt,
1079                ),
1080            )
1081
1082        if as_type in get_observation_types_list(ObservationTypeSpanLike):
1083            if trace_context:
1084                trace_id = trace_context.get("trace_id", None)
1085                parent_span_id = trace_context.get("parent_span_id", None)
1086
1087                if trace_id:
1088                    remote_parent_span = self._create_remote_parent_span(
1089                        trace_id=trace_id, parent_span_id=parent_span_id
1090                    )
1091
1092                    return cast(
1093                        Union[
1094                            _AgnosticContextManager[LangfuseSpan],
1095                            _AgnosticContextManager[LangfuseAgent],
1096                            _AgnosticContextManager[LangfuseTool],
1097                            _AgnosticContextManager[LangfuseChain],
1098                            _AgnosticContextManager[LangfuseRetriever],
1099                            _AgnosticContextManager[LangfuseEvaluator],
1100                            _AgnosticContextManager[LangfuseGuardrail],
1101                        ],
1102                        self._create_span_with_parent_context(
1103                            as_type=as_type,
1104                            name=name,
1105                            remote_parent_span=remote_parent_span,
1106                            parent=None,
1107                            end_on_exit=end_on_exit,
1108                            input=input,
1109                            output=output,
1110                            metadata=metadata,
1111                            version=version,
1112                            level=level,
1113                            status_message=status_message,
1114                        ),
1115                    )
1116
1117            return cast(
1118                Union[
1119                    _AgnosticContextManager[LangfuseSpan],
1120                    _AgnosticContextManager[LangfuseAgent],
1121                    _AgnosticContextManager[LangfuseTool],
1122                    _AgnosticContextManager[LangfuseChain],
1123                    _AgnosticContextManager[LangfuseRetriever],
1124                    _AgnosticContextManager[LangfuseEvaluator],
1125                    _AgnosticContextManager[LangfuseGuardrail],
1126                ],
1127                self._start_as_current_otel_span_with_processed_media(
1128                    as_type=as_type,
1129                    name=name,
1130                    end_on_exit=end_on_exit,
1131                    input=input,
1132                    output=output,
1133                    metadata=metadata,
1134                    version=version,
1135                    level=level,
1136                    status_message=status_message,
1137                ),
1138            )
1139
1140        # This should never be reached since all valid types are handled above
1141        langfuse_logger.warning(
1142            f"Unknown observation type: {as_type}, falling back to span"
1143        )
1144        return self._start_as_current_otel_span_with_processed_media(
1145            as_type="span",
1146            name=name,
1147            end_on_exit=end_on_exit,
1148            input=input,
1149            output=output,
1150            metadata=metadata,
1151            version=version,
1152            level=level,
1153            status_message=status_message,
1154        )

Create a new observation and set it as the current span in a context manager.

This method creates a new observation of the specified type and sets it as the current span within a context manager. Use this method with a 'with' statement to automatically handle the observation lifecycle within a code block.

The created observation will be the child of the current span in the context.

Arguments:
  • trace_context: Optional context for connecting to an existing trace
  • name: Name of the observation (e.g., function or operation name)
  • as_type: Type of observation to create (defaults to "span")
  • input: Input data for the operation (can be any JSON-serializable object)
  • output: Output data from the operation (can be any JSON-serializable object)
  • metadata: Additional metadata to associate with the observation
  • version: Version identifier for the code or component
  • level: Importance level of the observation (info, warning, error)
  • status_message: Optional status message for the observation
  • end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
  • The following parameters are available when as_type is: "generation" or "embedding".
  • completion_start_time: When the model started generating the response
  • model: Name/identifier of the AI model used (e.g., "gpt-4")
  • model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
  • usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
  • cost_details: Cost information for the model call
  • prompt: Associated prompt template from Langfuse prompt management
Returns:

A context manager that yields the appropriate observation type based on as_type

Example:
# Create a span
with langfuse.start_as_current_observation(name="process-query", as_type="span") as span:
    # Do work
    result = process_data()
    span.update(output=result)

    # Create a child span automatically
    with span.start_as_current_observation(name="sub-operation") as child_span:
        # Do sub-operation work
        child_span.update(output="sub-result")

# Create a tool observation
with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool:
    # Do tool work
    results = search_web(query)
    tool.update(output=results)

# Create a generation observation
with langfuse.start_as_current_observation(
    name="answer-generation",
    as_type="generation",
    model="gpt-4"
) as generation:
    # Generate answer
    response = llm.generate(...)
    generation.update(output=response)
def update_current_generation( self, *, name: Optional[str] = None, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, float, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None) -> None:
1346    def update_current_generation(
1347        self,
1348        *,
1349        name: Optional[str] = None,
1350        input: Optional[Any] = None,
1351        output: Optional[Any] = None,
1352        metadata: Optional[Any] = None,
1353        version: Optional[str] = None,
1354        level: Optional[SpanLevel] = None,
1355        status_message: Optional[str] = None,
1356        completion_start_time: Optional[datetime] = None,
1357        model: Optional[str] = None,
1358        model_parameters: Optional[Dict[str, MapValue]] = None,
1359        usage_details: Optional[Dict[str, int]] = None,
1360        cost_details: Optional[Dict[str, float]] = None,
1361        prompt: Optional[PromptClient] = None,
1362    ) -> None:
1363        """Update the current active generation span with new information.
1364
1365        This method updates the current generation span in the active context with
1366        additional information. It's useful for adding output, usage stats, or other
1367        details that become available during or after model generation.
1368
1369        Args:
1370            name: The generation name
1371            input: Updated input data for the model
1372            output: Output from the model (e.g., completions)
1373            metadata: Additional metadata to associate with the generation
1374            version: Version identifier for the model or component
1375            level: Importance level of the generation (info, warning, error)
1376            status_message: Optional status message for the generation
1377            completion_start_time: When the model started generating the response
1378            model: Name/identifier of the AI model used (e.g., "gpt-4")
1379            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1380            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1381            cost_details: Cost information for the model call
1382            prompt: Associated prompt template from Langfuse prompt management
1383
1384        Example:
1385            ```python
1386            with langfuse.start_as_current_generation(name="answer-query") as generation:
1387                # Initial setup and API call
1388                response = llm.generate(...)
1389
1390                # Update with results that weren't available at creation time
1391                langfuse.update_current_generation(
1392                    output=response.text,
1393                    usage_details={
1394                        "prompt_tokens": response.usage.prompt_tokens,
1395                        "completion_tokens": response.usage.completion_tokens
1396                    }
1397                )
1398            ```
1399        """
1400        if not self._tracing_enabled:
1401            langfuse_logger.debug(
1402                "Operation skipped: update_current_generation - Tracing is disabled or client is in no-op mode."
1403            )
1404            return
1405
1406        current_otel_span = self._get_current_otel_span()
1407
1408        if current_otel_span is not None:
1409            generation = LangfuseGeneration(
1410                otel_span=current_otel_span, langfuse_client=self
1411            )
1412
1413            if name:
1414                current_otel_span.update_name(name)
1415
1416            generation.update(
1417                input=input,
1418                output=output,
1419                metadata=metadata,
1420                version=version,
1421                level=level,
1422                status_message=status_message,
1423                completion_start_time=completion_start_time,
1424                model=model,
1425                model_parameters=model_parameters,
1426                usage_details=usage_details,
1427                cost_details=cost_details,
1428                prompt=prompt,
1429            )

Update the current active generation span with new information.

This method updates the current generation span in the active context with additional information. It's useful for adding output, usage stats, or other details that become available during or after model generation.

Arguments:
  • name: The generation name
  • input: Updated input data for the model
  • output: Output from the model (e.g., completions)
  • metadata: Additional metadata to associate with the generation
  • version: Version identifier for the model or component
  • level: Importance level of the generation (info, warning, error)
  • status_message: Optional status message for the generation
  • completion_start_time: When the model started generating the response
  • model: Name/identifier of the AI model used (e.g., "gpt-4")
  • model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
  • usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
  • cost_details: Cost information for the model call
  • prompt: Associated prompt template from Langfuse prompt management
Example:
with langfuse.start_as_current_generation(name="answer-query") as generation:
    # Initial setup and API call
    response = llm.generate(...)

    # Update with results that weren't available at creation time
    langfuse.update_current_generation(
        output=response.text,
        usage_details={
            "prompt_tokens": response.usage.prompt_tokens,
            "completion_tokens": response.usage.completion_tokens
        }
    )
def update_current_span( self, *, name: Optional[str] = None, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None) -> None:
1431    def update_current_span(
1432        self,
1433        *,
1434        name: Optional[str] = None,
1435        input: Optional[Any] = None,
1436        output: Optional[Any] = None,
1437        metadata: Optional[Any] = None,
1438        version: Optional[str] = None,
1439        level: Optional[SpanLevel] = None,
1440        status_message: Optional[str] = None,
1441    ) -> None:
1442        """Update the current active span with new information.
1443
1444        This method updates the current span in the active context with
1445        additional information. It's useful for adding outputs or metadata
1446        that become available during execution.
1447
1448        Args:
1449            name: The span name
1450            input: Updated input data for the operation
1451            output: Output data from the operation
1452            metadata: Additional metadata to associate with the span
1453            version: Version identifier for the code or component
1454            level: Importance level of the span (info, warning, error)
1455            status_message: Optional status message for the span
1456
1457        Example:
1458            ```python
1459            with langfuse.start_as_current_observation(name="process-data") as span:
1460                # Initial processing
1461                result = process_first_part()
1462
1463                # Update with intermediate results
1464                langfuse.update_current_span(metadata={"intermediate_result": result})
1465
1466                # Continue processing
1467                final_result = process_second_part(result)
1468
1469                # Final update
1470                langfuse.update_current_span(output=final_result)
1471            ```
1472        """
1473        if not self._tracing_enabled:
1474            langfuse_logger.debug(
1475                "Operation skipped: update_current_span - Tracing is disabled or client is in no-op mode."
1476            )
1477            return
1478
1479        current_otel_span = self._get_current_otel_span()
1480
1481        if current_otel_span is not None:
1482            span_class = self._get_span_class(
1483                self._get_observation_type_from_otel_span(current_otel_span)
1484            )
1485            span = span_class(
1486                otel_span=current_otel_span,
1487                langfuse_client=self,
1488                environment=self._environment,
1489                release=self._release,
1490            )
1491
1492            if name:
1493                current_otel_span.update_name(name)
1494
1495            span.update(
1496                input=input,
1497                output=output,
1498                metadata=metadata,
1499                version=version,
1500                level=level,
1501                status_message=status_message,
1502            )

Update the current active span with new information.

This method updates the current span in the active context with additional information. It's useful for adding outputs or metadata that become available during execution.

Arguments:
  • name: The span name
  • input: Updated input data for the operation
  • output: Output data from the operation
  • metadata: Additional metadata to associate with the span
  • version: Version identifier for the code or component
  • level: Importance level of the span (info, warning, error)
  • status_message: Optional status message for the span
Example:
with langfuse.start_as_current_observation(name="process-data") as span:
    # Initial processing
    result = process_first_part()

    # Update with intermediate results
    langfuse.update_current_span(metadata={"intermediate_result": result})

    # Continue processing
    final_result = process_second_part(result)

    # Final update
    langfuse.update_current_span(output=final_result)
@deprecated('Trace-level input/output is deprecated. For trace attributes (user_id, session_id, tags, etc.), use propagate_attributes() instead. This method will be removed in a future major version.')
def set_current_trace_io( self, *, input: Optional[Any] = None, output: Optional[Any] = None) -> None:
1504    @deprecated(
1505        "Trace-level input/output is deprecated. "
1506        "For trace attributes (user_id, session_id, tags, etc.), use propagate_attributes() instead. "
1507        "This method will be removed in a future major version."
1508    )
1509    def set_current_trace_io(
1510        self,
1511        *,
1512        input: Optional[Any] = None,
1513        output: Optional[Any] = None,
1514    ) -> None:
1515        """Set trace-level input and output for the current span's trace.
1516
1517        .. deprecated::
1518            This is a legacy method for backward compatibility with Langfuse platform
1519            features that still rely on trace-level input/output (e.g., legacy LLM-as-a-judge
1520            evaluators). It will be removed in a future major version.
1521
1522            For setting other trace attributes (user_id, session_id, metadata, tags, version),
1523            use :meth:`propagate_attributes` instead.
1524
1525        Args:
1526            input: Input data to associate with the trace.
1527            output: Output data to associate with the trace.
1528        """
1529        if not self._tracing_enabled:
1530            langfuse_logger.debug(
1531                "Operation skipped: set_current_trace_io - Tracing is disabled or client is in no-op mode."
1532            )
1533            return
1534
1535        current_otel_span = self._get_current_otel_span()
1536
1537        if current_otel_span is not None and current_otel_span.is_recording():
1538            span_class = self._get_span_class(
1539                self._get_observation_type_from_otel_span(current_otel_span)
1540            )
1541            span = span_class(
1542                otel_span=current_otel_span,
1543                langfuse_client=self,
1544                environment=self._environment,
1545                release=self._release,
1546            )
1547
1548            span.set_trace_io(
1549                input=input,
1550                output=output,
1551            )

Set trace-level input and output for the current span's trace.

Deprecated since version : This is a legacy method for backward compatibility with Langfuse platform features that still rely on trace-level input/output (e.g., legacy LLM-as-a-judge evaluators). It will be removed in a future major version.

For setting other trace attributes (user_id, session_id, metadata, tags, version), use propagate_attributes() instead.

Arguments:
  • input: Input data to associate with the trace.
  • output: Output data to associate with the trace.
def set_current_trace_as_public(self) -> None:
1553    def set_current_trace_as_public(self) -> None:
1554        """Make the current trace publicly accessible via its URL.
1555
1556        When a trace is published, anyone with the trace link can view the full trace
1557        without needing to be logged in to Langfuse. This action cannot be undone
1558        programmatically - once published, the entire trace becomes public.
1559
1560        This is a convenience method that publishes the trace from the currently
1561        active span context. Use this when you want to make a trace public from
1562        within a traced function without needing direct access to the span object.
1563        """
1564        if not self._tracing_enabled:
1565            langfuse_logger.debug(
1566                "Operation skipped: set_current_trace_as_public - Tracing is disabled or client is in no-op mode."
1567            )
1568            return
1569
1570        current_otel_span = self._get_current_otel_span()
1571
1572        if current_otel_span is not None and current_otel_span.is_recording():
1573            span_class = self._get_span_class(
1574                self._get_observation_type_from_otel_span(current_otel_span)
1575            )
1576            span = span_class(
1577                otel_span=current_otel_span,
1578                langfuse_client=self,
1579                environment=self._environment,
1580            )
1581
1582            span.set_trace_as_public()

Make the current trace publicly accessible via its URL.

When a trace is published, anyone with the trace link can view the full trace without needing to be logged in to Langfuse. This action cannot be undone programmatically - once published, the entire trace becomes public.

This is a convenience method that publishes the trace from the currently active span context. Use this when you want to make a trace public from within a traced function without needing direct access to the span object.

def create_event( self, *, trace_context: Optional[langfuse.types.TraceContext] = None, name: str, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None) -> LangfuseEvent:
1584    def create_event(
1585        self,
1586        *,
1587        trace_context: Optional[TraceContext] = None,
1588        name: str,
1589        input: Optional[Any] = None,
1590        output: Optional[Any] = None,
1591        metadata: Optional[Any] = None,
1592        version: Optional[str] = None,
1593        level: Optional[SpanLevel] = None,
1594        status_message: Optional[str] = None,
1595    ) -> LangfuseEvent:
1596        """Create a new Langfuse observation of type 'EVENT'.
1597
1598        The created Langfuse Event observation will be the child of the current span in the context.
1599
1600        Args:
1601            trace_context: Optional context for connecting to an existing trace
1602            name: Name of the span (e.g., function or operation name)
1603            input: Input data for the operation (can be any JSON-serializable object)
1604            output: Output data from the operation (can be any JSON-serializable object)
1605            metadata: Additional metadata to associate with the span
1606            version: Version identifier for the code or component
1607            level: Importance level of the span (info, warning, error)
1608            status_message: Optional status message for the span
1609
1610        Returns:
1611            The Langfuse Event object
1612
1613        Example:
1614            ```python
1615            event = langfuse.create_event(name="process-event")
1616            ```
1617        """
1618        timestamp = time_ns()
1619
1620        if trace_context:
1621            trace_id = trace_context.get("trace_id", None)
1622            parent_span_id = trace_context.get("parent_span_id", None)
1623
1624            if trace_id:
1625                remote_parent_span = self._create_remote_parent_span(
1626                    trace_id=trace_id, parent_span_id=parent_span_id
1627                )
1628
1629                with otel_trace_api.use_span(
1630                    cast(otel_trace_api.Span, remote_parent_span)
1631                ):
1632                    otel_span = self._otel_tracer.start_span(
1633                        name=name, start_time=timestamp
1634                    )
1635                    otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True)
1636
1637                    return cast(
1638                        LangfuseEvent,
1639                        LangfuseEvent(
1640                            otel_span=otel_span,
1641                            langfuse_client=self,
1642                            environment=self._environment,
1643                            release=self._release,
1644                            input=input,
1645                            output=output,
1646                            metadata=metadata,
1647                            version=version,
1648                            level=level,
1649                            status_message=status_message,
1650                        ).end(end_time=timestamp),
1651                    )
1652
1653        otel_span = self._otel_tracer.start_span(name=name, start_time=timestamp)
1654
1655        return cast(
1656            LangfuseEvent,
1657            LangfuseEvent(
1658                otel_span=otel_span,
1659                langfuse_client=self,
1660                environment=self._environment,
1661                release=self._release,
1662                input=input,
1663                output=output,
1664                metadata=metadata,
1665                version=version,
1666                level=level,
1667                status_message=status_message,
1668            ).end(end_time=timestamp),
1669        )

Create a new Langfuse observation of type 'EVENT'.

The created Langfuse Event observation will be the child of the current span in the context.

Arguments:
  • trace_context: Optional context for connecting to an existing trace
  • name: Name of the span (e.g., function or operation name)
  • input: Input data for the operation (can be any JSON-serializable object)
  • output: Output data from the operation (can be any JSON-serializable object)
  • metadata: Additional metadata to associate with the span
  • version: Version identifier for the code or component
  • level: Importance level of the span (info, warning, error)
  • status_message: Optional status message for the span
Returns:

The Langfuse Event object

Example:
event = langfuse.create_event(name="process-event")
@staticmethod
def create_trace_id(*, seed: Optional[str] = None) -> str:
1758    @staticmethod
1759    def create_trace_id(*, seed: Optional[str] = None) -> str:
1760        """Create a unique trace ID for use with Langfuse.
1761
1762        This method generates a unique trace ID for use with various Langfuse APIs.
1763        It can either generate a random ID or create a deterministic ID based on
1764        a seed string.
1765
1766        Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes.
1767        This method ensures the generated ID meets this requirement. If you need to
1768        correlate an external ID with a Langfuse trace ID, use the external ID as the
1769        seed to get a valid, deterministic Langfuse trace ID.
1770
1771        Args:
1772            seed: Optional string to use as a seed for deterministic ID generation.
1773                 If provided, the same seed will always produce the same ID.
1774                 If not provided, a random ID will be generated.
1775
1776        Returns:
1777            A 32-character lowercase hexadecimal string representing the Langfuse trace ID.
1778
1779        Example:
1780            ```python
1781            # Generate a random trace ID
1782            trace_id = langfuse.create_trace_id()
1783
1784            # Generate a deterministic ID based on a seed
1785            session_trace_id = langfuse.create_trace_id(seed="session-456")
1786
1787            # Correlate an external ID with a Langfuse trace ID
1788            external_id = "external-system-123456"
1789            correlated_trace_id = langfuse.create_trace_id(seed=external_id)
1790
1791            # Use the ID with trace context
1792            with langfuse.start_as_current_observation(
1793                name="process-request",
1794                trace_context={"trace_id": trace_id}
1795            ) as span:
1796                # Operation will be part of the specific trace
1797                pass
1798            ```
1799        """
1800        if not seed:
1801            trace_id_int = RandomIdGenerator().generate_trace_id()
1802
1803            return Langfuse._format_otel_trace_id(trace_id_int)
1804
1805        return sha256(seed.encode("utf-8")).digest()[:16].hex()

Create a unique trace ID for use with Langfuse.

This method generates a unique trace ID for use with various Langfuse APIs. It can either generate a random ID or create a deterministic ID based on a seed string.

Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes. This method ensures the generated ID meets this requirement. If you need to correlate an external ID with a Langfuse trace ID, use the external ID as the seed to get a valid, deterministic Langfuse trace ID.

Arguments:
  • seed: Optional string to use as a seed for deterministic ID generation. If provided, the same seed will always produce the same ID. If not provided, a random ID will be generated.
Returns:

A 32-character lowercase hexadecimal string representing the Langfuse trace ID.

Example:
# Generate a random trace ID
trace_id = langfuse.create_trace_id()

# Generate a deterministic ID based on a seed
session_trace_id = langfuse.create_trace_id(seed="session-456")

# Correlate an external ID with a Langfuse trace ID
external_id = "external-system-123456"
correlated_trace_id = langfuse.create_trace_id(seed=external_id)

# Use the ID with trace context
with langfuse.start_as_current_observation(
    name="process-request",
    trace_context={"trace_id": trace_id}
) as span:
    # Operation will be part of the specific trace
    pass
def create_score( self, *, name: str, value: Union[float, str], session_id: Optional[str] = None, dataset_run_id: Optional[str] = None, trace_id: Optional[str] = None, observation_id: Optional[str] = None, score_id: Optional[str] = None, data_type: Optional[Literal['NUMERIC', 'CATEGORICAL', 'BOOLEAN', 'TEXT', 'CORRECTION']] = None, comment: Optional[str] = None, config_id: Optional[str] = None, metadata: Optional[Any] = None, timestamp: Optional[datetime.datetime] = None, environment: Optional[str] = None) -> None:
1887    def create_score(
1888        self,
1889        *,
1890        name: str,
1891        value: Union[float, str],
1892        session_id: Optional[str] = None,
1893        dataset_run_id: Optional[str] = None,
1894        trace_id: Optional[str] = None,
1895        observation_id: Optional[str] = None,
1896        score_id: Optional[str] = None,
1897        data_type: Optional[ScoreDataType] = None,
1898        comment: Optional[str] = None,
1899        config_id: Optional[str] = None,
1900        metadata: Optional[Any] = None,
1901        timestamp: Optional[datetime] = None,
1902        environment: Optional[str] = None,
1903    ) -> None:
1904        """Create a score for a specific trace or observation.
1905
1906        This method creates a score for evaluating a Langfuse trace or observation. Scores can be
1907        used to track quality metrics, user feedback, or automated evaluations.
1908
1909        Args:
1910            name: Name of the score (e.g., "relevance", "accuracy")
1911            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION)
1912            session_id: ID of the Langfuse session to associate the score with
1913            dataset_run_id: ID of the Langfuse dataset run to associate the score with
1914            trace_id: ID of the Langfuse trace to associate the score with
1915            observation_id: Optional ID of the specific observation to score. Trace ID must be provided too.
1916            score_id: Optional custom ID for the score (auto-generated if not provided)
1917            data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION)
1918            comment: Optional comment or explanation for the score
1919            config_id: Optional ID of a score config defined in Langfuse
1920            metadata: Optional metadata to be attached to the score
1921            timestamp: Optional timestamp for the score (defaults to current UTC time)
1922            environment: Optional environment override for this score. If omitted,
1923                the score uses the client-level environment from
1924                `Langfuse(environment=...)` or `LANGFUSE_TRACING_ENVIRONMENT`.
1925                Langfuse observation wrapper methods pass their resolved span
1926                environment here so scores created via `span.score()` or
1927                `span.score_trace()` stay grouped with the scored observation or
1928                trace, including request-scoped environments propagated with
1929                `propagate_attributes(environment=...)`.
1930
1931        Example:
1932            ```python
1933            # Create a numeric score for accuracy
1934            langfuse.create_score(
1935                name="accuracy",
1936                value=0.92,
1937                trace_id="abcdef1234567890abcdef1234567890",
1938                data_type="NUMERIC",
1939                comment="High accuracy with minor irrelevant details"
1940            )
1941
1942            # Create a categorical score for sentiment
1943            langfuse.create_score(
1944                name="sentiment",
1945                value="positive",
1946                trace_id="abcdef1234567890abcdef1234567890",
1947                observation_id="abcdef1234567890",
1948                data_type="CATEGORICAL"
1949            )
1950            ```
1951        """
1952        if not self._tracing_enabled:
1953            return
1954
1955        score_id = score_id or self._create_observation_id()
1956
1957        try:
1958            new_body = ScoreBody(
1959                id=score_id,
1960                sessionId=session_id,
1961                datasetRunId=dataset_run_id,
1962                traceId=trace_id,
1963                observationId=observation_id,
1964                name=name,
1965                value=value,
1966                dataType=data_type,  # type: ignore
1967                comment=comment,
1968                configId=config_id,
1969                environment=environment or self._environment,
1970                metadata=metadata,
1971            )
1972
1973            event = {
1974                "id": self.create_trace_id(),
1975                "type": "score-create",
1976                "timestamp": timestamp or _get_timestamp(),
1977                "body": new_body,
1978            }
1979
1980            if self._resources is not None:
1981                # Force the score to be in sample if it was for a legacy trace ID, i.e. non-32 hexchar
1982                force_sample = (
1983                    not self._is_valid_trace_id(trace_id) if trace_id else True
1984                )
1985
1986                self._resources.add_score_task(
1987                    event,
1988                    force_sample=force_sample,
1989                )
1990
1991        except Exception as e:
1992            langfuse_logger.exception(
1993                f"Error creating score: Failed to process score event for trace_id={trace_id}, name={name}. Error: {e}"
1994            )

Create a score for a specific trace or observation.

This method creates a score for evaluating a Langfuse trace or observation. Scores can be used to track quality metrics, user feedback, or automated evaluations.

Arguments:
  • name: Name of the score (e.g., "relevance", "accuracy")
  • value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION)
  • session_id: ID of the Langfuse session to associate the score with
  • dataset_run_id: ID of the Langfuse dataset run to associate the score with
  • trace_id: ID of the Langfuse trace to associate the score with
  • observation_id: Optional ID of the specific observation to score. Trace ID must be provided too.
  • score_id: Optional custom ID for the score (auto-generated if not provided)
  • data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION)
  • comment: Optional comment or explanation for the score
  • config_id: Optional ID of a score config defined in Langfuse
  • metadata: Optional metadata to be attached to the score
  • timestamp: Optional timestamp for the score (defaults to current UTC time)
  • environment: Optional environment override for this score. If omitted, the score uses the client-level environment from Langfuse(environment=...) or LANGFUSE_TRACING_ENVIRONMENT. Langfuse observation wrapper methods pass their resolved span environment here so scores created via span.score() or span.score_trace() stay grouped with the scored observation or trace, including request-scoped environments propagated with propagate_attributes(environment=...).
Example:
# Create a numeric score for accuracy
langfuse.create_score(
    name="accuracy",
    value=0.92,
    trace_id="abcdef1234567890abcdef1234567890",
    data_type="NUMERIC",
    comment="High accuracy with minor irrelevant details"
)

# Create a categorical score for sentiment
langfuse.create_score(
    name="sentiment",
    value="positive",
    trace_id="abcdef1234567890abcdef1234567890",
    observation_id="abcdef1234567890",
    data_type="CATEGORICAL"
)
def score_current_span( self, *, name: str, value: Union[float, str], score_id: Optional[str] = None, data_type: Optional[Literal['NUMERIC', 'CATEGORICAL', 'BOOLEAN', 'TEXT', 'CORRECTION']] = None, comment: Optional[str] = None, config_id: Optional[str] = None, metadata: Optional[Any] = None) -> None:
2057    def score_current_span(
2058        self,
2059        *,
2060        name: str,
2061        value: Union[float, str],
2062        score_id: Optional[str] = None,
2063        data_type: Optional[ScoreDataType] = None,
2064        comment: Optional[str] = None,
2065        config_id: Optional[str] = None,
2066        metadata: Optional[Any] = None,
2067    ) -> None:
2068        """Create a score for the current active span.
2069
2070        This method scores the currently active span in the context. It's a convenient
2071        way to score the current operation without needing to know its trace and span IDs.
2072        If the active span has a `langfuse.environment` attribute, including one
2073        set by `propagate_attributes(environment=...)`, the score uses that
2074        environment. Otherwise it uses the client-level environment.
2075
2076        Args:
2077            name: Name of the score (e.g., "relevance", "accuracy")
2078            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION)
2079            score_id: Optional custom ID for the score (auto-generated if not provided)
2080            data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION)
2081            comment: Optional comment or explanation for the score
2082            config_id: Optional ID of a score config defined in Langfuse
2083            metadata: Optional metadata to be attached to the score
2084
2085        Example:
2086            ```python
2087            with langfuse.start_as_current_generation(name="answer-query") as generation:
2088                # Generate answer
2089                response = generate_answer(...)
2090                generation.update(output=response)
2091
2092                # Score the generation
2093                langfuse.score_current_span(
2094                    name="relevance",
2095                    value=0.85,
2096                    data_type="NUMERIC",
2097                    comment="Mostly relevant but contains some tangential information",
2098                    metadata={"model": "gpt-4", "prompt_version": "v2"}
2099                )
2100            ```
2101        """
2102        current_span = self._get_current_otel_span()
2103
2104        if current_span is not None:
2105            trace_id = self._get_otel_trace_id(current_span)
2106            observation_id = self._get_otel_span_id(current_span)
2107
2108            langfuse_logger.info(
2109                f"Score: Creating score name='{name}' value={value} for current span ({observation_id}) in trace {trace_id}"
2110            )
2111
2112            self.create_score(
2113                trace_id=trace_id,
2114                observation_id=observation_id,
2115                name=name,
2116                value=cast(str, value),
2117                score_id=score_id,
2118                data_type=cast(Literal["CATEGORICAL", "TEXT", "CORRECTION"], data_type),
2119                comment=comment,
2120                config_id=config_id,
2121                metadata=metadata,
2122                environment=get_string_span_attribute(
2123                    current_span, LangfuseOtelSpanAttributes.ENVIRONMENT
2124                ),
2125            )

Create a score for the current active span.

This method scores the currently active span in the context. It's a convenient way to score the current operation without needing to know its trace and span IDs. If the active span has a langfuse.environment attribute, including one set by propagate_attributes(environment=...), the score uses that environment. Otherwise it uses the client-level environment.

Arguments:
  • name: Name of the score (e.g., "relevance", "accuracy")
  • value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION)
  • score_id: Optional custom ID for the score (auto-generated if not provided)
  • data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION)
  • comment: Optional comment or explanation for the score
  • config_id: Optional ID of a score config defined in Langfuse
  • metadata: Optional metadata to be attached to the score
Example:
with langfuse.start_as_current_generation(name="answer-query") as generation:
    # Generate answer
    response = generate_answer(...)
    generation.update(output=response)

    # Score the generation
    langfuse.score_current_span(
        name="relevance",
        value=0.85,
        data_type="NUMERIC",
        comment="Mostly relevant but contains some tangential information",
        metadata={"model": "gpt-4", "prompt_version": "v2"}
    )
def score_current_trace( self, *, name: str, value: Union[float, str], score_id: Optional[str] = None, data_type: Optional[Literal['NUMERIC', 'CATEGORICAL', 'BOOLEAN', 'TEXT', 'CORRECTION']] = None, comment: Optional[str] = None, config_id: Optional[str] = None, metadata: Optional[Any] = None) -> None:
2155    def score_current_trace(
2156        self,
2157        *,
2158        name: str,
2159        value: Union[float, str],
2160        score_id: Optional[str] = None,
2161        data_type: Optional[ScoreDataType] = None,
2162        comment: Optional[str] = None,
2163        config_id: Optional[str] = None,
2164        metadata: Optional[Any] = None,
2165    ) -> None:
2166        """Create a score for the current trace.
2167
2168        This method scores the trace of the currently active span. Unlike score_current_span,
2169        this method associates the score with the entire trace rather than a specific span.
2170        It's useful for scoring overall performance or quality of the entire operation.
2171        If the active span has a `langfuse.environment` attribute, including one
2172        set by `propagate_attributes(environment=...)`, the score uses that
2173        environment. Otherwise it uses the client-level environment.
2174
2175        Args:
2176            name: Name of the score (e.g., "user_satisfaction", "overall_quality")
2177            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION)
2178            score_id: Optional custom ID for the score (auto-generated if not provided)
2179            data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION)
2180            comment: Optional comment or explanation for the score
2181            config_id: Optional ID of a score config defined in Langfuse
2182            metadata: Optional metadata to be attached to the score
2183
2184        Example:
2185            ```python
2186            with langfuse.start_as_current_observation(name="process-user-request") as span:
2187                # Process request
2188                result = process_complete_request()
2189                span.update(output=result)
2190
2191                # Score the overall trace
2192                langfuse.score_current_trace(
2193                    name="overall_quality",
2194                    value=0.95,
2195                    data_type="NUMERIC",
2196                    comment="High quality end-to-end response",
2197                    metadata={"evaluator": "gpt-4", "criteria": "comprehensive"}
2198                )
2199            ```
2200        """
2201        current_span = self._get_current_otel_span()
2202
2203        if current_span is not None:
2204            trace_id = self._get_otel_trace_id(current_span)
2205
2206            langfuse_logger.info(
2207                f"Score: Creating score name='{name}' value={value} for entire trace {trace_id}"
2208            )
2209
2210            self.create_score(
2211                trace_id=trace_id,
2212                name=name,
2213                value=cast(str, value),
2214                score_id=score_id,
2215                data_type=cast(Literal["CATEGORICAL", "TEXT", "CORRECTION"], data_type),
2216                comment=comment,
2217                config_id=config_id,
2218                metadata=metadata,
2219                environment=get_string_span_attribute(
2220                    current_span, LangfuseOtelSpanAttributes.ENVIRONMENT
2221                ),
2222            )

Create a score for the current trace.

This method scores the trace of the currently active span. Unlike score_current_span, this method associates the score with the entire trace rather than a specific span. It's useful for scoring overall performance or quality of the entire operation. If the active span has a langfuse.environment attribute, including one set by propagate_attributes(environment=...), the score uses that environment. Otherwise it uses the client-level environment.

Arguments:
  • name: Name of the score (e.g., "user_satisfaction", "overall_quality")
  • value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION)
  • score_id: Optional custom ID for the score (auto-generated if not provided)
  • data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION)
  • comment: Optional comment or explanation for the score
  • config_id: Optional ID of a score config defined in Langfuse
  • metadata: Optional metadata to be attached to the score
Example:
with langfuse.start_as_current_observation(name="process-user-request") as span:
    # Process request
    result = process_complete_request()
    span.update(output=result)

    # Score the overall trace
    langfuse.score_current_trace(
        name="overall_quality",
        value=0.95,
        data_type="NUMERIC",
        comment="High quality end-to-end response",
        metadata={"evaluator": "gpt-4", "criteria": "comprehensive"}
    )
def flush(self) -> None:
2224    def flush(self) -> None:
2225        """Force flush all pending spans and events to the Langfuse API.
2226
2227        This method manually flushes any pending spans, scores, and other events to the
2228        Langfuse API. It's useful in scenarios where you want to ensure all data is sent
2229        before proceeding, without waiting for the automatic flush interval.
2230
2231        Example:
2232            ```python
2233            # Record some spans and scores
2234            with langfuse.start_as_current_observation(name="operation") as span:
2235                # Do work...
2236                pass
2237
2238            # Ensure all data is sent to Langfuse before proceeding
2239            langfuse.flush()
2240
2241            # Continue with other work
2242            ```
2243        """
2244        if self._resources is not None:
2245            self._resources.flush()

Force flush all pending spans and events to the Langfuse API.

This method manually flushes any pending spans, scores, and other events to the Langfuse API. It's useful in scenarios where you want to ensure all data is sent before proceeding, without waiting for the automatic flush interval.

Example:
# Record some spans and scores
with langfuse.start_as_current_observation(name="operation") as span:
    # Do work...
    pass

# Ensure all data is sent to Langfuse before proceeding
langfuse.flush()

# Continue with other work
def shutdown(self) -> None:
2247    def shutdown(self) -> None:
2248        """Shut down the Langfuse client and flush all pending data.
2249
2250        This method cleanly shuts down the Langfuse client, ensuring all pending data
2251        is flushed to the API and all background threads are properly terminated.
2252
2253        It's important to call this method when your application is shutting down to
2254        prevent data loss and resource leaks. For most applications, using the client
2255        as a context manager or relying on the automatic shutdown via atexit is sufficient.
2256
2257        Example:
2258            ```python
2259            # Initialize Langfuse
2260            langfuse = Langfuse(public_key="...", secret_key="...")
2261
2262            # Use Langfuse throughout your application
2263            # ...
2264
2265            # When application is shutting down
2266            langfuse.shutdown()
2267            ```
2268        """
2269        if self._resources is not None:
2270            self._resources.shutdown()

Shut down the Langfuse client and flush all pending data.

This method cleanly shuts down the Langfuse client, ensuring all pending data is flushed to the API and all background threads are properly terminated.

It's important to call this method when your application is shutting down to prevent data loss and resource leaks. For most applications, using the client as a context manager or relying on the automatic shutdown via atexit is sufficient.

Example:
# Initialize Langfuse
langfuse = Langfuse(public_key="...", secret_key="...")

# Use Langfuse throughout your application
# ...

# When application is shutting down
langfuse.shutdown()
def get_current_trace_id(self) -> Optional[str]:
2272    def get_current_trace_id(self) -> Optional[str]:
2273        """Get the trace ID of the current active span.
2274
2275        This method retrieves the trace ID from the currently active span in the context.
2276        It can be used to get the trace ID for referencing in logs, external systems,
2277        or for creating related operations.
2278
2279        Returns:
2280            The current trace ID as a 32-character lowercase hexadecimal string,
2281            or None if there is no active span.
2282
2283        Example:
2284            ```python
2285            with langfuse.start_as_current_observation(name="process-request") as span:
2286                # Get the current trace ID for reference
2287                trace_id = langfuse.get_current_trace_id()
2288
2289                # Use it for external correlation
2290                log.info(f"Processing request with trace_id: {trace_id}")
2291
2292                # Or pass to another system
2293                external_system.process(data, trace_id=trace_id)
2294            ```
2295        """
2296        if not self._tracing_enabled:
2297            langfuse_logger.debug(
2298                "Operation skipped: get_current_trace_id - Tracing is disabled or client is in no-op mode."
2299            )
2300            return None
2301
2302        current_otel_span = self._get_current_otel_span()
2303
2304        return self._get_otel_trace_id(current_otel_span) if current_otel_span else None

Get the trace ID of the current active span.

This method retrieves the trace ID from the currently active span in the context. It can be used to get the trace ID for referencing in logs, external systems, or for creating related operations.

Returns:

The current trace ID as a 32-character lowercase hexadecimal string, or None if there is no active span.

Example:
with langfuse.start_as_current_observation(name="process-request") as span:
    # Get the current trace ID for reference
    trace_id = langfuse.get_current_trace_id()

    # Use it for external correlation
    log.info(f"Processing request with trace_id: {trace_id}")

    # Or pass to another system
    external_system.process(data, trace_id=trace_id)
def get_current_observation_id(self) -> Optional[str]:
2306    def get_current_observation_id(self) -> Optional[str]:
2307        """Get the observation ID (span ID) of the current active span.
2308
2309        This method retrieves the observation ID from the currently active span in the context.
2310        It can be used to get the observation ID for referencing in logs, external systems,
2311        or for creating scores or other related operations.
2312
2313        Returns:
2314            The current observation ID as a 16-character lowercase hexadecimal string,
2315            or None if there is no active span.
2316
2317        Example:
2318            ```python
2319            with langfuse.start_as_current_observation(name="process-user-query") as span:
2320                # Get the current observation ID
2321                observation_id = langfuse.get_current_observation_id()
2322
2323                # Store it for later reference
2324                cache.set(f"query_{query_id}_observation", observation_id)
2325
2326                # Process the query...
2327            ```
2328        """
2329        if not self._tracing_enabled:
2330            langfuse_logger.debug(
2331                "Operation skipped: get_current_observation_id - Tracing is disabled or client is in no-op mode."
2332            )
2333            return None
2334
2335        current_otel_span = self._get_current_otel_span()
2336
2337        return self._get_otel_span_id(current_otel_span) if current_otel_span else None

Get the observation ID (span ID) of the current active span.

This method retrieves the observation ID from the currently active span in the context. It can be used to get the observation ID for referencing in logs, external systems, or for creating scores or other related operations.

Returns:

The current observation ID as a 16-character lowercase hexadecimal string, or None if there is no active span.

Example:
with langfuse.start_as_current_observation(name="process-user-query") as span:
    # Get the current observation ID
    observation_id = langfuse.get_current_observation_id()

    # Store it for later reference
    cache.set(f"query_{query_id}_observation", observation_id)

    # Process the query...
def get_trace_url(self, *, trace_id: Optional[str] = None) -> Optional[str]:
2350    def get_trace_url(self, *, trace_id: Optional[str] = None) -> Optional[str]:
2351        """Get the URL to view a trace in the Langfuse UI.
2352
2353        This method generates a URL that links directly to a trace in the Langfuse UI.
2354        It's useful for providing links in logs, notifications, or debugging tools.
2355
2356        Args:
2357            trace_id: Optional trace ID to generate a URL for. If not provided,
2358                     the trace ID of the current active span will be used.
2359
2360        Returns:
2361            A URL string pointing to the trace in the Langfuse UI,
2362            or None if the project ID couldn't be retrieved or no trace ID is available.
2363
2364        Example:
2365            ```python
2366            # Get URL for the current trace
2367            with langfuse.start_as_current_observation(name="process-request") as span:
2368                trace_url = langfuse.get_trace_url()
2369                log.info(f"Processing trace: {trace_url}")
2370
2371            # Get URL for a specific trace
2372            specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef")
2373            send_notification(f"Review needed for trace: {specific_trace_url}")
2374            ```
2375        """
2376        final_trace_id = trace_id or self.get_current_trace_id()
2377        if not final_trace_id:
2378            return None
2379
2380        project_id = self._get_project_id()
2381
2382        return (
2383            f"{self._base_url}/project/{project_id}/traces/{final_trace_id}"
2384            if project_id and final_trace_id
2385            else None
2386        )

Get the URL to view a trace in the Langfuse UI.

This method generates a URL that links directly to a trace in the Langfuse UI. It's useful for providing links in logs, notifications, or debugging tools.

Arguments:
  • trace_id: Optional trace ID to generate a URL for. If not provided, the trace ID of the current active span will be used.
Returns:

A URL string pointing to the trace in the Langfuse UI, or None if the project ID couldn't be retrieved or no trace ID is available.

Example:
# Get URL for the current trace
with langfuse.start_as_current_observation(name="process-request") as span:
    trace_url = langfuse.get_trace_url()
    log.info(f"Processing trace: {trace_url}")

# Get URL for a specific trace
specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef")
send_notification(f"Review needed for trace: {specific_trace_url}")
def get_dataset( self, name: str, *, fetch_items_page_size: Optional[int] = 50, version: Optional[datetime.datetime] = None) -> langfuse._client.datasets.DatasetClient:
2388    def get_dataset(
2389        self,
2390        name: str,
2391        *,
2392        fetch_items_page_size: Optional[int] = 50,
2393        version: Optional[datetime] = None,
2394    ) -> "DatasetClient":
2395        """Fetch a dataset by its name.
2396
2397        Args:
2398            name: The name of the dataset to fetch.
2399            fetch_items_page_size: All items of the dataset will be fetched in chunks of this size. Defaults to 50.
2400            version: Retrieve dataset items as they existed at this specific point in time (UTC).
2401                If provided, returns the state of items at the specified UTC timestamp.
2402                If not provided, returns the latest version. Must be a timezone-aware datetime object in UTC.
2403
2404        Returns:
2405            DatasetClient: The dataset with the given name.
2406        """
2407        try:
2408            langfuse_logger.debug(f"Getting datasets {name}")
2409            dataset = self.api.datasets.get(dataset_name=self._url_encode(name))
2410
2411            dataset_items: List[DatasetItem] = []
2412            page = 1
2413
2414            while True:
2415                new_items = self.api.dataset_items.list(
2416                    dataset_name=self._url_encode(name, is_url_param=True),
2417                    page=page,
2418                    limit=fetch_items_page_size,
2419                    version=version,
2420                )
2421                dataset_items.extend(
2422                    self._hydrate_dataset_item_media_references(item)
2423                    for item in new_items.data
2424                )
2425
2426                if new_items.meta.total_pages <= page:
2427                    break
2428
2429                page += 1
2430
2431            return DatasetClient(
2432                dataset=dataset,
2433                items=dataset_items,
2434                version=version,
2435                langfuse_client=self,
2436            )
2437
2438        except Error as e:
2439            handle_fern_exception(e)
2440            raise e

Fetch a dataset by its name.

Arguments:
  • name: The name of the dataset to fetch.
  • fetch_items_page_size: All items of the dataset will be fetched in chunks of this size. Defaults to 50.
  • version: Retrieve dataset items as they existed at this specific point in time (UTC). If provided, returns the state of items at the specified UTC timestamp. If not provided, returns the latest version. Must be a timezone-aware datetime object in UTC.
Returns:

DatasetClient: The dataset with the given name.

def get_dataset_run( self, *, dataset_name: str, run_name: str) -> langfuse.api.DatasetRunWithItems:
2442    def get_dataset_run(
2443        self, *, dataset_name: str, run_name: str
2444    ) -> DatasetRunWithItems:
2445        """Fetch a dataset run by dataset name and run name.
2446
2447        Args:
2448            dataset_name (str): The name of the dataset.
2449            run_name (str): The name of the run.
2450
2451        Returns:
2452            DatasetRunWithItems: The dataset run with its items.
2453        """
2454        try:
2455            return cast(
2456                DatasetRunWithItems,
2457                self.api.datasets.get_run(
2458                    dataset_name=self._url_encode(dataset_name),
2459                    run_name=self._url_encode(run_name),
2460                    request_options=None,
2461                ),
2462            )
2463        except Error as e:
2464            handle_fern_exception(e)
2465            raise e

Fetch a dataset run by dataset name and run name.

Arguments:
  • dataset_name (str): The name of the dataset.
  • run_name (str): The name of the run.
Returns:

DatasetRunWithItems: The dataset run with its items.

def get_dataset_runs( self, *, dataset_name: str, page: Optional[int] = None, limit: Optional[int] = None) -> langfuse.api.PaginatedDatasetRuns:
2467    def get_dataset_runs(
2468        self,
2469        *,
2470        dataset_name: str,
2471        page: Optional[int] = None,
2472        limit: Optional[int] = None,
2473    ) -> PaginatedDatasetRuns:
2474        """Fetch all runs for a dataset.
2475
2476        Args:
2477            dataset_name (str): The name of the dataset.
2478            page (Optional[int]): Page number, starts at 1.
2479            limit (Optional[int]): Limit of items per page.
2480
2481        Returns:
2482            PaginatedDatasetRuns: Paginated list of dataset runs.
2483        """
2484        try:
2485            return cast(
2486                PaginatedDatasetRuns,
2487                self.api.datasets.get_runs(
2488                    dataset_name=self._url_encode(dataset_name),
2489                    page=page,
2490                    limit=limit,
2491                    request_options=None,
2492                ),
2493            )
2494        except Error as e:
2495            handle_fern_exception(e)
2496            raise e

Fetch all runs for a dataset.

Arguments:
  • dataset_name (str): The name of the dataset.
  • page (Optional[int]): Page number, starts at 1.
  • limit (Optional[int]): Limit of items per page.
Returns:

PaginatedDatasetRuns: Paginated list of dataset runs.

def delete_dataset_run( self, *, dataset_name: str, run_name: str) -> langfuse.api.DeleteDatasetRunResponse:
2498    def delete_dataset_run(
2499        self, *, dataset_name: str, run_name: str
2500    ) -> DeleteDatasetRunResponse:
2501        """Delete a dataset run and all its run items. This action is irreversible.
2502
2503        Args:
2504            dataset_name (str): The name of the dataset.
2505            run_name (str): The name of the run.
2506
2507        Returns:
2508            DeleteDatasetRunResponse: Confirmation of deletion.
2509        """
2510        try:
2511            return cast(
2512                DeleteDatasetRunResponse,
2513                self.api.datasets.delete_run(
2514                    dataset_name=self._url_encode(dataset_name),
2515                    run_name=self._url_encode(run_name),
2516                    request_options=None,
2517                ),
2518            )
2519        except Error as e:
2520            handle_fern_exception(e)
2521            raise e

Delete a dataset run and all its run items. This action is irreversible.

Arguments:
  • dataset_name (str): The name of the dataset.
  • run_name (str): The name of the run.
Returns:

DeleteDatasetRunResponse: Confirmation of deletion.

def run_experiment( self, *, name: str, run_name: Optional[str] = None, description: Optional[str] = None, data: Union[List[langfuse.experiment.LocalExperimentItem], List[langfuse.api.DatasetItem]], task: langfuse.experiment.TaskFunction, evaluators: List[langfuse.experiment.EvaluatorFunction] = [], composite_evaluator: Optional[CompositeEvaluatorFunction] = None, run_evaluators: List[langfuse.experiment.RunEvaluatorFunction] = [], max_concurrency: int = 50, metadata: Optional[Dict[str, str]] = None, _dataset_version: Optional[datetime.datetime] = None) -> langfuse.experiment.ExperimentResult:
2523    def run_experiment(
2524        self,
2525        *,
2526        name: str,
2527        run_name: Optional[str] = None,
2528        description: Optional[str] = None,
2529        data: ExperimentData,
2530        task: TaskFunction,
2531        evaluators: List[EvaluatorFunction] = [],
2532        composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
2533        run_evaluators: List[RunEvaluatorFunction] = [],
2534        max_concurrency: int = 50,
2535        metadata: Optional[Dict[str, str]] = None,
2536        _dataset_version: Optional[datetime] = None,
2537    ) -> ExperimentResult:
2538        """Run an experiment on a dataset with automatic tracing and evaluation.
2539
2540        This method executes a task function on each item in the provided dataset,
2541        automatically traces all executions with Langfuse for observability, runs
2542        item-level and run-level evaluators on the outputs, and returns comprehensive
2543        results with evaluation metrics.
2544
2545        The experiment system provides:
2546        - Automatic tracing of all task executions
2547        - Concurrent processing with configurable limits
2548        - Comprehensive error handling that isolates failures
2549        - Integration with Langfuse datasets for experiment tracking
2550        - Flexible evaluation framework supporting both sync and async evaluators
2551
2552        Args:
2553            name: Human-readable name for the experiment. Used for identification
2554                in the Langfuse UI.
2555            run_name: Optional exact name for the experiment run. If provided, this will be
2556                used as the exact dataset run name if the `data` contains Langfuse dataset items.
2557                If not provided, this will default to the experiment name appended with an ISO timestamp.
2558            description: Optional description explaining the experiment's purpose,
2559                methodology, or expected outcomes.
2560            data: Array of data items to process. Can be either:
2561                - List of dict-like items with 'input', 'expected_output', 'metadata' keys
2562                - List of Langfuse DatasetItem objects from dataset.items
2563            task: Function that processes each data item and returns output.
2564                Must accept 'item' as keyword argument and can return sync or async results.
2565                The task function signature should be: task(*, item, **kwargs) -> Any
2566            evaluators: List of functions to evaluate each item's output individually.
2567                Each evaluator receives input, output, expected_output, and metadata.
2568                Can return single Evaluation dict or list of Evaluation dicts.
2569            composite_evaluator: Optional function that creates composite scores from item-level evaluations.
2570                Receives the same inputs as item-level evaluators (input, output, expected_output, metadata)
2571                plus the list of evaluations from item-level evaluators. Useful for weighted averages,
2572                pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics.
2573            run_evaluators: List of functions to evaluate the entire experiment run.
2574                Each run evaluator receives all item_results and can compute aggregate metrics.
2575                Useful for calculating averages, distributions, or cross-item comparisons.
2576            max_concurrency: Maximum number of concurrent task executions (default: 50).
2577                Controls the number of items processed simultaneously. Adjust based on
2578                API rate limits and system resources.
2579            metadata: Optional metadata dictionary to attach to all experiment traces.
2580                This metadata will be included in every trace created during the experiment.
2581                If `data` are Langfuse dataset items, the metadata will be attached to the dataset run, too.
2582
2583        Returns:
2584            ExperimentResult containing:
2585            - run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset.
2586            - item_results: List of results for each processed item with outputs and evaluations
2587            - run_evaluations: List of aggregate evaluation results for the entire run
2588            - experiment_id: Stable identifier for the experiment run across all items
2589            - dataset_run_id: ID of the dataset run (if using Langfuse datasets)
2590            - dataset_run_url: Direct URL to view results in Langfuse UI (if applicable)
2591
2592        Raises:
2593            ValueError: If required parameters are missing or invalid
2594            Exception: If experiment setup fails (individual item failures are handled gracefully)
2595
2596        Examples:
2597            Basic experiment with local data:
2598            ```python
2599            def summarize_text(*, item, **kwargs):
2600                return f"Summary: {item['input'][:50]}..."
2601
2602            def length_evaluator(*, input, output, expected_output=None, **kwargs):
2603                return {
2604                    "name": "output_length",
2605                    "value": len(output),
2606                    "comment": f"Output contains {len(output)} characters"
2607                }
2608
2609            result = langfuse.run_experiment(
2610                name="Text Summarization Test",
2611                description="Evaluate summarization quality and length",
2612                data=[
2613                    {"input": "Long article text...", "expected_output": "Expected summary"},
2614                    {"input": "Another article...", "expected_output": "Another summary"}
2615                ],
2616                task=summarize_text,
2617                evaluators=[length_evaluator]
2618            )
2619
2620            print(f"Processed {len(result.item_results)} items")
2621            for item_result in result.item_results:
2622                print(f"Input: {item_result.item['input']}")
2623                print(f"Output: {item_result.output}")
2624                print(f"Evaluations: {item_result.evaluations}")
2625            ```
2626
2627            Advanced experiment with async task and multiple evaluators:
2628            ```python
2629            async def llm_task(*, item, **kwargs):
2630                # Simulate async LLM call
2631                response = await openai_client.chat.completions.create(
2632                    model="gpt-4",
2633                    messages=[{"role": "user", "content": item["input"]}]
2634                )
2635                return response.choices[0].message.content
2636
2637            def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
2638                if expected_output and expected_output.lower() in output.lower():
2639                    return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"}
2640                return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"}
2641
2642            def toxicity_evaluator(*, input, output, expected_output=None, **kwargs):
2643                # Simulate toxicity check
2644                toxicity_score = check_toxicity(output)  # Your toxicity checker
2645                return {
2646                    "name": "toxicity",
2647                    "value": toxicity_score,
2648                    "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}"
2649                }
2650
2651            def average_accuracy(*, item_results, **kwargs):
2652                accuracies = [
2653                    eval.value for result in item_results
2654                    for eval in result.evaluations
2655                    if eval.name == "accuracy"
2656                ]
2657                return {
2658                    "name": "average_accuracy",
2659                    "value": sum(accuracies) / len(accuracies) if accuracies else 0,
2660                    "comment": f"Average accuracy across {len(accuracies)} items"
2661                }
2662
2663            result = langfuse.run_experiment(
2664                name="LLM Safety and Accuracy Test",
2665                description="Evaluate model accuracy and safety across diverse prompts",
2666                data=test_dataset,  # Your dataset items
2667                task=llm_task,
2668                evaluators=[accuracy_evaluator, toxicity_evaluator],
2669                run_evaluators=[average_accuracy],
2670                max_concurrency=5,  # Limit concurrent API calls
2671                metadata={"model": "gpt-4", "temperature": 0.7}
2672            )
2673            ```
2674
2675            Using with Langfuse datasets:
2676            ```python
2677            # Get dataset from Langfuse
2678            dataset = langfuse.get_dataset("my-eval-dataset")
2679
2680            result = dataset.run_experiment(
2681                name="Production Model Evaluation",
2682                description="Monthly evaluation of production model performance",
2683                task=my_production_task,
2684                evaluators=[accuracy_evaluator, latency_evaluator]
2685            )
2686
2687            # Results automatically linked to dataset in Langfuse UI
2688            print(f"View results: {result['dataset_run_url']}")
2689            ```
2690
2691        Note:
2692            - Task and evaluator functions can be either synchronous or asynchronous
2693            - Individual item failures are logged but don't stop the experiment
2694            - All executions are automatically traced and visible in Langfuse UI
2695            - When using Langfuse datasets, results are automatically linked for easy comparison
2696            - This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.)
2697            - Async execution is handled automatically with smart event loop detection
2698        """
2699        return cast(
2700            ExperimentResult,
2701            run_async_safely(
2702                self._run_experiment_async(
2703                    name=name,
2704                    run_name=self._create_experiment_run_name(
2705                        name=name, run_name=run_name
2706                    ),
2707                    description=description,
2708                    data=data,
2709                    task=task,
2710                    evaluators=evaluators or [],
2711                    composite_evaluator=composite_evaluator,
2712                    run_evaluators=run_evaluators or [],
2713                    max_concurrency=max_concurrency,
2714                    metadata=metadata,
2715                    dataset_version=_dataset_version,
2716                ),
2717            ),
2718        )

Run an experiment on a dataset with automatic tracing and evaluation.

This method executes a task function on each item in the provided dataset, automatically traces all executions with Langfuse for observability, runs item-level and run-level evaluators on the outputs, and returns comprehensive results with evaluation metrics.

The experiment system provides:

  • Automatic tracing of all task executions
  • Concurrent processing with configurable limits
  • Comprehensive error handling that isolates failures
  • Integration with Langfuse datasets for experiment tracking
  • Flexible evaluation framework supporting both sync and async evaluators
Arguments:
  • name: Human-readable name for the experiment. Used for identification in the Langfuse UI.
  • run_name: Optional exact name for the experiment run. If provided, this will be used as the exact dataset run name if the data contains Langfuse dataset items. If not provided, this will default to the experiment name appended with an ISO timestamp.
  • description: Optional description explaining the experiment's purpose, methodology, or expected outcomes.
  • data: Array of data items to process. Can be either:
    • List of dict-like items with 'input', 'expected_output', 'metadata' keys
    • List of Langfuse DatasetItem objects from dataset.items
  • task: Function that processes each data item and returns output. Must accept 'item' as keyword argument and can return sync or async results. The task function signature should be: task(*, item, **kwargs) -> Any
  • evaluators: List of functions to evaluate each item's output individually. Each evaluator receives input, output, expected_output, and metadata. Can return single Evaluation dict or list of Evaluation dicts.
  • composite_evaluator: Optional function that creates composite scores from item-level evaluations. Receives the same inputs as item-level evaluators (input, output, expected_output, metadata) plus the list of evaluations from item-level evaluators. Useful for weighted averages, pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics.
  • run_evaluators: List of functions to evaluate the entire experiment run. Each run evaluator receives all item_results and can compute aggregate metrics. Useful for calculating averages, distributions, or cross-item comparisons.
  • max_concurrency: Maximum number of concurrent task executions (default: 50). Controls the number of items processed simultaneously. Adjust based on API rate limits and system resources.
  • metadata: Optional metadata dictionary to attach to all experiment traces. This metadata will be included in every trace created during the experiment. If data are Langfuse dataset items, the metadata will be attached to the dataset run, too.
Returns:

ExperimentResult containing:

  • run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset.
  • item_results: List of results for each processed item with outputs and evaluations
  • run_evaluations: List of aggregate evaluation results for the entire run
  • experiment_id: Stable identifier for the experiment run across all items
  • dataset_run_id: ID of the dataset run (if using Langfuse datasets)
  • dataset_run_url: Direct URL to view results in Langfuse UI (if applicable)
Raises:
  • ValueError: If required parameters are missing or invalid
  • Exception: If experiment setup fails (individual item failures are handled gracefully)
Examples:

Basic experiment with local data:

def summarize_text(*, item, **kwargs):
    return f"Summary: {item['input'][:50]}..."

def length_evaluator(*, input, output, expected_output=None, **kwargs):
    return {
        "name": "output_length",
        "value": len(output),
        "comment": f"Output contains {len(output)} characters"
    }

result = langfuse.run_experiment(
    name="Text Summarization Test",
    description="Evaluate summarization quality and length",
    data=[
        {"input": "Long article text...", "expected_output": "Expected summary"},
        {"input": "Another article...", "expected_output": "Another summary"}
    ],
    task=summarize_text,
    evaluators=[length_evaluator]
)

print(f"Processed {len(result.item_results)} items")
for item_result in result.item_results:
    print(f"Input: {item_result.item['input']}")
    print(f"Output: {item_result.output}")
    print(f"Evaluations: {item_result.evaluations}")

Advanced experiment with async task and multiple evaluators:

async def llm_task(*, item, **kwargs):
    # Simulate async LLM call
    response = await openai_client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": item["input"]}]
    )
    return response.choices[0].message.content

def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
    if expected_output and expected_output.lower() in output.lower():
        return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"}
    return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"}

def toxicity_evaluator(*, input, output, expected_output=None, **kwargs):
    # Simulate toxicity check
    toxicity_score = check_toxicity(output)  # Your toxicity checker
    return {
        "name": "toxicity",
        "value": toxicity_score,
        "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}"
    }

def average_accuracy(*, item_results, **kwargs):
    accuracies = [
        eval.value for result in item_results
        for eval in result.evaluations
        if eval.name == "accuracy"
    ]
    return {
        "name": "average_accuracy",
        "value": sum(accuracies) / len(accuracies) if accuracies else 0,
        "comment": f"Average accuracy across {len(accuracies)} items"
    }

result = langfuse.run_experiment(
    name="LLM Safety and Accuracy Test",
    description="Evaluate model accuracy and safety across diverse prompts",
    data=test_dataset,  # Your dataset items
    task=llm_task,
    evaluators=[accuracy_evaluator, toxicity_evaluator],
    run_evaluators=[average_accuracy],
    max_concurrency=5,  # Limit concurrent API calls
    metadata={"model": "gpt-4", "temperature": 0.7}
)

Using with Langfuse datasets:

# Get dataset from Langfuse
dataset = langfuse.get_dataset("my-eval-dataset")

result = dataset.run_experiment(
    name="Production Model Evaluation",
    description="Monthly evaluation of production model performance",
    task=my_production_task,
    evaluators=[accuracy_evaluator, latency_evaluator]
)

# Results automatically linked to dataset in Langfuse UI
print(f"View results: {result['dataset_run_url']}")
Note:
  • Task and evaluator functions can be either synchronous or asynchronous
  • Individual item failures are logged but don't stop the experiment
  • All executions are automatically traced and visible in Langfuse UI
  • When using Langfuse datasets, results are automatically linked for easy comparison
  • This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.)
  • Async execution is handled automatically with smart event loop detection
def run_batched_evaluation( self, *, scope: Literal['traces', 'observations'], mapper: MapperFunction, filter: Optional[str] = None, fetch_batch_size: int = 50, fetch_trace_fields: Optional[str] = None, max_items: Optional[int] = None, max_retries: int = 3, evaluators: List[langfuse.experiment.EvaluatorFunction], composite_evaluator: Optional[CompositeEvaluatorFunction] = None, max_concurrency: int = 5, metadata: Optional[Dict[str, Any]] = None, _add_observation_scores_to_trace: bool = False, _additional_trace_tags: Optional[List[str]] = None, resume_from: Optional[BatchEvaluationResumeToken] = None, verbose: bool = False) -> BatchEvaluationResult:
3080    def run_batched_evaluation(
3081        self,
3082        *,
3083        scope: Literal["traces", "observations"],
3084        mapper: MapperFunction,
3085        filter: Optional[str] = None,
3086        fetch_batch_size: int = 50,
3087        fetch_trace_fields: Optional[str] = None,
3088        max_items: Optional[int] = None,
3089        max_retries: int = 3,
3090        evaluators: List[EvaluatorFunction],
3091        composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
3092        max_concurrency: int = 5,
3093        metadata: Optional[Dict[str, Any]] = None,
3094        _add_observation_scores_to_trace: bool = False,
3095        _additional_trace_tags: Optional[List[str]] = None,
3096        resume_from: Optional[BatchEvaluationResumeToken] = None,
3097        verbose: bool = False,
3098    ) -> BatchEvaluationResult:
3099        """Fetch traces or observations and run evaluations on each item.
3100
3101        This method provides a powerful way to evaluate existing data in Langfuse at scale.
3102        It fetches items based on filters, transforms them using a mapper function, runs
3103        evaluators on each item, and creates scores that are linked back to the original
3104        entities. This is ideal for:
3105
3106        - Running evaluations on production traces after deployment
3107        - Backtesting new evaluation metrics on historical data
3108        - Batch scoring of observations for quality monitoring
3109        - Periodic evaluation runs on recent data
3110
3111        The method uses a streaming/pipeline approach to process items in batches, making
3112        it memory-efficient for large datasets. It includes comprehensive error handling,
3113        retry logic, and resume capability for long-running evaluations.
3114
3115        Args:
3116            scope: The type of items to evaluate. Must be one of:
3117                - "traces": Evaluate complete traces with all their observations
3118                - "observations": Evaluate individual observations (spans, generations, events)
3119            mapper: Function that transforms API response objects into evaluator inputs.
3120                Receives a trace/observation object and returns an EvaluatorInputs
3121                instance with input, output, expected_output, and metadata fields.
3122                Can be sync or async.
3123            evaluators: List of evaluation functions to run on each item. Each evaluator
3124                receives the mapped inputs and returns Evaluation object(s). Evaluator
3125                failures are logged but don't stop the batch evaluation.
3126            filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples:
3127                - '{"tags": ["production"]}'
3128                - '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}'
3129                Default: None (fetches all items).
3130            fetch_batch_size: Number of items to fetch per API call and hold in memory.
3131                Larger values may be faster but use more memory. Default: 50.
3132            fetch_trace_fields: Comma-separated list of fields to include when fetching traces. Available field groups: 'core' (always included), 'io' (input, output, metadata), 'scores', 'observations', 'metrics'. If not specified, all fields are returned. Example: 'core,scores,metrics'. Note: Excluded 'observations' or 'scores' fields return empty arrays; excluded 'metrics' returns -1 for 'totalCost' and 'latency'. Only relevant if scope is 'traces'.
3133            max_items: Maximum total number of items to process. If None, processes all
3134                items matching the filter. Useful for testing or limiting evaluation runs.
3135                Default: None (process all).
3136            max_concurrency: Maximum number of items to evaluate concurrently. Controls
3137                parallelism and resource usage. Default: 5.
3138            composite_evaluator: Optional function that creates a composite score from
3139                item-level evaluations. Receives the original item and its evaluations,
3140                returns a single Evaluation. Useful for weighted averages or combined metrics.
3141                Default: None.
3142            metadata: Optional metadata dict to add to all created scores. Useful for
3143                tracking evaluation runs, versions, or other context. Default: None.
3144            max_retries: Maximum number of retry attempts for failed batch fetches.
3145                Uses exponential backoff (1s, 2s, 4s). Default: 3.
3146            verbose: If True, logs progress information to console. Useful for monitoring
3147                long-running evaluations. Default: False.
3148            resume_from: Optional resume token from a previous incomplete run. Allows
3149                continuing evaluation after interruption or failure. Default: None.
3150
3151
3152        Returns:
3153            BatchEvaluationResult containing:
3154                - total_items_fetched: Number of items fetched from API
3155                - total_items_processed: Number of items successfully evaluated
3156                - total_items_failed: Number of items that failed evaluation
3157                - total_scores_created: Scores created by item-level evaluators
3158                - total_composite_scores_created: Scores created by composite evaluator
3159                - total_evaluations_failed: Individual evaluator failures
3160                - evaluator_stats: Per-evaluator statistics (success rate, scores created)
3161                - resume_token: Token for resuming if incomplete (None if completed)
3162                - completed: True if all items processed
3163                - duration_seconds: Total execution time
3164                - failed_item_ids: IDs of items that failed
3165                - error_summary: Error types and counts
3166                - has_more_items: True if max_items reached but more exist
3167
3168        Raises:
3169            ValueError: If invalid scope is provided.
3170
3171        Examples:
3172            Basic trace evaluation:
3173            ```python
3174            from langfuse import Langfuse, EvaluatorInputs, Evaluation
3175
3176            client = Langfuse()
3177
3178            # Define mapper to extract fields from traces
3179            def trace_mapper(trace):
3180                return EvaluatorInputs(
3181                    input=trace.input,
3182                    output=trace.output,
3183                    expected_output=None,
3184                    metadata={"trace_id": trace.id}
3185                )
3186
3187            # Define evaluator
3188            def length_evaluator(*, input, output, expected_output, metadata):
3189                return Evaluation(
3190                    name="output_length",
3191                    value=len(output) if output else 0
3192                )
3193
3194            # Run batch evaluation
3195            result = client.run_batched_evaluation(
3196                scope="traces",
3197                mapper=trace_mapper,
3198                evaluators=[length_evaluator],
3199                filter='{"tags": ["production"]}',
3200                max_items=1000,
3201                verbose=True
3202            )
3203
3204            print(f"Processed {result.total_items_processed} traces")
3205            print(f"Created {result.total_scores_created} scores")
3206            ```
3207
3208            Evaluation with composite scorer:
3209            ```python
3210            def accuracy_evaluator(*, input, output, expected_output, metadata):
3211                # ... evaluation logic
3212                return Evaluation(name="accuracy", value=0.85)
3213
3214            def relevance_evaluator(*, input, output, expected_output, metadata):
3215                # ... evaluation logic
3216                return Evaluation(name="relevance", value=0.92)
3217
3218            def composite_evaluator(*, item, evaluations):
3219                # Weighted average of evaluations
3220                weights = {"accuracy": 0.6, "relevance": 0.4}
3221                total = sum(
3222                    e.value * weights.get(e.name, 0)
3223                    for e in evaluations
3224                    if isinstance(e.value, (int, float))
3225                )
3226                return Evaluation(
3227                    name="composite_score",
3228                    value=total,
3229                    comment=f"Weighted average of {len(evaluations)} metrics"
3230                )
3231
3232            result = client.run_batched_evaluation(
3233                scope="traces",
3234                mapper=trace_mapper,
3235                evaluators=[accuracy_evaluator, relevance_evaluator],
3236                composite_evaluator=composite_evaluator,
3237                filter='{"user_id": "important_user"}',
3238                verbose=True
3239            )
3240            ```
3241
3242            Handling incomplete runs with resume:
3243            ```python
3244            # Initial run that may fail or timeout
3245            result = client.run_batched_evaluation(
3246                scope="observations",
3247                mapper=obs_mapper,
3248                evaluators=[my_evaluator],
3249                max_items=10000,
3250                verbose=True
3251            )
3252
3253            # Check if incomplete
3254            if not result.completed and result.resume_token:
3255                print(f"Processed {result.resume_token.items_processed} items before interruption")
3256
3257                # Resume from where it left off
3258                result = client.run_batched_evaluation(
3259                    scope="observations",
3260                    mapper=obs_mapper,
3261                    evaluators=[my_evaluator],
3262                    resume_from=result.resume_token,
3263                    verbose=True
3264                )
3265
3266            print(f"Total items processed: {result.total_items_processed}")
3267            ```
3268
3269            Monitoring evaluator performance:
3270            ```python
3271            result = client.run_batched_evaluation(...)
3272
3273            for stats in result.evaluator_stats:
3274                success_rate = stats.successful_runs / stats.total_runs
3275                print(f"{stats.name}:")
3276                print(f"  Success rate: {success_rate:.1%}")
3277                print(f"  Scores created: {stats.total_scores_created}")
3278
3279                if stats.failed_runs > 0:
3280                    print(f"  ⚠️  Failed {stats.failed_runs} times")
3281            ```
3282
3283        Note:
3284            - Evaluator failures are logged but don't stop the batch evaluation
3285            - Individual item failures are tracked but don't stop processing
3286            - Fetch failures are retried with exponential backoff
3287            - All scores are automatically flushed to Langfuse at the end
3288            - The resume mechanism uses timestamp-based filtering to avoid duplicates
3289        """
3290        runner = BatchEvaluationRunner(self)
3291
3292        return cast(
3293            BatchEvaluationResult,
3294            run_async_safely(
3295                runner.run_async(
3296                    scope=scope,
3297                    mapper=mapper,
3298                    evaluators=evaluators,
3299                    filter=filter,
3300                    fetch_batch_size=fetch_batch_size,
3301                    fetch_trace_fields=fetch_trace_fields,
3302                    max_items=max_items,
3303                    max_concurrency=max_concurrency,
3304                    composite_evaluator=composite_evaluator,
3305                    metadata=metadata,
3306                    _add_observation_scores_to_trace=_add_observation_scores_to_trace,
3307                    _additional_trace_tags=_additional_trace_tags,
3308                    max_retries=max_retries,
3309                    verbose=verbose,
3310                    resume_from=resume_from,
3311                )
3312            ),
3313        )

Fetch traces or observations and run evaluations on each item.

This method provides a powerful way to evaluate existing data in Langfuse at scale. It fetches items based on filters, transforms them using a mapper function, runs evaluators on each item, and creates scores that are linked back to the original entities. This is ideal for:

  • Running evaluations on production traces after deployment
  • Backtesting new evaluation metrics on historical data
  • Batch scoring of observations for quality monitoring
  • Periodic evaluation runs on recent data

The method uses a streaming/pipeline approach to process items in batches, making it memory-efficient for large datasets. It includes comprehensive error handling, retry logic, and resume capability for long-running evaluations.

Arguments:
  • scope: The type of items to evaluate. Must be one of:
    • "traces": Evaluate complete traces with all their observations
    • "observations": Evaluate individual observations (spans, generations, events)
  • mapper: Function that transforms API response objects into evaluator inputs. Receives a trace/observation object and returns an EvaluatorInputs instance with input, output, expected_output, and metadata fields. Can be sync or async.
  • evaluators: List of evaluation functions to run on each item. Each evaluator receives the mapped inputs and returns Evaluation object(s). Evaluator failures are logged but don't stop the batch evaluation.
  • filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples:
    • '{"tags": ["production"]}'
    • '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}' Default: None (fetches all items).
  • fetch_batch_size: Number of items to fetch per API call and hold in memory. Larger values may be faster but use more memory. Default: 50.
  • fetch_trace_fields: Comma-separated list of fields to include when fetching traces. Available field groups: 'core' (always included), 'io' (input, output, metadata), 'scores', 'observations', 'metrics'. If not specified, all fields are returned. Example: 'core,scores,metrics'. Note: Excluded 'observations' or 'scores' fields return empty arrays; excluded 'metrics' returns -1 for 'totalCost' and 'latency'. Only relevant if scope is 'traces'.
  • max_items: Maximum total number of items to process. If None, processes all items matching the filter. Useful for testing or limiting evaluation runs. Default: None (process all).
  • max_concurrency: Maximum number of items to evaluate concurrently. Controls parallelism and resource usage. Default: 5.
  • composite_evaluator: Optional function that creates a composite score from item-level evaluations. Receives the original item and its evaluations, returns a single Evaluation. Useful for weighted averages or combined metrics. Default: None.
  • metadata: Optional metadata dict to add to all created scores. Useful for tracking evaluation runs, versions, or other context. Default: None.
  • max_retries: Maximum number of retry attempts for failed batch fetches. Uses exponential backoff (1s, 2s, 4s). Default: 3.
  • verbose: If True, logs progress information to console. Useful for monitoring long-running evaluations. Default: False.
  • resume_from: Optional resume token from a previous incomplete run. Allows continuing evaluation after interruption or failure. Default: None.
Returns:

BatchEvaluationResult containing: - total_items_fetched: Number of items fetched from API - total_items_processed: Number of items successfully evaluated - total_items_failed: Number of items that failed evaluation - total_scores_created: Scores created by item-level evaluators - total_composite_scores_created: Scores created by composite evaluator - total_evaluations_failed: Individual evaluator failures - evaluator_stats: Per-evaluator statistics (success rate, scores created) - resume_token: Token for resuming if incomplete (None if completed) - completed: True if all items processed - duration_seconds: Total execution time - failed_item_ids: IDs of items that failed - error_summary: Error types and counts - has_more_items: True if max_items reached but more exist

Raises:
  • ValueError: If invalid scope is provided.
Examples:

Basic trace evaluation:

from langfuse import Langfuse, EvaluatorInputs, Evaluation

client = Langfuse()

# Define mapper to extract fields from traces
def trace_mapper(trace):
    return EvaluatorInputs(
        input=trace.input,
        output=trace.output,
        expected_output=None,
        metadata={"trace_id": trace.id}
    )

# Define evaluator
def length_evaluator(*, input, output, expected_output, metadata):
    return Evaluation(
        name="output_length",
        value=len(output) if output else 0
    )

# Run batch evaluation
result = client.run_batched_evaluation(
    scope="traces",
    mapper=trace_mapper,
    evaluators=[length_evaluator],
    filter='{"tags": ["production"]}',
    max_items=1000,
    verbose=True
)

print(f"Processed {result.total_items_processed} traces")
print(f"Created {result.total_scores_created} scores")

Evaluation with composite scorer:

def accuracy_evaluator(*, input, output, expected_output, metadata):
    # ... evaluation logic
    return Evaluation(name="accuracy", value=0.85)

def relevance_evaluator(*, input, output, expected_output, metadata):
    # ... evaluation logic
    return Evaluation(name="relevance", value=0.92)

def composite_evaluator(*, item, evaluations):
    # Weighted average of evaluations
    weights = {"accuracy": 0.6, "relevance": 0.4}
    total = sum(
        e.value * weights.get(e.name, 0)
        for e in evaluations
        if isinstance(e.value, (int, float))
    )
    return Evaluation(
        name="composite_score",
        value=total,
        comment=f"Weighted average of {len(evaluations)} metrics"
    )

result = client.run_batched_evaluation(
    scope="traces",
    mapper=trace_mapper,
    evaluators=[accuracy_evaluator, relevance_evaluator],
    composite_evaluator=composite_evaluator,
    filter='{"user_id": "important_user"}',
    verbose=True
)

Handling incomplete runs with resume:

# Initial run that may fail or timeout
result = client.run_batched_evaluation(
    scope="observations",
    mapper=obs_mapper,
    evaluators=[my_evaluator],
    max_items=10000,
    verbose=True
)

# Check if incomplete
if not result.completed and result.resume_token:
    print(f"Processed {result.resume_token.items_processed} items before interruption")

    # Resume from where it left off
    result = client.run_batched_evaluation(
        scope="observations",
        mapper=obs_mapper,
        evaluators=[my_evaluator],
        resume_from=result.resume_token,
        verbose=True
    )

print(f"Total items processed: {result.total_items_processed}")

Monitoring evaluator performance:

result = client.run_batched_evaluation(...)

for stats in result.evaluator_stats:
    success_rate = stats.successful_runs / stats.total_runs
    print(f"{stats.name}:")
    print(f"  Success rate: {success_rate:.1%}")
    print(f"  Scores created: {stats.total_scores_created}")

    if stats.failed_runs > 0:
        print(f"  ⚠️  Failed {stats.failed_runs} times")
Note:
  • Evaluator failures are logged but don't stop the batch evaluation
  • Individual item failures are tracked but don't stop processing
  • Fetch failures are retried with exponential backoff
  • All scores are automatically flushed to Langfuse at the end
  • The resume mechanism uses timestamp-based filtering to avoid duplicates
def auth_check(self) -> bool:
3315    def auth_check(self) -> bool:
3316        """Check if the provided credentials (public and secret key) are valid.
3317
3318        Raises:
3319            Exception: If no projects were found for the provided credentials.
3320
3321        Note:
3322            This method is blocking. It is discouraged to use it in production code.
3323        """
3324        try:
3325            projects = self.api.projects.get()
3326            langfuse_logger.debug(
3327                f"Auth check successful, found {len(projects.data)} projects"
3328            )
3329            if len(projects.data) == 0:
3330                raise Exception(
3331                    "Auth check failed, no project found for the keys provided."
3332                )
3333            return True
3334
3335        except AttributeError as e:
3336            langfuse_logger.warning(
3337                f"Auth check failed: Client not properly initialized. Error: {e}"
3338            )
3339            return False
3340
3341        except Error as e:
3342            handle_fern_exception(e)
3343            raise e

Check if the provided credentials (public and secret key) are valid.

Raises:
  • Exception: If no projects were found for the provided credentials.
Note:

This method is blocking. It is discouraged to use it in production code.

def create_dataset( self, *, name: str, description: Optional[str] = None, metadata: Optional[Any] = None, input_schema: Optional[Any] = None, expected_output_schema: Optional[Any] = None) -> langfuse.api.Dataset:
3345    def create_dataset(
3346        self,
3347        *,
3348        name: str,
3349        description: Optional[str] = None,
3350        metadata: Optional[Any] = None,
3351        input_schema: Optional[Any] = None,
3352        expected_output_schema: Optional[Any] = None,
3353    ) -> Dataset:
3354        """Create a dataset with the given name on Langfuse.
3355
3356        Args:
3357            name: Name of the dataset to create.
3358            description: Description of the dataset. Defaults to None.
3359            metadata: Additional metadata. Defaults to None.
3360            input_schema: JSON Schema for validating dataset item inputs. When set, all new items will be validated against this schema.
3361            expected_output_schema: JSON Schema for validating dataset item expected outputs. When set, all new items will be validated against this schema.
3362
3363        Returns:
3364            Dataset: The created dataset as returned by the Langfuse API.
3365        """
3366        try:
3367            langfuse_logger.debug(f"Creating datasets {name}")
3368
3369            result = self.api.datasets.create(
3370                name=name,
3371                description=description,
3372                metadata=metadata,
3373                input_schema=input_schema,
3374                expected_output_schema=expected_output_schema,
3375            )
3376
3377            return cast(Dataset, result)
3378
3379        except Error as e:
3380            handle_fern_exception(e)
3381            raise e

Create a dataset with the given name on Langfuse.

Arguments:
  • name: Name of the dataset to create.
  • description: Description of the dataset. Defaults to None.
  • metadata: Additional metadata. Defaults to None.
  • input_schema: JSON Schema for validating dataset item inputs. When set, all new items will be validated against this schema.
  • expected_output_schema: JSON Schema for validating dataset item expected outputs. When set, all new items will be validated against this schema.
Returns:

Dataset: The created dataset as returned by the Langfuse API.

def create_dataset_item( self, *, dataset_name: str, input: Optional[Any] = None, expected_output: Optional[Any] = None, metadata: Optional[Any] = None, source_trace_id: Optional[str] = None, source_observation_id: Optional[str] = None, status: Optional[langfuse.api.DatasetStatus] = None, id: Optional[str] = None) -> langfuse.api.DatasetItem:
3383    def create_dataset_item(
3384        self,
3385        *,
3386        dataset_name: str,
3387        input: Optional[Any] = None,
3388        expected_output: Optional[Any] = None,
3389        metadata: Optional[Any] = None,
3390        source_trace_id: Optional[str] = None,
3391        source_observation_id: Optional[str] = None,
3392        status: Optional[DatasetStatus] = None,
3393        id: Optional[str] = None,
3394    ) -> DatasetItem:
3395        """Create a dataset item.
3396
3397        Upserts if an item with id already exists.
3398
3399        Args:
3400            dataset_name: Name of the dataset in which the dataset item should be created.
3401            input: Input data. Defaults to None. Can contain any dict, list or scalar.
3402            expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar.
3403            metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar.
3404            source_trace_id: Id of the source trace. Defaults to None.
3405            source_observation_id: Id of the source observation. Defaults to None.
3406            status: Status of the dataset item. Defaults to ACTIVE for newly created items.
3407            id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets.
3408
3409        Returns:
3410            DatasetItem: The created dataset item as returned by the Langfuse API.
3411
3412        Example:
3413            ```python
3414            from langfuse import Langfuse
3415
3416            langfuse = Langfuse()
3417
3418            # Uploading items to the Langfuse dataset named "capital_cities"
3419            langfuse.create_dataset_item(
3420                dataset_name="capital_cities",
3421                input={"input": {"country": "Italy"}},
3422                expected_output={"expected_output": "Rome"},
3423                metadata={"foo": "bar"}
3424            )
3425            ```
3426        """
3427        try:
3428            langfuse_logger.debug(f"Creating dataset item for dataset {dataset_name}")
3429
3430            # Media uploads must reference the (dataset, item) they belong to, and
3431            # the item need not exist yet — so settle on the item id up front and
3432            # reuse it for the create call below.
3433            item_id = id if id is not None else str(uuid.uuid4())
3434
3435            # Single pass per field: swap each LangfuseMedia for its reference
3436            # string (derived from content, not the upload) and collect the media
3437            # still to upload, deduped by media id and tagged with its field.
3438            pending_media: Dict[str, Tuple[LangfuseMedia, str]] = {}
3439            input = self._process_dataset_item_media(
3440                data=input,
3441                pending_media=pending_media,
3442                field=DatasetItemMediaReferenceField.INPUT.value,
3443            )
3444            expected_output = self._process_dataset_item_media(
3445                data=expected_output,
3446                pending_media=pending_media,
3447                field=DatasetItemMediaReferenceField.EXPECTED_OUTPUT.value,
3448            )
3449            metadata = self._process_dataset_item_media(
3450                data=metadata,
3451                pending_media=pending_media,
3452                field=DatasetItemMediaReferenceField.METADATA.value,
3453            )
3454
3455            # The upload needs the dataset id, but the create API only takes the
3456            # name. Resolve it once, and only when there is actually media to
3457            # upload — a plain item pays no extra datasets.get round-trip.
3458            if pending_media:
3459                assert self._resources is not None
3460                dataset_id = self.api.datasets.get(self._url_encode(dataset_name)).id
3461                for media, field in pending_media.values():
3462                    self._resources._media_manager._upload_media_sync(
3463                        media=media,
3464                        dataset_id=dataset_id,
3465                        dataset_item_id=item_id,
3466                        field=field,
3467                    )
3468
3469            result = self.api.dataset_items.create(
3470                dataset_name=dataset_name,
3471                input=input,
3472                expected_output=expected_output,
3473                metadata=metadata,
3474                source_trace_id=source_trace_id,
3475                source_observation_id=source_observation_id,
3476                status=status,
3477                id=item_id,
3478            )
3479
3480            return cast(DatasetItem, result)
3481        except Error as e:
3482            handle_fern_exception(e)
3483            raise e

Create a dataset item.

Upserts if an item with id already exists.

Arguments:
  • dataset_name: Name of the dataset in which the dataset item should be created.
  • input: Input data. Defaults to None. Can contain any dict, list or scalar.
  • expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar.
  • metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar.
  • source_trace_id: Id of the source trace. Defaults to None.
  • source_observation_id: Id of the source observation. Defaults to None.
  • status: Status of the dataset item. Defaults to ACTIVE for newly created items.
  • id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets.
Returns:

DatasetItem: The created dataset item as returned by the Langfuse API.

Example:
from langfuse import Langfuse

langfuse = Langfuse()

# Uploading items to the Langfuse dataset named "capital_cities"
langfuse.create_dataset_item(
    dataset_name="capital_cities",
    input={"input": {"country": "Italy"}},
    expected_output={"expected_output": "Rome"},
    metadata={"foo": "bar"}
)
def resolve_media_references( self, *, obj: Any, resolve_with: Literal['base64_data_uri'], max_depth: int = 10, content_fetch_timeout_seconds: int = 5) -> Any:
3609    def resolve_media_references(
3610        self,
3611        *,
3612        obj: Any,
3613        resolve_with: Literal["base64_data_uri"],
3614        max_depth: int = 10,
3615        content_fetch_timeout_seconds: int = 5,
3616    ) -> Any:
3617        """Replace media reference strings in an object with base64 data URIs.
3618
3619        This method recursively traverses an object (up to max_depth) looking for media reference strings
3620        in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using
3621        the provided Langfuse client and replaces the reference string with a base64 data URI.
3622
3623        If fetching media content fails for a reference string, a warning is logged and the reference
3624        string is left unchanged.
3625
3626        Args:
3627            obj: The object to process. Can be a primitive value, array, or nested object.
3628                If the object has a __dict__ attribute, a dict will be returned instead of the original object type.
3629            resolve_with: The representation of the media content to replace the media reference string with.
3630                Currently only "base64_data_uri" is supported.
3631            max_depth: int: The maximum depth to traverse the object. Default is 10.
3632            content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5.
3633
3634        Returns:
3635            A deep copy of the input object with all media references replaced with base64 data URIs where possible.
3636            If the input object has a __dict__ attribute, a dict will be returned instead of the original object type.
3637
3638        Example:
3639            obj = {
3640                "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@",
3641                "nested": {
3642                    "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@"
3643                }
3644            }
3645
3646            result = await LangfuseMedia.resolve_media_references(obj, langfuse_client)
3647
3648            # Result:
3649            # {
3650            #     "image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...",
3651            #     "nested": {
3652            #         "pdf": "data:application/pdf;base64,JVBERi0xLjcK..."
3653            #     }
3654            # }
3655        """
3656        return LangfuseMedia.resolve_media_references(
3657            langfuse_client=self,
3658            obj=obj,
3659            resolve_with=resolve_with,
3660            max_depth=max_depth,
3661            content_fetch_timeout_seconds=content_fetch_timeout_seconds,
3662        )

Replace media reference strings in an object with base64 data URIs.

This method recursively traverses an object (up to max_depth) looking for media reference strings in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using the provided Langfuse client and replaces the reference string with a base64 data URI.

If fetching media content fails for a reference string, a warning is logged and the reference string is left unchanged.

Arguments:
  • obj: The object to process. Can be a primitive value, array, or nested object. If the object has a __dict__ attribute, a dict will be returned instead of the original object type.
  • resolve_with: The representation of the media content to replace the media reference string with. Currently only "base64_data_uri" is supported.
  • max_depth: int: The maximum depth to traverse the object. Default is 10.
  • content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5.
Returns:

A deep copy of the input object with all media references replaced with base64 data URIs where possible. If the input object has a __dict__ attribute, a dict will be returned instead of the original object type.

Example:

obj = { "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@", "nested": { "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@" } }

result = await LangfuseMedia.resolve_media_references(obj, langfuse_client)

Result:

{

"image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...",

"nested": {

"pdf": "data:application/pdf;base64,JVBERi0xLjcK..."

}

}

def get_prompt( self, name: str, *, version: Optional[int] = None, label: Optional[str] = None, type: Literal['chat', 'text'] = 'text', cache_ttl_seconds: Optional[int] = None, fallback: Union[List[langfuse.model.ChatMessageDict], NoneType, str] = None, max_retries: Optional[int] = None, fetch_timeout_seconds: Optional[int] = None) -> Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient]:
3692    def get_prompt(
3693        self,
3694        name: str,
3695        *,
3696        version: Optional[int] = None,
3697        label: Optional[str] = None,
3698        type: Literal["chat", "text"] = "text",
3699        cache_ttl_seconds: Optional[int] = None,
3700        fallback: Union[Optional[List[ChatMessageDict]], Optional[str]] = None,
3701        max_retries: Optional[int] = None,
3702        fetch_timeout_seconds: Optional[int] = None,
3703    ) -> PromptClient:
3704        """Get a prompt.
3705
3706        This method attempts to fetch the requested prompt from the local cache. If the prompt is not found
3707        in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again
3708        and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will
3709        return the expired prompt as a fallback.
3710
3711        Args:
3712            name (str): The name of the prompt to retrieve.
3713
3714        Keyword Args:
3715            version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both.
3716            label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both.
3717            cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a
3718            keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0.
3719            type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text".
3720            fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None.
3721            max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds.
3722            fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default.
3723
3724        Returns:
3725            The prompt object retrieved from the cache or directly fetched if not cached or expired of type
3726            - TextPromptClient, if type argument is 'text'.
3727            - ChatPromptClient, if type argument is 'chat'.
3728
3729        Raises:
3730            Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an
3731            expired prompt in the cache, in which case it logs a warning and returns the expired prompt.
3732        """
3733        if self._resources is None:
3734            raise Error(
3735                "SDK is not correctly initialized. Check the init logs for more details."
3736            )
3737        if version is not None and label is not None:
3738            raise ValueError("Cannot specify both version and label at the same time.")
3739
3740        if not name:
3741            raise ValueError("Prompt name cannot be empty.")
3742
3743        cache_key = PromptCache.generate_cache_key(name, version=version, label=label)
3744        bounded_max_retries = self._get_bounded_max_retries(
3745            max_retries, default_max_retries=2, max_retries_upper_bound=4
3746        )
3747
3748        langfuse_logger.debug(f"Getting prompt '{cache_key}'")
3749        cached_prompt = self._resources.prompt_cache.get(cache_key)
3750
3751        if cached_prompt is None or cache_ttl_seconds == 0:
3752            langfuse_logger.debug(
3753                f"Prompt '{cache_key}' not found in cache or caching disabled."
3754            )
3755            try:
3756                return self._fetch_prompt_and_update_cache(
3757                    name,
3758                    version=version,
3759                    label=label,
3760                    ttl_seconds=cache_ttl_seconds,
3761                    max_retries=bounded_max_retries,
3762                    fetch_timeout_seconds=fetch_timeout_seconds,
3763                )
3764            except Exception as e:
3765                if fallback:
3766                    langfuse_logger.warning(
3767                        f"Returning fallback prompt for '{cache_key}' due to fetch error: {e}"
3768                    )
3769
3770                    fallback_client_args: Dict[str, Any] = {
3771                        "name": name,
3772                        "prompt": fallback,
3773                        "type": type,
3774                        "version": version or 0,
3775                        "config": {},
3776                        "labels": [label] if label else [],
3777                        "tags": [],
3778                    }
3779
3780                    if type == "text":
3781                        return TextPromptClient(
3782                            prompt=Prompt_Text(**fallback_client_args),
3783                            is_fallback=True,
3784                        )
3785
3786                    if type == "chat":
3787                        return ChatPromptClient(
3788                            prompt=Prompt_Chat(**fallback_client_args),
3789                            is_fallback=True,
3790                        )
3791
3792                raise e
3793
3794        if cached_prompt.is_expired():
3795            langfuse_logger.debug(f"Stale prompt '{cache_key}' found in cache.")
3796            try:
3797                # refresh prompt in background thread, refresh_prompt deduplicates tasks
3798                langfuse_logger.debug(f"Refreshing prompt '{cache_key}' in background.")
3799
3800                def refresh_task() -> None:
3801                    self._fetch_prompt_and_update_cache(
3802                        name,
3803                        version=version,
3804                        label=label,
3805                        ttl_seconds=cache_ttl_seconds,
3806                        max_retries=bounded_max_retries,
3807                        fetch_timeout_seconds=fetch_timeout_seconds,
3808                    )
3809
3810                self._resources.prompt_cache.add_refresh_prompt_task_if_current(
3811                    cache_key,
3812                    cached_prompt,
3813                    refresh_task,
3814                )
3815                langfuse_logger.debug(
3816                    f"Returning stale prompt '{cache_key}' from cache."
3817                )
3818                # return stale prompt
3819                return cached_prompt.value
3820
3821            except Exception as e:
3822                langfuse_logger.warning(
3823                    f"Error when refreshing cached prompt '{cache_key}', returning cached version. Error: {e}"
3824                )
3825                # creation of refresh prompt task failed, return stale prompt
3826                return cached_prompt.value
3827
3828        return cached_prompt.value

Get a prompt.

This method attempts to fetch the requested prompt from the local cache. If the prompt is not found in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will return the expired prompt as a fallback.

Arguments:
  • name (str): The name of the prompt to retrieve.
Keyword Args:
  • version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the production label is returned. Specify either version or label, not both.
  • label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the production label is returned. Specify either version or label, not both.
  • cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a
  • keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0.
  • type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text".
  • fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None.
  • max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds.
  • fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default.
Returns:

The prompt object retrieved from the cache or directly fetched if not cached or expired of type

  • TextPromptClient, if type argument is 'text'.
  • ChatPromptClient, if type argument is 'chat'.
Raises:
  • Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an
  • expired prompt in the cache, in which case it logs a warning and returns the expired prompt.
def create_prompt( self, *, name: str, prompt: Union[str, List[Union[langfuse.model.ChatMessageDict, langfuse.model.ChatMessageWithPlaceholdersDict_Message, langfuse.model.ChatMessageWithPlaceholdersDict_Placeholder]]], labels: List[str] = [], tags: Optional[List[str]] = None, type: Optional[Literal['chat', 'text']] = 'text', config: Optional[Any] = None, commit_message: Optional[str] = None) -> Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient]:
3930    def create_prompt(
3931        self,
3932        *,
3933        name: str,
3934        prompt: Union[
3935            str, List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]]
3936        ],
3937        labels: List[str] = [],
3938        tags: Optional[List[str]] = None,
3939        type: Optional[Literal["chat", "text"]] = "text",
3940        config: Optional[Any] = None,
3941        commit_message: Optional[str] = None,
3942    ) -> PromptClient:
3943        """Create a new prompt in Langfuse.
3944
3945        Keyword Args:
3946            name : The name of the prompt to be created.
3947            prompt : The content of the prompt to be created.
3948            is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead.
3949            labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label.
3950            tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt.
3951            config: Additional structured data to be saved with the prompt. Defaults to None.
3952            type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text".
3953            commit_message: Optional string describing the change.
3954
3955        Returns:
3956            TextPromptClient: The prompt if type argument is 'text'.
3957            ChatPromptClient: The prompt if type argument is 'chat'.
3958        """
3959        try:
3960            langfuse_logger.debug(f"Creating prompt {name=}, {labels=}")
3961
3962            if type == "chat":
3963                if not isinstance(prompt, list):
3964                    raise ValueError(
3965                        "For 'chat' type, 'prompt' must be a list of chat messages with role and content attributes."
3966                    )
3967                request: Union[CreateChatPromptRequest, CreateTextPromptRequest] = (
3968                    CreateChatPromptRequest(
3969                        name=name,
3970                        prompt=cast(Any, prompt),
3971                        labels=labels,
3972                        tags=tags,
3973                        config=config or {},
3974                        commit_message=commit_message,
3975                        type=CreateChatPromptType.CHAT,
3976                    )
3977                )
3978                server_prompt = self.api.prompts.create(request=request)
3979
3980                if self._resources is not None:
3981                    self._resources.prompt_cache.invalidate(name)
3982
3983                return ChatPromptClient(prompt=cast(Prompt_Chat, server_prompt))
3984
3985            if not isinstance(prompt, str):
3986                raise ValueError("For 'text' type, 'prompt' must be a string.")
3987
3988            request = CreateTextPromptRequest(
3989                name=name,
3990                prompt=prompt,
3991                labels=labels,
3992                tags=tags,
3993                config=config or {},
3994                commit_message=commit_message,
3995            )
3996
3997            server_prompt = self.api.prompts.create(request=request)
3998
3999            if self._resources is not None:
4000                self._resources.prompt_cache.invalidate(name)
4001
4002            return TextPromptClient(prompt=cast(Prompt_Text, server_prompt))
4003
4004        except Error as e:
4005            handle_fern_exception(e)
4006            raise e

Create a new prompt in Langfuse.

Keyword Args:
  • name : The name of the prompt to be created.
  • prompt : The content of the prompt to be created.
  • is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead.
  • labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label.
  • tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt.
  • config: Additional structured data to be saved with the prompt. Defaults to None.
  • type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text".
  • commit_message: Optional string describing the change.
Returns:

TextPromptClient: The prompt if type argument is 'text'. ChatPromptClient: The prompt if type argument is 'chat'.

def update_prompt(self, *, name: str, version: int, new_labels: List[str] = []) -> Any:
4008    def update_prompt(
4009        self,
4010        *,
4011        name: str,
4012        version: int,
4013        new_labels: List[str] = [],
4014    ) -> Any:
4015        """Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name.
4016
4017        Args:
4018            name (str): The name of the prompt to update.
4019            version (int): The version number of the prompt to update.
4020            new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to [].
4021
4022        Returns:
4023            Prompt: The updated prompt from the Langfuse API.
4024
4025        """
4026        updated_prompt = self.api.prompt_version.update(
4027            name=self._url_encode(name),
4028            version=version,
4029            new_labels=new_labels,
4030        )
4031
4032        if self._resources is not None:
4033            self._resources.prompt_cache.invalidate(name)
4034
4035        return updated_prompt

Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name.

Arguments:
  • name (str): The name of the prompt to update.
  • version (int): The version number of the prompt to update.
  • new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to [].
Returns:

Prompt: The updated prompt from the Langfuse API.

def clear_prompt_cache(self) -> None:
4050    def clear_prompt_cache(self) -> None:
4051        """Clear the entire prompt cache, removing all cached prompts.
4052
4053        This method is useful when you want to force a complete refresh of all
4054        cached prompts, for example after major updates or when you need to
4055        ensure the latest versions are fetched from the server.
4056        """
4057        if self._resources is not None:
4058            self._resources.prompt_cache.clear()

Clear the entire prompt cache, removing all cached prompts.

This method is useful when you want to force a complete refresh of all cached prompts, for example after major updates or when you need to ensure the latest versions are fetched from the server.

class LangfuseMedia:
 99class LangfuseMedia:
100    """A class for wrapping media objects for upload to Langfuse.
101
102    This class handles the preparation and formatting of media content for Langfuse,
103    supporting both base64 data URIs and raw content bytes.
104
105    Args:
106        obj (Optional[object]): The source object to be wrapped. Can be accessed via the `obj` attribute.
107        base64_data_uri (Optional[str]): A base64-encoded data URI containing the media content
108            and content type (e.g., "data:image/jpeg;base64,/9j/4AAQ...").
109        content_type (Optional[str]): The MIME type of the media content when providing raw bytes.
110        content_bytes (Optional[bytes]): Raw bytes of the media content.
111        file_path (Optional[str]): The path to the file containing the media content. For relative paths,
112            the current working directory is used.
113
114    Raises:
115        ValueError: If neither base64_data_uri or the combination of content_bytes
116            and content_type is provided.
117    """
118
119    obj: object
120
121    _content_bytes: Optional[bytes]
122    _content_type: Optional[MediaContentType]
123    _source: Optional[str]
124    _media_id: Optional[str]
125
126    def __init__(
127        self,
128        *,
129        obj: Optional[object] = None,
130        base64_data_uri: Optional[str] = None,
131        content_type: Optional[MediaContentType] = None,
132        content_bytes: Optional[bytes] = None,
133        file_path: Optional[str] = None,
134    ):
135        """Initialize a LangfuseMedia object.
136
137        Args:
138            obj: The object to wrap.
139
140            base64_data_uri: A base64-encoded data URI containing the media content
141                and content type (e.g., "data:image/jpeg;base64,/9j/4AAQ...").
142            content_type: The MIME type of the media content when providing raw bytes or reading from a file.
143            content_bytes: Raw bytes of the media content.
144            file_path: The path to the file containing the media content. For relative paths,
145                the current working directory is used.
146        """
147        self.obj = obj
148
149        if base64_data_uri is not None:
150            parsed_data = self._parse_base64_data_uri(base64_data_uri)
151            self._content_bytes, self._content_type = parsed_data
152            self._source = "base64_data_uri"
153
154        elif content_bytes is not None and content_type is not None:
155            self._content_type = content_type
156            self._content_bytes = content_bytes
157            self._source = "bytes"
158        elif (
159            file_path is not None
160            and content_type is not None
161            and os.path.exists(file_path)
162        ):
163            self._content_bytes = self._read_file(file_path)
164            self._content_type = content_type if self._content_bytes else None
165            self._source = "file" if self._content_bytes else None
166        else:
167            logger.error(
168                "base64_data_uri, or content_bytes and content_type, or file_path must be provided to LangfuseMedia"
169            )
170
171            self._content_bytes = None
172            self._content_type = None
173            self._source = None
174
175        self._media_id = self._get_media_id()
176
177    def _read_file(self, file_path: str) -> Optional[bytes]:
178        try:
179            with open(file_path, "rb") as file:
180                return file.read()
181        except Exception as e:
182            logger.error(f"Error reading file at path {file_path}", exc_info=e)
183
184            return None
185
186    def _get_media_id(self) -> Optional[str]:
187        content_hash = self._content_sha256_hash
188
189        if content_hash is None:
190            return None
191
192        # Convert hash to base64Url
193        url_safe_content_hash = content_hash.replace("+", "-").replace("/", "_")
194
195        return url_safe_content_hash[:22]
196
197    @property
198    def _content_length(self) -> Optional[int]:
199        return len(self._content_bytes) if self._content_bytes else None
200
201    @property
202    def _content_sha256_hash(self) -> Optional[str]:
203        if self._content_bytes is None:
204            return None
205
206        sha256_hash_bytes = hashlib.sha256(self._content_bytes).digest()
207
208        return base64.b64encode(sha256_hash_bytes).decode("utf-8")
209
210    @property
211    def _reference_string(self) -> Optional[str]:
212        if self._content_type is None or self._source is None or self._media_id is None:
213            return None
214
215        return f"@@@langfuseMedia:type={self._content_type}|id={self._media_id}|source={self._source}@@@"
216
217    @staticmethod
218    def parse_reference_string(reference_string: str) -> ParsedMediaReference:
219        """Parse a media reference string into a ParsedMediaReference.
220
221        Example reference string:
222            "@@@langfuseMedia:type=image/jpeg|id=some-uuid|source=base64_data_uri@@@"
223
224        Args:
225            reference_string: The reference string to parse.
226
227        Returns:
228            A TypedDict with the media_id, source, and content_type.
229
230        Raises:
231            ValueError: If the reference string is empty or not a string.
232            ValueError: If the reference string does not start with "@@@langfuseMedia:type=".
233            ValueError: If the reference string does not end with "@@@".
234            ValueError: If the reference string is missing required fields.
235        """
236        if not reference_string:
237            raise ValueError("Reference string is empty")
238
239        if not isinstance(reference_string, str):
240            raise ValueError("Reference string is not a string")
241
242        if not reference_string.startswith("@@@langfuseMedia:type="):
243            raise ValueError(
244                "Reference string does not start with '@@@langfuseMedia:type='"
245            )
246
247        if not reference_string.endswith("@@@"):
248            raise ValueError("Reference string does not end with '@@@'")
249
250        content = reference_string[len("@@@langfuseMedia:") :].rstrip("@@@")
251
252        # Split into key-value pairs
253        pairs = content.split("|")
254        parsed_data = {}
255
256        for pair in pairs:
257            key, value = pair.split("=", 1)
258            parsed_data[key] = value
259
260        # Verify all required fields are present
261        if not all(key in parsed_data for key in ["type", "id", "source"]):
262            raise ValueError("Missing required fields in reference string")
263
264        return ParsedMediaReference(
265            media_id=parsed_data["id"],
266            source=parsed_data["source"],
267            content_type=cast(MediaContentType, parsed_data["type"]),
268        )
269
270    def _parse_base64_data_uri(
271        self, data: str
272    ) -> Tuple[Optional[bytes], Optional[MediaContentType]]:
273        # Example data URI: data:image/jpeg;base64,/9j/4AAQ...
274        try:
275            if not data or not isinstance(data, str):
276                raise ValueError("Data URI is not a string")
277
278            if not data.startswith("data:"):
279                raise ValueError("Data URI does not start with 'data:'")
280
281            header, actual_data = data[5:].split(",", 1)
282            if not header or not actual_data:
283                raise ValueError("Invalid URI")
284
285            # Split header into parts and check for base64
286            header_parts = header.split(";")
287            if "base64" not in header_parts:
288                raise ValueError("Data is not base64 encoded")
289
290            # Content type is the first part
291            content_type = header_parts[0]
292            if not content_type:
293                raise ValueError("Content type is empty")
294
295            return base64.b64decode(actual_data), cast(MediaContentType, content_type)
296
297        except Exception as e:
298            logger.error("Error parsing base64 data URI", exc_info=e)
299
300            return None, None
301
302    @staticmethod
303    def resolve_media_references(
304        *,
305        obj: T,
306        langfuse_client: "Langfuse",
307        resolve_with: Literal["base64_data_uri"],
308        max_depth: int = 10,
309        content_fetch_timeout_seconds: int = 10,
310    ) -> T:
311        """Replace media reference strings in an object with base64 data URIs.
312
313        This method recursively traverses an object (up to max_depth) looking for media reference strings
314        in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using
315        the provided Langfuse client and replaces the reference string with a base64 data URI.
316
317        If fetching media content fails for a reference string, a warning is logged and the reference
318        string is left unchanged.
319
320        Args:
321            obj: The object to process. Can be a primitive value, array, or nested object.
322                If the object has a __dict__ attribute, a dict will be returned instead of the original object type.
323            langfuse_client: Langfuse client instance used to fetch media content.
324            resolve_with: The representation of the media content to replace the media reference string with.
325                Currently only "base64_data_uri" is supported.
326            max_depth: Optional. Default is 10. The maximum depth to traverse the object.
327
328        Returns:
329            A deep copy of the input object with all media references replaced with base64 data URIs where possible.
330            If the input object has a __dict__ attribute, a dict will be returned instead of the original object type.
331
332        Example:
333            obj = {
334                "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@",
335                "nested": {
336                    "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@"
337                }
338            }
339
340            result = await LangfuseMedia.resolve_media_references(obj, langfuse_client)
341
342            # Result:
343            # {
344            #     "image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...",
345            #     "nested": {
346            #         "pdf": "data:application/pdf;base64,JVBERi0xLjcK..."
347            #     }
348            # }
349        """
350
351        def traverse(obj: Any, depth: int) -> Any:
352            if depth > max_depth:
353                return obj
354
355            # Handle string
356            if isinstance(obj, str):
357                regex = r"@@@langfuseMedia:.+?@@@"
358                reference_string_matches = re.findall(regex, obj)
359                if len(reference_string_matches) == 0:
360                    return obj
361
362                result = obj
363                reference_string_to_media_content = {}
364                httpx_client = (
365                    langfuse_client._resources.httpx_client
366                    if langfuse_client._resources is not None
367                    else None
368                )
369
370                for reference_string in reference_string_matches:
371                    try:
372                        parsed_media_reference = LangfuseMedia.parse_reference_string(
373                            reference_string
374                        )
375                        media_data = langfuse_client.api.media.get(
376                            parsed_media_reference["media_id"]
377                        )
378                        media_content = (
379                            httpx_client.get(
380                                media_data.url,
381                                timeout=content_fetch_timeout_seconds,
382                            )
383                            if httpx_client is not None
384                            else httpx.get(
385                                media_data.url, timeout=content_fetch_timeout_seconds
386                            )
387                        )
388                        media_content.raise_for_status()
389
390                        base64_media_content = base64.b64encode(
391                            media_content.content
392                        ).decode()
393                        base64_data_uri = f"data:{media_data.content_type};base64,{base64_media_content}"
394
395                        reference_string_to_media_content[reference_string] = (
396                            base64_data_uri
397                        )
398                    except Exception as e:
399                        logger.warning(
400                            f"Error fetching media content for reference string {reference_string}: {e}"
401                        )
402                        # Do not replace the reference string if there's an error
403                        continue
404
405                for (
406                    ref_str,
407                    media_content_str,
408                ) in reference_string_to_media_content.items():
409                    result = result.replace(ref_str, media_content_str)
410
411                return result
412
413            # Handle arrays
414            if isinstance(obj, list):
415                return [traverse(item, depth + 1) for item in obj]
416
417            # Handle dictionaries
418            if isinstance(obj, dict):
419                return {key: traverse(value, depth + 1) for key, value in obj.items()}
420
421            # Handle objects:
422            if hasattr(obj, "__dict__"):
423                return {
424                    key: traverse(value, depth + 1)
425                    for key, value in obj.__dict__.items()
426                }
427
428            return obj
429
430        return cast(T, traverse(obj, 0))

A class for wrapping media objects for upload to Langfuse.

This class handles the preparation and formatting of media content for Langfuse, supporting both base64 data URIs and raw content bytes.

Arguments:
  • obj (Optional[object]): The source object to be wrapped. Can be accessed via the obj attribute.
  • base64_data_uri (Optional[str]): A base64-encoded data URI containing the media content and content type (e.g., "data:image/jpeg;base64,/9j/4AAQ...").
  • content_type (Optional[str]): The MIME type of the media content when providing raw bytes.
  • content_bytes (Optional[bytes]): Raw bytes of the media content.
  • file_path (Optional[str]): The path to the file containing the media content. For relative paths, the current working directory is used.
Raises:
  • ValueError: If neither base64_data_uri or the combination of content_bytes and content_type is provided.
LangfuseMedia( *, obj: Optional[object] = None, base64_data_uri: Optional[str] = None, content_type: Optional[langfuse.api.MediaContentType] = None, content_bytes: Optional[bytes] = None, file_path: Optional[str] = None)
126    def __init__(
127        self,
128        *,
129        obj: Optional[object] = None,
130        base64_data_uri: Optional[str] = None,
131        content_type: Optional[MediaContentType] = None,
132        content_bytes: Optional[bytes] = None,
133        file_path: Optional[str] = None,
134    ):
135        """Initialize a LangfuseMedia object.
136
137        Args:
138            obj: The object to wrap.
139
140            base64_data_uri: A base64-encoded data URI containing the media content
141                and content type (e.g., "data:image/jpeg;base64,/9j/4AAQ...").
142            content_type: The MIME type of the media content when providing raw bytes or reading from a file.
143            content_bytes: Raw bytes of the media content.
144            file_path: The path to the file containing the media content. For relative paths,
145                the current working directory is used.
146        """
147        self.obj = obj
148
149        if base64_data_uri is not None:
150            parsed_data = self._parse_base64_data_uri(base64_data_uri)
151            self._content_bytes, self._content_type = parsed_data
152            self._source = "base64_data_uri"
153
154        elif content_bytes is not None and content_type is not None:
155            self._content_type = content_type
156            self._content_bytes = content_bytes
157            self._source = "bytes"
158        elif (
159            file_path is not None
160            and content_type is not None
161            and os.path.exists(file_path)
162        ):
163            self._content_bytes = self._read_file(file_path)
164            self._content_type = content_type if self._content_bytes else None
165            self._source = "file" if self._content_bytes else None
166        else:
167            logger.error(
168                "base64_data_uri, or content_bytes and content_type, or file_path must be provided to LangfuseMedia"
169            )
170
171            self._content_bytes = None
172            self._content_type = None
173            self._source = None
174
175        self._media_id = self._get_media_id()

Initialize a LangfuseMedia object.

Arguments:
  • obj: The object to wrap.
  • base64_data_uri: A base64-encoded data URI containing the media content and content type (e.g., "data:image/jpeg;base64,/9j/4AAQ...").
  • content_type: The MIME type of the media content when providing raw bytes or reading from a file.
  • content_bytes: Raw bytes of the media content.
  • file_path: The path to the file containing the media content. For relative paths, the current working directory is used.
obj: object
@staticmethod
def parse_reference_string(reference_string: str) -> langfuse.types.ParsedMediaReference:
217    @staticmethod
218    def parse_reference_string(reference_string: str) -> ParsedMediaReference:
219        """Parse a media reference string into a ParsedMediaReference.
220
221        Example reference string:
222            "@@@langfuseMedia:type=image/jpeg|id=some-uuid|source=base64_data_uri@@@"
223
224        Args:
225            reference_string: The reference string to parse.
226
227        Returns:
228            A TypedDict with the media_id, source, and content_type.
229
230        Raises:
231            ValueError: If the reference string is empty or not a string.
232            ValueError: If the reference string does not start with "@@@langfuseMedia:type=".
233            ValueError: If the reference string does not end with "@@@".
234            ValueError: If the reference string is missing required fields.
235        """
236        if not reference_string:
237            raise ValueError("Reference string is empty")
238
239        if not isinstance(reference_string, str):
240            raise ValueError("Reference string is not a string")
241
242        if not reference_string.startswith("@@@langfuseMedia:type="):
243            raise ValueError(
244                "Reference string does not start with '@@@langfuseMedia:type='"
245            )
246
247        if not reference_string.endswith("@@@"):
248            raise ValueError("Reference string does not end with '@@@'")
249
250        content = reference_string[len("@@@langfuseMedia:") :].rstrip("@@@")
251
252        # Split into key-value pairs
253        pairs = content.split("|")
254        parsed_data = {}
255
256        for pair in pairs:
257            key, value = pair.split("=", 1)
258            parsed_data[key] = value
259
260        # Verify all required fields are present
261        if not all(key in parsed_data for key in ["type", "id", "source"]):
262            raise ValueError("Missing required fields in reference string")
263
264        return ParsedMediaReference(
265            media_id=parsed_data["id"],
266            source=parsed_data["source"],
267            content_type=cast(MediaContentType, parsed_data["type"]),
268        )

Parse a media reference string into a ParsedMediaReference.

Example reference string:

"@@@langfuseMedia:type=image/jpeg|id=some-uuid|source=base64_data_uri@@@"

Arguments:
  • reference_string: The reference string to parse.
Returns:

A TypedDict with the media_id, source, and content_type.

Raises:
  • ValueError: If the reference string is empty or not a string.
  • ValueError: If the reference string does not start with "@@@langfuseMedia:type=".
  • ValueError: If the reference string does not end with "@@@".
  • ValueError: If the reference string is missing required fields.
@staticmethod
def resolve_media_references( *, obj: ~T, langfuse_client: Langfuse, resolve_with: Literal['base64_data_uri'], max_depth: int = 10, content_fetch_timeout_seconds: int = 10) -> ~T:
302    @staticmethod
303    def resolve_media_references(
304        *,
305        obj: T,
306        langfuse_client: "Langfuse",
307        resolve_with: Literal["base64_data_uri"],
308        max_depth: int = 10,
309        content_fetch_timeout_seconds: int = 10,
310    ) -> T:
311        """Replace media reference strings in an object with base64 data URIs.
312
313        This method recursively traverses an object (up to max_depth) looking for media reference strings
314        in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using
315        the provided Langfuse client and replaces the reference string with a base64 data URI.
316
317        If fetching media content fails for a reference string, a warning is logged and the reference
318        string is left unchanged.
319
320        Args:
321            obj: The object to process. Can be a primitive value, array, or nested object.
322                If the object has a __dict__ attribute, a dict will be returned instead of the original object type.
323            langfuse_client: Langfuse client instance used to fetch media content.
324            resolve_with: The representation of the media content to replace the media reference string with.
325                Currently only "base64_data_uri" is supported.
326            max_depth: Optional. Default is 10. The maximum depth to traverse the object.
327
328        Returns:
329            A deep copy of the input object with all media references replaced with base64 data URIs where possible.
330            If the input object has a __dict__ attribute, a dict will be returned instead of the original object type.
331
332        Example:
333            obj = {
334                "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@",
335                "nested": {
336                    "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@"
337                }
338            }
339
340            result = await LangfuseMedia.resolve_media_references(obj, langfuse_client)
341
342            # Result:
343            # {
344            #     "image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...",
345            #     "nested": {
346            #         "pdf": "data:application/pdf;base64,JVBERi0xLjcK..."
347            #     }
348            # }
349        """
350
351        def traverse(obj: Any, depth: int) -> Any:
352            if depth > max_depth:
353                return obj
354
355            # Handle string
356            if isinstance(obj, str):
357                regex = r"@@@langfuseMedia:.+?@@@"
358                reference_string_matches = re.findall(regex, obj)
359                if len(reference_string_matches) == 0:
360                    return obj
361
362                result = obj
363                reference_string_to_media_content = {}
364                httpx_client = (
365                    langfuse_client._resources.httpx_client
366                    if langfuse_client._resources is not None
367                    else None
368                )
369
370                for reference_string in reference_string_matches:
371                    try:
372                        parsed_media_reference = LangfuseMedia.parse_reference_string(
373                            reference_string
374                        )
375                        media_data = langfuse_client.api.media.get(
376                            parsed_media_reference["media_id"]
377                        )
378                        media_content = (
379                            httpx_client.get(
380                                media_data.url,
381                                timeout=content_fetch_timeout_seconds,
382                            )
383                            if httpx_client is not None
384                            else httpx.get(
385                                media_data.url, timeout=content_fetch_timeout_seconds
386                            )
387                        )
388                        media_content.raise_for_status()
389
390                        base64_media_content = base64.b64encode(
391                            media_content.content
392                        ).decode()
393                        base64_data_uri = f"data:{media_data.content_type};base64,{base64_media_content}"
394
395                        reference_string_to_media_content[reference_string] = (
396                            base64_data_uri
397                        )
398                    except Exception as e:
399                        logger.warning(
400                            f"Error fetching media content for reference string {reference_string}: {e}"
401                        )
402                        # Do not replace the reference string if there's an error
403                        continue
404
405                for (
406                    ref_str,
407                    media_content_str,
408                ) in reference_string_to_media_content.items():
409                    result = result.replace(ref_str, media_content_str)
410
411                return result
412
413            # Handle arrays
414            if isinstance(obj, list):
415                return [traverse(item, depth + 1) for item in obj]
416
417            # Handle dictionaries
418            if isinstance(obj, dict):
419                return {key: traverse(value, depth + 1) for key, value in obj.items()}
420
421            # Handle objects:
422            if hasattr(obj, "__dict__"):
423                return {
424                    key: traverse(value, depth + 1)
425                    for key, value in obj.__dict__.items()
426                }
427
428            return obj
429
430        return cast(T, traverse(obj, 0))

Replace media reference strings in an object with base64 data URIs.

This method recursively traverses an object (up to max_depth) looking for media reference strings in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using the provided Langfuse client and replaces the reference string with a base64 data URI.

If fetching media content fails for a reference string, a warning is logged and the reference string is left unchanged.

Arguments:
  • obj: The object to process. Can be a primitive value, array, or nested object. If the object has a __dict__ attribute, a dict will be returned instead of the original object type.
  • langfuse_client: Langfuse client instance used to fetch media content.
  • resolve_with: The representation of the media content to replace the media reference string with. Currently only "base64_data_uri" is supported.
  • max_depth: Optional. Default is 10. The maximum depth to traverse the object.
Returns:

A deep copy of the input object with all media references replaced with base64 data URIs where possible. If the input object has a __dict__ attribute, a dict will be returned instead of the original object type.

Example:

obj = { "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@", "nested": { "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@" } }

result = await LangfuseMedia.resolve_media_references(obj, langfuse_client)

Result:

{

"image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...",

"nested": {

"pdf": "data:application/pdf;base64,JVBERi0xLjcK..."

}

}

@dataclass(frozen=True)
class LangfuseMediaReference:
24@dataclass(frozen=True)
25class LangfuseMediaReference:
26    """Resolved reference to media stored in Langfuse."""
27
28    media_id: str
29    content_type: str
30    url: str
31    url_expiry: Optional[str] = None
32    content_length: Optional[int] = None
33    reference_string: Optional[str] = None
34
35    def is_url_expired(self) -> bool:
36        """Return whether the signed URL is already expired."""
37        if self.url_expiry is None:
38            return False
39
40        expiry = self.url_expiry.replace("Z", "+00:00")
41
42        try:
43            expiry_datetime = datetime.fromisoformat(expiry)
44        except ValueError:
45            return False
46
47        if expiry_datetime.tzinfo is None:
48            expiry_datetime = expiry_datetime.replace(tzinfo=timezone.utc)
49
50        return expiry_datetime <= datetime.now(timezone.utc)
51
52    def fetch_bytes(
53        self, *, timeout: float = 30.0, client: Optional[httpx.Client] = None
54    ) -> bytes:
55        """Fetch the media content from the signed URL.
56
57        Args:
58            timeout: Request timeout in seconds.
59            client: Optional httpx client to use for the request. Pass this to
60                honor custom transport settings (proxy, CA bundle, mTLS) — in
61                particular when multiple Langfuse clients are configured, since
62                the SDK cannot otherwise tell which client produced this
63                reference. When omitted, the single configured client is used,
64                falling back to a default httpx client.
65        """
66        from langfuse._client.resource_manager import LangfuseResourceManager
67
68        httpx_client = client or LangfuseResourceManager.get_singleton_httpx_client()
69        response = (
70            httpx_client.get(self.url, timeout=timeout)
71            if httpx_client is not None
72            else httpx.get(self.url, timeout=timeout)
73        )
74        response.raise_for_status()
75
76        return response.content
77
78    def fetch_base64(
79        self, *, timeout: float = 30.0, client: Optional[httpx.Client] = None
80    ) -> str:
81        """Fetch media and return raw base64 without a data URI prefix.
82
83        See :meth:`fetch_bytes` for the ``client`` argument.
84        """
85        return base64.b64encode(
86            self.fetch_bytes(timeout=timeout, client=client)
87        ).decode()
88
89    def fetch_data_uri(
90        self, *, timeout: float = 30.0, client: Optional[httpx.Client] = None
91    ) -> str:
92        """Fetch media and return it as a data URI.
93
94        See :meth:`fetch_bytes` for the ``client`` argument.
95        """
96        return f"data:{self.content_type};base64,{self.fetch_base64(timeout=timeout, client=client)}"

Resolved reference to media stored in Langfuse.

LangfuseMediaReference( media_id: str, content_type: str, url: str, url_expiry: Optional[str] = None, content_length: Optional[int] = None, reference_string: Optional[str] = None)
media_id: str
content_type: str
url: str
url_expiry: Optional[str] = None
content_length: Optional[int] = None
reference_string: Optional[str] = None
def is_url_expired(self) -> bool:
35    def is_url_expired(self) -> bool:
36        """Return whether the signed URL is already expired."""
37        if self.url_expiry is None:
38            return False
39
40        expiry = self.url_expiry.replace("Z", "+00:00")
41
42        try:
43            expiry_datetime = datetime.fromisoformat(expiry)
44        except ValueError:
45            return False
46
47        if expiry_datetime.tzinfo is None:
48            expiry_datetime = expiry_datetime.replace(tzinfo=timezone.utc)
49
50        return expiry_datetime <= datetime.now(timezone.utc)

Return whether the signed URL is already expired.

def fetch_bytes( self, *, timeout: float = 30.0, client: Optional[httpx.Client] = None) -> bytes:
52    def fetch_bytes(
53        self, *, timeout: float = 30.0, client: Optional[httpx.Client] = None
54    ) -> bytes:
55        """Fetch the media content from the signed URL.
56
57        Args:
58            timeout: Request timeout in seconds.
59            client: Optional httpx client to use for the request. Pass this to
60                honor custom transport settings (proxy, CA bundle, mTLS) — in
61                particular when multiple Langfuse clients are configured, since
62                the SDK cannot otherwise tell which client produced this
63                reference. When omitted, the single configured client is used,
64                falling back to a default httpx client.
65        """
66        from langfuse._client.resource_manager import LangfuseResourceManager
67
68        httpx_client = client or LangfuseResourceManager.get_singleton_httpx_client()
69        response = (
70            httpx_client.get(self.url, timeout=timeout)
71            if httpx_client is not None
72            else httpx.get(self.url, timeout=timeout)
73        )
74        response.raise_for_status()
75
76        return response.content

Fetch the media content from the signed URL.

Arguments:
  • timeout: Request timeout in seconds.
  • client: Optional httpx client to use for the request. Pass this to honor custom transport settings (proxy, CA bundle, mTLS) — in particular when multiple Langfuse clients are configured, since the SDK cannot otherwise tell which client produced this reference. When omitted, the single configured client is used, falling back to a default httpx client.
def fetch_base64( self, *, timeout: float = 30.0, client: Optional[httpx.Client] = None) -> str:
78    def fetch_base64(
79        self, *, timeout: float = 30.0, client: Optional[httpx.Client] = None
80    ) -> str:
81        """Fetch media and return raw base64 without a data URI prefix.
82
83        See :meth:`fetch_bytes` for the ``client`` argument.
84        """
85        return base64.b64encode(
86            self.fetch_bytes(timeout=timeout, client=client)
87        ).decode()

Fetch media and return raw base64 without a data URI prefix.

See fetch_bytes() for the client argument.

def fetch_data_uri( self, *, timeout: float = 30.0, client: Optional[httpx.Client] = None) -> str:
89    def fetch_data_uri(
90        self, *, timeout: float = 30.0, client: Optional[httpx.Client] = None
91    ) -> str:
92        """Fetch media and return it as a data URI.
93
94        See :meth:`fetch_bytes` for the ``client`` argument.
95        """
96        return f"data:{self.content_type};base64,{self.fetch_base64(timeout=timeout, client=client)}"

Fetch media and return it as a data URI.

See fetch_bytes() for the client argument.

def get_client(*, public_key: Optional[str] = None) -> Langfuse:
 65def get_client(*, public_key: Optional[str] = None) -> Langfuse:
 66    """Get or create a Langfuse client instance.
 67
 68    Returns an existing Langfuse client or creates a new one if none exists. In multi-project setups,
 69    providing a public_key is required. Multi-project support is experimental - see Langfuse docs.
 70
 71    Behavior:
 72    - Single project: Returns existing client or creates new one
 73    - Multi-project: Requires public_key to return specific client
 74    - No public_key in multi-project: Returns disabled client to prevent data leakage
 75
 76    The function uses a singleton pattern per public_key to conserve resources and maintain state.
 77
 78    Args:
 79        public_key (Optional[str]): Project identifier
 80            - With key: Returns client for that project
 81            - Without key: Returns single client or disabled client if multiple exist
 82
 83    Returns:
 84        Langfuse: Client instance in one of three states:
 85            1. Client for specified public_key
 86            2. Default client for single-project setup
 87            3. Disabled client when multiple projects exist without key
 88
 89    Security:
 90        Disables tracing when multiple projects exist without explicit key to prevent
 91        cross-project data leakage. Multi-project setups are experimental.
 92
 93    Example:
 94        ```python
 95        # Single project
 96        client = get_client()  # Default client
 97
 98        # In multi-project usage:
 99        client_a = get_client(public_key="project_a_key")  # Returns project A's client
100        client_b = get_client(public_key="project_b_key")  # Returns project B's client
101
102        # Without specific key in multi-project setup:
103        client = get_client()  # Returns disabled client for safety
104        ```
105    """
106    with LangfuseResourceManager._lock:
107        active_instances = LangfuseResourceManager._instances
108
109        # If no explicit public_key provided, check execution context
110        if not public_key:
111            public_key = _current_public_key.get(None)
112
113        if not public_key:
114            if len(active_instances) == 0:
115                # No clients initialized yet, create default instance
116                return Langfuse()
117
118            if len(active_instances) == 1:
119                # Only one client exists, safe to use without specifying key
120                instance = list(active_instances.values())[0]
121
122                # Initialize with the credentials bound to the instance
123                # This is important if the original instance was instantiated
124                # via constructor arguments
125                return _create_client_from_instance(instance)
126
127            else:
128                # Multiple clients exist but no key specified - disable tracing
129                # to prevent cross-project data leakage
130                langfuse_logger.warning(
131                    "No 'langfuse_public_key' passed to decorated function, but multiple langfuse clients are instantiated in current process. Skipping tracing for this function to avoid cross-project leakage."
132                )
133                return Langfuse(
134                    tracing_enabled=False, public_key="fake", secret_key="fake"
135                )
136
137        else:
138            # Specific key provided, look up existing instance
139            target_instance: Optional[LangfuseResourceManager] = active_instances.get(
140                public_key, None
141            )
142
143            if target_instance is None:
144                # No instance found with this key - client not initialized properly
145                langfuse_logger.warning(
146                    f"No Langfuse client with public key {public_key} has been initialized. Skipping tracing for decorated function."
147                )
148                return Langfuse(
149                    tracing_enabled=False, public_key="fake", secret_key="fake"
150                )
151
152            # target_instance is guaranteed to be not None at this point
153            return _create_client_from_instance(target_instance, public_key)

Get or create a Langfuse client instance.

Returns an existing Langfuse client or creates a new one if none exists. In multi-project setups, providing a public_key is required. Multi-project support is experimental - see Langfuse docs.

Behavior:

  • Single project: Returns existing client or creates new one
  • Multi-project: Requires public_key to return specific client
  • No public_key in multi-project: Returns disabled client to prevent data leakage

The function uses a singleton pattern per public_key to conserve resources and maintain state.

Arguments:
  • public_key (Optional[str]): Project identifier
    • With key: Returns client for that project
    • Without key: Returns single client or disabled client if multiple exist
Returns:

Langfuse: Client instance in one of three states: 1. Client for specified public_key 2. Default client for single-project setup 3. Disabled client when multiple projects exist without key

Security:

Disables tracing when multiple projects exist without explicit key to prevent cross-project data leakage. Multi-project setups are experimental.

Example:
# Single project
client = get_client()  # Default client

# In multi-project usage:
client_a = get_client(public_key="project_a_key")  # Returns project A's client
client_b = get_client(public_key="project_b_key")  # Returns project B's client

# Without specific key in multi-project setup:
client = get_client()  # Returns disabled client for safety
def observe( func: Optional[~F] = None, *, name: Optional[str] = None, as_type: Union[Literal['generation', 'embedding'], Literal['span', 'agent', 'tool', 'chain', 'retriever', 'evaluator', 'guardrail'], NoneType] = None, capture_input: Optional[bool] = None, capture_output: Optional[bool] = None, transform_to_string: Optional[Callable[[Iterable], str]] = None) -> Union[~F, Callable[[~F], ~F]]:
 88    def observe(
 89        self,
 90        func: Optional[F] = None,
 91        *,
 92        name: Optional[str] = None,
 93        as_type: Optional[ObservationTypeLiteralNoEvent] = None,
 94        capture_input: Optional[bool] = None,
 95        capture_output: Optional[bool] = None,
 96        transform_to_string: Optional[Callable[[Iterable], str]] = None,
 97    ) -> Union[F, Callable[[F], F]]:
 98        """Wrap a function to create and manage Langfuse tracing around its execution, supporting both synchronous and asynchronous functions.
 99
100        This decorator provides seamless integration of Langfuse observability into your codebase. It automatically creates
101        spans or generations around function execution, capturing timing, inputs/outputs, and error states. The decorator
102        intelligently handles both synchronous and asynchronous functions, preserving function signatures and type hints.
103
104        Using OpenTelemetry's distributed tracing system, it maintains proper trace context propagation throughout your application,
105        enabling you to see hierarchical traces of function calls with detailed performance metrics and function-specific details.
106
107        Args:
108            func (Optional[Callable]): The function to decorate. When used with parentheses @observe(), this will be None.
109            name (Optional[str]): Custom name for the created trace or span. If not provided, the function name is used.
110            as_type (Optional[Literal]): Set the observation type. Supported values:
111                    "generation", "span", "agent", "tool", "chain", "retriever", "embedding", "evaluator", "guardrail".
112                    Observation types are highlighted in the Langfuse UI for filtering and visualization.
113                    The types "generation" and "embedding" create a span on which additional attributes such as model metrics
114                    can be set.
115
116        Returns:
117            Callable: A wrapped version of the original function that automatically creates and manages Langfuse spans.
118
119        Example:
120            For general function tracing with automatic naming:
121            ```python
122            @observe()
123            def process_user_request(user_id, query):
124                # Function is automatically traced with name "process_user_request"
125                return get_response(query)
126            ```
127
128            For language model generation tracking:
129            ```python
130            @observe(name="answer-generation", as_type="generation")
131            async def generate_answer(query):
132                # Creates a generation-type span with extended LLM metrics
133                response = await openai.chat.completions.create(
134                    model="gpt-4",
135                    messages=[{"role": "user", "content": query}]
136                )
137                return response.choices[0].message.content
138            ```
139
140            For trace context propagation between functions:
141            ```python
142            @observe()
143            def main_process():
144                # Parent span is created
145                return sub_process()  # Child span automatically connected to parent
146
147            @observe()
148            def sub_process():
149                # Automatically becomes a child span of main_process
150                return "result"
151            ```
152
153        Raises:
154            Exception: Propagates any exceptions from the wrapped function after logging them in the trace.
155
156        Notes:
157            - The decorator preserves the original function's signature, docstring, and return type.
158            - Proper parent-child relationships between spans are automatically maintained.
159            - Special keyword arguments can be passed to control tracing:
160              - langfuse_trace_id: Explicitly set the trace ID for this function call
161              - langfuse_parent_observation_id: Explicitly set the parent span ID
162              - langfuse_public_key: Use a specific Langfuse project (when multiple clients exist)
163            - For async functions, the decorator returns an async function wrapper.
164            - For sync functions, the decorator returns a synchronous wrapper.
165        """
166        valid_types = set(get_observation_types_list(ObservationTypeLiteralNoEvent))
167        if as_type is not None and as_type not in valid_types:
168            logger.warning(
169                f"Invalid as_type '{as_type}'. Valid types are: {', '.join(sorted(valid_types))}. Defaulting to 'span'."
170            )
171            as_type = "span"
172
173        function_io_capture_enabled = os.environ.get(
174            LANGFUSE_OBSERVE_DECORATOR_IO_CAPTURE_ENABLED, "True"
175        ).lower() not in ("false", "0")
176
177        should_capture_input = (
178            capture_input if capture_input is not None else function_io_capture_enabled
179        )
180
181        should_capture_output = (
182            capture_output
183            if capture_output is not None
184            else function_io_capture_enabled
185        )
186
187        def decorator(func: F) -> F:
188            return (
189                self._async_observe(
190                    func,
191                    name=name,
192                    as_type=as_type,
193                    capture_input=should_capture_input,
194                    capture_output=should_capture_output,
195                    transform_to_string=transform_to_string,
196                )
197                if asyncio.iscoroutinefunction(func)
198                else self._sync_observe(
199                    func,
200                    name=name,
201                    as_type=as_type,
202                    capture_input=should_capture_input,
203                    capture_output=should_capture_output,
204                    transform_to_string=transform_to_string,
205                )
206            )
207
208        """Handle decorator with or without parentheses.
209
210        This logic enables the decorator to work both with and without parentheses:
211        - @observe - Python passes the function directly to the decorator
212        - @observe() - Python calls the decorator first, which must return a function decorator
213
214        When called without arguments (@observe), the func parameter contains the function to decorate,
215        so we directly apply the decorator to it. When called with parentheses (@observe()),
216        func is None, so we return the decorator function itself for Python to apply in the next step.
217        """
218        if func is None:
219            return decorator
220        else:
221            return decorator(func)

Wrap a function to create and manage Langfuse tracing around its execution, supporting both synchronous and asynchronous functions.

This decorator provides seamless integration of Langfuse observability into your codebase. It automatically creates spans or generations around function execution, capturing timing, inputs/outputs, and error states. The decorator intelligently handles both synchronous and asynchronous functions, preserving function signatures and type hints.

Using OpenTelemetry's distributed tracing system, it maintains proper trace context propagation throughout your application, enabling you to see hierarchical traces of function calls with detailed performance metrics and function-specific details.

Arguments:
  • func (Optional[Callable]): The function to decorate. When used with parentheses @observe(), this will be None.
  • name (Optional[str]): Custom name for the created trace or span. If not provided, the function name is used.
  • as_type (Optional[Literal]): Set the observation type. Supported values: "generation", "span", "agent", "tool", "chain", "retriever", "embedding", "evaluator", "guardrail". Observation types are highlighted in the Langfuse UI for filtering and visualization. The types "generation" and "embedding" create a span on which additional attributes such as model metrics can be set.
Returns:

Callable: A wrapped version of the original function that automatically creates and manages Langfuse spans.

Example:

For general function tracing with automatic naming:

@observe()
def process_user_request(user_id, query):
    # Function is automatically traced with name "process_user_request"
    return get_response(query)

For language model generation tracking:

@observe(name="answer-generation", as_type="generation")
async def generate_answer(query):
    # Creates a generation-type span with extended LLM metrics
    response = await openai.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": query}]
    )
    return response.choices[0].message.content

For trace context propagation between functions:

@observe()
def main_process():
    # Parent span is created
    return sub_process()  # Child span automatically connected to parent

@observe()
def sub_process():
    # Automatically becomes a child span of main_process
    return "result"
Raises:
  • Exception: Propagates any exceptions from the wrapped function after logging them in the trace.
Notes:
  • The decorator preserves the original function's signature, docstring, and return type.
  • Proper parent-child relationships between spans are automatically maintained.
  • Special keyword arguments can be passed to control tracing:
    • langfuse_trace_id: Explicitly set the trace ID for this function call
    • langfuse_parent_observation_id: Explicitly set the parent span ID
    • langfuse_public_key: Use a specific Langfuse project (when multiple clients exist)
  • For async functions, the decorator returns an async function wrapper.
  • For sync functions, the decorator returns a synchronous wrapper.
def propagate_attributes( *, user_id: Optional[str] = None, session_id: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, version: Optional[str] = None, tags: Optional[List[str]] = None, trace_name: Optional[str] = None, environment: Optional[str] = None, as_baggage: bool = False) -> opentelemetry.util._decorator._AgnosticContextManager[typing.Any]:
 98def propagate_attributes(
 99    *,
100    user_id: Optional[str] = None,
101    session_id: Optional[str] = None,
102    metadata: Optional[Dict[str, Any]] = None,
103    version: Optional[str] = None,
104    tags: Optional[List[str]] = None,
105    trace_name: Optional[str] = None,
106    environment: Optional[str] = None,
107    as_baggage: bool = False,
108) -> _AgnosticContextManager[Any]:
109    """Propagate trace-level attributes to all spans created within this context.
110
111    This context manager sets attributes on the currently active span AND automatically
112    propagates them to all new child spans created within the context. This is the
113    recommended way to set trace-level attributes like user_id, session_id,
114    environment, and metadata dimensions that should be consistently applied across
115    all observations in a trace.
116
117    **IMPORTANT**: Call this as early as possible within your trace/workflow. Only the
118    currently active span and spans created after entering this context will have these
119    attributes. Pre-existing spans will NOT be retroactively updated.
120
121    **Why this matters**: Langfuse aggregation queries (e.g., total cost by user_id,
122    filtering by session_id) only include observations that have the attribute set.
123    If you call `propagate_attributes` late in your workflow, earlier spans won't be
124    included in aggregations for that attribute.
125
126    Args:
127        user_id: User identifier to associate with all spans in this context.
128            Must be US-ASCII string, ≤200 characters. Use this to track which user
129            generated each trace and enable e.g. per-user cost/performance analysis.
130        session_id: Session identifier to associate with all spans in this context.
131            Must be US-ASCII string, ≤200 characters. Use this to group related traces
132            within a user session (e.g., a conversation thread, multi-turn interaction).
133        metadata: Additional key-value metadata to propagate to all spans.
134            - Keys must be US-ASCII strings
135            - Values are coerced to strings
136            - Coerced values must be ≤200 characters
137            - Use for dimensions like internal correlating identifiers
138            - AVOID: large payloads or sensitive data
139        version: Version identfier for parts of your application that are independently versioned, e.g. agents
140        tags: List of tags to categorize the group of observations
141        trace_name: Name to assign to the trace. Must be US-ASCII string, ≤200 characters.
142            Use this to set a consistent trace name for all spans created within this context.
143        environment: Langfuse environment to assign to spans created in this context.
144            Must be a lowercase alphanumeric string with optional hyphens or underscores,
145            must be ≤40 characters, and must not start with "langfuse". This maps to
146            the first-class `langfuse.environment` attribute, not to trace metadata.
147            Use it for request-scoped environments, for example when one shared proxy
148            handles calls from dev, staging, qa, and prod. A propagated environment
149            takes precedence over the local client default configured via
150            `Langfuse(environment=...)` or `LANGFUSE_TRACING_ENVIRONMENT` for spans
151            created while this propagation context is active.
152        as_baggage: If True, propagates attributes using OpenTelemetry baggage for
153            cross-process/service propagation. **Security warning**: When enabled,
154            attribute values are added to HTTP headers on ALL outbound requests.
155            This includes `environment` as the `langfuse_environment` baggage entry.
156            Only enable if values are safe to transmit via HTTP headers and you need
157            cross-service tracing. Default: False.
158
159    Returns:
160        Context manager that propagates attributes to all child spans.
161
162    Example:
163        Basic usage with user and session tracking:
164
165        ```python
166        from langfuse import Langfuse
167
168        langfuse = Langfuse()
169
170        # Set attributes early in the trace
171        with langfuse.start_as_current_observation(name="user_workflow") as span:
172            with langfuse.propagate_attributes(
173                user_id="user_123",
174                session_id="session_abc",
175                environment="production",
176                metadata={"experiment": "variant_a"}
177            ):
178                # All spans created here will have user_id, session_id, environment, and metadata
179                with langfuse.start_observation(name="llm_call") as llm_span:
180                    # This span inherits user_id, session_id, environment, and experiment metadata
181                    ...
182
183                with langfuse.start_generation(name="completion") as gen:
184                    # This span also inherits all attributes
185                    ...
186        ```
187
188        Late propagation (anti-pattern):
189
190        ```python
191        with langfuse.start_as_current_observation(name="workflow") as span:
192            # These spans WON'T have user_id
193            early_span = langfuse.start_observation(name="early_work")
194            early_span.end()
195
196            # Set attributes in the middle
197            with langfuse.propagate_attributes(user_id="user_123"):
198                # Only spans created AFTER this point will have user_id
199                late_span = langfuse.start_observation(name="late_work")
200                late_span.end()
201
202            # Result: Aggregations by user_id will miss "early_work" span
203        ```
204
205        Cross-service propagation with baggage (advanced):
206
207        ```python
208        # Service A - originating service
209        with langfuse.start_as_current_observation(name="api_request"):
210            with langfuse.propagate_attributes(
211                user_id="user_123",
212                session_id="session_abc",
213                environment="staging",
214                as_baggage=True  # Propagate via HTTP headers
215            ):
216                # Make HTTP request to Service B
217                response = requests.get("https://service-b.example.com/api")
218                # user_id, session_id, and environment are now in HTTP headers
219
220        # Service B - downstream service
221        # OpenTelemetry will automatically extract baggage from HTTP headers
222        # and propagate attributes to spans in Service B. If Service B has a local
223        # Langfuse environment configured, the propagated environment wins for
224        # spans created within this context.
225        ```
226
227    Note:
228        - **Validation**: Attribute values (user_id, session_id, version, tags,
229          trace_name) must be strings ≤200 characters. Environment must also match
230          Langfuse's environment format: lowercase alphanumeric with optional
231          hyphens or underscores, must be ≤40 characters, and it must not start with "langfuse". Metadata
232          values are coerced to strings before the 200 character limit is applied.
233          Invalid values will be dropped with a warning logged.
234        - **OpenTelemetry**: This uses OpenTelemetry context propagation under the hood,
235          making it compatible with other OTel-instrumented libraries.
236
237    Raises:
238        No exceptions are raised. Invalid values are logged as warnings and dropped.
239    """
240    return _propagate_attributes(
241        user_id=user_id,
242        session_id=session_id,
243        metadata=metadata,
244        version=version,
245        tags=tags,
246        trace_name=trace_name,
247        environment=environment,
248        as_baggage=as_baggage,
249    )

Propagate trace-level attributes to all spans created within this context.

This context manager sets attributes on the currently active span AND automatically propagates them to all new child spans created within the context. This is the recommended way to set trace-level attributes like user_id, session_id, environment, and metadata dimensions that should be consistently applied across all observations in a trace.

IMPORTANT: Call this as early as possible within your trace/workflow. Only the currently active span and spans created after entering this context will have these attributes. Pre-existing spans will NOT be retroactively updated.

Why this matters: Langfuse aggregation queries (e.g., total cost by user_id, filtering by session_id) only include observations that have the attribute set. If you call propagate_attributes late in your workflow, earlier spans won't be included in aggregations for that attribute.

Arguments:
  • user_id: User identifier to associate with all spans in this context. Must be US-ASCII string, ≤200 characters. Use this to track which user generated each trace and enable e.g. per-user cost/performance analysis.
  • session_id: Session identifier to associate with all spans in this context. Must be US-ASCII string, ≤200 characters. Use this to group related traces within a user session (e.g., a conversation thread, multi-turn interaction).
  • metadata: Additional key-value metadata to propagate to all spans.
    • Keys must be US-ASCII strings
    • Values are coerced to strings
    • Coerced values must be ≤200 characters
    • Use for dimensions like internal correlating identifiers
    • AVOID: large payloads or sensitive data
  • version: Version identfier for parts of your application that are independently versioned, e.g. agents
  • tags: List of tags to categorize the group of observations
  • trace_name: Name to assign to the trace. Must be US-ASCII string, ≤200 characters. Use this to set a consistent trace name for all spans created within this context.
  • environment: Langfuse environment to assign to spans created in this context. Must be a lowercase alphanumeric string with optional hyphens or underscores, must be ≤40 characters, and must not start with "langfuse". This maps to the first-class langfuse.environment attribute, not to trace metadata. Use it for request-scoped environments, for example when one shared proxy handles calls from dev, staging, qa, and prod. A propagated environment takes precedence over the local client default configured via Langfuse(environment=...) or LANGFUSE_TRACING_ENVIRONMENT for spans created while this propagation context is active.
  • as_baggage: If True, propagates attributes using OpenTelemetry baggage for cross-process/service propagation. Security warning: When enabled, attribute values are added to HTTP headers on ALL outbound requests. This includes environment as the langfuse_environment baggage entry. Only enable if values are safe to transmit via HTTP headers and you need cross-service tracing. Default: False.
Returns:

Context manager that propagates attributes to all child spans.

Example:

Basic usage with user and session tracking:

from langfuse import Langfuse

langfuse = Langfuse()

# Set attributes early in the trace
with langfuse.start_as_current_observation(name="user_workflow") as span:
    with langfuse.propagate_attributes(
        user_id="user_123",
        session_id="session_abc",
        environment="production",
        metadata={"experiment": "variant_a"}
    ):
        # All spans created here will have user_id, session_id, environment, and metadata
        with langfuse.start_observation(name="llm_call") as llm_span:
            # This span inherits user_id, session_id, environment, and experiment metadata
            ...

        with langfuse.start_generation(name="completion") as gen:
            # This span also inherits all attributes
            ...

Late propagation (anti-pattern):

with langfuse.start_as_current_observation(name="workflow") as span:
    # These spans WON'T have user_id
    early_span = langfuse.start_observation(name="early_work")
    early_span.end()

    # Set attributes in the middle
    with langfuse.propagate_attributes(user_id="user_123"):
        # Only spans created AFTER this point will have user_id
        late_span = langfuse.start_observation(name="late_work")
        late_span.end()

    # Result: Aggregations by user_id will miss "early_work" span

Cross-service propagation with baggage (advanced):

# Service A - originating service
with langfuse.start_as_current_observation(name="api_request"):
    with langfuse.propagate_attributes(
        user_id="user_123",
        session_id="session_abc",
        environment="staging",
        as_baggage=True  # Propagate via HTTP headers
    ):
        # Make HTTP request to Service B
        response = requests.get("https://service-b.example.com/api")
        # user_id, session_id, and environment are now in HTTP headers

# Service B - downstream service
# OpenTelemetry will automatically extract baggage from HTTP headers
# and propagate attributes to spans in Service B. If Service B has a local
# Langfuse environment configured, the propagated environment wins for
# spans created within this context.
Note:
  • Validation: Attribute values (user_id, session_id, version, tags, trace_name) must be strings ≤200 characters. Environment must also match Langfuse's environment format: lowercase alphanumeric with optional hyphens or underscores, must be ≤40 characters, and it must not start with "langfuse". Metadata values are coerced to strings before the 200 character limit is applied. Invalid values will be dropped with a warning logged.
  • OpenTelemetry: This uses OpenTelemetry context propagation under the hood, making it compatible with other OTel-instrumented libraries.
Raises:
  • No exceptions are raised. Invalid values are logged as warnings and dropped.
ObservationTypeLiteral = typing.Union[typing.Literal['generation', 'embedding'], typing.Literal['span', 'agent', 'tool', 'chain', 'retriever', 'evaluator', 'guardrail'], typing.Literal['event']]
class LangfuseSpan(langfuse._client.span.LangfuseObservationWrapper):
1267class LangfuseSpan(LangfuseObservationWrapper):
1268    """Standard span implementation for general operations in Langfuse.
1269
1270    This class represents a general-purpose span that can be used to trace
1271    any operation in your application. It extends the base LangfuseObservationWrapper
1272    with specific methods for creating child spans, generations, and updating
1273    span-specific attributes. If possible, use a more specific type for
1274    better observability and insights.
1275    """
1276
1277    def __init__(
1278        self,
1279        *,
1280        otel_span: otel_trace_api.Span,
1281        langfuse_client: "Langfuse",
1282        input: Optional[Any] = None,
1283        output: Optional[Any] = None,
1284        metadata: Optional[Any] = None,
1285        environment: Optional[str] = None,
1286        release: Optional[str] = None,
1287        version: Optional[str] = None,
1288        level: Optional[SpanLevel] = None,
1289        status_message: Optional[str] = None,
1290    ):
1291        """Initialize a new LangfuseSpan.
1292
1293        Args:
1294            otel_span: The OpenTelemetry span to wrap
1295            langfuse_client: Reference to the parent Langfuse client
1296            input: Input data for the span (any JSON-serializable object)
1297            output: Output data from the span (any JSON-serializable object)
1298            metadata: Additional metadata to associate with the span
1299            environment: The tracing environment
1300            release: Release identifier for the application
1301            version: Version identifier for the code or component
1302            level: Importance level of the span (info, warning, error)
1303            status_message: Optional status message for the span
1304        """
1305        super().__init__(
1306            otel_span=otel_span,
1307            as_type="span",
1308            langfuse_client=langfuse_client,
1309            input=input,
1310            output=output,
1311            metadata=metadata,
1312            environment=environment,
1313            release=release,
1314            version=version,
1315            level=level,
1316            status_message=status_message,
1317        )

Standard span implementation for general operations in Langfuse.

This class represents a general-purpose span that can be used to trace any operation in your application. It extends the base LangfuseObservationWrapper with specific methods for creating child spans, generations, and updating span-specific attributes. If possible, use a more specific type for better observability and insights.

LangfuseSpan( *, otel_span: opentelemetry.trace.span.Span, langfuse_client: Langfuse, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, environment: Optional[str] = None, release: Optional[str] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None)
1277    def __init__(
1278        self,
1279        *,
1280        otel_span: otel_trace_api.Span,
1281        langfuse_client: "Langfuse",
1282        input: Optional[Any] = None,
1283        output: Optional[Any] = None,
1284        metadata: Optional[Any] = None,
1285        environment: Optional[str] = None,
1286        release: Optional[str] = None,
1287        version: Optional[str] = None,
1288        level: Optional[SpanLevel] = None,
1289        status_message: Optional[str] = None,
1290    ):
1291        """Initialize a new LangfuseSpan.
1292
1293        Args:
1294            otel_span: The OpenTelemetry span to wrap
1295            langfuse_client: Reference to the parent Langfuse client
1296            input: Input data for the span (any JSON-serializable object)
1297            output: Output data from the span (any JSON-serializable object)
1298            metadata: Additional metadata to associate with the span
1299            environment: The tracing environment
1300            release: Release identifier for the application
1301            version: Version identifier for the code or component
1302            level: Importance level of the span (info, warning, error)
1303            status_message: Optional status message for the span
1304        """
1305        super().__init__(
1306            otel_span=otel_span,
1307            as_type="span",
1308            langfuse_client=langfuse_client,
1309            input=input,
1310            output=output,
1311            metadata=metadata,
1312            environment=environment,
1313            release=release,
1314            version=version,
1315            level=level,
1316            status_message=status_message,
1317        )

Initialize a new LangfuseSpan.

Arguments:
  • otel_span: The OpenTelemetry span to wrap
  • langfuse_client: Reference to the parent Langfuse client
  • input: Input data for the span (any JSON-serializable object)
  • output: Output data from the span (any JSON-serializable object)
  • metadata: Additional metadata to associate with the span
  • environment: The tracing environment
  • release: Release identifier for the application
  • version: Version identifier for the code or component
  • level: Importance level of the span (info, warning, error)
  • status_message: Optional status message for the span
class LangfuseGeneration(langfuse._client.span.LangfuseObservationWrapper):
1320class LangfuseGeneration(LangfuseObservationWrapper):
1321    """Specialized span implementation for AI model generations in Langfuse.
1322
1323    This class represents a generation span specifically designed for tracking
1324    AI/LLM operations. It extends the base LangfuseObservationWrapper with specialized
1325    attributes for model details, token usage, and costs.
1326    """
1327
1328    def __init__(
1329        self,
1330        *,
1331        otel_span: otel_trace_api.Span,
1332        langfuse_client: "Langfuse",
1333        input: Optional[Any] = None,
1334        output: Optional[Any] = None,
1335        metadata: Optional[Any] = None,
1336        environment: Optional[str] = None,
1337        release: Optional[str] = None,
1338        version: Optional[str] = None,
1339        level: Optional[SpanLevel] = None,
1340        status_message: Optional[str] = None,
1341        completion_start_time: Optional[datetime] = None,
1342        model: Optional[str] = None,
1343        model_parameters: Optional[Dict[str, MapValue]] = None,
1344        usage_details: Optional[Dict[str, int]] = None,
1345        cost_details: Optional[Dict[str, float]] = None,
1346        prompt: Optional[PromptClient] = None,
1347    ):
1348        """Initialize a new LangfuseGeneration span.
1349
1350        Args:
1351            otel_span: The OpenTelemetry span to wrap
1352            langfuse_client: Reference to the parent Langfuse client
1353            input: Input data for the generation (e.g., prompts)
1354            output: Output from the generation (e.g., completions)
1355            metadata: Additional metadata to associate with the generation
1356            environment: The tracing environment
1357            release: Release identifier for the application
1358            version: Version identifier for the model or component
1359            level: Importance level of the generation (info, warning, error)
1360            status_message: Optional status message for the generation
1361            completion_start_time: When the model started generating the response
1362            model: Name/identifier of the AI model used (e.g., "gpt-4")
1363            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1364            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1365            cost_details: Cost information for the model call
1366            prompt: Associated prompt template from Langfuse prompt management
1367        """
1368        super().__init__(
1369            as_type="generation",
1370            otel_span=otel_span,
1371            langfuse_client=langfuse_client,
1372            input=input,
1373            output=output,
1374            metadata=metadata,
1375            environment=environment,
1376            release=release,
1377            version=version,
1378            level=level,
1379            status_message=status_message,
1380            completion_start_time=completion_start_time,
1381            model=model,
1382            model_parameters=model_parameters,
1383            usage_details=usage_details,
1384            cost_details=cost_details,
1385            prompt=prompt,
1386        )

Specialized span implementation for AI model generations in Langfuse.

This class represents a generation span specifically designed for tracking AI/LLM operations. It extends the base LangfuseObservationWrapper with specialized attributes for model details, token usage, and costs.

LangfuseGeneration( *, otel_span: opentelemetry.trace.span.Span, langfuse_client: Langfuse, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, environment: Optional[str] = None, release: Optional[str] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, float, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None)
1328    def __init__(
1329        self,
1330        *,
1331        otel_span: otel_trace_api.Span,
1332        langfuse_client: "Langfuse",
1333        input: Optional[Any] = None,
1334        output: Optional[Any] = None,
1335        metadata: Optional[Any] = None,
1336        environment: Optional[str] = None,
1337        release: Optional[str] = None,
1338        version: Optional[str] = None,
1339        level: Optional[SpanLevel] = None,
1340        status_message: Optional[str] = None,
1341        completion_start_time: Optional[datetime] = None,
1342        model: Optional[str] = None,
1343        model_parameters: Optional[Dict[str, MapValue]] = None,
1344        usage_details: Optional[Dict[str, int]] = None,
1345        cost_details: Optional[Dict[str, float]] = None,
1346        prompt: Optional[PromptClient] = None,
1347    ):
1348        """Initialize a new LangfuseGeneration span.
1349
1350        Args:
1351            otel_span: The OpenTelemetry span to wrap
1352            langfuse_client: Reference to the parent Langfuse client
1353            input: Input data for the generation (e.g., prompts)
1354            output: Output from the generation (e.g., completions)
1355            metadata: Additional metadata to associate with the generation
1356            environment: The tracing environment
1357            release: Release identifier for the application
1358            version: Version identifier for the model or component
1359            level: Importance level of the generation (info, warning, error)
1360            status_message: Optional status message for the generation
1361            completion_start_time: When the model started generating the response
1362            model: Name/identifier of the AI model used (e.g., "gpt-4")
1363            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1364            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1365            cost_details: Cost information for the model call
1366            prompt: Associated prompt template from Langfuse prompt management
1367        """
1368        super().__init__(
1369            as_type="generation",
1370            otel_span=otel_span,
1371            langfuse_client=langfuse_client,
1372            input=input,
1373            output=output,
1374            metadata=metadata,
1375            environment=environment,
1376            release=release,
1377            version=version,
1378            level=level,
1379            status_message=status_message,
1380            completion_start_time=completion_start_time,
1381            model=model,
1382            model_parameters=model_parameters,
1383            usage_details=usage_details,
1384            cost_details=cost_details,
1385            prompt=prompt,
1386        )

Initialize a new LangfuseGeneration span.

Arguments:
  • otel_span: The OpenTelemetry span to wrap
  • langfuse_client: Reference to the parent Langfuse client
  • input: Input data for the generation (e.g., prompts)
  • output: Output from the generation (e.g., completions)
  • metadata: Additional metadata to associate with the generation
  • environment: The tracing environment
  • release: Release identifier for the application
  • version: Version identifier for the model or component
  • level: Importance level of the generation (info, warning, error)
  • status_message: Optional status message for the generation
  • completion_start_time: When the model started generating the response
  • model: Name/identifier of the AI model used (e.g., "gpt-4")
  • model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
  • usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
  • cost_details: Cost information for the model call
  • prompt: Associated prompt template from Langfuse prompt management
class LangfuseEvent(langfuse._client.span.LangfuseObservationWrapper):
1389class LangfuseEvent(LangfuseObservationWrapper):
1390    """Specialized span implementation for Langfuse Events."""
1391
1392    def __init__(
1393        self,
1394        *,
1395        otel_span: otel_trace_api.Span,
1396        langfuse_client: "Langfuse",
1397        input: Optional[Any] = None,
1398        output: Optional[Any] = None,
1399        metadata: Optional[Any] = None,
1400        environment: Optional[str] = None,
1401        release: Optional[str] = None,
1402        version: Optional[str] = None,
1403        level: Optional[SpanLevel] = None,
1404        status_message: Optional[str] = None,
1405    ):
1406        """Initialize a new LangfuseEvent span.
1407
1408        Args:
1409            otel_span: The OpenTelemetry span to wrap
1410            langfuse_client: Reference to the parent Langfuse client
1411            input: Input data for the event
1412            output: Output from the event
1413            metadata: Additional metadata to associate with the generation
1414            environment: The tracing environment
1415            release: Release identifier for the application
1416            version: Version identifier for the model or component
1417            level: Importance level of the generation (info, warning, error)
1418            status_message: Optional status message for the generation
1419        """
1420        super().__init__(
1421            otel_span=otel_span,
1422            as_type="event",
1423            langfuse_client=langfuse_client,
1424            input=input,
1425            output=output,
1426            metadata=metadata,
1427            environment=environment,
1428            release=release,
1429            version=version,
1430            level=level,
1431            status_message=status_message,
1432        )
1433
1434    def update(
1435        self,
1436        *,
1437        name: Optional[str] = None,
1438        input: Optional[Any] = None,
1439        output: Optional[Any] = None,
1440        metadata: Optional[Any] = None,
1441        version: Optional[str] = None,
1442        level: Optional[SpanLevel] = None,
1443        status_message: Optional[str] = None,
1444        completion_start_time: Optional[datetime] = None,
1445        model: Optional[str] = None,
1446        model_parameters: Optional[Dict[str, MapValue]] = None,
1447        usage_details: Optional[Dict[str, int]] = None,
1448        cost_details: Optional[Dict[str, float]] = None,
1449        prompt: Optional[PromptClient] = None,
1450        **kwargs: Any,
1451    ) -> "LangfuseEvent":
1452        """Update is not allowed for LangfuseEvent because events cannot be updated.
1453
1454        This method logs a warning and returns self without making changes.
1455
1456        Returns:
1457            self: Returns the unchanged LangfuseEvent instance
1458        """
1459        langfuse_logger.warning(
1460            "Attempted to update LangfuseEvent observation. Events cannot be updated after creation."
1461        )
1462        return self

Specialized span implementation for Langfuse Events.

LangfuseEvent( *, otel_span: opentelemetry.trace.span.Span, langfuse_client: Langfuse, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, environment: Optional[str] = None, release: Optional[str] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None)
1392    def __init__(
1393        self,
1394        *,
1395        otel_span: otel_trace_api.Span,
1396        langfuse_client: "Langfuse",
1397        input: Optional[Any] = None,
1398        output: Optional[Any] = None,
1399        metadata: Optional[Any] = None,
1400        environment: Optional[str] = None,
1401        release: Optional[str] = None,
1402        version: Optional[str] = None,
1403        level: Optional[SpanLevel] = None,
1404        status_message: Optional[str] = None,
1405    ):
1406        """Initialize a new LangfuseEvent span.
1407
1408        Args:
1409            otel_span: The OpenTelemetry span to wrap
1410            langfuse_client: Reference to the parent Langfuse client
1411            input: Input data for the event
1412            output: Output from the event
1413            metadata: Additional metadata to associate with the generation
1414            environment: The tracing environment
1415            release: Release identifier for the application
1416            version: Version identifier for the model or component
1417            level: Importance level of the generation (info, warning, error)
1418            status_message: Optional status message for the generation
1419        """
1420        super().__init__(
1421            otel_span=otel_span,
1422            as_type="event",
1423            langfuse_client=langfuse_client,
1424            input=input,
1425            output=output,
1426            metadata=metadata,
1427            environment=environment,
1428            release=release,
1429            version=version,
1430            level=level,
1431            status_message=status_message,
1432        )

Initialize a new LangfuseEvent span.

Arguments:
  • otel_span: The OpenTelemetry span to wrap
  • langfuse_client: Reference to the parent Langfuse client
  • input: Input data for the event
  • output: Output from the event
  • metadata: Additional metadata to associate with the generation
  • environment: The tracing environment
  • release: Release identifier for the application
  • version: Version identifier for the model or component
  • level: Importance level of the generation (info, warning, error)
  • status_message: Optional status message for the generation
def update( self, *, name: Optional[str] = None, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, float, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None, **kwargs: Any) -> LangfuseEvent:
1434    def update(
1435        self,
1436        *,
1437        name: Optional[str] = None,
1438        input: Optional[Any] = None,
1439        output: Optional[Any] = None,
1440        metadata: Optional[Any] = None,
1441        version: Optional[str] = None,
1442        level: Optional[SpanLevel] = None,
1443        status_message: Optional[str] = None,
1444        completion_start_time: Optional[datetime] = None,
1445        model: Optional[str] = None,
1446        model_parameters: Optional[Dict[str, MapValue]] = None,
1447        usage_details: Optional[Dict[str, int]] = None,
1448        cost_details: Optional[Dict[str, float]] = None,
1449        prompt: Optional[PromptClient] = None,
1450        **kwargs: Any,
1451    ) -> "LangfuseEvent":
1452        """Update is not allowed for LangfuseEvent because events cannot be updated.
1453
1454        This method logs a warning and returns self without making changes.
1455
1456        Returns:
1457            self: Returns the unchanged LangfuseEvent instance
1458        """
1459        langfuse_logger.warning(
1460            "Attempted to update LangfuseEvent observation. Events cannot be updated after creation."
1461        )
1462        return self

Update is not allowed for LangfuseEvent because events cannot be updated.

This method logs a warning and returns self without making changes.

Returns:

self: Returns the unchanged LangfuseEvent instance

class LangfuseOtelSpanAttributes:
28class LangfuseOtelSpanAttributes:
29    # Langfuse-Trace attributes
30    TRACE_NAME = "langfuse.trace.name"
31    TRACE_USER_ID = "user.id"
32    TRACE_SESSION_ID = "session.id"
33    TRACE_TAGS = "langfuse.trace.tags"
34    TRACE_PUBLIC = "langfuse.trace.public"
35    TRACE_METADATA = "langfuse.trace.metadata"
36    TRACE_INPUT = "langfuse.trace.input"
37    TRACE_OUTPUT = "langfuse.trace.output"
38
39    # Langfuse-observation attributes
40    OBSERVATION_TYPE = "langfuse.observation.type"
41    OBSERVATION_METADATA = "langfuse.observation.metadata"
42    OBSERVATION_LEVEL = "langfuse.observation.level"
43    OBSERVATION_STATUS_MESSAGE = "langfuse.observation.status_message"
44    OBSERVATION_INPUT = "langfuse.observation.input"
45    OBSERVATION_OUTPUT = "langfuse.observation.output"
46
47    # Langfuse-observation of type Generation attributes
48    OBSERVATION_COMPLETION_START_TIME = "langfuse.observation.completion_start_time"
49    OBSERVATION_MODEL = "langfuse.observation.model.name"
50    OBSERVATION_MODEL_PARAMETERS = "langfuse.observation.model.parameters"
51    OBSERVATION_USAGE_DETAILS = "langfuse.observation.usage_details"
52    OBSERVATION_COST_DETAILS = "langfuse.observation.cost_details"
53    OBSERVATION_PROMPT_NAME = "langfuse.observation.prompt.name"
54    OBSERVATION_PROMPT_VERSION = "langfuse.observation.prompt.version"
55
56    # General
57    ENVIRONMENT = "langfuse.environment"
58    RELEASE = "langfuse.release"
59    VERSION = "langfuse.version"
60
61    # Internal
62    AS_ROOT = "langfuse.internal.as_root"
63    IS_APP_ROOT = "langfuse.internal.is_app_root"
64
65    # Experiments
66    EXPERIMENT_ID = "langfuse.experiment.id"
67    EXPERIMENT_NAME = "langfuse.experiment.name"
68    EXPERIMENT_DESCRIPTION = "langfuse.experiment.description"
69    EXPERIMENT_METADATA = "langfuse.experiment.metadata"
70    EXPERIMENT_DATASET_ID = "langfuse.experiment.dataset.id"
71    EXPERIMENT_ITEM_ID = "langfuse.experiment.item.id"
72    EXPERIMENT_ITEM_EXPECTED_OUTPUT = "langfuse.experiment.item.expected_output"
73    EXPERIMENT_ITEM_METADATA = "langfuse.experiment.item.metadata"
74    EXPERIMENT_ITEM_ROOT_OBSERVATION_ID = "langfuse.experiment.item.root_observation_id"
TRACE_NAME = 'langfuse.trace.name'
TRACE_USER_ID = 'user.id'
TRACE_SESSION_ID = 'session.id'
TRACE_TAGS = 'langfuse.trace.tags'
TRACE_PUBLIC = 'langfuse.trace.public'
TRACE_METADATA = 'langfuse.trace.metadata'
TRACE_INPUT = 'langfuse.trace.input'
TRACE_OUTPUT = 'langfuse.trace.output'
OBSERVATION_TYPE = 'langfuse.observation.type'
OBSERVATION_METADATA = 'langfuse.observation.metadata'
OBSERVATION_LEVEL = 'langfuse.observation.level'
OBSERVATION_STATUS_MESSAGE = 'langfuse.observation.status_message'
OBSERVATION_INPUT = 'langfuse.observation.input'
OBSERVATION_OUTPUT = 'langfuse.observation.output'
OBSERVATION_COMPLETION_START_TIME = 'langfuse.observation.completion_start_time'
OBSERVATION_MODEL = 'langfuse.observation.model.name'
OBSERVATION_MODEL_PARAMETERS = 'langfuse.observation.model.parameters'
OBSERVATION_USAGE_DETAILS = 'langfuse.observation.usage_details'
OBSERVATION_COST_DETAILS = 'langfuse.observation.cost_details'
OBSERVATION_PROMPT_NAME = 'langfuse.observation.prompt.name'
OBSERVATION_PROMPT_VERSION = 'langfuse.observation.prompt.version'
ENVIRONMENT = 'langfuse.environment'
RELEASE = 'langfuse.release'
VERSION = 'langfuse.version'
AS_ROOT = 'langfuse.internal.as_root'
IS_APP_ROOT = 'langfuse.internal.is_app_root'
EXPERIMENT_ID = 'langfuse.experiment.id'
EXPERIMENT_NAME = 'langfuse.experiment.name'
EXPERIMENT_DESCRIPTION = 'langfuse.experiment.description'
EXPERIMENT_METADATA = 'langfuse.experiment.metadata'
EXPERIMENT_DATASET_ID = 'langfuse.experiment.dataset.id'
EXPERIMENT_ITEM_ID = 'langfuse.experiment.item.id'
EXPERIMENT_ITEM_EXPECTED_OUTPUT = 'langfuse.experiment.item.expected_output'
EXPERIMENT_ITEM_METADATA = 'langfuse.experiment.item.metadata'
EXPERIMENT_ITEM_ROOT_OBSERVATION_ID = 'langfuse.experiment.item.root_observation_id'
class LangfuseAgent(langfuse._client.span.LangfuseObservationWrapper):
1465class LangfuseAgent(LangfuseObservationWrapper):
1466    """Agent observation for reasoning blocks that act on tools using LLM guidance."""
1467
1468    def __init__(self, **kwargs: Any) -> None:
1469        """Initialize a new LangfuseAgent span."""
1470        kwargs["as_type"] = "agent"
1471        super().__init__(**kwargs)

Agent observation for reasoning blocks that act on tools using LLM guidance.

LangfuseAgent(**kwargs: Any)
1468    def __init__(self, **kwargs: Any) -> None:
1469        """Initialize a new LangfuseAgent span."""
1470        kwargs["as_type"] = "agent"
1471        super().__init__(**kwargs)

Initialize a new LangfuseAgent span.

class LangfuseTool(langfuse._client.span.LangfuseObservationWrapper):
1474class LangfuseTool(LangfuseObservationWrapper):
1475    """Tool observation representing external tool calls, e.g., calling a weather API."""
1476
1477    def __init__(self, **kwargs: Any) -> None:
1478        """Initialize a new LangfuseTool span."""
1479        kwargs["as_type"] = "tool"
1480        super().__init__(**kwargs)

Tool observation representing external tool calls, e.g., calling a weather API.

LangfuseTool(**kwargs: Any)
1477    def __init__(self, **kwargs: Any) -> None:
1478        """Initialize a new LangfuseTool span."""
1479        kwargs["as_type"] = "tool"
1480        super().__init__(**kwargs)

Initialize a new LangfuseTool span.

class LangfuseChain(langfuse._client.span.LangfuseObservationWrapper):
1483class LangfuseChain(LangfuseObservationWrapper):
1484    """Chain observation for connecting LLM application steps, e.g. passing context from retriever to LLM."""
1485
1486    def __init__(self, **kwargs: Any) -> None:
1487        """Initialize a new LangfuseChain span."""
1488        kwargs["as_type"] = "chain"
1489        super().__init__(**kwargs)

Chain observation for connecting LLM application steps, e.g. passing context from retriever to LLM.

LangfuseChain(**kwargs: Any)
1486    def __init__(self, **kwargs: Any) -> None:
1487        """Initialize a new LangfuseChain span."""
1488        kwargs["as_type"] = "chain"
1489        super().__init__(**kwargs)

Initialize a new LangfuseChain span.

class LangfuseEmbedding(langfuse._client.span.LangfuseObservationWrapper):
1501class LangfuseEmbedding(LangfuseObservationWrapper):
1502    """Embedding observation for LLM embedding calls, typically used before retrieval."""
1503
1504    def __init__(self, **kwargs: Any) -> None:
1505        """Initialize a new LangfuseEmbedding span."""
1506        kwargs["as_type"] = "embedding"
1507        super().__init__(**kwargs)

Embedding observation for LLM embedding calls, typically used before retrieval.

LangfuseEmbedding(**kwargs: Any)
1504    def __init__(self, **kwargs: Any) -> None:
1505        """Initialize a new LangfuseEmbedding span."""
1506        kwargs["as_type"] = "embedding"
1507        super().__init__(**kwargs)

Initialize a new LangfuseEmbedding span.

class LangfuseEvaluator(langfuse._client.span.LangfuseObservationWrapper):
1510class LangfuseEvaluator(LangfuseObservationWrapper):
1511    """Evaluator observation for assessing relevance, correctness, or helpfulness of LLM outputs."""
1512
1513    def __init__(self, **kwargs: Any) -> None:
1514        """Initialize a new LangfuseEvaluator span."""
1515        kwargs["as_type"] = "evaluator"
1516        super().__init__(**kwargs)

Evaluator observation for assessing relevance, correctness, or helpfulness of LLM outputs.

LangfuseEvaluator(**kwargs: Any)
1513    def __init__(self, **kwargs: Any) -> None:
1514        """Initialize a new LangfuseEvaluator span."""
1515        kwargs["as_type"] = "evaluator"
1516        super().__init__(**kwargs)

Initialize a new LangfuseEvaluator span.

class LangfuseRetriever(langfuse._client.span.LangfuseObservationWrapper):
1492class LangfuseRetriever(LangfuseObservationWrapper):
1493    """Retriever observation for data retrieval steps, e.g. vector store or database queries."""
1494
1495    def __init__(self, **kwargs: Any) -> None:
1496        """Initialize a new LangfuseRetriever span."""
1497        kwargs["as_type"] = "retriever"
1498        super().__init__(**kwargs)

Retriever observation for data retrieval steps, e.g. vector store or database queries.

LangfuseRetriever(**kwargs: Any)
1495    def __init__(self, **kwargs: Any) -> None:
1496        """Initialize a new LangfuseRetriever span."""
1497        kwargs["as_type"] = "retriever"
1498        super().__init__(**kwargs)

Initialize a new LangfuseRetriever span.

class LangfuseGuardrail(langfuse._client.span.LangfuseObservationWrapper):
1519class LangfuseGuardrail(LangfuseObservationWrapper):
1520    """Guardrail observation for protection e.g. against jailbreaks or offensive content."""
1521
1522    def __init__(self, **kwargs: Any) -> None:
1523        """Initialize a new LangfuseGuardrail span."""
1524        kwargs["as_type"] = "guardrail"
1525        super().__init__(**kwargs)

Guardrail observation for protection e.g. against jailbreaks or offensive content.

LangfuseGuardrail(**kwargs: Any)
1522    def __init__(self, **kwargs: Any) -> None:
1523        """Initialize a new LangfuseGuardrail span."""
1524        kwargs["as_type"] = "guardrail"
1525        super().__init__(**kwargs)

Initialize a new LangfuseGuardrail span.

class Evaluation:
101class Evaluation:
102    """Represents an evaluation result for an experiment item or an entire experiment run.
103
104    This class provides a strongly-typed way to create evaluation results in evaluator functions.
105    Users must use keyword arguments when instantiating this class.
106
107    Attributes:
108        name: Unique identifier for the evaluation metric. Should be descriptive
109            and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity").
110            Used for aggregation and comparison across experiment runs.
111        value: The evaluation score or result. Can be:
112            - Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42)
113            - String: For categorical results like "positive", "negative", "neutral"
114            - Boolean: For binary assessments like "passes_safety_check"
115        comment: Optional human-readable explanation of the evaluation result.
116            Useful for providing context, explaining scoring rationale, or noting
117            special conditions. Displayed in Langfuse UI for interpretability.
118        metadata: Optional structured metadata about the evaluation process.
119            Can include confidence scores, intermediate calculations, model versions,
120            or any other relevant technical details.
121        data_type: Optional score data type. Required if value is not NUMERIC.
122            One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC.
123        config_id: Optional Langfuse score config ID.
124
125    Examples:
126        Basic accuracy evaluation:
127        ```python
128        from langfuse import Evaluation
129
130        def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
131            if not expected_output:
132                return Evaluation(name="accuracy", value=0, comment="No expected output")
133
134            is_correct = output.strip().lower() == expected_output.strip().lower()
135            return Evaluation(
136                name="accuracy",
137                value=1.0 if is_correct else 0.0,
138                comment="Correct answer" if is_correct else "Incorrect answer"
139            )
140        ```
141
142        Multi-metric evaluator:
143        ```python
144        def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs):
145            return [
146                Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"),
147                Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"),
148                Evaluation(
149                    name="quality",
150                    value=0.85,
151                    comment="High quality response",
152                    metadata={"confidence": 0.92, "model": "gpt-4"}
153                )
154            ]
155        ```
156
157        Categorical evaluation:
158        ```python
159        def sentiment_evaluator(*, input, output, **kwargs):
160            sentiment = analyze_sentiment(output)  # Returns "positive", "negative", or "neutral"
161            return Evaluation(
162                name="sentiment",
163                value=sentiment,
164                comment=f"Response expresses {sentiment} sentiment",
165                data_type="CATEGORICAL"
166            )
167        ```
168
169        Failed evaluation with error handling:
170        ```python
171        def external_api_evaluator(*, input, output, **kwargs):
172            try:
173                score = external_api.evaluate(output)
174                return Evaluation(name="external_score", value=score)
175            except Exception as e:
176                return Evaluation(
177                    name="external_score",
178                    value=0,
179                    comment=f"API unavailable: {e}",
180                    metadata={"error": str(e), "retry_count": 3}
181                )
182        ```
183
184    Note:
185        All arguments must be passed as keywords. Positional arguments are not allowed
186        to ensure code clarity and prevent errors from argument reordering.
187    """
188
189    def __init__(
190        self,
191        *,
192        name: str,
193        value: Union[int, float, str, bool],
194        comment: Optional[str] = None,
195        metadata: Optional[Dict[str, Any]] = None,
196        data_type: Optional[ExperimentScoreType] = None,
197        config_id: Optional[str] = None,
198    ):
199        """Initialize an Evaluation with the provided data.
200
201        Args:
202            name: Unique identifier for the evaluation metric.
203            value: The evaluation score or result.
204            comment: Optional human-readable explanation of the result.
205            metadata: Optional structured metadata about the evaluation process.
206            data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
207            config_id: Optional Langfuse score config ID.
208
209        Note:
210            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
211        """
212        self.name = name
213        self.value = value
214        self.comment = comment
215        self.metadata = metadata
216        self.data_type = data_type
217        self.config_id = config_id

Represents an evaluation result for an experiment item or an entire experiment run.

This class provides a strongly-typed way to create evaluation results in evaluator functions. Users must use keyword arguments when instantiating this class.

Attributes:
  • name: Unique identifier for the evaluation metric. Should be descriptive and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity"). Used for aggregation and comparison across experiment runs.
  • value: The evaluation score or result. Can be:
    • Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42)
    • String: For categorical results like "positive", "negative", "neutral"
    • Boolean: For binary assessments like "passes_safety_check"
  • comment: Optional human-readable explanation of the evaluation result. Useful for providing context, explaining scoring rationale, or noting special conditions. Displayed in Langfuse UI for interpretability.
  • metadata: Optional structured metadata about the evaluation process. Can include confidence scores, intermediate calculations, model versions, or any other relevant technical details.
  • data_type: Optional score data type. Required if value is not NUMERIC. One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC.
  • config_id: Optional Langfuse score config ID.
Examples:

Basic accuracy evaluation:

from langfuse import Evaluation

def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
    if not expected_output:
        return Evaluation(name="accuracy", value=0, comment="No expected output")

    is_correct = output.strip().lower() == expected_output.strip().lower()
    return Evaluation(
        name="accuracy",
        value=1.0 if is_correct else 0.0,
        comment="Correct answer" if is_correct else "Incorrect answer"
    )

Multi-metric evaluator:

def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs):
    return [
        Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"),
        Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"),
        Evaluation(
            name="quality",
            value=0.85,
            comment="High quality response",
            metadata={"confidence": 0.92, "model": "gpt-4"}
        )
    ]

Categorical evaluation:

def sentiment_evaluator(*, input, output, **kwargs):
    sentiment = analyze_sentiment(output)  # Returns "positive", "negative", or "neutral"
    return Evaluation(
        name="sentiment",
        value=sentiment,
        comment=f"Response expresses {sentiment} sentiment",
        data_type="CATEGORICAL"
    )

Failed evaluation with error handling:

def external_api_evaluator(*, input, output, **kwargs):
    try:
        score = external_api.evaluate(output)
        return Evaluation(name="external_score", value=score)
    except Exception as e:
        return Evaluation(
            name="external_score",
            value=0,
            comment=f"API unavailable: {e}",
            metadata={"error": str(e), "retry_count": 3}
        )
Note:

All arguments must be passed as keywords. Positional arguments are not allowed to ensure code clarity and prevent errors from argument reordering.

Evaluation( *, name: str, value: Union[int, float, str, bool], comment: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, data_type: Optional[Literal['NUMERIC', 'CATEGORICAL', 'BOOLEAN']] = None, config_id: Optional[str] = None)
189    def __init__(
190        self,
191        *,
192        name: str,
193        value: Union[int, float, str, bool],
194        comment: Optional[str] = None,
195        metadata: Optional[Dict[str, Any]] = None,
196        data_type: Optional[ExperimentScoreType] = None,
197        config_id: Optional[str] = None,
198    ):
199        """Initialize an Evaluation with the provided data.
200
201        Args:
202            name: Unique identifier for the evaluation metric.
203            value: The evaluation score or result.
204            comment: Optional human-readable explanation of the result.
205            metadata: Optional structured metadata about the evaluation process.
206            data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
207            config_id: Optional Langfuse score config ID.
208
209        Note:
210            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
211        """
212        self.name = name
213        self.value = value
214        self.comment = comment
215        self.metadata = metadata
216        self.data_type = data_type
217        self.config_id = config_id

Initialize an Evaluation with the provided data.

Arguments:
  • name: Unique identifier for the evaluation metric.
  • value: The evaluation score or result.
  • comment: Optional human-readable explanation of the result.
  • metadata: Optional structured metadata about the evaluation process.
  • data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
  • config_id: Optional Langfuse score config ID.
Note:

All arguments must be provided as keywords. Positional arguments will raise a TypeError.

name
value
comment
metadata
data_type
config_id
class EvaluatorInputs:
 38class EvaluatorInputs:
 39    """Input data structure for evaluators, returned by mapper functions.
 40
 41    This class provides a strongly-typed container for transforming API response
 42    objects (traces, observations) into the standardized format expected
 43    by evaluator functions. It ensures consistent access to input, output, expected
 44    output, and metadata regardless of the source entity type.
 45
 46    Attributes:
 47        input: The input data that was provided to generate the output being evaluated.
 48            For traces, this might be the initial prompt or request. For observations,
 49            this could be the span's input. The exact meaning depends on your use case.
 50        output: The actual output that was produced and needs to be evaluated.
 51            For traces, this is typically the final response. For observations,
 52            this might be the generation output or span result.
 53        expected_output: Optional ground truth or expected result for comparison.
 54            Used by evaluators to assess correctness. May be None if no ground truth
 55            is available for the entity being evaluated.
 56        metadata: Optional structured metadata providing additional context for evaluation.
 57            Can include information about the entity, execution context, user attributes,
 58            or any other relevant data that evaluators might use.
 59
 60    Examples:
 61        Simple mapper for traces:
 62        ```python
 63        from langfuse import EvaluatorInputs
 64
 65        def trace_mapper(trace):
 66            return EvaluatorInputs(
 67                input=trace.input,
 68                output=trace.output,
 69                expected_output=None,  # No ground truth available
 70                metadata={"user_id": trace.user_id, "tags": trace.tags}
 71            )
 72        ```
 73
 74        Mapper for observations extracting specific fields:
 75        ```python
 76        def observation_mapper(observation):
 77            # Extract input/output from observation's data
 78            input_data = observation.input if hasattr(observation, 'input') else None
 79            output_data = observation.output if hasattr(observation, 'output') else None
 80
 81            return EvaluatorInputs(
 82                input=input_data,
 83                output=output_data,
 84                expected_output=None,
 85                metadata={
 86                    "observation_type": observation.type,
 87                    "model": observation.model,
 88                    "latency_ms": observation.end_time - observation.start_time
 89                }
 90            )
 91        ```
 92        ```
 93
 94    Note:
 95        All arguments must be passed as keywords when instantiating this class.
 96    """
 97
 98    def __init__(
 99        self,
100        *,
101        input: Any,
102        output: Any,
103        expected_output: Any = None,
104        metadata: Optional[Dict[str, Any]] = None,
105    ):
106        """Initialize EvaluatorInputs with the provided data.
107
108        Args:
109            input: The input data for evaluation.
110            output: The output data to be evaluated.
111            expected_output: Optional ground truth for comparison.
112            metadata: Optional additional context for evaluation.
113
114        Note:
115            All arguments must be provided as keywords.
116        """
117        self.input = input
118        self.output = output
119        self.expected_output = expected_output
120        self.metadata = metadata

Input data structure for evaluators, returned by mapper functions.

This class provides a strongly-typed container for transforming API response objects (traces, observations) into the standardized format expected by evaluator functions. It ensures consistent access to input, output, expected output, and metadata regardless of the source entity type.

Attributes:
  • input: The input data that was provided to generate the output being evaluated. For traces, this might be the initial prompt or request. For observations, this could be the span's input. The exact meaning depends on your use case.
  • output: The actual output that was produced and needs to be evaluated. For traces, this is typically the final response. For observations, this might be the generation output or span result.
  • expected_output: Optional ground truth or expected result for comparison. Used by evaluators to assess correctness. May be None if no ground truth is available for the entity being evaluated.
  • metadata: Optional structured metadata providing additional context for evaluation. Can include information about the entity, execution context, user attributes, or any other relevant data that evaluators might use.
Examples:

Simple mapper for traces:

from langfuse import EvaluatorInputs

def trace_mapper(trace):
    return EvaluatorInputs(
        input=trace.input,
        output=trace.output,
        expected_output=None,  # No ground truth available
        metadata={"user_id": trace.user_id, "tags": trace.tags}
    )

Mapper for observations extracting specific fields:

def observation_mapper(observation):
    # Extract input/output from observation's data
    input_data = observation.input if hasattr(observation, 'input') else None
    output_data = observation.output if hasattr(observation, 'output') else None

    return EvaluatorInputs(
        input=input_data,
        output=output_data,
        expected_output=None,
        metadata={
            "observation_type": observation.type,
            "model": observation.model,
            "latency_ms": observation.end_time - observation.start_time
        }
    )

```

Note:

All arguments must be passed as keywords when instantiating this class.

EvaluatorInputs( *, input: Any, output: Any, expected_output: Any = None, metadata: Optional[Dict[str, Any]] = None)
 98    def __init__(
 99        self,
100        *,
101        input: Any,
102        output: Any,
103        expected_output: Any = None,
104        metadata: Optional[Dict[str, Any]] = None,
105    ):
106        """Initialize EvaluatorInputs with the provided data.
107
108        Args:
109            input: The input data for evaluation.
110            output: The output data to be evaluated.
111            expected_output: Optional ground truth for comparison.
112            metadata: Optional additional context for evaluation.
113
114        Note:
115            All arguments must be provided as keywords.
116        """
117        self.input = input
118        self.output = output
119        self.expected_output = expected_output
120        self.metadata = metadata

Initialize EvaluatorInputs with the provided data.

Arguments:
  • input: The input data for evaluation.
  • output: The output data to be evaluated.
  • expected_output: Optional ground truth for comparison.
  • metadata: Optional additional context for evaluation.
Note:

All arguments must be provided as keywords.

input
output
expected_output
metadata
class MapperFunction(typing.Protocol):
123class MapperFunction(Protocol):
124    """Protocol defining the interface for mapper functions in batch evaluation.
125
126    Mapper functions transform API response objects (traces or observations)
127    into the standardized EvaluatorInputs format that evaluators expect. This abstraction
128    allows you to define how to extract and structure evaluation data from different
129    entity types.
130
131    Mapper functions must:
132    - Accept a single item parameter (trace, observation)
133    - Return an EvaluatorInputs instance with input, output, expected_output, metadata
134    - Can be either synchronous or asynchronous
135    - Should handle missing or malformed data gracefully
136    """
137
138    def __call__(
139        self,
140        *,
141        item: Union["TraceWithFullDetails", "ObservationsView"],
142        **kwargs: Dict[str, Any],
143    ) -> Union[EvaluatorInputs, Awaitable[EvaluatorInputs]]:
144        """Transform an API response object into evaluator inputs.
145
146        This method defines how to extract evaluation-relevant data from the raw
147        API response object. The implementation should map entity-specific fields
148        to the standardized input/output/expected_output/metadata structure.
149
150        Args:
151            item: The API response object to transform. The type depends on the scope:
152                - TraceWithFullDetails: When evaluating traces
153                - ObservationsView: When evaluating observations
154
155        Returns:
156            EvaluatorInputs: A structured container with:
157                - input: The input data that generated the output
158                - output: The output to be evaluated
159                - expected_output: Optional ground truth for comparison
160                - metadata: Optional additional context
161
162            Can return either a direct EvaluatorInputs instance or an awaitable
163            (for async mappers that need to fetch additional data).
164
165        Examples:
166            Basic trace mapper:
167            ```python
168            def map_trace(trace):
169                return EvaluatorInputs(
170                    input=trace.input,
171                    output=trace.output,
172                    expected_output=None,
173                    metadata={"trace_id": trace.id, "user": trace.user_id}
174                )
175            ```
176
177            Observation mapper with conditional logic:
178            ```python
179            def map_observation(observation):
180                # Extract fields based on observation type
181                if observation.type == "GENERATION":
182                    input_data = observation.input
183                    output_data = observation.output
184                else:
185                    # For other types, use different fields
186                    input_data = observation.metadata.get("input")
187                    output_data = observation.metadata.get("output")
188
189                return EvaluatorInputs(
190                    input=input_data,
191                    output=output_data,
192                    expected_output=None,
193                    metadata={"obs_id": observation.id, "type": observation.type}
194                )
195            ```
196
197            Async mapper (if additional processing needed):
198            ```python
199            async def map_trace_async(trace):
200                # Could do async processing here if needed
201                processed_output = await some_async_transformation(trace.output)
202
203                return EvaluatorInputs(
204                    input=trace.input,
205                    output=processed_output,
206                    expected_output=None,
207                    metadata={"trace_id": trace.id}
208                )
209            ```
210        """
211        ...

Protocol defining the interface for mapper functions in batch evaluation.

Mapper functions transform API response objects (traces or observations) into the standardized EvaluatorInputs format that evaluators expect. This abstraction allows you to define how to extract and structure evaluation data from different entity types.

Mapper functions must:

  • Accept a single item parameter (trace, observation)
  • Return an EvaluatorInputs instance with input, output, expected_output, metadata
  • Can be either synchronous or asynchronous
  • Should handle missing or malformed data gracefully
MapperFunction(*args, **kwargs)
1927def _no_init_or_replace_init(self, *args, **kwargs):
1928    cls = type(self)
1929
1930    if cls._is_protocol:
1931        raise TypeError('Protocols cannot be instantiated')
1932
1933    # Already using a custom `__init__`. No need to calculate correct
1934    # `__init__` to call. This can lead to RecursionError. See bpo-45121.
1935    if cls.__init__ is not _no_init_or_replace_init:
1936        return
1937
1938    # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`.
1939    # The first instantiation of the subclass will call `_no_init_or_replace_init` which
1940    # searches for a proper new `__init__` in the MRO. The new `__init__`
1941    # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent
1942    # instantiation of the protocol subclass will thus use the new
1943    # `__init__` and no longer call `_no_init_or_replace_init`.
1944    for base in cls.__mro__:
1945        init = base.__dict__.get('__init__', _no_init_or_replace_init)
1946        if init is not _no_init_or_replace_init:
1947            cls.__init__ = init
1948            break
1949    else:
1950        # should not happen
1951        cls.__init__ = object.__init__
1952
1953    cls.__init__(self, *args, **kwargs)
class CompositeEvaluatorFunction(typing.Protocol):
214class CompositeEvaluatorFunction(Protocol):
215    """Protocol defining the interface for composite evaluator functions.
216
217    Composite evaluators create aggregate scores from multiple item-level evaluations.
218    This is commonly used to compute weighted averages, combined metrics, or other
219    composite assessments based on individual evaluation results.
220
221    Composite evaluators:
222    - Accept the same inputs as item-level evaluators (input, output, expected_output, metadata)
223      plus the list of evaluations
224    - Return either a single Evaluation, a list of Evaluations, or a dict
225    - Can be either synchronous or asynchronous
226    - Have access to both raw item data and evaluation results
227    """
228
229    def __call__(
230        self,
231        *,
232        input: Optional[Any] = None,
233        output: Optional[Any] = None,
234        expected_output: Optional[Any] = None,
235        metadata: Optional[Dict[str, Any]] = None,
236        evaluations: List[Evaluation],
237        **kwargs: Dict[str, Any],
238    ) -> Union[
239        Evaluation,
240        List[Evaluation],
241        Dict[str, Any],
242        Awaitable[Evaluation],
243        Awaitable[List[Evaluation]],
244        Awaitable[Dict[str, Any]],
245    ]:
246        r"""Create a composite evaluation from item-level evaluation results.
247
248        This method combines multiple evaluation scores into a single composite metric.
249        Common use cases include weighted averages, pass/fail decisions based on multiple
250        criteria, or custom scoring logic that considers multiple dimensions.
251
252        Args:
253            input: The input data that was provided to the system being evaluated.
254            output: The output generated by the system being evaluated.
255            expected_output: The expected/reference output for comparison (if available).
256            metadata: Additional metadata about the evaluation context.
257            evaluations: List of evaluation results from item-level evaluators.
258                Each evaluation contains name, value, comment, and metadata.
259
260        Returns:
261            Can return any of:
262            - Evaluation: A single composite evaluation result
263            - List[Evaluation]: Multiple composite evaluations
264            - Dict: A dict that will be converted to an Evaluation
265                - name: Identifier for the composite metric (e.g., "composite_score")
266                - value: The computed composite value
267                - comment: Optional explanation of how the score was computed
268                - metadata: Optional details about the composition logic
269
270            Can return either a direct Evaluation instance or an awaitable
271            (for async composite evaluators).
272
273        Examples:
274            Simple weighted average:
275            ```python
276            def weighted_composite(*, input, output, expected_output, metadata, evaluations):
277                weights = {
278                    "accuracy": 0.5,
279                    "relevance": 0.3,
280                    "safety": 0.2
281                }
282
283                total_score = 0.0
284                total_weight = 0.0
285
286                for eval in evaluations:
287                    if eval.name in weights and isinstance(eval.value, (int, float)):
288                        total_score += eval.value * weights[eval.name]
289                        total_weight += weights[eval.name]
290
291                final_score = total_score / total_weight if total_weight > 0 else 0.0
292
293                return Evaluation(
294                    name="composite_score",
295                    value=final_score,
296                    comment=f"Weighted average of {len(evaluations)} metrics"
297                )
298            ```
299
300            Pass/fail composite based on thresholds:
301            ```python
302            def pass_fail_composite(*, input, output, expected_output, metadata, evaluations):
303                # Must pass all criteria
304                thresholds = {
305                    "accuracy": 0.7,
306                    "safety": 0.9,
307                    "relevance": 0.6
308                }
309
310                passes = True
311                failing_metrics = []
312
313                for metric, threshold in thresholds.items():
314                    eval_result = next((e for e in evaluations if e.name == metric), None)
315                    if eval_result and isinstance(eval_result.value, (int, float)):
316                        if eval_result.value < threshold:
317                            passes = False
318                            failing_metrics.append(metric)
319
320                return Evaluation(
321                    name="passes_all_checks",
322                    value=passes,
323                    comment=f"Failed: {', '.join(failing_metrics)}" if failing_metrics else "All checks passed",
324                    data_type="BOOLEAN"
325                )
326            ```
327
328            Async composite with external scoring:
329            ```python
330            async def llm_composite(*, input, output, expected_output, metadata, evaluations):
331                # Use LLM to synthesize multiple evaluation results
332                eval_summary = "\n".join(
333                    f"- {e.name}: {e.value}" for e in evaluations
334                )
335
336                prompt = f"Given these evaluation scores:\n{eval_summary}\n"
337                prompt += f"For the output: {output}\n"
338                prompt += "Provide an overall quality score from 0-1."
339
340                response = await openai.chat.completions.create(
341                    model="gpt-4",
342                    messages=[{"role": "user", "content": prompt}]
343                )
344
345                score = float(response.choices[0].message.content.strip())
346
347                return Evaluation(
348                    name="llm_composite_score",
349                    value=score,
350                    comment="LLM-synthesized composite score"
351                )
352            ```
353
354            Context-aware composite:
355            ```python
356            def context_composite(*, input, output, expected_output, metadata, evaluations):
357                # Adjust weighting based on metadata
358                base_weights = {"accuracy": 0.5, "speed": 0.3, "cost": 0.2}
359
360                # If metadata indicates high importance, prioritize accuracy
361                if metadata and metadata.get('importance') == 'high':
362                    weights = {"accuracy": 0.7, "speed": 0.2, "cost": 0.1}
363                else:
364                    weights = base_weights
365
366                total = sum(
367                    e.value * weights.get(e.name, 0)
368                    for e in evaluations
369                    if isinstance(e.value, (int, float))
370                )
371
372                return Evaluation(
373                    name="weighted_composite",
374                    value=total,
375                    comment="Context-aware weighted composite"
376                )
377            ```
378        """
379        ...

Protocol defining the interface for composite evaluator functions.

Composite evaluators create aggregate scores from multiple item-level evaluations. This is commonly used to compute weighted averages, combined metrics, or other composite assessments based on individual evaluation results.

Composite evaluators:

  • Accept the same inputs as item-level evaluators (input, output, expected_output, metadata) plus the list of evaluations
  • Return either a single Evaluation, a list of Evaluations, or a dict
  • Can be either synchronous or asynchronous
  • Have access to both raw item data and evaluation results
CompositeEvaluatorFunction(*args, **kwargs)
1927def _no_init_or_replace_init(self, *args, **kwargs):
1928    cls = type(self)
1929
1930    if cls._is_protocol:
1931        raise TypeError('Protocols cannot be instantiated')
1932
1933    # Already using a custom `__init__`. No need to calculate correct
1934    # `__init__` to call. This can lead to RecursionError. See bpo-45121.
1935    if cls.__init__ is not _no_init_or_replace_init:
1936        return
1937
1938    # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`.
1939    # The first instantiation of the subclass will call `_no_init_or_replace_init` which
1940    # searches for a proper new `__init__` in the MRO. The new `__init__`
1941    # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent
1942    # instantiation of the protocol subclass will thus use the new
1943    # `__init__` and no longer call `_no_init_or_replace_init`.
1944    for base in cls.__mro__:
1945        init = base.__dict__.get('__init__', _no_init_or_replace_init)
1946        if init is not _no_init_or_replace_init:
1947            cls.__init__ = init
1948            break
1949    else:
1950        # should not happen
1951        cls.__init__ = object.__init__
1952
1953    cls.__init__(self, *args, **kwargs)
class EvaluatorStats:
382class EvaluatorStats:
383    """Statistics for a single evaluator's performance during batch evaluation.
384
385    This class tracks detailed metrics about how a specific evaluator performed
386    across all items in a batch evaluation run. It helps identify evaluator issues,
387    understand reliability, and optimize evaluation pipelines.
388
389    Attributes:
390        name: The name of the evaluator function (extracted from __name__).
391        total_runs: Total number of times the evaluator was invoked.
392        successful_runs: Number of times the evaluator completed successfully.
393        failed_runs: Number of times the evaluator raised an exception or failed.
394        total_scores_created: Total number of evaluation scores created by this evaluator.
395            Can be higher than successful_runs if the evaluator returns multiple scores.
396
397    Examples:
398        Accessing evaluator stats from batch evaluation result:
399        ```python
400        result = client.run_batched_evaluation(...)
401
402        for stats in result.evaluator_stats:
403            print(f"Evaluator: {stats.name}")
404            print(f"  Success rate: {stats.successful_runs / stats.total_runs:.1%}")
405            print(f"  Scores created: {stats.total_scores_created}")
406
407            if stats.failed_runs > 0:
408                print(f"  ⚠️  Failed {stats.failed_runs} times")
409        ```
410
411        Identifying problematic evaluators:
412        ```python
413        result = client.run_batched_evaluation(...)
414
415        # Find evaluators with high failure rates
416        for stats in result.evaluator_stats:
417            failure_rate = stats.failed_runs / stats.total_runs
418            if failure_rate > 0.1:  # More than 10% failures
419                print(f"⚠️  {stats.name} has {failure_rate:.1%} failure rate")
420                print(f"    Consider debugging or removing this evaluator")
421        ```
422
423    Note:
424        All arguments must be passed as keywords when instantiating this class.
425    """
426
427    def __init__(
428        self,
429        *,
430        name: str,
431        total_runs: int = 0,
432        successful_runs: int = 0,
433        failed_runs: int = 0,
434        total_scores_created: int = 0,
435    ):
436        """Initialize EvaluatorStats with the provided metrics.
437
438        Args:
439            name: The evaluator function name.
440            total_runs: Total number of evaluator invocations.
441            successful_runs: Number of successful completions.
442            failed_runs: Number of failures.
443            total_scores_created: Total scores created by this evaluator.
444
445        Note:
446            All arguments must be provided as keywords.
447        """
448        self.name = name
449        self.total_runs = total_runs
450        self.successful_runs = successful_runs
451        self.failed_runs = failed_runs
452        self.total_scores_created = total_scores_created

Statistics for a single evaluator's performance during batch evaluation.

This class tracks detailed metrics about how a specific evaluator performed across all items in a batch evaluation run. It helps identify evaluator issues, understand reliability, and optimize evaluation pipelines.

Attributes:
  • name: The name of the evaluator function (extracted from __name__).
  • total_runs: Total number of times the evaluator was invoked.
  • successful_runs: Number of times the evaluator completed successfully.
  • failed_runs: Number of times the evaluator raised an exception or failed.
  • total_scores_created: Total number of evaluation scores created by this evaluator. Can be higher than successful_runs if the evaluator returns multiple scores.
Examples:

Accessing evaluator stats from batch evaluation result:

result = client.run_batched_evaluation(...)

for stats in result.evaluator_stats:
    print(f"Evaluator: {stats.name}")
    print(f"  Success rate: {stats.successful_runs / stats.total_runs:.1%}")
    print(f"  Scores created: {stats.total_scores_created}")

    if stats.failed_runs > 0:
        print(f"  ⚠️  Failed {stats.failed_runs} times")

Identifying problematic evaluators:

result = client.run_batched_evaluation(...)

# Find evaluators with high failure rates
for stats in result.evaluator_stats:
    failure_rate = stats.failed_runs / stats.total_runs
    if failure_rate > 0.1:  # More than 10% failures
        print(f"⚠️  {stats.name} has {failure_rate:.1%} failure rate")
        print(f"    Consider debugging or removing this evaluator")
Note:

All arguments must be passed as keywords when instantiating this class.

EvaluatorStats( *, name: str, total_runs: int = 0, successful_runs: int = 0, failed_runs: int = 0, total_scores_created: int = 0)
427    def __init__(
428        self,
429        *,
430        name: str,
431        total_runs: int = 0,
432        successful_runs: int = 0,
433        failed_runs: int = 0,
434        total_scores_created: int = 0,
435    ):
436        """Initialize EvaluatorStats with the provided metrics.
437
438        Args:
439            name: The evaluator function name.
440            total_runs: Total number of evaluator invocations.
441            successful_runs: Number of successful completions.
442            failed_runs: Number of failures.
443            total_scores_created: Total scores created by this evaluator.
444
445        Note:
446            All arguments must be provided as keywords.
447        """
448        self.name = name
449        self.total_runs = total_runs
450        self.successful_runs = successful_runs
451        self.failed_runs = failed_runs
452        self.total_scores_created = total_scores_created

Initialize EvaluatorStats with the provided metrics.

Arguments:
  • name: The evaluator function name.
  • total_runs: Total number of evaluator invocations.
  • successful_runs: Number of successful completions.
  • failed_runs: Number of failures.
  • total_scores_created: Total scores created by this evaluator.
Note:

All arguments must be provided as keywords.

name
total_runs
successful_runs
failed_runs
total_scores_created
class BatchEvaluationResumeToken:
455class BatchEvaluationResumeToken:
456    """Token for resuming a failed batch evaluation run.
457
458    This class encapsulates all the information needed to resume a batch evaluation
459    that was interrupted or failed partway through. It uses timestamp-based filtering
460    to avoid re-processing items that were already evaluated, even if the underlying
461    dataset changed between runs.
462
463    Attributes:
464        scope: The type of items being evaluated ("traces", "observations").
465        filter: The original JSON filter string used to query items.
466        last_processed_timestamp: ISO 8601 timestamp of the last successfully processed item.
467            Used to construct a filter that only fetches items after this timestamp.
468        last_processed_id: The ID of the last successfully processed item, for reference.
469        items_processed: Count of items successfully processed before interruption.
470
471    Examples:
472        Resuming a failed batch evaluation:
473        ```python
474        # Initial run that fails partway through
475        try:
476            result = client.run_batched_evaluation(
477                scope="traces",
478                mapper=my_mapper,
479                evaluators=[evaluator1, evaluator2],
480                filter='{"tags": ["production"]}',
481                max_items=10000
482            )
483        except Exception as e:
484            print(f"Evaluation failed: {e}")
485
486            # Save the resume token
487            if result.resume_token:
488                # Store resume token for later (e.g., in a file or database)
489                import json
490                with open("resume_token.json", "w") as f:
491                    json.dump({
492                        "scope": result.resume_token.scope,
493                        "filter": result.resume_token.filter,
494                        "last_timestamp": result.resume_token.last_processed_timestamp,
495                        "last_id": result.resume_token.last_processed_id,
496                        "items_done": result.resume_token.items_processed
497                    }, f)
498
499        # Later, resume from where it left off
500        with open("resume_token.json") as f:
501            token_data = json.load(f)
502
503        resume_token = BatchEvaluationResumeToken(
504            scope=token_data["scope"],
505            filter=token_data["filter"],
506            last_processed_timestamp=token_data["last_timestamp"],
507            last_processed_id=token_data["last_id"],
508            items_processed=token_data["items_done"]
509        )
510
511        # Resume the evaluation
512        result = client.run_batched_evaluation(
513            scope="traces",
514            mapper=my_mapper,
515            evaluators=[evaluator1, evaluator2],
516            resume_from=resume_token
517        )
518
519        print(f"Processed {result.total_items_processed} additional items")
520        ```
521
522        Handling partial completion:
523        ```python
524        result = client.run_batched_evaluation(...)
525
526        if not result.completed:
527            print(f"Evaluation incomplete. Processed {result.resume_token.items_processed} items")
528            print(f"Last item: {result.resume_token.last_processed_id}")
529            print(f"Resume from: {result.resume_token.last_processed_timestamp}")
530
531            # Optionally retry automatically
532            if result.resume_token:
533                print("Retrying...")
534                result = client.run_batched_evaluation(
535                    scope=result.resume_token.scope,
536                    mapper=my_mapper,
537                    evaluators=my_evaluators,
538                    resume_from=result.resume_token
539                )
540        ```
541
542    Note:
543        All arguments must be passed as keywords when instantiating this class.
544        The timestamp-based approach means that items created after the initial run
545        but before the timestamp will be skipped. This is intentional to avoid
546        duplicates and ensure consistent evaluation.
547    """
548
549    def __init__(
550        self,
551        *,
552        scope: str,
553        filter: Optional[str],
554        last_processed_timestamp: str,
555        last_processed_id: str,
556        items_processed: int,
557    ):
558        """Initialize BatchEvaluationResumeToken with the provided state.
559
560        Args:
561            scope: The scope type ("traces", "observations").
562            filter: The original JSON filter string.
563            last_processed_timestamp: ISO 8601 timestamp of last processed item.
564            last_processed_id: ID of last processed item.
565            items_processed: Count of items processed before interruption.
566
567        Note:
568            All arguments must be provided as keywords.
569        """
570        self.scope = scope
571        self.filter = filter
572        self.last_processed_timestamp = last_processed_timestamp
573        self.last_processed_id = last_processed_id
574        self.items_processed = items_processed

Token for resuming a failed batch evaluation run.

This class encapsulates all the information needed to resume a batch evaluation that was interrupted or failed partway through. It uses timestamp-based filtering to avoid re-processing items that were already evaluated, even if the underlying dataset changed between runs.

Attributes:
  • scope: The type of items being evaluated ("traces", "observations").
  • filter: The original JSON filter string used to query items.
  • last_processed_timestamp: ISO 8601 timestamp of the last successfully processed item. Used to construct a filter that only fetches items after this timestamp.
  • last_processed_id: The ID of the last successfully processed item, for reference.
  • items_processed: Count of items successfully processed before interruption.
Examples:

Resuming a failed batch evaluation:

# Initial run that fails partway through
try:
    result = client.run_batched_evaluation(
        scope="traces",
        mapper=my_mapper,
        evaluators=[evaluator1, evaluator2],
        filter='{"tags": ["production"]}',
        max_items=10000
    )
except Exception as e:
    print(f"Evaluation failed: {e}")

    # Save the resume token
    if result.resume_token:
        # Store resume token for later (e.g., in a file or database)
        import json
        with open("resume_token.json", "w") as f:
            json.dump({
                "scope": result.resume_token.scope,
                "filter": result.resume_token.filter,
                "last_timestamp": result.resume_token.last_processed_timestamp,
                "last_id": result.resume_token.last_processed_id,
                "items_done": result.resume_token.items_processed
            }, f)

# Later, resume from where it left off
with open("resume_token.json") as f:
    token_data = json.load(f)

resume_token = BatchEvaluationResumeToken(
    scope=token_data["scope"],
    filter=token_data["filter"],
    last_processed_timestamp=token_data["last_timestamp"],
    last_processed_id=token_data["last_id"],
    items_processed=token_data["items_done"]
)

# Resume the evaluation
result = client.run_batched_evaluation(
    scope="traces",
    mapper=my_mapper,
    evaluators=[evaluator1, evaluator2],
    resume_from=resume_token
)

print(f"Processed {result.total_items_processed} additional items")

Handling partial completion:

result = client.run_batched_evaluation(...)

if not result.completed:
    print(f"Evaluation incomplete. Processed {result.resume_token.items_processed} items")
    print(f"Last item: {result.resume_token.last_processed_id}")
    print(f"Resume from: {result.resume_token.last_processed_timestamp}")

    # Optionally retry automatically
    if result.resume_token:
        print("Retrying...")
        result = client.run_batched_evaluation(
            scope=result.resume_token.scope,
            mapper=my_mapper,
            evaluators=my_evaluators,
            resume_from=result.resume_token
        )
Note:

All arguments must be passed as keywords when instantiating this class. The timestamp-based approach means that items created after the initial run but before the timestamp will be skipped. This is intentional to avoid duplicates and ensure consistent evaluation.

BatchEvaluationResumeToken( *, scope: str, filter: Optional[str], last_processed_timestamp: str, last_processed_id: str, items_processed: int)
549    def __init__(
550        self,
551        *,
552        scope: str,
553        filter: Optional[str],
554        last_processed_timestamp: str,
555        last_processed_id: str,
556        items_processed: int,
557    ):
558        """Initialize BatchEvaluationResumeToken with the provided state.
559
560        Args:
561            scope: The scope type ("traces", "observations").
562            filter: The original JSON filter string.
563            last_processed_timestamp: ISO 8601 timestamp of last processed item.
564            last_processed_id: ID of last processed item.
565            items_processed: Count of items processed before interruption.
566
567        Note:
568            All arguments must be provided as keywords.
569        """
570        self.scope = scope
571        self.filter = filter
572        self.last_processed_timestamp = last_processed_timestamp
573        self.last_processed_id = last_processed_id
574        self.items_processed = items_processed

Initialize BatchEvaluationResumeToken with the provided state.

Arguments:
  • scope: The scope type ("traces", "observations").
  • filter: The original JSON filter string.
  • last_processed_timestamp: ISO 8601 timestamp of last processed item.
  • last_processed_id: ID of last processed item.
  • items_processed: Count of items processed before interruption.
Note:

All arguments must be provided as keywords.

scope
filter
last_processed_timestamp
last_processed_id
items_processed
class BatchEvaluationResult:
577class BatchEvaluationResult:
578    r"""Complete result structure for batch evaluation execution.
579
580    This class encapsulates comprehensive statistics and metadata about a batch
581    evaluation run, including counts, evaluator-specific metrics, timing information,
582    error details, and resume capability.
583
584    Attributes:
585        total_items_fetched: Total number of items fetched from the API.
586        total_items_processed: Number of items successfully evaluated.
587        total_items_failed: Number of items that failed during evaluation.
588        total_scores_created: Total scores created by all item-level evaluators.
589        total_composite_scores_created: Scores created by the composite evaluator.
590        total_evaluations_failed: Number of individual evaluator failures across all items.
591        evaluator_stats: List of per-evaluator statistics (success/failure rates, scores created).
592        resume_token: Token for resuming if evaluation was interrupted (None if completed).
593        completed: True if all items were processed, False if stopped early or failed.
594        duration_seconds: Total time taken to execute the batch evaluation.
595        failed_item_ids: List of IDs for items that failed evaluation.
596        error_summary: Dictionary mapping error types to occurrence counts.
597        has_more_items: True if max_items limit was reached but more items exist.
598        item_evaluations: Dictionary mapping item IDs to their evaluation results (both regular and composite).
599
600    Examples:
601        Basic result inspection:
602        ```python
603        result = client.run_batched_evaluation(...)
604
605        print(f"Processed: {result.total_items_processed}/{result.total_items_fetched}")
606        print(f"Scores created: {result.total_scores_created}")
607        print(f"Duration: {result.duration_seconds:.2f}s")
608        print(f"Success rate: {result.total_items_processed / result.total_items_fetched:.1%}")
609        ```
610
611        Detailed analysis with evaluator stats:
612        ```python
613        result = client.run_batched_evaluation(...)
614
615        print(f"\n📊 Batch Evaluation Results")
616        print(f"{'='*50}")
617        print(f"Items processed: {result.total_items_processed}")
618        print(f"Items failed: {result.total_items_failed}")
619        print(f"Scores created: {result.total_scores_created}")
620
621        if result.total_composite_scores_created > 0:
622            print(f"Composite scores: {result.total_composite_scores_created}")
623
624        print(f"\n📈 Evaluator Performance:")
625        for stats in result.evaluator_stats:
626            success_rate = stats.successful_runs / stats.total_runs if stats.total_runs > 0 else 0
627            print(f"\n  {stats.name}:")
628            print(f"    Success rate: {success_rate:.1%}")
629            print(f"    Scores created: {stats.total_scores_created}")
630            if stats.failed_runs > 0:
631                print(f"    ⚠️  Failures: {stats.failed_runs}")
632
633        if result.error_summary:
634            print(f"\n⚠️  Errors encountered:")
635            for error_type, count in result.error_summary.items():
636                print(f"    {error_type}: {count}")
637        ```
638
639        Handling incomplete runs:
640        ```python
641        result = client.run_batched_evaluation(...)
642
643        if not result.completed:
644            print("⚠️  Evaluation incomplete!")
645
646            if result.resume_token:
647                print(f"Processed {result.resume_token.items_processed} items before failure")
648                print(f"Use resume_from parameter to continue from:")
649                print(f"  Timestamp: {result.resume_token.last_processed_timestamp}")
650                print(f"  Last ID: {result.resume_token.last_processed_id}")
651
652        if result.has_more_items:
653            print(f"ℹ️  More items available beyond max_items limit")
654        ```
655
656        Performance monitoring:
657        ```python
658        result = client.run_batched_evaluation(...)
659
660        items_per_second = result.total_items_processed / result.duration_seconds
661        avg_scores_per_item = result.total_scores_created / result.total_items_processed
662
663        print(f"Performance metrics:")
664        print(f"  Throughput: {items_per_second:.2f} items/second")
665        print(f"  Avg scores/item: {avg_scores_per_item:.2f}")
666        print(f"  Total duration: {result.duration_seconds:.2f}s")
667
668        if result.total_evaluations_failed > 0:
669            failure_rate = result.total_evaluations_failed / (
670                result.total_items_processed * len(result.evaluator_stats)
671            )
672            print(f"  Evaluation failure rate: {failure_rate:.1%}")
673        ```
674
675    Note:
676        All arguments must be passed as keywords when instantiating this class.
677    """
678
679    def __init__(
680        self,
681        *,
682        total_items_fetched: int,
683        total_items_processed: int,
684        total_items_failed: int,
685        total_scores_created: int,
686        total_composite_scores_created: int,
687        total_evaluations_failed: int,
688        evaluator_stats: List[EvaluatorStats],
689        resume_token: Optional[BatchEvaluationResumeToken],
690        completed: bool,
691        duration_seconds: float,
692        failed_item_ids: List[str],
693        error_summary: Dict[str, int],
694        has_more_items: bool,
695        item_evaluations: Dict[str, List["Evaluation"]],
696    ):
697        """Initialize BatchEvaluationResult with comprehensive statistics.
698
699        Args:
700            total_items_fetched: Total items fetched from API.
701            total_items_processed: Items successfully evaluated.
702            total_items_failed: Items that failed evaluation.
703            total_scores_created: Scores from item-level evaluators.
704            total_composite_scores_created: Scores from composite evaluator.
705            total_evaluations_failed: Individual evaluator failures.
706            evaluator_stats: Per-evaluator statistics.
707            resume_token: Token for resuming (None if completed).
708            completed: Whether all items were processed.
709            duration_seconds: Total execution time.
710            failed_item_ids: IDs of failed items.
711            error_summary: Error types and counts.
712            has_more_items: Whether more items exist beyond max_items.
713            item_evaluations: Dictionary mapping item IDs to their evaluation results.
714
715        Note:
716            All arguments must be provided as keywords.
717        """
718        self.total_items_fetched = total_items_fetched
719        self.total_items_processed = total_items_processed
720        self.total_items_failed = total_items_failed
721        self.total_scores_created = total_scores_created
722        self.total_composite_scores_created = total_composite_scores_created
723        self.total_evaluations_failed = total_evaluations_failed
724        self.evaluator_stats = evaluator_stats
725        self.resume_token = resume_token
726        self.completed = completed
727        self.duration_seconds = duration_seconds
728        self.failed_item_ids = failed_item_ids
729        self.error_summary = error_summary
730        self.has_more_items = has_more_items
731        self.item_evaluations = item_evaluations
732
733    def __str__(self) -> str:
734        """Return a formatted string representation of the batch evaluation results.
735
736        Returns:
737            A multi-line string with a summary of the evaluation results.
738        """
739        lines = []
740        lines.append("=" * 60)
741        lines.append("Batch Evaluation Results")
742        lines.append("=" * 60)
743
744        # Summary statistics
745        lines.append(f"\nStatus: {'Completed' if self.completed else 'Incomplete'}")
746        lines.append(f"Duration: {self.duration_seconds:.2f}s")
747        lines.append(f"\nItems fetched: {self.total_items_fetched}")
748        lines.append(f"Items processed: {self.total_items_processed}")
749
750        if self.total_items_failed > 0:
751            lines.append(f"Items failed: {self.total_items_failed}")
752
753        # Success rate
754        if self.total_items_fetched > 0:
755            success_rate = self.total_items_processed / self.total_items_fetched * 100
756            lines.append(f"Success rate: {success_rate:.1f}%")
757
758        # Scores created
759        lines.append(f"\nScores created: {self.total_scores_created}")
760        if self.total_composite_scores_created > 0:
761            lines.append(f"Composite scores: {self.total_composite_scores_created}")
762
763        total_scores = self.total_scores_created + self.total_composite_scores_created
764        lines.append(f"Total scores: {total_scores}")
765
766        # Evaluator statistics
767        if self.evaluator_stats:
768            lines.append("\nEvaluator Performance:")
769            for stats in self.evaluator_stats:
770                lines.append(f"  {stats.name}:")
771                if stats.total_runs > 0:
772                    success_rate = (
773                        stats.successful_runs / stats.total_runs * 100
774                        if stats.total_runs > 0
775                        else 0
776                    )
777                    lines.append(
778                        f"    Runs: {stats.successful_runs}/{stats.total_runs} "
779                        f"({success_rate:.1f}% success)"
780                    )
781                    lines.append(f"    Scores created: {stats.total_scores_created}")
782                    if stats.failed_runs > 0:
783                        lines.append(f"    Failed runs: {stats.failed_runs}")
784
785        # Performance metrics
786        if self.total_items_processed > 0 and self.duration_seconds > 0:
787            items_per_sec = self.total_items_processed / self.duration_seconds
788            lines.append("\nPerformance:")
789            lines.append(f"  Throughput: {items_per_sec:.2f} items/second")
790            if self.total_scores_created > 0:
791                avg_scores = self.total_scores_created / self.total_items_processed
792                lines.append(f"  Avg scores per item: {avg_scores:.2f}")
793
794        # Errors and warnings
795        if self.error_summary:
796            lines.append("\nErrors encountered:")
797            for error_type, count in self.error_summary.items():
798                lines.append(f"  {error_type}: {count}")
799
800        # Incomplete run information
801        if not self.completed:
802            lines.append("\nWarning: Evaluation incomplete")
803            if self.resume_token:
804                lines.append(
805                    f"  Last processed: {self.resume_token.last_processed_timestamp}"
806                )
807                lines.append(f"  Items processed: {self.resume_token.items_processed}")
808                lines.append("  Use resume_from parameter to continue")
809
810        if self.has_more_items:
811            lines.append("\nNote: More items available beyond max_items limit")
812
813        lines.append("=" * 60)
814        return "\n".join(lines)

Complete result structure for batch evaluation execution.

This class encapsulates comprehensive statistics and metadata about a batch evaluation run, including counts, evaluator-specific metrics, timing information, error details, and resume capability.

Attributes:
  • total_items_fetched: Total number of items fetched from the API.
  • total_items_processed: Number of items successfully evaluated.
  • total_items_failed: Number of items that failed during evaluation.
  • total_scores_created: Total scores created by all item-level evaluators.
  • total_composite_scores_created: Scores created by the composite evaluator.
  • total_evaluations_failed: Number of individual evaluator failures across all items.
  • evaluator_stats: List of per-evaluator statistics (success/failure rates, scores created).
  • resume_token: Token for resuming if evaluation was interrupted (None if completed).
  • completed: True if all items were processed, False if stopped early or failed.
  • duration_seconds: Total time taken to execute the batch evaluation.
  • failed_item_ids: List of IDs for items that failed evaluation.
  • error_summary: Dictionary mapping error types to occurrence counts.
  • has_more_items: True if max_items limit was reached but more items exist.
  • item_evaluations: Dictionary mapping item IDs to their evaluation results (both regular and composite).
Examples:

Basic result inspection:

result = client.run_batched_evaluation(...)

print(f"Processed: {result.total_items_processed}/{result.total_items_fetched}")
print(f"Scores created: {result.total_scores_created}")
print(f"Duration: {result.duration_seconds:.2f}s")
print(f"Success rate: {result.total_items_processed / result.total_items_fetched:.1%}")

Detailed analysis with evaluator stats:

result = client.run_batched_evaluation(...)

print(f"\n📊 Batch Evaluation Results")
print(f"{'='*50}")
print(f"Items processed: {result.total_items_processed}")
print(f"Items failed: {result.total_items_failed}")
print(f"Scores created: {result.total_scores_created}")

if result.total_composite_scores_created > 0:
    print(f"Composite scores: {result.total_composite_scores_created}")

print(f"\n📈 Evaluator Performance:")
for stats in result.evaluator_stats:
    success_rate = stats.successful_runs / stats.total_runs if stats.total_runs > 0 else 0
    print(f"\n  {stats.name}:")
    print(f"    Success rate: {success_rate:.1%}")
    print(f"    Scores created: {stats.total_scores_created}")
    if stats.failed_runs > 0:
        print(f"    ⚠️  Failures: {stats.failed_runs}")

if result.error_summary:
    print(f"\n⚠️  Errors encountered:")
    for error_type, count in result.error_summary.items():
        print(f"    {error_type}: {count}")

Handling incomplete runs:

result = client.run_batched_evaluation(...)

if not result.completed:
    print("⚠️  Evaluation incomplete!")

    if result.resume_token:
        print(f"Processed {result.resume_token.items_processed} items before failure")
        print(f"Use resume_from parameter to continue from:")
        print(f"  Timestamp: {result.resume_token.last_processed_timestamp}")
        print(f"  Last ID: {result.resume_token.last_processed_id}")

if result.has_more_items:
    print(f"ℹ️  More items available beyond max_items limit")

Performance monitoring:

result = client.run_batched_evaluation(...)

items_per_second = result.total_items_processed / result.duration_seconds
avg_scores_per_item = result.total_scores_created / result.total_items_processed

print(f"Performance metrics:")
print(f"  Throughput: {items_per_second:.2f} items/second")
print(f"  Avg scores/item: {avg_scores_per_item:.2f}")
print(f"  Total duration: {result.duration_seconds:.2f}s")

if result.total_evaluations_failed > 0:
    failure_rate = result.total_evaluations_failed / (
        result.total_items_processed * len(result.evaluator_stats)
    )
    print(f"  Evaluation failure rate: {failure_rate:.1%}")
Note:

All arguments must be passed as keywords when instantiating this class.

BatchEvaluationResult( *, total_items_fetched: int, total_items_processed: int, total_items_failed: int, total_scores_created: int, total_composite_scores_created: int, total_evaluations_failed: int, evaluator_stats: List[EvaluatorStats], resume_token: Optional[BatchEvaluationResumeToken], completed: bool, duration_seconds: float, failed_item_ids: List[str], error_summary: Dict[str, int], has_more_items: bool, item_evaluations: Dict[str, List[Evaluation]])
679    def __init__(
680        self,
681        *,
682        total_items_fetched: int,
683        total_items_processed: int,
684        total_items_failed: int,
685        total_scores_created: int,
686        total_composite_scores_created: int,
687        total_evaluations_failed: int,
688        evaluator_stats: List[EvaluatorStats],
689        resume_token: Optional[BatchEvaluationResumeToken],
690        completed: bool,
691        duration_seconds: float,
692        failed_item_ids: List[str],
693        error_summary: Dict[str, int],
694        has_more_items: bool,
695        item_evaluations: Dict[str, List["Evaluation"]],
696    ):
697        """Initialize BatchEvaluationResult with comprehensive statistics.
698
699        Args:
700            total_items_fetched: Total items fetched from API.
701            total_items_processed: Items successfully evaluated.
702            total_items_failed: Items that failed evaluation.
703            total_scores_created: Scores from item-level evaluators.
704            total_composite_scores_created: Scores from composite evaluator.
705            total_evaluations_failed: Individual evaluator failures.
706            evaluator_stats: Per-evaluator statistics.
707            resume_token: Token for resuming (None if completed).
708            completed: Whether all items were processed.
709            duration_seconds: Total execution time.
710            failed_item_ids: IDs of failed items.
711            error_summary: Error types and counts.
712            has_more_items: Whether more items exist beyond max_items.
713            item_evaluations: Dictionary mapping item IDs to their evaluation results.
714
715        Note:
716            All arguments must be provided as keywords.
717        """
718        self.total_items_fetched = total_items_fetched
719        self.total_items_processed = total_items_processed
720        self.total_items_failed = total_items_failed
721        self.total_scores_created = total_scores_created
722        self.total_composite_scores_created = total_composite_scores_created
723        self.total_evaluations_failed = total_evaluations_failed
724        self.evaluator_stats = evaluator_stats
725        self.resume_token = resume_token
726        self.completed = completed
727        self.duration_seconds = duration_seconds
728        self.failed_item_ids = failed_item_ids
729        self.error_summary = error_summary
730        self.has_more_items = has_more_items
731        self.item_evaluations = item_evaluations

Initialize BatchEvaluationResult with comprehensive statistics.

Arguments:
  • total_items_fetched: Total items fetched from API.
  • total_items_processed: Items successfully evaluated.
  • total_items_failed: Items that failed evaluation.
  • total_scores_created: Scores from item-level evaluators.
  • total_composite_scores_created: Scores from composite evaluator.
  • total_evaluations_failed: Individual evaluator failures.
  • evaluator_stats: Per-evaluator statistics.
  • resume_token: Token for resuming (None if completed).
  • completed: Whether all items were processed.
  • duration_seconds: Total execution time.
  • failed_item_ids: IDs of failed items.
  • error_summary: Error types and counts.
  • has_more_items: Whether more items exist beyond max_items.
  • item_evaluations: Dictionary mapping item IDs to their evaluation results.
Note:

All arguments must be provided as keywords.

total_items_fetched
total_items_processed
total_items_failed
total_scores_created
total_composite_scores_created
total_evaluations_failed
evaluator_stats
resume_token
completed
duration_seconds
failed_item_ids
error_summary
has_more_items
item_evaluations
class RunnerContext:
1062class RunnerContext:
1063    """Wraps :meth:`Langfuse.run_experiment` with CI-injected defaults.
1064
1065    Intended for use with the ``langfuse/experiment-action`` GitHub Action
1066    (https://github.com/langfuse/experiment-action). The action builds a
1067    ``RunnerContext`` before invoking the user's ``experiment(context)``
1068    function. Defaults set here (dataset, metadata tags) are applied when
1069    the user omits them on the :meth:`run_experiment` call; users can
1070    override any default by passing the corresponding argument explicitly.
1071    """
1072
1073    def __init__(
1074        self,
1075        *,
1076        client: "Langfuse",
1077        data: Optional[ExperimentData] = None,
1078        dataset_version: Optional[datetime] = None,
1079        metadata: Optional[Dict[str, str]] = None,
1080    ):
1081        """Build a ``RunnerContext`` populated with defaults for ``run_experiment``.
1082
1083        Typically called by the ``langfuse/experiment-action`` GitHub Action,
1084        not by end users directly. Every field except ``client`` is optional:
1085        fields left as ``None`` simply mean the corresponding argument must be
1086        supplied on the :meth:`run_experiment` call.
1087
1088        Args:
1089            client: Initialized Langfuse SDK client used to execute the
1090                experiment. The action creates this from the
1091                ``langfuse_public_key`` / ``langfuse_secret_key`` /
1092                ``langfuse_base_url`` inputs.
1093            data: Default dataset items to run the experiment on. Accepts
1094                either ``List[LocalExperimentItem]`` or ``List[DatasetItem]``.
1095                Injected by the action when ``dataset_name`` is configured.
1096                If ``None``, the user must pass ``data=`` to
1097                :meth:`run_experiment`.
1098            dataset_version: Optional pinned dataset version. Injected by the
1099                action when ``dataset_version`` is configured.
1100            metadata: Default metadata attached to every experiment trace and
1101                the dataset run. The action injects GitHub-sourced tags (SHA,
1102                PR link, workflow run link, branch, GH user, etc.). Merged
1103                with any ``metadata`` passed to :meth:`run_experiment`, with
1104                user-supplied keys winning on collision.
1105        """
1106        self.client = client
1107        self.data = data
1108        self.dataset_version = dataset_version
1109        self.metadata = metadata
1110
1111    def run_experiment(
1112        self,
1113        *,
1114        name: str,
1115        run_name: Optional[str] = None,
1116        description: Optional[str] = None,
1117        data: Optional[ExperimentData] = None,
1118        task: TaskFunction,
1119        evaluators: List[EvaluatorFunction] = [],
1120        composite_evaluator: Optional["CompositeEvaluatorFunction"] = None,
1121        run_evaluators: List[RunEvaluatorFunction] = [],
1122        max_concurrency: int = 50,
1123        metadata: Optional[Dict[str, str]] = None,
1124        _dataset_version: Optional[datetime] = None,
1125    ) -> ExperimentResult:
1126        resolved_data = data if data is not None else self.data
1127        if resolved_data is None:
1128            raise ValueError(
1129                "`data` must be provided either on the RunnerContext or the run_experiment call"
1130            )
1131
1132        resolved_dataset_version = (
1133            _dataset_version if _dataset_version is not None else self.dataset_version
1134        )
1135
1136        merged_metadata: Optional[Dict[str, str]]
1137        if self.metadata is None and metadata is None:
1138            merged_metadata = None
1139        else:
1140            merged_metadata = {**(self.metadata or {}), **(metadata or {})}
1141
1142        return self.client.run_experiment(
1143            name=name,
1144            run_name=run_name,
1145            description=description,
1146            data=resolved_data,
1147            task=task,
1148            evaluators=evaluators,
1149            composite_evaluator=composite_evaluator,
1150            run_evaluators=run_evaluators,
1151            max_concurrency=max_concurrency,
1152            metadata=merged_metadata,
1153            _dataset_version=resolved_dataset_version,
1154        )

Wraps Langfuse.run_experiment() with CI-injected defaults.

Intended for use with the langfuse/experiment-action GitHub Action (https://github.com/langfuse/experiment-action). The action builds a RunnerContext before invoking the user's experiment(context) function. Defaults set here (dataset, metadata tags) are applied when the user omits them on the run_experiment() call; users can override any default by passing the corresponding argument explicitly.

RunnerContext( *, client: Langfuse, data: Union[List[langfuse.experiment.LocalExperimentItem], List[langfuse.api.DatasetItem], NoneType] = None, dataset_version: Optional[datetime.datetime] = None, metadata: Optional[Dict[str, str]] = None)
1073    def __init__(
1074        self,
1075        *,
1076        client: "Langfuse",
1077        data: Optional[ExperimentData] = None,
1078        dataset_version: Optional[datetime] = None,
1079        metadata: Optional[Dict[str, str]] = None,
1080    ):
1081        """Build a ``RunnerContext`` populated with defaults for ``run_experiment``.
1082
1083        Typically called by the ``langfuse/experiment-action`` GitHub Action,
1084        not by end users directly. Every field except ``client`` is optional:
1085        fields left as ``None`` simply mean the corresponding argument must be
1086        supplied on the :meth:`run_experiment` call.
1087
1088        Args:
1089            client: Initialized Langfuse SDK client used to execute the
1090                experiment. The action creates this from the
1091                ``langfuse_public_key`` / ``langfuse_secret_key`` /
1092                ``langfuse_base_url`` inputs.
1093            data: Default dataset items to run the experiment on. Accepts
1094                either ``List[LocalExperimentItem]`` or ``List[DatasetItem]``.
1095                Injected by the action when ``dataset_name`` is configured.
1096                If ``None``, the user must pass ``data=`` to
1097                :meth:`run_experiment`.
1098            dataset_version: Optional pinned dataset version. Injected by the
1099                action when ``dataset_version`` is configured.
1100            metadata: Default metadata attached to every experiment trace and
1101                the dataset run. The action injects GitHub-sourced tags (SHA,
1102                PR link, workflow run link, branch, GH user, etc.). Merged
1103                with any ``metadata`` passed to :meth:`run_experiment`, with
1104                user-supplied keys winning on collision.
1105        """
1106        self.client = client
1107        self.data = data
1108        self.dataset_version = dataset_version
1109        self.metadata = metadata

Build a RunnerContext populated with defaults for run_experiment.

Typically called by the langfuse/experiment-action GitHub Action, not by end users directly. Every field except client is optional: fields left as None simply mean the corresponding argument must be supplied on the run_experiment() call.

Arguments:
  • client: Initialized Langfuse SDK client used to execute the experiment. The action creates this from the langfuse_public_key / langfuse_secret_key / langfuse_base_url inputs.
  • data: Default dataset items to run the experiment on. Accepts either List[LocalExperimentItem] or List[DatasetItem]. Injected by the action when dataset_name is configured. If None, the user must pass data= to run_experiment().
  • dataset_version: Optional pinned dataset version. Injected by the action when dataset_version is configured.
  • metadata: Default metadata attached to every experiment trace and the dataset run. The action injects GitHub-sourced tags (SHA, PR link, workflow run link, branch, GH user, etc.). Merged with any metadata passed to run_experiment(), with user-supplied keys winning on collision.
client
data
dataset_version
metadata
def run_experiment( self, *, name: str, run_name: Optional[str] = None, description: Optional[str] = None, data: Union[List[langfuse.experiment.LocalExperimentItem], List[langfuse.api.DatasetItem], NoneType] = None, task: langfuse.experiment.TaskFunction, evaluators: List[langfuse.experiment.EvaluatorFunction] = [], composite_evaluator: Optional[CompositeEvaluatorFunction] = None, run_evaluators: List[langfuse.experiment.RunEvaluatorFunction] = [], max_concurrency: int = 50, metadata: Optional[Dict[str, str]] = None, _dataset_version: Optional[datetime.datetime] = None) -> langfuse.experiment.ExperimentResult:
1111    def run_experiment(
1112        self,
1113        *,
1114        name: str,
1115        run_name: Optional[str] = None,
1116        description: Optional[str] = None,
1117        data: Optional[ExperimentData] = None,
1118        task: TaskFunction,
1119        evaluators: List[EvaluatorFunction] = [],
1120        composite_evaluator: Optional["CompositeEvaluatorFunction"] = None,
1121        run_evaluators: List[RunEvaluatorFunction] = [],
1122        max_concurrency: int = 50,
1123        metadata: Optional[Dict[str, str]] = None,
1124        _dataset_version: Optional[datetime] = None,
1125    ) -> ExperimentResult:
1126        resolved_data = data if data is not None else self.data
1127        if resolved_data is None:
1128            raise ValueError(
1129                "`data` must be provided either on the RunnerContext or the run_experiment call"
1130            )
1131
1132        resolved_dataset_version = (
1133            _dataset_version if _dataset_version is not None else self.dataset_version
1134        )
1135
1136        merged_metadata: Optional[Dict[str, str]]
1137        if self.metadata is None and metadata is None:
1138            merged_metadata = None
1139        else:
1140            merged_metadata = {**(self.metadata or {}), **(metadata or {})}
1141
1142        return self.client.run_experiment(
1143            name=name,
1144            run_name=run_name,
1145            description=description,
1146            data=resolved_data,
1147            task=task,
1148            evaluators=evaluators,
1149            composite_evaluator=composite_evaluator,
1150            run_evaluators=run_evaluators,
1151            max_concurrency=max_concurrency,
1152            metadata=merged_metadata,
1153            _dataset_version=resolved_dataset_version,
1154        )
class RegressionError(builtins.Exception):
1157class RegressionError(Exception):
1158    """Raised by a user's ``experiment`` function to signal a CI gate failure.
1159
1160    Intended for use with the ``langfuse/experiment-action`` GitHub Action
1161    (https://github.com/langfuse/experiment-action). The action catches this
1162    exception and, when ``should_fail_on_error`` is enabled, fails the
1163    workflow run and renders a callout in the PR comment using
1164    ``metric``/``value``/``threshold`` if supplied, otherwise ``str(exc)``.
1165
1166    Callers choose one of three forms:
1167
1168    - ``RegressionError(result=r)`` — minimal, generic message.
1169    - ``RegressionError(result=r, message="...")`` — free-form message.
1170    - ``RegressionError(result=r, metric="acc", value=0.7, threshold=0.9)`` —
1171      structured; ``metric`` and ``value`` must be provided together so the
1172      action can render a targeted callout without ``None`` placeholders.
1173    """
1174
1175    @overload
1176    def __init__(self, *, result: ExperimentResult) -> None: ...
1177    @overload
1178    def __init__(self, *, result: ExperimentResult, message: str) -> None: ...
1179    @overload
1180    def __init__(
1181        self,
1182        *,
1183        result: ExperimentResult,
1184        metric: str,
1185        value: float,
1186        threshold: Optional[float] = None,
1187        message: Optional[str] = None,
1188    ) -> None: ...
1189    def __init__(
1190        self,
1191        *,
1192        result: ExperimentResult,
1193        metric: Optional[str] = None,
1194        value: Optional[float] = None,
1195        threshold: Optional[float] = None,
1196        message: Optional[str] = None,
1197    ):
1198        self.result = result
1199        self.metric = metric
1200        self.value = value
1201        self.threshold = threshold
1202        if message is not None:
1203            formatted = message
1204        elif metric is not None and value is not None:
1205            formatted = f"Regression on `{metric}`: {value} (threshold {threshold})"
1206        else:
1207            formatted = "Experiment regression detected"
1208        super().__init__(formatted)

Raised by a user's experiment function to signal a CI gate failure.

Intended for use with the langfuse/experiment-action GitHub Action (https://github.com/langfuse/experiment-action). The action catches this exception and, when should_fail_on_error is enabled, fails the workflow run and renders a callout in the PR comment using metric/value/threshold if supplied, otherwise str(exc).

Callers choose one of three forms:

  • RegressionError(result=r) — minimal, generic message.
  • RegressionError(result=r, message="...") — free-form message.
  • RegressionError(result=r, metric="acc", value=0.7, threshold=0.9) — structured; metric and value must be provided together so the action can render a targeted callout without None placeholders.
RegressionError( *, result: langfuse.experiment.ExperimentResult, metric: Optional[str] = None, value: Optional[float] = None, threshold: Optional[float] = None, message: Optional[str] = None)
1189    def __init__(
1190        self,
1191        *,
1192        result: ExperimentResult,
1193        metric: Optional[str] = None,
1194        value: Optional[float] = None,
1195        threshold: Optional[float] = None,
1196        message: Optional[str] = None,
1197    ):
1198        self.result = result
1199        self.metric = metric
1200        self.value = value
1201        self.threshold = threshold
1202        if message is not None:
1203            formatted = message
1204        elif metric is not None and value is not None:
1205            formatted = f"Regression on `{metric}`: {value} (threshold {threshold})"
1206        else:
1207            formatted = "Experiment regression detected"
1208        super().__init__(formatted)
result
metric
value
threshold
__version__ = '4.13.0'
def is_default_export_span(span: opentelemetry.sdk.trace.ReadableSpan) -> bool:
 98def is_default_export_span(span: ReadableSpan) -> bool:
 99    """Return whether a span should be exported by default."""
100    return (
101        is_langfuse_span(span) or is_genai_span(span) or is_known_llm_instrumentor(span)
102    )

Return whether a span should be exported by default.

def is_langfuse_span(span: opentelemetry.sdk.trace.ReadableSpan) -> bool:
61def is_langfuse_span(span: ReadableSpan) -> bool:
62    """Return whether the span was created by the Langfuse SDK tracer."""
63    return (
64        span.instrumentation_scope is not None
65        and span.instrumentation_scope.name == LANGFUSE_TRACER_NAME
66    )

Return whether the span was created by the Langfuse SDK tracer.

def is_genai_span(span: opentelemetry.sdk.trace.ReadableSpan) -> bool:
69def is_genai_span(span: ReadableSpan) -> bool:
70    """Return whether the span has any ``gen_ai.*`` semantic convention attribute."""
71    if span.attributes is None:
72        return False
73
74    return any(
75        isinstance(key, str) and key.startswith("gen_ai")
76        for key in span.attributes.keys()
77    )

Return whether the span has any gen_ai.* semantic convention attribute.

def is_known_llm_instrumentor(span: opentelemetry.sdk.trace.ReadableSpan) -> bool:
85def is_known_llm_instrumentor(span: ReadableSpan) -> bool:
86    """Return whether the span comes from a known LLM instrumentation scope."""
87    if span.instrumentation_scope is None:
88        return False
89
90    scope_name = span.instrumentation_scope.name
91
92    return any(
93        _matches_scope_prefix(scope_name, prefix)
94        for prefix in KNOWN_LLM_INSTRUMENTATION_SCOPE_PREFIXES
95    )

Return whether the span comes from a known LLM instrumentation scope.

KNOWN_LLM_INSTRUMENTATION_SCOPE_PREFIXES = frozenset({'haystack', 'opentelemetry.instrumentation.transformers', 'litellm', 'ai', 'opentelemetry.instrumentation.crewai', 'pydantic-ai', 'opentelemetry.instrumentation.groq', 'openinference', 'opentelemetry.instrumentation.cohere', 'opentelemetry.instrumentation.openai', 'opentelemetry.instrumentation.llamaindex', 'opentelemetry.instrumentation.agno', 'autogen-core', 'opentelemetry.instrumentation.replicate', 'opentelemetry.instrumentation.watsonx', 'opentelemetry.instrumentation.ollama', 'opentelemetry.instrumentation.openai_v2', 'agent_framework', 'opentelemetry.instrumentation.bedrock', 'strands-agents', 'opentelemetry.instrumentation.mistralai', 'langsmith', 'opentelemetry.instrumentation.writer', 'opentelemetry.instrumentation.openai_agents', 'opentelemetry.instrumentation.voyageai', 'opentelemetry.instrumentation.together', 'langfuse-sdk', 'opentelemetry.instrumentation.google_generativeai', 'opentelemetry.instrumentation.langchain', 'opentelemetry.instrumentation.alephalpha', 'opentelemetry.instrumentation.haystack', 'vllm', 'opentelemetry.instrumentation.anthropic', 'opentelemetry.instrumentation.sagemaker', 'opentelemetry.instrumentation.vertexai'})
class MaskOtelSpansFunction(typing.Protocol):
224class MaskOtelSpansFunction(Protocol):
225    """Function protocol for export-stage OpenTelemetry span masking.
226
227    `mask_otel_spans` runs after Langfuse decides which spans this client should
228    export and after export-stage media handling has converted supported media
229    payloads into Langfuse media references. It affects only the spans exported
230    by this Langfuse client. If the same OpenTelemetry spans are sent to another
231    exporter, that exporter receives its own unmodified copy.
232
233    The function is synchronous. It usually runs on the OpenTelemetry batch span
234    processor worker thread; during `flush()` and shutdown it may run on the
235    caller thread. Keep it deterministic and fast, and avoid relying on request
236    locals, the current active span, or async I/O.
237
238    Return `None` to leave the whole batch unchanged, or return
239    `MaskOtelSpansResult` with sparse patches for the spans that should change.
240
241    Example:
242        ```python
243        from typing import Optional
244
245        from langfuse import Langfuse
246        from langfuse.types import (
247            MaskOtelSpansParams,
248            MaskOtelSpansResult,
249            OtelSpanPatch,
250        )
251
252        def mask_otel_spans(
253            *, params: MaskOtelSpansParams
254        ) -> Optional[MaskOtelSpansResult]:
255            patches = {}
256
257            for identifier, span in params.spans.items():
258                if span.instrumentation_scope_name == "openai":
259                    patches[identifier] = OtelSpanPatch(
260                        delete_attributes=(
261                            "gen_ai.prompt.0.content",
262                            "gen_ai.completion.0.content",
263                        ),
264                        set_attributes={"masking.applied": True},
265                    )
266
267            return MaskOtelSpansResult(span_patches=patches)
268
269        langfuse = Langfuse(mask_otel_spans=mask_otel_spans)
270        ```
271    """
272
273    def __call__(
274        self, *, params: MaskOtelSpansParams
275    ) -> Optional[MaskOtelSpansResult]: ...

Function protocol for export-stage OpenTelemetry span masking.

mask_otel_spans runs after Langfuse decides which spans this client should export and after export-stage media handling has converted supported media payloads into Langfuse media references. It affects only the spans exported by this Langfuse client. If the same OpenTelemetry spans are sent to another exporter, that exporter receives its own unmodified copy.

The function is synchronous. It usually runs on the OpenTelemetry batch span processor worker thread; during flush() and shutdown it may run on the caller thread. Keep it deterministic and fast, and avoid relying on request locals, the current active span, or async I/O.

Return None to leave the whole batch unchanged, or return MaskOtelSpansResult with sparse patches for the spans that should change.

Example:
from typing import Optional

from langfuse import Langfuse
from langfuse.types import (
    MaskOtelSpansParams,
    MaskOtelSpansResult,
    OtelSpanPatch,
)

def mask_otel_spans(
    *, params: MaskOtelSpansParams
) -> Optional[MaskOtelSpansResult]:
    patches = {}

    for identifier, span in params.spans.items():
        if span.instrumentation_scope_name == "openai":
            patches[identifier] = OtelSpanPatch(
                delete_attributes=(
                    "gen_ai.prompt.0.content",
                    "gen_ai.completion.0.content",
                ),
                set_attributes={"masking.applied": True},
            )

    return MaskOtelSpansResult(span_patches=patches)

langfuse = Langfuse(mask_otel_spans=mask_otel_spans)
MaskOtelSpansFunction(*args, **kwargs)
1927def _no_init_or_replace_init(self, *args, **kwargs):
1928    cls = type(self)
1929
1930    if cls._is_protocol:
1931        raise TypeError('Protocols cannot be instantiated')
1932
1933    # Already using a custom `__init__`. No need to calculate correct
1934    # `__init__` to call. This can lead to RecursionError. See bpo-45121.
1935    if cls.__init__ is not _no_init_or_replace_init:
1936        return
1937
1938    # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`.
1939    # The first instantiation of the subclass will call `_no_init_or_replace_init` which
1940    # searches for a proper new `__init__` in the MRO. The new `__init__`
1941    # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent
1942    # instantiation of the protocol subclass will thus use the new
1943    # `__init__` and no longer call `_no_init_or_replace_init`.
1944    for base in cls.__mro__:
1945        init = base.__dict__.get('__init__', _no_init_or_replace_init)
1946        if init is not _no_init_or_replace_init:
1947            cls.__init__ = init
1948            break
1949    else:
1950        # should not happen
1951        cls.__init__ = object.__init__
1952
1953    cls.__init__(self, *args, **kwargs)
@dataclass(frozen=True)
class MaskOtelSpansParams:
123@dataclass(frozen=True)
124class MaskOtelSpansParams:
125    """Input passed to an export-stage OpenTelemetry span masking function.
126
127    A single call receives one OpenTelemetry export batch, not necessarily a
128    complete trace, request, or Langfuse observation tree. Batch contents depend
129    on OpenTelemetry span processor settings such as `flush_at`,
130    `flush_interval`, explicit `flush()`, and shutdown.
131
132    Example:
133        ```python
134        from typing import Optional
135
136        from langfuse.types import (
137            MaskOtelSpansParams,
138            MaskOtelSpansResult,
139            OtelSpanPatch,
140        )
141
142        def mask_otel_spans(
143            *, params: MaskOtelSpansParams
144        ) -> Optional[MaskOtelSpansResult]:
145            patches = {}
146
147            for identifier, span in params.spans.items():
148                if "http.request.header.authorization" in span.attributes:
149                    patches[identifier] = OtelSpanPatch(
150                        delete_attributes=("http.request.header.authorization",),
151                        set_attributes={"security.redacted": True},
152                    )
153
154            return MaskOtelSpansResult(span_patches=patches)
155        ```
156
157    Attributes:
158        spans: Read-only mapping from stable span identifiers to span snapshots.
159            Return patches using keys from this mapping.
160    """
161
162    spans: Mapping[OtelSpanIdentifier, OtelSpanData]

Input passed to an export-stage OpenTelemetry span masking function.

A single call receives one OpenTelemetry export batch, not necessarily a complete trace, request, or Langfuse observation tree. Batch contents depend on OpenTelemetry span processor settings such as flush_at, flush_interval, explicit flush(), and shutdown.

Example:
from typing import Optional

from langfuse.types import (
    MaskOtelSpansParams,
    MaskOtelSpansResult,
    OtelSpanPatch,
)

def mask_otel_spans(
    *, params: MaskOtelSpansParams
) -> Optional[MaskOtelSpansResult]:
    patches = {}

    for identifier, span in params.spans.items():
        if "http.request.header.authorization" in span.attributes:
            patches[identifier] = OtelSpanPatch(
                delete_attributes=("http.request.header.authorization",),
                set_attributes={"security.redacted": True},
            )

    return MaskOtelSpansResult(span_patches=patches)
Attributes:
  • spans: Read-only mapping from stable span identifiers to span snapshots. Return patches using keys from this mapping.
MaskOtelSpansParams( spans: Mapping[OtelSpanIdentifier, OtelSpanData])
spans: Mapping[OtelSpanIdentifier, OtelSpanData]
@dataclass(frozen=True)
class MaskOtelSpansResult:
200@dataclass(frozen=True)
201class MaskOtelSpansResult:
202    """Patches returned by a `mask_otel_spans` function.
203
204    Omit spans that do not need changes. A mapping value of `None` also leaves
205    that span unchanged. Returning an invalid patch to drop a span is not a
206    supported API; use `should_export_span` when you need span-level export
207    filtering.
208
209    If `mask_otel_spans` raises or returns an object that is not a
210    `MaskOtelSpansResult`, Langfuse drops the whole export batch. If one
211    individual `OtelSpanPatch` is invalid, Langfuse drops only that span from
212    the export batch.
213
214    Attributes:
215        span_patches: Mapping from identifiers in `MaskOtelSpansParams.spans` to
216            sparse attribute patches.
217    """
218
219    span_patches: Mapping[OtelSpanIdentifier, Optional[OtelSpanPatch]] = field(
220        default_factory=lambda: MappingProxyType({})
221    )

Patches returned by a mask_otel_spans function.

Omit spans that do not need changes. A mapping value of None also leaves that span unchanged. Returning an invalid patch to drop a span is not a supported API; use should_export_span when you need span-level export filtering.

If mask_otel_spans raises or returns an object that is not a MaskOtelSpansResult, Langfuse drops the whole export batch. If one individual OtelSpanPatch is invalid, Langfuse drops only that span from the export batch.

Attributes:
MaskOtelSpansResult( span_patches: Mapping[OtelSpanIdentifier, Optional[OtelSpanPatch]] = <factory>)
span_patches: Mapping[OtelSpanIdentifier, Optional[OtelSpanPatch]]
@dataclass(frozen=True)
class OtelSpanData:
 82@dataclass(frozen=True)
 83class OtelSpanData:
 84    """Read-only OpenTelemetry span snapshot passed to `mask_otel_spans`.
 85
 86    The snapshot contains the span data that Langfuse is about to export after
 87    the SDK has applied `should_export_span` filtering and export-stage media
 88    processing. The mappings are immutable views and mutating them is not
 89    supported; return an `OtelSpanPatch` to change exported attributes.
 90
 91    `mask_otel_spans` can only change span attributes. It cannot change the
 92    span name, IDs, parent relationship, resource attributes, events, links, or
 93    instrumentation scope.
 94
 95    Attributes:
 96        trace_id: Lowercase 32-character hexadecimal OpenTelemetry trace ID.
 97        span_id: Lowercase 16-character hexadecimal OpenTelemetry span ID.
 98        parent_span_id: Lowercase hexadecimal parent span ID, or `None` for a
 99            root span or when the parent is not available.
100        name: OpenTelemetry span name.
101        instrumentation_scope_name: Name of the instrumentation scope that
102            emitted the span, for example `openai` or `langfuse`.
103        instrumentation_scope_version: Version of the instrumentation scope, if
104            the instrumentation library provided one.
105        attributes: Read-only attributes that will be exported unless patched.
106            Values use OpenTelemetry `AttributeValue` types: strings, booleans,
107            numbers, or homogeneous sequences of those scalar values.
108        resource_attributes: Read-only resource attributes from the span's
109            OpenTelemetry resource. These are available for decisions only and
110            cannot be patched through `mask_otel_spans`.
111    """
112
113    trace_id: str
114    span_id: str
115    parent_span_id: Optional[str]
116    name: str
117    instrumentation_scope_name: Optional[str]
118    instrumentation_scope_version: Optional[str]
119    attributes: Mapping[str, AttributeValue]
120    resource_attributes: Mapping[str, AttributeValue]

Read-only OpenTelemetry span snapshot passed to mask_otel_spans.

The snapshot contains the span data that Langfuse is about to export after the SDK has applied should_export_span filtering and export-stage media processing. The mappings are immutable views and mutating them is not supported; return an OtelSpanPatch to change exported attributes.

mask_otel_spans can only change span attributes. It cannot change the span name, IDs, parent relationship, resource attributes, events, links, or instrumentation scope.

Attributes:
  • trace_id: Lowercase 32-character hexadecimal OpenTelemetry trace ID.
  • span_id: Lowercase 16-character hexadecimal OpenTelemetry span ID.
  • parent_span_id: Lowercase hexadecimal parent span ID, or None for a root span or when the parent is not available.
  • name: OpenTelemetry span name.
  • instrumentation_scope_name: Name of the instrumentation scope that emitted the span, for example openai or langfuse.
  • instrumentation_scope_version: Version of the instrumentation scope, if the instrumentation library provided one.
  • attributes: Read-only attributes that will be exported unless patched. Values use OpenTelemetry AttributeValue types: strings, booleans, numbers, or homogeneous sequences of those scalar values.
  • resource_attributes: Read-only resource attributes from the span's OpenTelemetry resource. These are available for decisions only and cannot be patched through mask_otel_spans.
OtelSpanData( trace_id: str, span_id: str, parent_span_id: Optional[str], name: str, instrumentation_scope_name: Optional[str], instrumentation_scope_version: Optional[str], attributes: Mapping[str, str | bool | int | float | Sequence[str] | Sequence[bool] | Sequence[int] | Sequence[float]], resource_attributes: Mapping[str, str | bool | int | float | Sequence[str] | Sequence[bool] | Sequence[int] | Sequence[float]])
trace_id: str
span_id: str
parent_span_id: Optional[str]
name: str
instrumentation_scope_name: Optional[str]
instrumentation_scope_version: Optional[str]
attributes: Mapping[str, str | bool | int | float | Sequence[str] | Sequence[bool] | Sequence[int] | Sequence[float]]
resource_attributes: Mapping[str, str | bool | int | float | Sequence[str] | Sequence[bool] | Sequence[int] | Sequence[float]]
@dataclass(frozen=True)
class OtelSpanIdentifier:
65@dataclass(frozen=True)
66class OtelSpanIdentifier:
67    """Stable key for one OpenTelemetry span in a masking batch.
68
69    Use this object as the key when returning a patch for a span. It is a
70    frozen, hashable dataclass, so the safest pattern is to reuse the exact
71    identifier object from `MaskOtelSpansParams.spans` instead of rebuilding it.
72
73    Attributes:
74        trace_id: Lowercase 32-character hexadecimal OpenTelemetry trace ID.
75        span_id: Lowercase 16-character hexadecimal OpenTelemetry span ID.
76    """
77
78    trace_id: str
79    span_id: str

Stable key for one OpenTelemetry span in a masking batch.

Use this object as the key when returning a patch for a span. It is a frozen, hashable dataclass, so the safest pattern is to reuse the exact identifier object from MaskOtelSpansParams.spans instead of rebuilding it.

Attributes:
  • trace_id: Lowercase 32-character hexadecimal OpenTelemetry trace ID.
  • span_id: Lowercase 16-character hexadecimal OpenTelemetry span ID.
OtelSpanIdentifier(trace_id: str, span_id: str)
trace_id: str
span_id: str
@dataclass(frozen=True)
class OtelSpanPatch:
165@dataclass(frozen=True)
166class OtelSpanPatch:
167    """Attribute changes to apply to one OpenTelemetry span before export.
168
169    Patches are sparse: include only the attributes that should change. Langfuse
170    deletes `delete_attributes` first and then applies `set_attributes`, so a key
171    present in both fields is exported with the value from `set_attributes`.
172
173    Attribute values must be valid OpenTelemetry attributes: strings, booleans,
174    integers, floats, or homogeneous sequences of those scalar types. If one
175    value is not valid for OpenTelemetry, Langfuse removes that attribute from
176    the export rather than sending an invalid span.
177
178    Example:
179        ```python
180        OtelSpanPatch(
181            delete_attributes=("gen_ai.prompt.0.content",),
182            set_attributes={
183                "gen_ai.prompt.redacted": True,
184                "app.masking.rule": "drop_prompt_text",
185            },
186        )
187        ```
188
189    Attributes:
190        set_attributes: Attribute values to add or replace on the exported span.
191        delete_attributes: Attribute keys to remove from the exported span.
192    """
193
194    set_attributes: Mapping[str, AttributeValue] = field(
195        default_factory=lambda: MappingProxyType({})
196    )
197    delete_attributes: Sequence[str] = field(default_factory=tuple)

Attribute changes to apply to one OpenTelemetry span before export.

Patches are sparse: include only the attributes that should change. Langfuse deletes delete_attributes first and then applies set_attributes, so a key present in both fields is exported with the value from set_attributes.

Attribute values must be valid OpenTelemetry attributes: strings, booleans, integers, floats, or homogeneous sequences of those scalar types. If one value is not valid for OpenTelemetry, Langfuse removes that attribute from the export rather than sending an invalid span.

Example:
OtelSpanPatch(
    delete_attributes=("gen_ai.prompt.0.content",),
    set_attributes={
        "gen_ai.prompt.redacted": True,
        "app.masking.rule": "drop_prompt_text",
    },
)
Attributes:
  • set_attributes: Attribute values to add or replace on the exported span.
  • delete_attributes: Attribute keys to remove from the exported span.
OtelSpanPatch( set_attributes: Mapping[str, str | bool | int | float | Sequence[str] | Sequence[bool] | Sequence[int] | Sequence[float]] = <factory>, delete_attributes: Sequence[str] = <factory>)
set_attributes: Mapping[str, str | bool | int | float | Sequence[str] | Sequence[bool] | Sequence[int] | Sequence[float]]
delete_attributes: Sequence[str]