langfuse

Langfuse GitHub Banner

Langfuse Python SDK

MIT License CI test status PyPI Version GitHub Repo stars Discord YC W23

Installation

Important

The SDK was rewritten in v3 and released in June 2025. Refer to the v3 migration guide for instructions on updating your code.

pip install langfuse

Docs

Please see our docs for detailed information on this SDK.

 1""".. include:: ../README.md"""
 2
 3from langfuse.batch_evaluation import (
 4    BatchEvaluationResult,
 5    BatchEvaluationResumeToken,
 6    CompositeEvaluatorFunction,
 7    EvaluatorInputs,
 8    EvaluatorStats,
 9    MapperFunction,
10)
11from langfuse.experiment import Evaluation
12
13from ._client import client as _client_module
14from ._client.attributes import LangfuseOtelSpanAttributes
15from ._client.constants import ObservationTypeLiteral
16from ._client.get_client import get_client
17from ._client.observe import observe
18from ._client.propagation import propagate_attributes
19from ._client.span import (
20    LangfuseAgent,
21    LangfuseChain,
22    LangfuseEmbedding,
23    LangfuseEvaluator,
24    LangfuseEvent,
25    LangfuseGeneration,
26    LangfuseGuardrail,
27    LangfuseRetriever,
28    LangfuseSpan,
29    LangfuseTool,
30)
31from .span_filter import (
32    KNOWN_LLM_INSTRUMENTATION_SCOPE_PREFIXES,
33    is_default_export_span,
34    is_genai_span,
35    is_known_llm_instrumentor,
36    is_langfuse_span,
37)
38
39Langfuse = _client_module.Langfuse
40
41__all__ = [
42    "Langfuse",
43    "get_client",
44    "observe",
45    "propagate_attributes",
46    "ObservationTypeLiteral",
47    "LangfuseSpan",
48    "LangfuseGeneration",
49    "LangfuseEvent",
50    "LangfuseOtelSpanAttributes",
51    "LangfuseAgent",
52    "LangfuseTool",
53    "LangfuseChain",
54    "LangfuseEmbedding",
55    "LangfuseEvaluator",
56    "LangfuseRetriever",
57    "LangfuseGuardrail",
58    "Evaluation",
59    "EvaluatorInputs",
60    "MapperFunction",
61    "CompositeEvaluatorFunction",
62    "EvaluatorStats",
63    "BatchEvaluationResumeToken",
64    "BatchEvaluationResult",
65    "is_default_export_span",
66    "is_langfuse_span",
67    "is_genai_span",
68    "is_known_llm_instrumentor",
69    "KNOWN_LLM_INSTRUMENTATION_SCOPE_PREFIXES",
70    "experiment",
71    "api",
72]
class Langfuse:
 134class Langfuse:
 135    """Main client for Langfuse tracing and platform features.
 136
 137    This class provides an interface for creating and managing traces, spans,
 138    and generations in Langfuse as well as interacting with the Langfuse API.
 139
 140    The client features a thread-safe singleton pattern for each unique public API key,
 141    ensuring consistent trace context propagation across your application. It implements
 142    efficient batching of spans with configurable flush settings and includes background
 143    thread management for media uploads and score ingestion.
 144
 145    Configuration is flexible through either direct parameters or environment variables,
 146    with graceful fallbacks and runtime configuration updates.
 147
 148    Attributes:
 149        api: Synchronous API client for Langfuse backend communication
 150        async_api: Asynchronous API client for Langfuse backend communication
 151        _otel_tracer: Internal LangfuseTracer instance managing OpenTelemetry components
 152
 153    Parameters:
 154        public_key (Optional[str]): Your Langfuse public API key. Can also be set via LANGFUSE_PUBLIC_KEY environment variable.
 155        secret_key (Optional[str]): Your Langfuse secret API key. Can also be set via LANGFUSE_SECRET_KEY environment variable.
 156        base_url (Optional[str]): The Langfuse API base URL. Defaults to "https://cloud.langfuse.com". Can also be set via LANGFUSE_BASE_URL environment variable.
 157        host (Optional[str]): Deprecated. Use base_url instead. The Langfuse API host URL. Defaults to "https://cloud.langfuse.com".
 158        timeout (Optional[int]): Timeout in seconds for API requests. Defaults to 5 seconds.
 159        httpx_client (Optional[httpx.Client]): Custom httpx client for making non-tracing HTTP requests. If not provided, a default client will be created.
 160        debug (bool): Enable debug logging. Defaults to False. Can also be set via LANGFUSE_DEBUG environment variable.
 161        tracing_enabled (Optional[bool]): Enable or disable tracing. Defaults to True. Can also be set via LANGFUSE_TRACING_ENABLED environment variable.
 162        flush_at (Optional[int]): Number of spans to batch before sending to the API. Defaults to 512. Can also be set via LANGFUSE_FLUSH_AT environment variable.
 163        flush_interval (Optional[float]): Time in seconds between batch flushes. Defaults to 5 seconds. Can also be set via LANGFUSE_FLUSH_INTERVAL environment variable.
 164        environment (Optional[str]): Environment name for tracing. Default is 'default'. Can also be set via LANGFUSE_TRACING_ENVIRONMENT environment variable. Can be any lowercase alphanumeric string with hyphens and underscores that does not start with 'langfuse'.
 165        release (Optional[str]): Release version/hash of your application. Used for grouping analytics by release.
 166        media_upload_thread_count (Optional[int]): Number of background threads for handling media uploads. Defaults to 1. Can also be set via LANGFUSE_MEDIA_UPLOAD_THREAD_COUNT environment variable.
 167        sample_rate (Optional[float]): Sampling rate for traces (0.0 to 1.0). Defaults to 1.0 (100% of traces are sampled). Can also be set via LANGFUSE_SAMPLE_RATE environment variable.
 168        mask (Optional[MaskFunction]): Function to mask sensitive data in traces before sending to the API.
 169        blocked_instrumentation_scopes (Optional[List[str]]): Deprecated. Use `should_export_span` instead. Equivalent behavior:
 170            ```python
 171            from langfuse.span_filter import is_default_export_span
 172            blocked = {"sqlite", "requests"}
 173
 174            should_export_span = lambda span: (
 175                is_default_export_span(span)
 176                and (
 177                    span.instrumentation_scope is None
 178                    or span.instrumentation_scope.name not in blocked
 179                )
 180            )
 181            ```
 182        should_export_span (Optional[Callable[[ReadableSpan], bool]]): Callback to decide whether to export a span. If omitted, Langfuse uses the default filter (Langfuse SDK spans, spans with `gen_ai.*` attributes, and known LLM instrumentation scopes).
 183        additional_headers (Optional[Dict[str, str]]): Additional headers to include in all API requests and OTLPSpanExporter requests. These headers will be merged with default headers. Note: If httpx_client is provided, additional_headers must be set directly on your custom httpx_client as well.
 184        tracer_provider(Optional[TracerProvider]): OpenTelemetry TracerProvider to use for Langfuse. This can be useful to set to have disconnected tracing between Langfuse and other OpenTelemetry-span emitting libraries. Note: To track active spans, the context is still shared between TracerProviders. This may lead to broken trace trees.
 185
 186    Example:
 187        ```python
 188        from langfuse.otel import Langfuse
 189
 190        # Initialize the client (reads from env vars if not provided)
 191        langfuse = Langfuse(
 192            public_key="your-public-key",
 193            secret_key="your-secret-key",
 194            host="https://cloud.langfuse.com",  # Optional, default shown
 195        )
 196
 197        # Create a trace span
 198        with langfuse.start_as_current_observation(name="process-query") as span:
 199            # Your application code here
 200
 201            # Create a nested generation span for an LLM call
 202            with span.start_as_current_generation(
 203                name="generate-response",
 204                model="gpt-4",
 205                input={"query": "Tell me about AI"},
 206                model_parameters={"temperature": 0.7, "max_tokens": 500}
 207            ) as generation:
 208                # Generate response here
 209                response = "AI is a field of computer science..."
 210
 211                generation.update(
 212                    output=response,
 213                    usage_details={"prompt_tokens": 10, "completion_tokens": 50},
 214                    cost_details={"total_cost": 0.0023}
 215                )
 216
 217                # Score the generation (supports NUMERIC, BOOLEAN, CATEGORICAL)
 218                generation.score(name="relevance", value=0.95, data_type="NUMERIC")
 219        ```
 220    """
 221
 222    _resources: Optional[LangfuseResourceManager] = None
 223    _mask: Optional[MaskFunction] = None
 224    _otel_tracer: otel_trace_api.Tracer
 225
 226    def __init__(
 227        self,
 228        *,
 229        public_key: Optional[str] = None,
 230        secret_key: Optional[str] = None,
 231        base_url: Optional[str] = None,
 232        host: Optional[str] = None,
 233        timeout: Optional[int] = None,
 234        httpx_client: Optional[httpx.Client] = None,
 235        debug: bool = False,
 236        tracing_enabled: Optional[bool] = True,
 237        flush_at: Optional[int] = None,
 238        flush_interval: Optional[float] = None,
 239        environment: Optional[str] = None,
 240        release: Optional[str] = None,
 241        media_upload_thread_count: Optional[int] = None,
 242        sample_rate: Optional[float] = None,
 243        mask: Optional[MaskFunction] = None,
 244        blocked_instrumentation_scopes: Optional[List[str]] = None,
 245        should_export_span: Optional[Callable[[ReadableSpan], bool]] = None,
 246        additional_headers: Optional[Dict[str, str]] = None,
 247        tracer_provider: Optional[TracerProvider] = None,
 248    ):
 249        self._base_url = (
 250            base_url
 251            or os.environ.get(LANGFUSE_BASE_URL)
 252            or host
 253            or os.environ.get(LANGFUSE_HOST, "https://cloud.langfuse.com")
 254        )
 255        self._environment = environment or cast(
 256            str, os.environ.get(LANGFUSE_TRACING_ENVIRONMENT)
 257        )
 258        self._release = (
 259            release
 260            or os.environ.get(LANGFUSE_RELEASE, None)
 261            or get_common_release_envs()
 262        )
 263        self._project_id: Optional[str] = None
 264        sample_rate = sample_rate or float(os.environ.get(LANGFUSE_SAMPLE_RATE, 1.0))
 265        if not 0.0 <= sample_rate <= 1.0:
 266            raise ValueError(
 267                f"Sample rate must be between 0.0 and 1.0, got {sample_rate}"
 268            )
 269
 270        timeout = timeout or int(os.environ.get(LANGFUSE_TIMEOUT, 5))
 271
 272        self._tracing_enabled = (
 273            tracing_enabled
 274            and os.environ.get(LANGFUSE_TRACING_ENABLED, "true").lower() != "false"
 275        )
 276        if not self._tracing_enabled:
 277            langfuse_logger.info(
 278                "Configuration: Langfuse tracing is explicitly disabled. No data will be sent to the Langfuse API."
 279            )
 280
 281        debug = (
 282            debug if debug else (os.getenv(LANGFUSE_DEBUG, "false").lower() == "true")
 283        )
 284        if debug:
 285            logging.basicConfig(
 286                format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 287            )
 288            langfuse_logger.setLevel(logging.DEBUG)
 289
 290        public_key = public_key or os.environ.get(LANGFUSE_PUBLIC_KEY)
 291        if public_key is None:
 292            langfuse_logger.warning(
 293                "Authentication error: Langfuse client initialized without public_key. Client will be disabled. "
 294                "Provide a public_key parameter or set LANGFUSE_PUBLIC_KEY environment variable. "
 295            )
 296            self._otel_tracer = otel_trace_api.NoOpTracer()
 297            return
 298
 299        secret_key = secret_key or os.environ.get(LANGFUSE_SECRET_KEY)
 300        if secret_key is None:
 301            langfuse_logger.warning(
 302                "Authentication error: Langfuse client initialized without secret_key. Client will be disabled. "
 303                "Provide a secret_key parameter or set LANGFUSE_SECRET_KEY environment variable. "
 304            )
 305            self._otel_tracer = otel_trace_api.NoOpTracer()
 306            return
 307
 308        if os.environ.get("OTEL_SDK_DISABLED", "false").lower() == "true":
 309            langfuse_logger.warning(
 310                "OTEL_SDK_DISABLED is set. Langfuse tracing will be disabled and no traces will appear in the UI."
 311            )
 312
 313        if blocked_instrumentation_scopes is not None:
 314            warnings.warn(
 315                "`blocked_instrumentation_scopes` is deprecated and will be removed in a future release. "
 316                "Use `should_export_span` instead. Example: "
 317                "from langfuse.span_filter import is_default_export_span; "
 318                'blocked={"scope"}; should_export_span=lambda span: '
 319                "is_default_export_span(span) and (span.instrumentation_scope is None or "
 320                "span.instrumentation_scope.name not in blocked).",
 321                DeprecationWarning,
 322                stacklevel=2,
 323            )
 324
 325        # Initialize api and tracer if requirements are met
 326        self._resources = LangfuseResourceManager(
 327            public_key=public_key,
 328            secret_key=secret_key,
 329            base_url=self._base_url,
 330            timeout=timeout,
 331            environment=self._environment,
 332            release=release,
 333            flush_at=flush_at,
 334            flush_interval=flush_interval,
 335            httpx_client=httpx_client,
 336            media_upload_thread_count=media_upload_thread_count,
 337            sample_rate=sample_rate,
 338            mask=mask,
 339            tracing_enabled=self._tracing_enabled,
 340            blocked_instrumentation_scopes=blocked_instrumentation_scopes,
 341            should_export_span=should_export_span,
 342            additional_headers=additional_headers,
 343            tracer_provider=tracer_provider,
 344        )
 345        self._mask = self._resources.mask
 346
 347        self._otel_tracer = (
 348            self._resources.tracer
 349            if self._tracing_enabled and self._resources.tracer is not None
 350            else otel_trace_api.NoOpTracer()
 351        )
 352        self.api = self._resources.api
 353        self.async_api = self._resources.async_api
 354
 355    @overload
 356    def start_observation(
 357        self,
 358        *,
 359        trace_context: Optional[TraceContext] = None,
 360        name: str,
 361        as_type: Literal["generation"],
 362        input: Optional[Any] = None,
 363        output: Optional[Any] = None,
 364        metadata: Optional[Any] = None,
 365        version: Optional[str] = None,
 366        level: Optional[SpanLevel] = None,
 367        status_message: Optional[str] = None,
 368        completion_start_time: Optional[datetime] = None,
 369        model: Optional[str] = None,
 370        model_parameters: Optional[Dict[str, MapValue]] = None,
 371        usage_details: Optional[Dict[str, int]] = None,
 372        cost_details: Optional[Dict[str, float]] = None,
 373        prompt: Optional[PromptClient] = None,
 374    ) -> LangfuseGeneration: ...
 375
 376    @overload
 377    def start_observation(
 378        self,
 379        *,
 380        trace_context: Optional[TraceContext] = None,
 381        name: str,
 382        as_type: Literal["span"] = "span",
 383        input: Optional[Any] = None,
 384        output: Optional[Any] = None,
 385        metadata: Optional[Any] = None,
 386        version: Optional[str] = None,
 387        level: Optional[SpanLevel] = None,
 388        status_message: Optional[str] = None,
 389    ) -> LangfuseSpan: ...
 390
 391    @overload
 392    def start_observation(
 393        self,
 394        *,
 395        trace_context: Optional[TraceContext] = None,
 396        name: str,
 397        as_type: Literal["agent"],
 398        input: Optional[Any] = None,
 399        output: Optional[Any] = None,
 400        metadata: Optional[Any] = None,
 401        version: Optional[str] = None,
 402        level: Optional[SpanLevel] = None,
 403        status_message: Optional[str] = None,
 404    ) -> LangfuseAgent: ...
 405
 406    @overload
 407    def start_observation(
 408        self,
 409        *,
 410        trace_context: Optional[TraceContext] = None,
 411        name: str,
 412        as_type: Literal["tool"],
 413        input: Optional[Any] = None,
 414        output: Optional[Any] = None,
 415        metadata: Optional[Any] = None,
 416        version: Optional[str] = None,
 417        level: Optional[SpanLevel] = None,
 418        status_message: Optional[str] = None,
 419    ) -> LangfuseTool: ...
 420
 421    @overload
 422    def start_observation(
 423        self,
 424        *,
 425        trace_context: Optional[TraceContext] = None,
 426        name: str,
 427        as_type: Literal["chain"],
 428        input: Optional[Any] = None,
 429        output: Optional[Any] = None,
 430        metadata: Optional[Any] = None,
 431        version: Optional[str] = None,
 432        level: Optional[SpanLevel] = None,
 433        status_message: Optional[str] = None,
 434    ) -> LangfuseChain: ...
 435
 436    @overload
 437    def start_observation(
 438        self,
 439        *,
 440        trace_context: Optional[TraceContext] = None,
 441        name: str,
 442        as_type: Literal["retriever"],
 443        input: Optional[Any] = None,
 444        output: Optional[Any] = None,
 445        metadata: Optional[Any] = None,
 446        version: Optional[str] = None,
 447        level: Optional[SpanLevel] = None,
 448        status_message: Optional[str] = None,
 449    ) -> LangfuseRetriever: ...
 450
 451    @overload
 452    def start_observation(
 453        self,
 454        *,
 455        trace_context: Optional[TraceContext] = None,
 456        name: str,
 457        as_type: Literal["evaluator"],
 458        input: Optional[Any] = None,
 459        output: Optional[Any] = None,
 460        metadata: Optional[Any] = None,
 461        version: Optional[str] = None,
 462        level: Optional[SpanLevel] = None,
 463        status_message: Optional[str] = None,
 464    ) -> LangfuseEvaluator: ...
 465
 466    @overload
 467    def start_observation(
 468        self,
 469        *,
 470        trace_context: Optional[TraceContext] = None,
 471        name: str,
 472        as_type: Literal["embedding"],
 473        input: Optional[Any] = None,
 474        output: Optional[Any] = None,
 475        metadata: Optional[Any] = None,
 476        version: Optional[str] = None,
 477        level: Optional[SpanLevel] = None,
 478        status_message: Optional[str] = None,
 479        completion_start_time: Optional[datetime] = None,
 480        model: Optional[str] = None,
 481        model_parameters: Optional[Dict[str, MapValue]] = None,
 482        usage_details: Optional[Dict[str, int]] = None,
 483        cost_details: Optional[Dict[str, float]] = None,
 484        prompt: Optional[PromptClient] = None,
 485    ) -> LangfuseEmbedding: ...
 486
 487    @overload
 488    def start_observation(
 489        self,
 490        *,
 491        trace_context: Optional[TraceContext] = None,
 492        name: str,
 493        as_type: Literal["guardrail"],
 494        input: Optional[Any] = None,
 495        output: Optional[Any] = None,
 496        metadata: Optional[Any] = None,
 497        version: Optional[str] = None,
 498        level: Optional[SpanLevel] = None,
 499        status_message: Optional[str] = None,
 500    ) -> LangfuseGuardrail: ...
 501
 502    def start_observation(
 503        self,
 504        *,
 505        trace_context: Optional[TraceContext] = None,
 506        name: str,
 507        as_type: ObservationTypeLiteralNoEvent = "span",
 508        input: Optional[Any] = None,
 509        output: Optional[Any] = None,
 510        metadata: Optional[Any] = None,
 511        version: Optional[str] = None,
 512        level: Optional[SpanLevel] = None,
 513        status_message: Optional[str] = None,
 514        completion_start_time: Optional[datetime] = None,
 515        model: Optional[str] = None,
 516        model_parameters: Optional[Dict[str, MapValue]] = None,
 517        usage_details: Optional[Dict[str, int]] = None,
 518        cost_details: Optional[Dict[str, float]] = None,
 519        prompt: Optional[PromptClient] = None,
 520    ) -> Union[
 521        LangfuseSpan,
 522        LangfuseGeneration,
 523        LangfuseAgent,
 524        LangfuseTool,
 525        LangfuseChain,
 526        LangfuseRetriever,
 527        LangfuseEvaluator,
 528        LangfuseEmbedding,
 529        LangfuseGuardrail,
 530    ]:
 531        """Create a new observation of the specified type.
 532
 533        This method creates a new observation but does not set it as the current span in the
 534        context. To create and use an observation within a context, use start_as_current_observation().
 535
 536        Args:
 537            trace_context: Optional context for connecting to an existing trace
 538            name: Name of the observation
 539            as_type: Type of observation to create (defaults to "span")
 540            input: Input data for the operation
 541            output: Output data from the operation
 542            metadata: Additional metadata to associate with the observation
 543            version: Version identifier for the code or component
 544            level: Importance level of the observation
 545            status_message: Optional status message for the observation
 546            completion_start_time: When the model started generating (for generation types)
 547            model: Name/identifier of the AI model used (for generation types)
 548            model_parameters: Parameters used for the model (for generation types)
 549            usage_details: Token usage information (for generation types)
 550            cost_details: Cost information (for generation types)
 551            prompt: Associated prompt template (for generation types)
 552
 553        Returns:
 554            An observation object of the appropriate type that must be ended with .end()
 555        """
 556        if trace_context:
 557            trace_id = trace_context.get("trace_id", None)
 558            parent_span_id = trace_context.get("parent_span_id", None)
 559
 560            if trace_id:
 561                remote_parent_span = self._create_remote_parent_span(
 562                    trace_id=trace_id, parent_span_id=parent_span_id
 563                )
 564
 565                with otel_trace_api.use_span(
 566                    cast(otel_trace_api.Span, remote_parent_span)
 567                ):
 568                    otel_span = self._otel_tracer.start_span(name=name)
 569                    otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True)
 570
 571                    return self._create_observation_from_otel_span(
 572                        otel_span=otel_span,
 573                        as_type=as_type,
 574                        input=input,
 575                        output=output,
 576                        metadata=metadata,
 577                        version=version,
 578                        level=level,
 579                        status_message=status_message,
 580                        completion_start_time=completion_start_time,
 581                        model=model,
 582                        model_parameters=model_parameters,
 583                        usage_details=usage_details,
 584                        cost_details=cost_details,
 585                        prompt=prompt,
 586                    )
 587
 588        otel_span = self._otel_tracer.start_span(name=name)
 589
 590        return self._create_observation_from_otel_span(
 591            otel_span=otel_span,
 592            as_type=as_type,
 593            input=input,
 594            output=output,
 595            metadata=metadata,
 596            version=version,
 597            level=level,
 598            status_message=status_message,
 599            completion_start_time=completion_start_time,
 600            model=model,
 601            model_parameters=model_parameters,
 602            usage_details=usage_details,
 603            cost_details=cost_details,
 604            prompt=prompt,
 605        )
 606
 607    def _create_observation_from_otel_span(
 608        self,
 609        *,
 610        otel_span: otel_trace_api.Span,
 611        as_type: ObservationTypeLiteralNoEvent,
 612        input: Optional[Any] = None,
 613        output: Optional[Any] = None,
 614        metadata: Optional[Any] = None,
 615        version: Optional[str] = None,
 616        level: Optional[SpanLevel] = None,
 617        status_message: Optional[str] = None,
 618        completion_start_time: Optional[datetime] = None,
 619        model: Optional[str] = None,
 620        model_parameters: Optional[Dict[str, MapValue]] = None,
 621        usage_details: Optional[Dict[str, int]] = None,
 622        cost_details: Optional[Dict[str, float]] = None,
 623        prompt: Optional[PromptClient] = None,
 624    ) -> Union[
 625        LangfuseSpan,
 626        LangfuseGeneration,
 627        LangfuseAgent,
 628        LangfuseTool,
 629        LangfuseChain,
 630        LangfuseRetriever,
 631        LangfuseEvaluator,
 632        LangfuseEmbedding,
 633        LangfuseGuardrail,
 634    ]:
 635        """Create the appropriate observation type from an OTEL span."""
 636        if as_type in get_observation_types_list(ObservationTypeGenerationLike):
 637            observation_class = self._get_span_class(as_type)
 638            # Type ignore to prevent overloads of internal _get_span_class function,
 639            # issue is that LangfuseEvent could be returned and that classes have diff. args
 640            return observation_class(  # type: ignore[return-value,call-arg]
 641                otel_span=otel_span,
 642                langfuse_client=self,
 643                environment=self._environment,
 644                release=self._release,
 645                input=input,
 646                output=output,
 647                metadata=metadata,
 648                version=version,
 649                level=level,
 650                status_message=status_message,
 651                completion_start_time=completion_start_time,
 652                model=model,
 653                model_parameters=model_parameters,
 654                usage_details=usage_details,
 655                cost_details=cost_details,
 656                prompt=prompt,
 657            )
 658        else:
 659            # For other types (e.g. span, guardrail), create appropriate class without generation properties
 660            observation_class = self._get_span_class(as_type)
 661            # Type ignore to prevent overloads of internal _get_span_class function,
 662            # issue is that LangfuseEvent could be returned and that classes have diff. args
 663            return observation_class(  # type: ignore[return-value,call-arg]
 664                otel_span=otel_span,
 665                langfuse_client=self,
 666                environment=self._environment,
 667                release=self._release,
 668                input=input,
 669                output=output,
 670                metadata=metadata,
 671                version=version,
 672                level=level,
 673                status_message=status_message,
 674            )
 675            # span._observation_type = as_type
 676            # span._otel_span.set_attribute("langfuse.observation.type", as_type)
 677            # return span
 678
 679    @overload
 680    def start_as_current_observation(
 681        self,
 682        *,
 683        trace_context: Optional[TraceContext] = None,
 684        name: str,
 685        as_type: Literal["generation"],
 686        input: Optional[Any] = None,
 687        output: Optional[Any] = None,
 688        metadata: Optional[Any] = None,
 689        version: Optional[str] = None,
 690        level: Optional[SpanLevel] = None,
 691        status_message: Optional[str] = None,
 692        completion_start_time: Optional[datetime] = None,
 693        model: Optional[str] = None,
 694        model_parameters: Optional[Dict[str, MapValue]] = None,
 695        usage_details: Optional[Dict[str, int]] = None,
 696        cost_details: Optional[Dict[str, float]] = None,
 697        prompt: Optional[PromptClient] = None,
 698        end_on_exit: Optional[bool] = None,
 699    ) -> _AgnosticContextManager[LangfuseGeneration]: ...
 700
 701    @overload
 702    def start_as_current_observation(
 703        self,
 704        *,
 705        trace_context: Optional[TraceContext] = None,
 706        name: str,
 707        as_type: Literal["span"] = "span",
 708        input: Optional[Any] = None,
 709        output: Optional[Any] = None,
 710        metadata: Optional[Any] = None,
 711        version: Optional[str] = None,
 712        level: Optional[SpanLevel] = None,
 713        status_message: Optional[str] = None,
 714        end_on_exit: Optional[bool] = None,
 715    ) -> _AgnosticContextManager[LangfuseSpan]: ...
 716
 717    @overload
 718    def start_as_current_observation(
 719        self,
 720        *,
 721        trace_context: Optional[TraceContext] = None,
 722        name: str,
 723        as_type: Literal["agent"],
 724        input: Optional[Any] = None,
 725        output: Optional[Any] = None,
 726        metadata: Optional[Any] = None,
 727        version: Optional[str] = None,
 728        level: Optional[SpanLevel] = None,
 729        status_message: Optional[str] = None,
 730        end_on_exit: Optional[bool] = None,
 731    ) -> _AgnosticContextManager[LangfuseAgent]: ...
 732
 733    @overload
 734    def start_as_current_observation(
 735        self,
 736        *,
 737        trace_context: Optional[TraceContext] = None,
 738        name: str,
 739        as_type: Literal["tool"],
 740        input: Optional[Any] = None,
 741        output: Optional[Any] = None,
 742        metadata: Optional[Any] = None,
 743        version: Optional[str] = None,
 744        level: Optional[SpanLevel] = None,
 745        status_message: Optional[str] = None,
 746        end_on_exit: Optional[bool] = None,
 747    ) -> _AgnosticContextManager[LangfuseTool]: ...
 748
 749    @overload
 750    def start_as_current_observation(
 751        self,
 752        *,
 753        trace_context: Optional[TraceContext] = None,
 754        name: str,
 755        as_type: Literal["chain"],
 756        input: Optional[Any] = None,
 757        output: Optional[Any] = None,
 758        metadata: Optional[Any] = None,
 759        version: Optional[str] = None,
 760        level: Optional[SpanLevel] = None,
 761        status_message: Optional[str] = None,
 762        end_on_exit: Optional[bool] = None,
 763    ) -> _AgnosticContextManager[LangfuseChain]: ...
 764
 765    @overload
 766    def start_as_current_observation(
 767        self,
 768        *,
 769        trace_context: Optional[TraceContext] = None,
 770        name: str,
 771        as_type: Literal["retriever"],
 772        input: Optional[Any] = None,
 773        output: Optional[Any] = None,
 774        metadata: Optional[Any] = None,
 775        version: Optional[str] = None,
 776        level: Optional[SpanLevel] = None,
 777        status_message: Optional[str] = None,
 778        end_on_exit: Optional[bool] = None,
 779    ) -> _AgnosticContextManager[LangfuseRetriever]: ...
 780
 781    @overload
 782    def start_as_current_observation(
 783        self,
 784        *,
 785        trace_context: Optional[TraceContext] = None,
 786        name: str,
 787        as_type: Literal["evaluator"],
 788        input: Optional[Any] = None,
 789        output: Optional[Any] = None,
 790        metadata: Optional[Any] = None,
 791        version: Optional[str] = None,
 792        level: Optional[SpanLevel] = None,
 793        status_message: Optional[str] = None,
 794        end_on_exit: Optional[bool] = None,
 795    ) -> _AgnosticContextManager[LangfuseEvaluator]: ...
 796
 797    @overload
 798    def start_as_current_observation(
 799        self,
 800        *,
 801        trace_context: Optional[TraceContext] = None,
 802        name: str,
 803        as_type: Literal["embedding"],
 804        input: Optional[Any] = None,
 805        output: Optional[Any] = None,
 806        metadata: Optional[Any] = None,
 807        version: Optional[str] = None,
 808        level: Optional[SpanLevel] = None,
 809        status_message: Optional[str] = None,
 810        completion_start_time: Optional[datetime] = None,
 811        model: Optional[str] = None,
 812        model_parameters: Optional[Dict[str, MapValue]] = None,
 813        usage_details: Optional[Dict[str, int]] = None,
 814        cost_details: Optional[Dict[str, float]] = None,
 815        prompt: Optional[PromptClient] = None,
 816        end_on_exit: Optional[bool] = None,
 817    ) -> _AgnosticContextManager[LangfuseEmbedding]: ...
 818
 819    @overload
 820    def start_as_current_observation(
 821        self,
 822        *,
 823        trace_context: Optional[TraceContext] = None,
 824        name: str,
 825        as_type: Literal["guardrail"],
 826        input: Optional[Any] = None,
 827        output: Optional[Any] = None,
 828        metadata: Optional[Any] = None,
 829        version: Optional[str] = None,
 830        level: Optional[SpanLevel] = None,
 831        status_message: Optional[str] = None,
 832        end_on_exit: Optional[bool] = None,
 833    ) -> _AgnosticContextManager[LangfuseGuardrail]: ...
 834
 835    def start_as_current_observation(
 836        self,
 837        *,
 838        trace_context: Optional[TraceContext] = None,
 839        name: str,
 840        as_type: ObservationTypeLiteralNoEvent = "span",
 841        input: Optional[Any] = None,
 842        output: Optional[Any] = None,
 843        metadata: Optional[Any] = None,
 844        version: Optional[str] = None,
 845        level: Optional[SpanLevel] = None,
 846        status_message: Optional[str] = None,
 847        completion_start_time: Optional[datetime] = None,
 848        model: Optional[str] = None,
 849        model_parameters: Optional[Dict[str, MapValue]] = None,
 850        usage_details: Optional[Dict[str, int]] = None,
 851        cost_details: Optional[Dict[str, float]] = None,
 852        prompt: Optional[PromptClient] = None,
 853        end_on_exit: Optional[bool] = None,
 854    ) -> Union[
 855        _AgnosticContextManager[LangfuseGeneration],
 856        _AgnosticContextManager[LangfuseSpan],
 857        _AgnosticContextManager[LangfuseAgent],
 858        _AgnosticContextManager[LangfuseTool],
 859        _AgnosticContextManager[LangfuseChain],
 860        _AgnosticContextManager[LangfuseRetriever],
 861        _AgnosticContextManager[LangfuseEvaluator],
 862        _AgnosticContextManager[LangfuseEmbedding],
 863        _AgnosticContextManager[LangfuseGuardrail],
 864    ]:
 865        """Create a new observation and set it as the current span in a context manager.
 866
 867        This method creates a new observation of the specified type and sets it as the
 868        current span within a context manager. Use this method with a 'with' statement to
 869        automatically handle the observation lifecycle within a code block.
 870
 871        The created observation will be the child of the current span in the context.
 872
 873        Args:
 874            trace_context: Optional context for connecting to an existing trace
 875            name: Name of the observation (e.g., function or operation name)
 876            as_type: Type of observation to create (defaults to "span")
 877            input: Input data for the operation (can be any JSON-serializable object)
 878            output: Output data from the operation (can be any JSON-serializable object)
 879            metadata: Additional metadata to associate with the observation
 880            version: Version identifier for the code or component
 881            level: Importance level of the observation (info, warning, error)
 882            status_message: Optional status message for the observation
 883            end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
 884
 885            The following parameters are available when as_type is: "generation" or "embedding".
 886            completion_start_time: When the model started generating the response
 887            model: Name/identifier of the AI model used (e.g., "gpt-4")
 888            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
 889            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
 890            cost_details: Cost information for the model call
 891            prompt: Associated prompt template from Langfuse prompt management
 892
 893        Returns:
 894            A context manager that yields the appropriate observation type based on as_type
 895
 896        Example:
 897            ```python
 898            # Create a span
 899            with langfuse.start_as_current_observation(name="process-query", as_type="span") as span:
 900                # Do work
 901                result = process_data()
 902                span.update(output=result)
 903
 904                # Create a child span automatically
 905                with span.start_as_current_observation(name="sub-operation") as child_span:
 906                    # Do sub-operation work
 907                    child_span.update(output="sub-result")
 908
 909            # Create a tool observation
 910            with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool:
 911                # Do tool work
 912                results = search_web(query)
 913                tool.update(output=results)
 914
 915            # Create a generation observation
 916            with langfuse.start_as_current_observation(
 917                name="answer-generation",
 918                as_type="generation",
 919                model="gpt-4"
 920            ) as generation:
 921                # Generate answer
 922                response = llm.generate(...)
 923                generation.update(output=response)
 924            ```
 925        """
 926        if as_type in get_observation_types_list(ObservationTypeGenerationLike):
 927            if trace_context:
 928                trace_id = trace_context.get("trace_id", None)
 929                parent_span_id = trace_context.get("parent_span_id", None)
 930
 931                if trace_id:
 932                    remote_parent_span = self._create_remote_parent_span(
 933                        trace_id=trace_id, parent_span_id=parent_span_id
 934                    )
 935
 936                    return cast(
 937                        Union[
 938                            _AgnosticContextManager[LangfuseGeneration],
 939                            _AgnosticContextManager[LangfuseEmbedding],
 940                        ],
 941                        self._create_span_with_parent_context(
 942                            as_type=as_type,
 943                            name=name,
 944                            remote_parent_span=remote_parent_span,
 945                            parent=None,
 946                            end_on_exit=end_on_exit,
 947                            input=input,
 948                            output=output,
 949                            metadata=metadata,
 950                            version=version,
 951                            level=level,
 952                            status_message=status_message,
 953                            completion_start_time=completion_start_time,
 954                            model=model,
 955                            model_parameters=model_parameters,
 956                            usage_details=usage_details,
 957                            cost_details=cost_details,
 958                            prompt=prompt,
 959                        ),
 960                    )
 961
 962            return cast(
 963                Union[
 964                    _AgnosticContextManager[LangfuseGeneration],
 965                    _AgnosticContextManager[LangfuseEmbedding],
 966                ],
 967                self._start_as_current_otel_span_with_processed_media(
 968                    as_type=as_type,
 969                    name=name,
 970                    end_on_exit=end_on_exit,
 971                    input=input,
 972                    output=output,
 973                    metadata=metadata,
 974                    version=version,
 975                    level=level,
 976                    status_message=status_message,
 977                    completion_start_time=completion_start_time,
 978                    model=model,
 979                    model_parameters=model_parameters,
 980                    usage_details=usage_details,
 981                    cost_details=cost_details,
 982                    prompt=prompt,
 983                ),
 984            )
 985
 986        if as_type in get_observation_types_list(ObservationTypeSpanLike):
 987            if trace_context:
 988                trace_id = trace_context.get("trace_id", None)
 989                parent_span_id = trace_context.get("parent_span_id", None)
 990
 991                if trace_id:
 992                    remote_parent_span = self._create_remote_parent_span(
 993                        trace_id=trace_id, parent_span_id=parent_span_id
 994                    )
 995
 996                    return cast(
 997                        Union[
 998                            _AgnosticContextManager[LangfuseSpan],
 999                            _AgnosticContextManager[LangfuseAgent],
1000                            _AgnosticContextManager[LangfuseTool],
1001                            _AgnosticContextManager[LangfuseChain],
1002                            _AgnosticContextManager[LangfuseRetriever],
1003                            _AgnosticContextManager[LangfuseEvaluator],
1004                            _AgnosticContextManager[LangfuseGuardrail],
1005                        ],
1006                        self._create_span_with_parent_context(
1007                            as_type=as_type,
1008                            name=name,
1009                            remote_parent_span=remote_parent_span,
1010                            parent=None,
1011                            end_on_exit=end_on_exit,
1012                            input=input,
1013                            output=output,
1014                            metadata=metadata,
1015                            version=version,
1016                            level=level,
1017                            status_message=status_message,
1018                        ),
1019                    )
1020
1021            return cast(
1022                Union[
1023                    _AgnosticContextManager[LangfuseSpan],
1024                    _AgnosticContextManager[LangfuseAgent],
1025                    _AgnosticContextManager[LangfuseTool],
1026                    _AgnosticContextManager[LangfuseChain],
1027                    _AgnosticContextManager[LangfuseRetriever],
1028                    _AgnosticContextManager[LangfuseEvaluator],
1029                    _AgnosticContextManager[LangfuseGuardrail],
1030                ],
1031                self._start_as_current_otel_span_with_processed_media(
1032                    as_type=as_type,
1033                    name=name,
1034                    end_on_exit=end_on_exit,
1035                    input=input,
1036                    output=output,
1037                    metadata=metadata,
1038                    version=version,
1039                    level=level,
1040                    status_message=status_message,
1041                ),
1042            )
1043
1044        # This should never be reached since all valid types are handled above
1045        langfuse_logger.warning(
1046            f"Unknown observation type: {as_type}, falling back to span"
1047        )
1048        return self._start_as_current_otel_span_with_processed_media(
1049            as_type="span",
1050            name=name,
1051            end_on_exit=end_on_exit,
1052            input=input,
1053            output=output,
1054            metadata=metadata,
1055            version=version,
1056            level=level,
1057            status_message=status_message,
1058        )
1059
1060    def _get_span_class(
1061        self,
1062        as_type: ObservationTypeLiteral,
1063    ) -> Union[
1064        Type[LangfuseAgent],
1065        Type[LangfuseTool],
1066        Type[LangfuseChain],
1067        Type[LangfuseRetriever],
1068        Type[LangfuseEvaluator],
1069        Type[LangfuseEmbedding],
1070        Type[LangfuseGuardrail],
1071        Type[LangfuseGeneration],
1072        Type[LangfuseEvent],
1073        Type[LangfuseSpan],
1074    ]:
1075        """Get the appropriate span class based on as_type."""
1076        normalized_type = as_type.lower()
1077
1078        if normalized_type == "agent":
1079            return LangfuseAgent
1080        elif normalized_type == "tool":
1081            return LangfuseTool
1082        elif normalized_type == "chain":
1083            return LangfuseChain
1084        elif normalized_type == "retriever":
1085            return LangfuseRetriever
1086        elif normalized_type == "evaluator":
1087            return LangfuseEvaluator
1088        elif normalized_type == "embedding":
1089            return LangfuseEmbedding
1090        elif normalized_type == "guardrail":
1091            return LangfuseGuardrail
1092        elif normalized_type == "generation":
1093            return LangfuseGeneration
1094        elif normalized_type == "event":
1095            return LangfuseEvent
1096        elif normalized_type == "span":
1097            return LangfuseSpan
1098        else:
1099            return LangfuseSpan
1100
1101    @_agnosticcontextmanager
1102    def _create_span_with_parent_context(
1103        self,
1104        *,
1105        name: str,
1106        parent: Optional[otel_trace_api.Span] = None,
1107        remote_parent_span: Optional[otel_trace_api.Span] = None,
1108        as_type: ObservationTypeLiteralNoEvent,
1109        end_on_exit: Optional[bool] = None,
1110        input: Optional[Any] = None,
1111        output: Optional[Any] = None,
1112        metadata: Optional[Any] = None,
1113        version: Optional[str] = None,
1114        level: Optional[SpanLevel] = None,
1115        status_message: Optional[str] = None,
1116        completion_start_time: Optional[datetime] = None,
1117        model: Optional[str] = None,
1118        model_parameters: Optional[Dict[str, MapValue]] = None,
1119        usage_details: Optional[Dict[str, int]] = None,
1120        cost_details: Optional[Dict[str, float]] = None,
1121        prompt: Optional[PromptClient] = None,
1122    ) -> Any:
1123        parent_span = parent or cast(otel_trace_api.Span, remote_parent_span)
1124
1125        with otel_trace_api.use_span(parent_span):
1126            with self._start_as_current_otel_span_with_processed_media(
1127                name=name,
1128                as_type=as_type,
1129                end_on_exit=end_on_exit,
1130                input=input,
1131                output=output,
1132                metadata=metadata,
1133                version=version,
1134                level=level,
1135                status_message=status_message,
1136                completion_start_time=completion_start_time,
1137                model=model,
1138                model_parameters=model_parameters,
1139                usage_details=usage_details,
1140                cost_details=cost_details,
1141                prompt=prompt,
1142            ) as langfuse_span:
1143                if remote_parent_span is not None:
1144                    langfuse_span._otel_span.set_attribute(
1145                        LangfuseOtelSpanAttributes.AS_ROOT, True
1146                    )
1147
1148                yield langfuse_span
1149
1150    @_agnosticcontextmanager
1151    def _start_as_current_otel_span_with_processed_media(
1152        self,
1153        *,
1154        name: str,
1155        as_type: Optional[ObservationTypeLiteralNoEvent] = None,
1156        end_on_exit: Optional[bool] = None,
1157        input: Optional[Any] = None,
1158        output: Optional[Any] = None,
1159        metadata: Optional[Any] = None,
1160        version: Optional[str] = None,
1161        level: Optional[SpanLevel] = None,
1162        status_message: Optional[str] = None,
1163        completion_start_time: Optional[datetime] = None,
1164        model: Optional[str] = None,
1165        model_parameters: Optional[Dict[str, MapValue]] = None,
1166        usage_details: Optional[Dict[str, int]] = None,
1167        cost_details: Optional[Dict[str, float]] = None,
1168        prompt: Optional[PromptClient] = None,
1169    ) -> Any:
1170        with self._otel_tracer.start_as_current_span(
1171            name=name,
1172            end_on_exit=end_on_exit if end_on_exit is not None else True,
1173        ) as otel_span:
1174            span_class = self._get_span_class(
1175                as_type or "generation"
1176            )  # default was "generation"
1177            common_args = {
1178                "otel_span": otel_span,
1179                "langfuse_client": self,
1180                "environment": self._environment,
1181                "release": self._release,
1182                "input": input,
1183                "output": output,
1184                "metadata": metadata,
1185                "version": version,
1186                "level": level,
1187                "status_message": status_message,
1188            }
1189
1190            if span_class in [
1191                LangfuseGeneration,
1192                LangfuseEmbedding,
1193            ]:
1194                common_args.update(
1195                    {
1196                        "completion_start_time": completion_start_time,
1197                        "model": model,
1198                        "model_parameters": model_parameters,
1199                        "usage_details": usage_details,
1200                        "cost_details": cost_details,
1201                        "prompt": prompt,
1202                    }
1203                )
1204            # For span-like types (span, agent, tool, chain, retriever, evaluator, guardrail), no generation properties needed
1205
1206            yield span_class(**common_args)  # type: ignore[arg-type]
1207
1208    def _get_current_otel_span(self) -> Optional[otel_trace_api.Span]:
1209        current_span = otel_trace_api.get_current_span()
1210
1211        if current_span is otel_trace_api.INVALID_SPAN:
1212            langfuse_logger.warning(
1213                "Context error: No active span in current context. Operations that depend on an active span will be skipped. "
1214                "Ensure spans are created with start_as_current_observation() or that you're operating within an active span context."
1215            )
1216            return None
1217
1218        return current_span
1219
1220    def update_current_generation(
1221        self,
1222        *,
1223        name: Optional[str] = None,
1224        input: Optional[Any] = None,
1225        output: Optional[Any] = None,
1226        metadata: Optional[Any] = None,
1227        version: Optional[str] = None,
1228        level: Optional[SpanLevel] = None,
1229        status_message: Optional[str] = None,
1230        completion_start_time: Optional[datetime] = None,
1231        model: Optional[str] = None,
1232        model_parameters: Optional[Dict[str, MapValue]] = None,
1233        usage_details: Optional[Dict[str, int]] = None,
1234        cost_details: Optional[Dict[str, float]] = None,
1235        prompt: Optional[PromptClient] = None,
1236    ) -> None:
1237        """Update the current active generation span with new information.
1238
1239        This method updates the current generation span in the active context with
1240        additional information. It's useful for adding output, usage stats, or other
1241        details that become available during or after model generation.
1242
1243        Args:
1244            name: The generation name
1245            input: Updated input data for the model
1246            output: Output from the model (e.g., completions)
1247            metadata: Additional metadata to associate with the generation
1248            version: Version identifier for the model or component
1249            level: Importance level of the generation (info, warning, error)
1250            status_message: Optional status message for the generation
1251            completion_start_time: When the model started generating the response
1252            model: Name/identifier of the AI model used (e.g., "gpt-4")
1253            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1254            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1255            cost_details: Cost information for the model call
1256            prompt: Associated prompt template from Langfuse prompt management
1257
1258        Example:
1259            ```python
1260            with langfuse.start_as_current_generation(name="answer-query") as generation:
1261                # Initial setup and API call
1262                response = llm.generate(...)
1263
1264                # Update with results that weren't available at creation time
1265                langfuse.update_current_generation(
1266                    output=response.text,
1267                    usage_details={
1268                        "prompt_tokens": response.usage.prompt_tokens,
1269                        "completion_tokens": response.usage.completion_tokens
1270                    }
1271                )
1272            ```
1273        """
1274        if not self._tracing_enabled:
1275            langfuse_logger.debug(
1276                "Operation skipped: update_current_generation - Tracing is disabled or client is in no-op mode."
1277            )
1278            return
1279
1280        current_otel_span = self._get_current_otel_span()
1281
1282        if current_otel_span is not None:
1283            generation = LangfuseGeneration(
1284                otel_span=current_otel_span, langfuse_client=self
1285            )
1286
1287            if name:
1288                current_otel_span.update_name(name)
1289
1290            generation.update(
1291                input=input,
1292                output=output,
1293                metadata=metadata,
1294                version=version,
1295                level=level,
1296                status_message=status_message,
1297                completion_start_time=completion_start_time,
1298                model=model,
1299                model_parameters=model_parameters,
1300                usage_details=usage_details,
1301                cost_details=cost_details,
1302                prompt=prompt,
1303            )
1304
1305    def update_current_span(
1306        self,
1307        *,
1308        name: Optional[str] = None,
1309        input: Optional[Any] = None,
1310        output: Optional[Any] = None,
1311        metadata: Optional[Any] = None,
1312        version: Optional[str] = None,
1313        level: Optional[SpanLevel] = None,
1314        status_message: Optional[str] = None,
1315    ) -> None:
1316        """Update the current active span with new information.
1317
1318        This method updates the current span in the active context with
1319        additional information. It's useful for adding outputs or metadata
1320        that become available during execution.
1321
1322        Args:
1323            name: The span name
1324            input: Updated input data for the operation
1325            output: Output data from the operation
1326            metadata: Additional metadata to associate with the span
1327            version: Version identifier for the code or component
1328            level: Importance level of the span (info, warning, error)
1329            status_message: Optional status message for the span
1330
1331        Example:
1332            ```python
1333            with langfuse.start_as_current_observation(name="process-data") as span:
1334                # Initial processing
1335                result = process_first_part()
1336
1337                # Update with intermediate results
1338                langfuse.update_current_span(metadata={"intermediate_result": result})
1339
1340                # Continue processing
1341                final_result = process_second_part(result)
1342
1343                # Final update
1344                langfuse.update_current_span(output=final_result)
1345            ```
1346        """
1347        if not self._tracing_enabled:
1348            langfuse_logger.debug(
1349                "Operation skipped: update_current_span - Tracing is disabled or client is in no-op mode."
1350            )
1351            return
1352
1353        current_otel_span = self._get_current_otel_span()
1354
1355        if current_otel_span is not None:
1356            span = LangfuseSpan(
1357                otel_span=current_otel_span,
1358                langfuse_client=self,
1359                environment=self._environment,
1360                release=self._release,
1361            )
1362
1363            if name:
1364                current_otel_span.update_name(name)
1365
1366            span.update(
1367                input=input,
1368                output=output,
1369                metadata=metadata,
1370                version=version,
1371                level=level,
1372                status_message=status_message,
1373            )
1374
1375    @deprecated(
1376        "Trace-level input/output is deprecated. "
1377        "For trace attributes (user_id, session_id, tags, etc.), use propagate_attributes() instead. "
1378        "This method will be removed in a future major version."
1379    )
1380    def set_current_trace_io(
1381        self,
1382        *,
1383        input: Optional[Any] = None,
1384        output: Optional[Any] = None,
1385    ) -> None:
1386        """Set trace-level input and output for the current span's trace.
1387
1388        .. deprecated::
1389            This is a legacy method for backward compatibility with Langfuse platform
1390            features that still rely on trace-level input/output (e.g., legacy LLM-as-a-judge
1391            evaluators). It will be removed in a future major version.
1392
1393            For setting other trace attributes (user_id, session_id, metadata, tags, version),
1394            use :meth:`propagate_attributes` instead.
1395
1396        Args:
1397            input: Input data to associate with the trace.
1398            output: Output data to associate with the trace.
1399        """
1400        if not self._tracing_enabled:
1401            langfuse_logger.debug(
1402                "Operation skipped: set_current_trace_io - Tracing is disabled or client is in no-op mode."
1403            )
1404            return
1405
1406        current_otel_span = self._get_current_otel_span()
1407
1408        if current_otel_span is not None and current_otel_span.is_recording():
1409            existing_observation_type = current_otel_span.attributes.get(  # type: ignore[attr-defined]
1410                LangfuseOtelSpanAttributes.OBSERVATION_TYPE, "span"
1411            )
1412            # We need to preserve the class to keep the correct observation type
1413            span_class = self._get_span_class(existing_observation_type)
1414            span = span_class(
1415                otel_span=current_otel_span,
1416                langfuse_client=self,
1417                environment=self._environment,
1418                release=self._release,
1419            )
1420
1421            span.set_trace_io(
1422                input=input,
1423                output=output,
1424            )
1425
1426    def set_current_trace_as_public(self) -> None:
1427        """Make the current trace publicly accessible via its URL.
1428
1429        When a trace is published, anyone with the trace link can view the full trace
1430        without needing to be logged in to Langfuse. This action cannot be undone
1431        programmatically - once published, the entire trace becomes public.
1432
1433        This is a convenience method that publishes the trace from the currently
1434        active span context. Use this when you want to make a trace public from
1435        within a traced function without needing direct access to the span object.
1436        """
1437        if not self._tracing_enabled:
1438            langfuse_logger.debug(
1439                "Operation skipped: set_current_trace_as_public - Tracing is disabled or client is in no-op mode."
1440            )
1441            return
1442
1443        current_otel_span = self._get_current_otel_span()
1444
1445        if current_otel_span is not None and current_otel_span.is_recording():
1446            existing_observation_type = current_otel_span.attributes.get(  # type: ignore[attr-defined]
1447                LangfuseOtelSpanAttributes.OBSERVATION_TYPE, "span"
1448            )
1449            # We need to preserve the class to keep the correct observation type
1450            span_class = self._get_span_class(existing_observation_type)
1451            span = span_class(
1452                otel_span=current_otel_span,
1453                langfuse_client=self,
1454                environment=self._environment,
1455            )
1456
1457            span.set_trace_as_public()
1458
1459    def create_event(
1460        self,
1461        *,
1462        trace_context: Optional[TraceContext] = None,
1463        name: str,
1464        input: Optional[Any] = None,
1465        output: Optional[Any] = None,
1466        metadata: Optional[Any] = None,
1467        version: Optional[str] = None,
1468        level: Optional[SpanLevel] = None,
1469        status_message: Optional[str] = None,
1470    ) -> LangfuseEvent:
1471        """Create a new Langfuse observation of type 'EVENT'.
1472
1473        The created Langfuse Event observation will be the child of the current span in the context.
1474
1475        Args:
1476            trace_context: Optional context for connecting to an existing trace
1477            name: Name of the span (e.g., function or operation name)
1478            input: Input data for the operation (can be any JSON-serializable object)
1479            output: Output data from the operation (can be any JSON-serializable object)
1480            metadata: Additional metadata to associate with the span
1481            version: Version identifier for the code or component
1482            level: Importance level of the span (info, warning, error)
1483            status_message: Optional status message for the span
1484
1485        Returns:
1486            The Langfuse Event object
1487
1488        Example:
1489            ```python
1490            event = langfuse.create_event(name="process-event")
1491            ```
1492        """
1493        timestamp = time_ns()
1494
1495        if trace_context:
1496            trace_id = trace_context.get("trace_id", None)
1497            parent_span_id = trace_context.get("parent_span_id", None)
1498
1499            if trace_id:
1500                remote_parent_span = self._create_remote_parent_span(
1501                    trace_id=trace_id, parent_span_id=parent_span_id
1502                )
1503
1504                with otel_trace_api.use_span(
1505                    cast(otel_trace_api.Span, remote_parent_span)
1506                ):
1507                    otel_span = self._otel_tracer.start_span(
1508                        name=name, start_time=timestamp
1509                    )
1510                    otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True)
1511
1512                    return cast(
1513                        LangfuseEvent,
1514                        LangfuseEvent(
1515                            otel_span=otel_span,
1516                            langfuse_client=self,
1517                            environment=self._environment,
1518                            release=self._release,
1519                            input=input,
1520                            output=output,
1521                            metadata=metadata,
1522                            version=version,
1523                            level=level,
1524                            status_message=status_message,
1525                        ).end(end_time=timestamp),
1526                    )
1527
1528        otel_span = self._otel_tracer.start_span(name=name, start_time=timestamp)
1529
1530        return cast(
1531            LangfuseEvent,
1532            LangfuseEvent(
1533                otel_span=otel_span,
1534                langfuse_client=self,
1535                environment=self._environment,
1536                release=self._release,
1537                input=input,
1538                output=output,
1539                metadata=metadata,
1540                version=version,
1541                level=level,
1542                status_message=status_message,
1543            ).end(end_time=timestamp),
1544        )
1545
1546    def _create_remote_parent_span(
1547        self, *, trace_id: str, parent_span_id: Optional[str]
1548    ) -> Any:
1549        if not self._is_valid_trace_id(trace_id):
1550            langfuse_logger.warning(
1551                f"Passed trace ID '{trace_id}' is not a valid 32 lowercase hex char Langfuse trace id. Ignoring trace ID."
1552            )
1553
1554        if parent_span_id and not self._is_valid_span_id(parent_span_id):
1555            langfuse_logger.warning(
1556                f"Passed span ID '{parent_span_id}' is not a valid 16 lowercase hex char Langfuse span id. Ignoring parent span ID."
1557            )
1558
1559        int_trace_id = int(trace_id, 16)
1560        int_parent_span_id = (
1561            int(parent_span_id, 16)
1562            if parent_span_id
1563            else RandomIdGenerator().generate_span_id()
1564        )
1565
1566        span_context = otel_trace_api.SpanContext(
1567            trace_id=int_trace_id,
1568            span_id=int_parent_span_id,
1569            trace_flags=otel_trace_api.TraceFlags(0x01),  # mark span as sampled
1570            is_remote=False,
1571        )
1572
1573        return otel_trace_api.NonRecordingSpan(span_context)
1574
1575    def _is_valid_trace_id(self, trace_id: str) -> bool:
1576        pattern = r"^[0-9a-f]{32}$"
1577
1578        return bool(re.match(pattern, trace_id))
1579
1580    def _is_valid_span_id(self, span_id: str) -> bool:
1581        pattern = r"^[0-9a-f]{16}$"
1582
1583        return bool(re.match(pattern, span_id))
1584
1585    def _create_observation_id(self, *, seed: Optional[str] = None) -> str:
1586        """Create a unique observation ID for use with Langfuse.
1587
1588        This method generates a unique observation ID (span ID in OpenTelemetry terms)
1589        for use with various Langfuse APIs. It can either generate a random ID or
1590        create a deterministic ID based on a seed string.
1591
1592        Observation IDs must be 16 lowercase hexadecimal characters, representing 8 bytes.
1593        This method ensures the generated ID meets this requirement. If you need to
1594        correlate an external ID with a Langfuse observation ID, use the external ID as
1595        the seed to get a valid, deterministic observation ID.
1596
1597        Args:
1598            seed: Optional string to use as a seed for deterministic ID generation.
1599                 If provided, the same seed will always produce the same ID.
1600                 If not provided, a random ID will be generated.
1601
1602        Returns:
1603            A 16-character lowercase hexadecimal string representing the observation ID.
1604
1605        Example:
1606            ```python
1607            # Generate a random observation ID
1608            obs_id = langfuse.create_observation_id()
1609
1610            # Generate a deterministic ID based on a seed
1611            user_obs_id = langfuse.create_observation_id(seed="user-123-feedback")
1612
1613            # Correlate an external item ID with a Langfuse observation ID
1614            item_id = "item-789012"
1615            correlated_obs_id = langfuse.create_observation_id(seed=item_id)
1616
1617            # Use the ID with Langfuse APIs
1618            langfuse.create_score(
1619                name="relevance",
1620                value=0.95,
1621                trace_id=trace_id,
1622                observation_id=obs_id
1623            )
1624            ```
1625        """
1626        if not seed:
1627            span_id_int = RandomIdGenerator().generate_span_id()
1628
1629            return self._format_otel_span_id(span_id_int)
1630
1631        return sha256(seed.encode("utf-8")).digest()[:8].hex()
1632
1633    @staticmethod
1634    def create_trace_id(*, seed: Optional[str] = None) -> str:
1635        """Create a unique trace ID for use with Langfuse.
1636
1637        This method generates a unique trace ID for use with various Langfuse APIs.
1638        It can either generate a random ID or create a deterministic ID based on
1639        a seed string.
1640
1641        Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes.
1642        This method ensures the generated ID meets this requirement. If you need to
1643        correlate an external ID with a Langfuse trace ID, use the external ID as the
1644        seed to get a valid, deterministic Langfuse trace ID.
1645
1646        Args:
1647            seed: Optional string to use as a seed for deterministic ID generation.
1648                 If provided, the same seed will always produce the same ID.
1649                 If not provided, a random ID will be generated.
1650
1651        Returns:
1652            A 32-character lowercase hexadecimal string representing the Langfuse trace ID.
1653
1654        Example:
1655            ```python
1656            # Generate a random trace ID
1657            trace_id = langfuse.create_trace_id()
1658
1659            # Generate a deterministic ID based on a seed
1660            session_trace_id = langfuse.create_trace_id(seed="session-456")
1661
1662            # Correlate an external ID with a Langfuse trace ID
1663            external_id = "external-system-123456"
1664            correlated_trace_id = langfuse.create_trace_id(seed=external_id)
1665
1666            # Use the ID with trace context
1667            with langfuse.start_as_current_observation(
1668                name="process-request",
1669                trace_context={"trace_id": trace_id}
1670            ) as span:
1671                # Operation will be part of the specific trace
1672                pass
1673            ```
1674        """
1675        if not seed:
1676            trace_id_int = RandomIdGenerator().generate_trace_id()
1677
1678            return Langfuse._format_otel_trace_id(trace_id_int)
1679
1680        return sha256(seed.encode("utf-8")).digest()[:16].hex()
1681
1682    def _get_otel_trace_id(self, otel_span: otel_trace_api.Span) -> str:
1683        span_context = otel_span.get_span_context()
1684
1685        return self._format_otel_trace_id(span_context.trace_id)
1686
1687    def _get_otel_span_id(self, otel_span: otel_trace_api.Span) -> str:
1688        span_context = otel_span.get_span_context()
1689
1690        return self._format_otel_span_id(span_context.span_id)
1691
1692    @staticmethod
1693    def _format_otel_span_id(span_id_int: int) -> str:
1694        """Format an integer span ID to a 16-character lowercase hex string.
1695
1696        Internal method to convert an OpenTelemetry integer span ID to the standard
1697        W3C Trace Context format (16-character lowercase hex string).
1698
1699        Args:
1700            span_id_int: 64-bit integer representing a span ID
1701
1702        Returns:
1703            A 16-character lowercase hexadecimal string
1704        """
1705        return format(span_id_int, "016x")
1706
1707    @staticmethod
1708    def _format_otel_trace_id(trace_id_int: int) -> str:
1709        """Format an integer trace ID to a 32-character lowercase hex string.
1710
1711        Internal method to convert an OpenTelemetry integer trace ID to the standard
1712        W3C Trace Context format (32-character lowercase hex string).
1713
1714        Args:
1715            trace_id_int: 128-bit integer representing a trace ID
1716
1717        Returns:
1718            A 32-character lowercase hexadecimal string
1719        """
1720        return format(trace_id_int, "032x")
1721
1722    @overload
1723    def create_score(
1724        self,
1725        *,
1726        name: str,
1727        value: float,
1728        session_id: Optional[str] = None,
1729        dataset_run_id: Optional[str] = None,
1730        trace_id: Optional[str] = None,
1731        observation_id: Optional[str] = None,
1732        score_id: Optional[str] = None,
1733        data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None,
1734        comment: Optional[str] = None,
1735        config_id: Optional[str] = None,
1736        metadata: Optional[Any] = None,
1737        timestamp: Optional[datetime] = None,
1738    ) -> None: ...
1739
1740    @overload
1741    def create_score(
1742        self,
1743        *,
1744        name: str,
1745        value: str,
1746        session_id: Optional[str] = None,
1747        dataset_run_id: Optional[str] = None,
1748        trace_id: Optional[str] = None,
1749        score_id: Optional[str] = None,
1750        observation_id: Optional[str] = None,
1751        data_type: Optional[Literal["CATEGORICAL"]] = "CATEGORICAL",
1752        comment: Optional[str] = None,
1753        config_id: Optional[str] = None,
1754        metadata: Optional[Any] = None,
1755        timestamp: Optional[datetime] = None,
1756    ) -> None: ...
1757
1758    def create_score(
1759        self,
1760        *,
1761        name: str,
1762        value: Union[float, str],
1763        session_id: Optional[str] = None,
1764        dataset_run_id: Optional[str] = None,
1765        trace_id: Optional[str] = None,
1766        observation_id: Optional[str] = None,
1767        score_id: Optional[str] = None,
1768        data_type: Optional[ScoreDataType] = None,
1769        comment: Optional[str] = None,
1770        config_id: Optional[str] = None,
1771        metadata: Optional[Any] = None,
1772        timestamp: Optional[datetime] = None,
1773    ) -> None:
1774        """Create a score for a specific trace or observation.
1775
1776        This method creates a score for evaluating a Langfuse trace or observation. Scores can be
1777        used to track quality metrics, user feedback, or automated evaluations.
1778
1779        Args:
1780            name: Name of the score (e.g., "relevance", "accuracy")
1781            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
1782            session_id: ID of the Langfuse session to associate the score with
1783            dataset_run_id: ID of the Langfuse dataset run to associate the score with
1784            trace_id: ID of the Langfuse trace to associate the score with
1785            observation_id: Optional ID of the specific observation to score. Trace ID must be provided too.
1786            score_id: Optional custom ID for the score (auto-generated if not provided)
1787            data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
1788            comment: Optional comment or explanation for the score
1789            config_id: Optional ID of a score config defined in Langfuse
1790            metadata: Optional metadata to be attached to the score
1791            timestamp: Optional timestamp for the score (defaults to current UTC time)
1792
1793        Example:
1794            ```python
1795            # Create a numeric score for accuracy
1796            langfuse.create_score(
1797                name="accuracy",
1798                value=0.92,
1799                trace_id="abcdef1234567890abcdef1234567890",
1800                data_type="NUMERIC",
1801                comment="High accuracy with minor irrelevant details"
1802            )
1803
1804            # Create a categorical score for sentiment
1805            langfuse.create_score(
1806                name="sentiment",
1807                value="positive",
1808                trace_id="abcdef1234567890abcdef1234567890",
1809                observation_id="abcdef1234567890",
1810                data_type="CATEGORICAL"
1811            )
1812            ```
1813        """
1814        if not self._tracing_enabled:
1815            return
1816
1817        score_id = score_id or self._create_observation_id()
1818
1819        try:
1820            new_body = ScoreBody(
1821                id=score_id,
1822                session_id=session_id,
1823                datasetRunId=dataset_run_id,
1824                traceId=trace_id,
1825                observationId=observation_id,
1826                name=name,
1827                value=value,
1828                dataType=data_type,  # type: ignore
1829                comment=comment,
1830                configId=config_id,
1831                environment=self._environment,
1832                metadata=metadata,
1833            )
1834
1835            event = {
1836                "id": self.create_trace_id(),
1837                "type": "score-create",
1838                "timestamp": timestamp or _get_timestamp(),
1839                "body": new_body,
1840            }
1841
1842            if self._resources is not None:
1843                # Force the score to be in sample if it was for a legacy trace ID, i.e. non-32 hexchar
1844                force_sample = (
1845                    not self._is_valid_trace_id(trace_id) if trace_id else True
1846                )
1847
1848                self._resources.add_score_task(
1849                    event,
1850                    force_sample=force_sample,
1851                )
1852
1853        except Exception as e:
1854            langfuse_logger.exception(
1855                f"Error creating score: Failed to process score event for trace_id={trace_id}, name={name}. Error: {e}"
1856            )
1857
1858    def _create_trace_tags_via_ingestion(
1859        self,
1860        *,
1861        trace_id: str,
1862        tags: List[str],
1863    ) -> None:
1864        """Private helper to enqueue trace tag updates via ingestion API events."""
1865        if not self._tracing_enabled:
1866            return
1867
1868        if len(tags) == 0:
1869            return
1870
1871        try:
1872            new_body = TraceBody(
1873                id=trace_id,
1874                tags=tags,
1875            )
1876
1877            event = {
1878                "id": self.create_trace_id(),
1879                "type": "trace-create",
1880                "timestamp": _get_timestamp(),
1881                "body": new_body,
1882            }
1883
1884            if self._resources is not None:
1885                self._resources.add_trace_task(event)
1886        except Exception as e:
1887            langfuse_logger.exception(
1888                f"Error updating trace tags: Failed to process trace update event for trace_id={trace_id}. Error: {e}"
1889            )
1890
1891    @overload
1892    def score_current_span(
1893        self,
1894        *,
1895        name: str,
1896        value: float,
1897        score_id: Optional[str] = None,
1898        data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None,
1899        comment: Optional[str] = None,
1900        config_id: Optional[str] = None,
1901        metadata: Optional[Any] = None,
1902    ) -> None: ...
1903
1904    @overload
1905    def score_current_span(
1906        self,
1907        *,
1908        name: str,
1909        value: str,
1910        score_id: Optional[str] = None,
1911        data_type: Optional[Literal["CATEGORICAL"]] = "CATEGORICAL",
1912        comment: Optional[str] = None,
1913        config_id: Optional[str] = None,
1914        metadata: Optional[Any] = None,
1915    ) -> None: ...
1916
1917    def score_current_span(
1918        self,
1919        *,
1920        name: str,
1921        value: Union[float, str],
1922        score_id: Optional[str] = None,
1923        data_type: Optional[ScoreDataType] = None,
1924        comment: Optional[str] = None,
1925        config_id: Optional[str] = None,
1926        metadata: Optional[Any] = None,
1927    ) -> None:
1928        """Create a score for the current active span.
1929
1930        This method scores the currently active span in the context. It's a convenient
1931        way to score the current operation without needing to know its trace and span IDs.
1932
1933        Args:
1934            name: Name of the score (e.g., "relevance", "accuracy")
1935            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
1936            score_id: Optional custom ID for the score (auto-generated if not provided)
1937            data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
1938            comment: Optional comment or explanation for the score
1939            config_id: Optional ID of a score config defined in Langfuse
1940            metadata: Optional metadata to be attached to the score
1941
1942        Example:
1943            ```python
1944            with langfuse.start_as_current_generation(name="answer-query") as generation:
1945                # Generate answer
1946                response = generate_answer(...)
1947                generation.update(output=response)
1948
1949                # Score the generation
1950                langfuse.score_current_span(
1951                    name="relevance",
1952                    value=0.85,
1953                    data_type="NUMERIC",
1954                    comment="Mostly relevant but contains some tangential information",
1955                    metadata={"model": "gpt-4", "prompt_version": "v2"}
1956                )
1957            ```
1958        """
1959        current_span = self._get_current_otel_span()
1960
1961        if current_span is not None:
1962            trace_id = self._get_otel_trace_id(current_span)
1963            observation_id = self._get_otel_span_id(current_span)
1964
1965            langfuse_logger.info(
1966                f"Score: Creating score name='{name}' value={value} for current span ({observation_id}) in trace {trace_id}"
1967            )
1968
1969            self.create_score(
1970                trace_id=trace_id,
1971                observation_id=observation_id,
1972                name=name,
1973                value=cast(str, value),
1974                score_id=score_id,
1975                data_type=cast(Literal["CATEGORICAL"], data_type),
1976                comment=comment,
1977                config_id=config_id,
1978                metadata=metadata,
1979            )
1980
1981    @overload
1982    def score_current_trace(
1983        self,
1984        *,
1985        name: str,
1986        value: float,
1987        score_id: Optional[str] = None,
1988        data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None,
1989        comment: Optional[str] = None,
1990        config_id: Optional[str] = None,
1991        metadata: Optional[Any] = None,
1992    ) -> None: ...
1993
1994    @overload
1995    def score_current_trace(
1996        self,
1997        *,
1998        name: str,
1999        value: str,
2000        score_id: Optional[str] = None,
2001        data_type: Optional[Literal["CATEGORICAL"]] = "CATEGORICAL",
2002        comment: Optional[str] = None,
2003        config_id: Optional[str] = None,
2004        metadata: Optional[Any] = None,
2005    ) -> None: ...
2006
2007    def score_current_trace(
2008        self,
2009        *,
2010        name: str,
2011        value: Union[float, str],
2012        score_id: Optional[str] = None,
2013        data_type: Optional[ScoreDataType] = None,
2014        comment: Optional[str] = None,
2015        config_id: Optional[str] = None,
2016        metadata: Optional[Any] = None,
2017    ) -> None:
2018        """Create a score for the current trace.
2019
2020        This method scores the trace of the currently active span. Unlike score_current_span,
2021        this method associates the score with the entire trace rather than a specific span.
2022        It's useful for scoring overall performance or quality of the entire operation.
2023
2024        Args:
2025            name: Name of the score (e.g., "user_satisfaction", "overall_quality")
2026            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
2027            score_id: Optional custom ID for the score (auto-generated if not provided)
2028            data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
2029            comment: Optional comment or explanation for the score
2030            config_id: Optional ID of a score config defined in Langfuse
2031            metadata: Optional metadata to be attached to the score
2032
2033        Example:
2034            ```python
2035            with langfuse.start_as_current_observation(name="process-user-request") as span:
2036                # Process request
2037                result = process_complete_request()
2038                span.update(output=result)
2039
2040                # Score the overall trace
2041                langfuse.score_current_trace(
2042                    name="overall_quality",
2043                    value=0.95,
2044                    data_type="NUMERIC",
2045                    comment="High quality end-to-end response",
2046                    metadata={"evaluator": "gpt-4", "criteria": "comprehensive"}
2047                )
2048            ```
2049        """
2050        current_span = self._get_current_otel_span()
2051
2052        if current_span is not None:
2053            trace_id = self._get_otel_trace_id(current_span)
2054
2055            langfuse_logger.info(
2056                f"Score: Creating score name='{name}' value={value} for entire trace {trace_id}"
2057            )
2058
2059            self.create_score(
2060                trace_id=trace_id,
2061                name=name,
2062                value=cast(str, value),
2063                score_id=score_id,
2064                data_type=cast(Literal["CATEGORICAL"], data_type),
2065                comment=comment,
2066                config_id=config_id,
2067                metadata=metadata,
2068            )
2069
2070    def flush(self) -> None:
2071        """Force flush all pending spans and events to the Langfuse API.
2072
2073        This method manually flushes any pending spans, scores, and other events to the
2074        Langfuse API. It's useful in scenarios where you want to ensure all data is sent
2075        before proceeding, without waiting for the automatic flush interval.
2076
2077        Example:
2078            ```python
2079            # Record some spans and scores
2080            with langfuse.start_as_current_observation(name="operation") as span:
2081                # Do work...
2082                pass
2083
2084            # Ensure all data is sent to Langfuse before proceeding
2085            langfuse.flush()
2086
2087            # Continue with other work
2088            ```
2089        """
2090        if self._resources is not None:
2091            self._resources.flush()
2092
2093    def shutdown(self) -> None:
2094        """Shut down the Langfuse client and flush all pending data.
2095
2096        This method cleanly shuts down the Langfuse client, ensuring all pending data
2097        is flushed to the API and all background threads are properly terminated.
2098
2099        It's important to call this method when your application is shutting down to
2100        prevent data loss and resource leaks. For most applications, using the client
2101        as a context manager or relying on the automatic shutdown via atexit is sufficient.
2102
2103        Example:
2104            ```python
2105            # Initialize Langfuse
2106            langfuse = Langfuse(public_key="...", secret_key="...")
2107
2108            # Use Langfuse throughout your application
2109            # ...
2110
2111            # When application is shutting down
2112            langfuse.shutdown()
2113            ```
2114        """
2115        if self._resources is not None:
2116            self._resources.shutdown()
2117
2118    def get_current_trace_id(self) -> Optional[str]:
2119        """Get the trace ID of the current active span.
2120
2121        This method retrieves the trace ID from the currently active span in the context.
2122        It can be used to get the trace ID for referencing in logs, external systems,
2123        or for creating related operations.
2124
2125        Returns:
2126            The current trace ID as a 32-character lowercase hexadecimal string,
2127            or None if there is no active span.
2128
2129        Example:
2130            ```python
2131            with langfuse.start_as_current_observation(name="process-request") as span:
2132                # Get the current trace ID for reference
2133                trace_id = langfuse.get_current_trace_id()
2134
2135                # Use it for external correlation
2136                log.info(f"Processing request with trace_id: {trace_id}")
2137
2138                # Or pass to another system
2139                external_system.process(data, trace_id=trace_id)
2140            ```
2141        """
2142        if not self._tracing_enabled:
2143            langfuse_logger.debug(
2144                "Operation skipped: get_current_trace_id - Tracing is disabled or client is in no-op mode."
2145            )
2146            return None
2147
2148        current_otel_span = self._get_current_otel_span()
2149
2150        return self._get_otel_trace_id(current_otel_span) if current_otel_span else None
2151
2152    def get_current_observation_id(self) -> Optional[str]:
2153        """Get the observation ID (span ID) of the current active span.
2154
2155        This method retrieves the observation ID from the currently active span in the context.
2156        It can be used to get the observation ID for referencing in logs, external systems,
2157        or for creating scores or other related operations.
2158
2159        Returns:
2160            The current observation ID as a 16-character lowercase hexadecimal string,
2161            or None if there is no active span.
2162
2163        Example:
2164            ```python
2165            with langfuse.start_as_current_observation(name="process-user-query") as span:
2166                # Get the current observation ID
2167                observation_id = langfuse.get_current_observation_id()
2168
2169                # Store it for later reference
2170                cache.set(f"query_{query_id}_observation", observation_id)
2171
2172                # Process the query...
2173            ```
2174        """
2175        if not self._tracing_enabled:
2176            langfuse_logger.debug(
2177                "Operation skipped: get_current_observation_id - Tracing is disabled or client is in no-op mode."
2178            )
2179            return None
2180
2181        current_otel_span = self._get_current_otel_span()
2182
2183        return self._get_otel_span_id(current_otel_span) if current_otel_span else None
2184
2185    def _get_project_id(self) -> Optional[str]:
2186        """Fetch and return the current project id. Persisted across requests. Returns None if no project id is found for api keys."""
2187        if not self._project_id:
2188            proj = self.api.projects.get()
2189            if not proj.data or not proj.data[0].id:
2190                return None
2191
2192            self._project_id = proj.data[0].id
2193
2194        return self._project_id
2195
2196    def get_trace_url(self, *, trace_id: Optional[str] = None) -> Optional[str]:
2197        """Get the URL to view a trace in the Langfuse UI.
2198
2199        This method generates a URL that links directly to a trace in the Langfuse UI.
2200        It's useful for providing links in logs, notifications, or debugging tools.
2201
2202        Args:
2203            trace_id: Optional trace ID to generate a URL for. If not provided,
2204                     the trace ID of the current active span will be used.
2205
2206        Returns:
2207            A URL string pointing to the trace in the Langfuse UI,
2208            or None if the project ID couldn't be retrieved or no trace ID is available.
2209
2210        Example:
2211            ```python
2212            # Get URL for the current trace
2213            with langfuse.start_as_current_observation(name="process-request") as span:
2214                trace_url = langfuse.get_trace_url()
2215                log.info(f"Processing trace: {trace_url}")
2216
2217            # Get URL for a specific trace
2218            specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef")
2219            send_notification(f"Review needed for trace: {specific_trace_url}")
2220            ```
2221        """
2222        final_trace_id = trace_id or self.get_current_trace_id()
2223        if not final_trace_id:
2224            return None
2225
2226        project_id = self._get_project_id()
2227
2228        return (
2229            f"{self._base_url}/project/{project_id}/traces/{final_trace_id}"
2230            if project_id and final_trace_id
2231            else None
2232        )
2233
2234    def get_dataset(
2235        self,
2236        name: str,
2237        *,
2238        fetch_items_page_size: Optional[int] = 50,
2239        version: Optional[datetime] = None,
2240    ) -> "DatasetClient":
2241        """Fetch a dataset by its name.
2242
2243        Args:
2244            name (str): The name of the dataset to fetch.
2245            fetch_items_page_size (Optional[int]): All items of the dataset will be fetched in chunks of this size. Defaults to 50.
2246            version (Optional[datetime]): Retrieve dataset items as they existed at this specific point in time (UTC).
2247                If provided, returns the state of items at the specified UTC timestamp.
2248                If not provided, returns the latest version. Must be a timezone-aware datetime object in UTC.
2249
2250        Returns:
2251            DatasetClient: The dataset with the given name.
2252        """
2253        try:
2254            langfuse_logger.debug(f"Getting datasets {name}")
2255            dataset = self.api.datasets.get(dataset_name=self._url_encode(name))
2256
2257            dataset_items = []
2258            page = 1
2259
2260            while True:
2261                new_items = self.api.dataset_items.list(
2262                    dataset_name=self._url_encode(name, is_url_param=True),
2263                    page=page,
2264                    limit=fetch_items_page_size,
2265                    version=version,
2266                )
2267                dataset_items.extend(new_items.data)
2268
2269                if new_items.meta.total_pages <= page:
2270                    break
2271
2272                page += 1
2273
2274            return DatasetClient(
2275                dataset=dataset,
2276                items=dataset_items,
2277                version=version,
2278                langfuse_client=self,
2279            )
2280
2281        except Error as e:
2282            handle_fern_exception(e)
2283            raise e
2284
2285    def get_dataset_run(
2286        self, *, dataset_name: str, run_name: str
2287    ) -> DatasetRunWithItems:
2288        """Fetch a dataset run by dataset name and run name.
2289
2290        Args:
2291            dataset_name (str): The name of the dataset.
2292            run_name (str): The name of the run.
2293
2294        Returns:
2295            DatasetRunWithItems: The dataset run with its items.
2296        """
2297        try:
2298            return cast(
2299                DatasetRunWithItems,
2300                self.api.datasets.get_run(
2301                    dataset_name=self._url_encode(dataset_name),
2302                    run_name=self._url_encode(run_name),
2303                    request_options=None,
2304                ),
2305            )
2306        except Error as e:
2307            handle_fern_exception(e)
2308            raise e
2309
2310    def get_dataset_runs(
2311        self,
2312        *,
2313        dataset_name: str,
2314        page: Optional[int] = None,
2315        limit: Optional[int] = None,
2316    ) -> PaginatedDatasetRuns:
2317        """Fetch all runs for a dataset.
2318
2319        Args:
2320            dataset_name (str): The name of the dataset.
2321            page (Optional[int]): Page number, starts at 1.
2322            limit (Optional[int]): Limit of items per page.
2323
2324        Returns:
2325            PaginatedDatasetRuns: Paginated list of dataset runs.
2326        """
2327        try:
2328            return cast(
2329                PaginatedDatasetRuns,
2330                self.api.datasets.get_runs(
2331                    dataset_name=self._url_encode(dataset_name),
2332                    page=page,
2333                    limit=limit,
2334                    request_options=None,
2335                ),
2336            )
2337        except Error as e:
2338            handle_fern_exception(e)
2339            raise e
2340
2341    def delete_dataset_run(
2342        self, *, dataset_name: str, run_name: str
2343    ) -> DeleteDatasetRunResponse:
2344        """Delete a dataset run and all its run items. This action is irreversible.
2345
2346        Args:
2347            dataset_name (str): The name of the dataset.
2348            run_name (str): The name of the run.
2349
2350        Returns:
2351            DeleteDatasetRunResponse: Confirmation of deletion.
2352        """
2353        try:
2354            return cast(
2355                DeleteDatasetRunResponse,
2356                self.api.datasets.delete_run(
2357                    dataset_name=self._url_encode(dataset_name),
2358                    run_name=self._url_encode(run_name),
2359                    request_options=None,
2360                ),
2361            )
2362        except Error as e:
2363            handle_fern_exception(e)
2364            raise e
2365
2366    def run_experiment(
2367        self,
2368        *,
2369        name: str,
2370        run_name: Optional[str] = None,
2371        description: Optional[str] = None,
2372        data: ExperimentData,
2373        task: TaskFunction,
2374        evaluators: List[EvaluatorFunction] = [],
2375        composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
2376        run_evaluators: List[RunEvaluatorFunction] = [],
2377        max_concurrency: int = 50,
2378        metadata: Optional[Dict[str, str]] = None,
2379        _dataset_version: Optional[datetime] = None,
2380    ) -> ExperimentResult:
2381        """Run an experiment on a dataset with automatic tracing and evaluation.
2382
2383        This method executes a task function on each item in the provided dataset,
2384        automatically traces all executions with Langfuse for observability, runs
2385        item-level and run-level evaluators on the outputs, and returns comprehensive
2386        results with evaluation metrics.
2387
2388        The experiment system provides:
2389        - Automatic tracing of all task executions
2390        - Concurrent processing with configurable limits
2391        - Comprehensive error handling that isolates failures
2392        - Integration with Langfuse datasets for experiment tracking
2393        - Flexible evaluation framework supporting both sync and async evaluators
2394
2395        Args:
2396            name: Human-readable name for the experiment. Used for identification
2397                in the Langfuse UI.
2398            run_name: Optional exact name for the experiment run. If provided, this will be
2399                used as the exact dataset run name if the `data` contains Langfuse dataset items.
2400                If not provided, this will default to the experiment name appended with an ISO timestamp.
2401            description: Optional description explaining the experiment's purpose,
2402                methodology, or expected outcomes.
2403            data: Array of data items to process. Can be either:
2404                - List of dict-like items with 'input', 'expected_output', 'metadata' keys
2405                - List of Langfuse DatasetItem objects from dataset.items
2406            task: Function that processes each data item and returns output.
2407                Must accept 'item' as keyword argument and can return sync or async results.
2408                The task function signature should be: task(*, item, **kwargs) -> Any
2409            evaluators: List of functions to evaluate each item's output individually.
2410                Each evaluator receives input, output, expected_output, and metadata.
2411                Can return single Evaluation dict or list of Evaluation dicts.
2412            composite_evaluator: Optional function that creates composite scores from item-level evaluations.
2413                Receives the same inputs as item-level evaluators (input, output, expected_output, metadata)
2414                plus the list of evaluations from item-level evaluators. Useful for weighted averages,
2415                pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics.
2416            run_evaluators: List of functions to evaluate the entire experiment run.
2417                Each run evaluator receives all item_results and can compute aggregate metrics.
2418                Useful for calculating averages, distributions, or cross-item comparisons.
2419            max_concurrency: Maximum number of concurrent task executions (default: 50).
2420                Controls the number of items processed simultaneously. Adjust based on
2421                API rate limits and system resources.
2422            metadata: Optional metadata dictionary to attach to all experiment traces.
2423                This metadata will be included in every trace created during the experiment.
2424                If `data` are Langfuse dataset items, the metadata will be attached to the dataset run, too.
2425
2426        Returns:
2427            ExperimentResult containing:
2428            - run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset.
2429            - item_results: List of results for each processed item with outputs and evaluations
2430            - run_evaluations: List of aggregate evaluation results for the entire run
2431            - dataset_run_id: ID of the dataset run (if using Langfuse datasets)
2432            - dataset_run_url: Direct URL to view results in Langfuse UI (if applicable)
2433
2434        Raises:
2435            ValueError: If required parameters are missing or invalid
2436            Exception: If experiment setup fails (individual item failures are handled gracefully)
2437
2438        Examples:
2439            Basic experiment with local data:
2440            ```python
2441            def summarize_text(*, item, **kwargs):
2442                return f"Summary: {item['input'][:50]}..."
2443
2444            def length_evaluator(*, input, output, expected_output=None, **kwargs):
2445                return {
2446                    "name": "output_length",
2447                    "value": len(output),
2448                    "comment": f"Output contains {len(output)} characters"
2449                }
2450
2451            result = langfuse.run_experiment(
2452                name="Text Summarization Test",
2453                description="Evaluate summarization quality and length",
2454                data=[
2455                    {"input": "Long article text...", "expected_output": "Expected summary"},
2456                    {"input": "Another article...", "expected_output": "Another summary"}
2457                ],
2458                task=summarize_text,
2459                evaluators=[length_evaluator]
2460            )
2461
2462            print(f"Processed {len(result.item_results)} items")
2463            for item_result in result.item_results:
2464                print(f"Input: {item_result.item['input']}")
2465                print(f"Output: {item_result.output}")
2466                print(f"Evaluations: {item_result.evaluations}")
2467            ```
2468
2469            Advanced experiment with async task and multiple evaluators:
2470            ```python
2471            async def llm_task(*, item, **kwargs):
2472                # Simulate async LLM call
2473                response = await openai_client.chat.completions.create(
2474                    model="gpt-4",
2475                    messages=[{"role": "user", "content": item["input"]}]
2476                )
2477                return response.choices[0].message.content
2478
2479            def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
2480                if expected_output and expected_output.lower() in output.lower():
2481                    return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"}
2482                return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"}
2483
2484            def toxicity_evaluator(*, input, output, expected_output=None, **kwargs):
2485                # Simulate toxicity check
2486                toxicity_score = check_toxicity(output)  # Your toxicity checker
2487                return {
2488                    "name": "toxicity",
2489                    "value": toxicity_score,
2490                    "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}"
2491                }
2492
2493            def average_accuracy(*, item_results, **kwargs):
2494                accuracies = [
2495                    eval.value for result in item_results
2496                    for eval in result.evaluations
2497                    if eval.name == "accuracy"
2498                ]
2499                return {
2500                    "name": "average_accuracy",
2501                    "value": sum(accuracies) / len(accuracies) if accuracies else 0,
2502                    "comment": f"Average accuracy across {len(accuracies)} items"
2503                }
2504
2505            result = langfuse.run_experiment(
2506                name="LLM Safety and Accuracy Test",
2507                description="Evaluate model accuracy and safety across diverse prompts",
2508                data=test_dataset,  # Your dataset items
2509                task=llm_task,
2510                evaluators=[accuracy_evaluator, toxicity_evaluator],
2511                run_evaluators=[average_accuracy],
2512                max_concurrency=5,  # Limit concurrent API calls
2513                metadata={"model": "gpt-4", "temperature": 0.7}
2514            )
2515            ```
2516
2517            Using with Langfuse datasets:
2518            ```python
2519            # Get dataset from Langfuse
2520            dataset = langfuse.get_dataset("my-eval-dataset")
2521
2522            result = dataset.run_experiment(
2523                name="Production Model Evaluation",
2524                description="Monthly evaluation of production model performance",
2525                task=my_production_task,
2526                evaluators=[accuracy_evaluator, latency_evaluator]
2527            )
2528
2529            # Results automatically linked to dataset in Langfuse UI
2530            print(f"View results: {result['dataset_run_url']}")
2531            ```
2532
2533        Note:
2534            - Task and evaluator functions can be either synchronous or asynchronous
2535            - Individual item failures are logged but don't stop the experiment
2536            - All executions are automatically traced and visible in Langfuse UI
2537            - When using Langfuse datasets, results are automatically linked for easy comparison
2538            - This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.)
2539            - Async execution is handled automatically with smart event loop detection
2540        """
2541        return cast(
2542            ExperimentResult,
2543            run_async_safely(
2544                self._run_experiment_async(
2545                    name=name,
2546                    run_name=self._create_experiment_run_name(
2547                        name=name, run_name=run_name
2548                    ),
2549                    description=description,
2550                    data=data,
2551                    task=task,
2552                    evaluators=evaluators or [],
2553                    composite_evaluator=composite_evaluator,
2554                    run_evaluators=run_evaluators or [],
2555                    max_concurrency=max_concurrency,
2556                    metadata=metadata,
2557                    dataset_version=_dataset_version,
2558                ),
2559            ),
2560        )
2561
2562    async def _run_experiment_async(
2563        self,
2564        *,
2565        name: str,
2566        run_name: str,
2567        description: Optional[str],
2568        data: ExperimentData,
2569        task: TaskFunction,
2570        evaluators: List[EvaluatorFunction],
2571        composite_evaluator: Optional[CompositeEvaluatorFunction],
2572        run_evaluators: List[RunEvaluatorFunction],
2573        max_concurrency: int,
2574        metadata: Optional[Dict[str, Any]] = None,
2575        dataset_version: Optional[datetime] = None,
2576    ) -> ExperimentResult:
2577        langfuse_logger.debug(
2578            f"Starting experiment '{name}' run '{run_name}' with {len(data)} items"
2579        )
2580
2581        # Set up concurrency control
2582        semaphore = asyncio.Semaphore(max_concurrency)
2583
2584        # Process all items
2585        async def process_item(item: ExperimentItem) -> ExperimentItemResult:
2586            async with semaphore:
2587                return await self._process_experiment_item(
2588                    item,
2589                    task,
2590                    evaluators,
2591                    composite_evaluator,
2592                    name,
2593                    run_name,
2594                    description,
2595                    metadata,
2596                    dataset_version,
2597                )
2598
2599        # Run all items concurrently
2600        tasks = [process_item(item) for item in data]
2601        item_results = await asyncio.gather(*tasks, return_exceptions=True)
2602
2603        # Filter out any exceptions and log errors
2604        valid_results: List[ExperimentItemResult] = []
2605        for i, result in enumerate(item_results):
2606            if isinstance(result, Exception):
2607                langfuse_logger.error(f"Item {i} failed: {result}")
2608            elif isinstance(result, ExperimentItemResult):
2609                valid_results.append(result)  # type: ignore
2610
2611        # Run experiment-level evaluators
2612        run_evaluations: List[Evaluation] = []
2613        for run_evaluator in run_evaluators:
2614            try:
2615                evaluations = await _run_evaluator(
2616                    run_evaluator, item_results=valid_results
2617                )
2618                run_evaluations.extend(evaluations)
2619            except Exception as e:
2620                langfuse_logger.error(f"Run evaluator failed: {e}")
2621
2622        # Generate dataset run URL if applicable
2623        dataset_run_id = valid_results[0].dataset_run_id if valid_results else None
2624        dataset_run_url = None
2625        if dataset_run_id and data:
2626            try:
2627                # Check if the first item has dataset_id (for DatasetItem objects)
2628                first_item = data[0]
2629                dataset_id = None
2630
2631                if hasattr(first_item, "dataset_id"):
2632                    dataset_id = getattr(first_item, "dataset_id", None)
2633
2634                if dataset_id:
2635                    project_id = self._get_project_id()
2636
2637                    if project_id:
2638                        dataset_run_url = f"{self._base_url}/project/{project_id}/datasets/{dataset_id}/runs/{dataset_run_id}"
2639
2640            except Exception:
2641                pass  # URL generation is optional
2642
2643        # Store run-level evaluations as scores
2644        for evaluation in run_evaluations:
2645            try:
2646                if dataset_run_id:
2647                    self.create_score(
2648                        dataset_run_id=dataset_run_id,
2649                        name=evaluation.name or "<unknown>",
2650                        value=evaluation.value,  # type: ignore
2651                        comment=evaluation.comment,
2652                        metadata=evaluation.metadata,
2653                        data_type=evaluation.data_type,  # type: ignore
2654                        config_id=evaluation.config_id,
2655                    )
2656
2657            except Exception as e:
2658                langfuse_logger.error(f"Failed to store run evaluation: {e}")
2659
2660        # Flush scores and traces
2661        self.flush()
2662
2663        return ExperimentResult(
2664            name=name,
2665            run_name=run_name,
2666            description=description,
2667            item_results=valid_results,
2668            run_evaluations=run_evaluations,
2669            dataset_run_id=dataset_run_id,
2670            dataset_run_url=dataset_run_url,
2671        )
2672
2673    async def _process_experiment_item(
2674        self,
2675        item: ExperimentItem,
2676        task: Callable,
2677        evaluators: List[Callable],
2678        composite_evaluator: Optional[CompositeEvaluatorFunction],
2679        experiment_name: str,
2680        experiment_run_name: str,
2681        experiment_description: Optional[str],
2682        experiment_metadata: Optional[Dict[str, Any]] = None,
2683        dataset_version: Optional[datetime] = None,
2684    ) -> ExperimentItemResult:
2685        span_name = "experiment-item-run"
2686
2687        with self.start_as_current_observation(name=span_name) as span:
2688            try:
2689                input_data = (
2690                    item.get("input")
2691                    if isinstance(item, dict)
2692                    else getattr(item, "input", None)
2693                )
2694
2695                if input_data is None:
2696                    raise ValueError("Experiment Item is missing input. Skipping item.")
2697
2698                expected_output = (
2699                    item.get("expected_output")
2700                    if isinstance(item, dict)
2701                    else getattr(item, "expected_output", None)
2702                )
2703
2704                item_metadata = (
2705                    item.get("metadata")
2706                    if isinstance(item, dict)
2707                    else getattr(item, "metadata", None)
2708                )
2709
2710                final_observation_metadata = {
2711                    "experiment_name": experiment_name,
2712                    "experiment_run_name": experiment_run_name,
2713                    **(experiment_metadata or {}),
2714                }
2715
2716                trace_id = span.trace_id
2717                dataset_id = None
2718                dataset_item_id = None
2719                dataset_run_id = None
2720
2721                # Link to dataset run if this is a dataset item
2722                if hasattr(item, "id") and hasattr(item, "dataset_id"):
2723                    try:
2724                        # Use sync API to avoid event loop issues when run_async_safely
2725                        # creates multiple event loops across different threads
2726                        dataset_run_item = await asyncio.to_thread(
2727                            self.api.dataset_run_items.create,
2728                            run_name=experiment_run_name,
2729                            run_description=experiment_description,
2730                            metadata=experiment_metadata,
2731                            dataset_item_id=item.id,  # type: ignore
2732                            trace_id=trace_id,
2733                            observation_id=span.id,
2734                            dataset_version=dataset_version,
2735                        )
2736
2737                        dataset_run_id = dataset_run_item.dataset_run_id
2738
2739                    except Exception as e:
2740                        langfuse_logger.error(f"Failed to create dataset run item: {e}")
2741
2742                if (
2743                    not isinstance(item, dict)
2744                    and hasattr(item, "dataset_id")
2745                    and hasattr(item, "id")
2746                ):
2747                    dataset_id = item.dataset_id
2748                    dataset_item_id = item.id
2749
2750                    final_observation_metadata.update(
2751                        {"dataset_id": dataset_id, "dataset_item_id": dataset_item_id}
2752                    )
2753
2754                if isinstance(item_metadata, dict):
2755                    final_observation_metadata.update(item_metadata)
2756
2757                experiment_id = dataset_run_id or self._create_observation_id()
2758                experiment_item_id = (
2759                    dataset_item_id or get_sha256_hash_hex(_serialize(input_data))[:16]
2760                )
2761                span._otel_span.set_attributes(
2762                    {
2763                        k: v
2764                        for k, v in {
2765                            LangfuseOtelSpanAttributes.ENVIRONMENT: LANGFUSE_SDK_EXPERIMENT_ENVIRONMENT,
2766                            LangfuseOtelSpanAttributes.EXPERIMENT_DESCRIPTION: experiment_description,
2767                            LangfuseOtelSpanAttributes.EXPERIMENT_ITEM_EXPECTED_OUTPUT: _serialize(
2768                                expected_output
2769                            ),
2770                        }.items()
2771                        if v is not None
2772                    }
2773                )
2774
2775                propagated_experiment_attributes = PropagatedExperimentAttributes(
2776                    experiment_id=experiment_id,
2777                    experiment_name=experiment_run_name,
2778                    experiment_metadata=_serialize(experiment_metadata),
2779                    experiment_dataset_id=dataset_id,
2780                    experiment_item_id=experiment_item_id,
2781                    experiment_item_metadata=_serialize(item_metadata),
2782                    experiment_item_root_observation_id=span.id,
2783                )
2784
2785                with _propagate_attributes(experiment=propagated_experiment_attributes):
2786                    output = await _run_task(task, item)
2787
2788                span.update(
2789                    input=input_data,
2790                    output=output,
2791                    metadata=final_observation_metadata,
2792                )
2793
2794            except Exception as e:
2795                span.update(
2796                    output=f"Error: {str(e)}", level="ERROR", status_message=str(e)
2797                )
2798                raise e
2799
2800            # Run evaluators
2801            evaluations = []
2802
2803            for evaluator in evaluators:
2804                try:
2805                    eval_metadata: Optional[Dict[str, Any]] = None
2806
2807                    if isinstance(item, dict):
2808                        eval_metadata = item.get("metadata")
2809                    elif hasattr(item, "metadata"):
2810                        eval_metadata = item.metadata
2811
2812                    with _propagate_attributes(
2813                        experiment=propagated_experiment_attributes
2814                    ):
2815                        eval_results = await _run_evaluator(
2816                            evaluator,
2817                            input=input_data,
2818                            output=output,
2819                            expected_output=expected_output,
2820                            metadata=eval_metadata,
2821                        )
2822                        evaluations.extend(eval_results)
2823
2824                        # Store evaluations as scores
2825                        for evaluation in eval_results:
2826                            self.create_score(
2827                                trace_id=trace_id,
2828                                observation_id=span.id,
2829                                name=evaluation.name,
2830                                value=evaluation.value,  # type: ignore
2831                                comment=evaluation.comment,
2832                                metadata=evaluation.metadata,
2833                                config_id=evaluation.config_id,
2834                                data_type=evaluation.data_type,  # type: ignore
2835                            )
2836
2837                except Exception as e:
2838                    langfuse_logger.error(f"Evaluator failed: {e}")
2839
2840            # Run composite evaluator if provided and we have evaluations
2841            if composite_evaluator and evaluations:
2842                try:
2843                    composite_eval_metadata: Optional[Dict[str, Any]] = None
2844                    if isinstance(item, dict):
2845                        composite_eval_metadata = item.get("metadata")
2846                    elif hasattr(item, "metadata"):
2847                        composite_eval_metadata = item.metadata
2848
2849                    with _propagate_attributes(
2850                        experiment=propagated_experiment_attributes
2851                    ):
2852                        result = composite_evaluator(
2853                            input=input_data,
2854                            output=output,
2855                            expected_output=expected_output,
2856                            metadata=composite_eval_metadata,
2857                            evaluations=evaluations,
2858                        )
2859
2860                        # Handle async composite evaluators
2861                        if asyncio.iscoroutine(result):
2862                            result = await result
2863
2864                        # Normalize to list
2865                        composite_evals: List[Evaluation] = []
2866                        if isinstance(result, (dict, Evaluation)):
2867                            composite_evals = [result]  # type: ignore
2868                        elif isinstance(result, list):
2869                            composite_evals = result  # type: ignore
2870
2871                        # Store composite evaluations as scores and add to evaluations list
2872                        for composite_evaluation in composite_evals:
2873                            self.create_score(
2874                                trace_id=trace_id,
2875                                observation_id=span.id,
2876                                name=composite_evaluation.name,
2877                                value=composite_evaluation.value,  # type: ignore
2878                                comment=composite_evaluation.comment,
2879                                metadata=composite_evaluation.metadata,
2880                                config_id=composite_evaluation.config_id,
2881                                data_type=composite_evaluation.data_type,  # type: ignore
2882                            )
2883                            evaluations.append(composite_evaluation)
2884
2885                except Exception as e:
2886                    langfuse_logger.error(f"Composite evaluator failed: {e}")
2887
2888            return ExperimentItemResult(
2889                item=item,
2890                output=output,
2891                evaluations=evaluations,
2892                trace_id=trace_id,
2893                dataset_run_id=dataset_run_id,
2894            )
2895
2896    def _create_experiment_run_name(
2897        self, *, name: Optional[str] = None, run_name: Optional[str] = None
2898    ) -> str:
2899        if run_name:
2900            return run_name
2901
2902        iso_timestamp = _get_timestamp().isoformat().replace("+00:00", "Z")
2903
2904        return f"{name} - {iso_timestamp}"
2905
2906    def run_batched_evaluation(
2907        self,
2908        *,
2909        scope: Literal["traces", "observations"],
2910        mapper: MapperFunction,
2911        filter: Optional[str] = None,
2912        fetch_batch_size: int = 50,
2913        fetch_trace_fields: Optional[str] = None,
2914        max_items: Optional[int] = None,
2915        max_retries: int = 3,
2916        evaluators: List[EvaluatorFunction],
2917        composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
2918        max_concurrency: int = 5,
2919        metadata: Optional[Dict[str, Any]] = None,
2920        _add_observation_scores_to_trace: bool = False,
2921        _additional_trace_tags: Optional[List[str]] = None,
2922        resume_from: Optional[BatchEvaluationResumeToken] = None,
2923        verbose: bool = False,
2924    ) -> BatchEvaluationResult:
2925        """Fetch traces or observations and run evaluations on each item.
2926
2927        This method provides a powerful way to evaluate existing data in Langfuse at scale.
2928        It fetches items based on filters, transforms them using a mapper function, runs
2929        evaluators on each item, and creates scores that are linked back to the original
2930        entities. This is ideal for:
2931
2932        - Running evaluations on production traces after deployment
2933        - Backtesting new evaluation metrics on historical data
2934        - Batch scoring of observations for quality monitoring
2935        - Periodic evaluation runs on recent data
2936
2937        The method uses a streaming/pipeline approach to process items in batches, making
2938        it memory-efficient for large datasets. It includes comprehensive error handling,
2939        retry logic, and resume capability for long-running evaluations.
2940
2941        Args:
2942            scope: The type of items to evaluate. Must be one of:
2943                - "traces": Evaluate complete traces with all their observations
2944                - "observations": Evaluate individual observations (spans, generations, events)
2945            mapper: Function that transforms API response objects into evaluator inputs.
2946                Receives a trace/observation object and returns an EvaluatorInputs
2947                instance with input, output, expected_output, and metadata fields.
2948                Can be sync or async.
2949            evaluators: List of evaluation functions to run on each item. Each evaluator
2950                receives the mapped inputs and returns Evaluation object(s). Evaluator
2951                failures are logged but don't stop the batch evaluation.
2952            filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples:
2953                - '{"tags": ["production"]}'
2954                - '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}'
2955                Default: None (fetches all items).
2956            fetch_batch_size: Number of items to fetch per API call and hold in memory.
2957                Larger values may be faster but use more memory. Default: 50.
2958            fetch_trace_fields: Comma-separated list of fields to include when fetching traces. Available field groups: 'core' (always included), 'io' (input, output, metadata), 'scores', 'observations', 'metrics'. If not specified, all fields are returned. Example: 'core,scores,metrics'. Note: Excluded 'observations' or 'scores' fields return empty arrays; excluded 'metrics' returns -1 for 'totalCost' and 'latency'. Only relevant if scope is 'traces'.
2959            max_items: Maximum total number of items to process. If None, processes all
2960                items matching the filter. Useful for testing or limiting evaluation runs.
2961                Default: None (process all).
2962            max_concurrency: Maximum number of items to evaluate concurrently. Controls
2963                parallelism and resource usage. Default: 5.
2964            composite_evaluator: Optional function that creates a composite score from
2965                item-level evaluations. Receives the original item and its evaluations,
2966                returns a single Evaluation. Useful for weighted averages or combined metrics.
2967                Default: None.
2968            metadata: Optional metadata dict to add to all created scores. Useful for
2969                tracking evaluation runs, versions, or other context. Default: None.
2970            max_retries: Maximum number of retry attempts for failed batch fetches.
2971                Uses exponential backoff (1s, 2s, 4s). Default: 3.
2972            verbose: If True, logs progress information to console. Useful for monitoring
2973                long-running evaluations. Default: False.
2974            resume_from: Optional resume token from a previous incomplete run. Allows
2975                continuing evaluation after interruption or failure. Default: None.
2976
2977
2978        Returns:
2979            BatchEvaluationResult containing:
2980                - total_items_fetched: Number of items fetched from API
2981                - total_items_processed: Number of items successfully evaluated
2982                - total_items_failed: Number of items that failed evaluation
2983                - total_scores_created: Scores created by item-level evaluators
2984                - total_composite_scores_created: Scores created by composite evaluator
2985                - total_evaluations_failed: Individual evaluator failures
2986                - evaluator_stats: Per-evaluator statistics (success rate, scores created)
2987                - resume_token: Token for resuming if incomplete (None if completed)
2988                - completed: True if all items processed
2989                - duration_seconds: Total execution time
2990                - failed_item_ids: IDs of items that failed
2991                - error_summary: Error types and counts
2992                - has_more_items: True if max_items reached but more exist
2993
2994        Raises:
2995            ValueError: If invalid scope is provided.
2996
2997        Examples:
2998            Basic trace evaluation:
2999            ```python
3000            from langfuse import Langfuse, EvaluatorInputs, Evaluation
3001
3002            client = Langfuse()
3003
3004            # Define mapper to extract fields from traces
3005            def trace_mapper(trace):
3006                return EvaluatorInputs(
3007                    input=trace.input,
3008                    output=trace.output,
3009                    expected_output=None,
3010                    metadata={"trace_id": trace.id}
3011                )
3012
3013            # Define evaluator
3014            def length_evaluator(*, input, output, expected_output, metadata):
3015                return Evaluation(
3016                    name="output_length",
3017                    value=len(output) if output else 0
3018                )
3019
3020            # Run batch evaluation
3021            result = client.run_batched_evaluation(
3022                scope="traces",
3023                mapper=trace_mapper,
3024                evaluators=[length_evaluator],
3025                filter='{"tags": ["production"]}',
3026                max_items=1000,
3027                verbose=True
3028            )
3029
3030            print(f"Processed {result.total_items_processed} traces")
3031            print(f"Created {result.total_scores_created} scores")
3032            ```
3033
3034            Evaluation with composite scorer:
3035            ```python
3036            def accuracy_evaluator(*, input, output, expected_output, metadata):
3037                # ... evaluation logic
3038                return Evaluation(name="accuracy", value=0.85)
3039
3040            def relevance_evaluator(*, input, output, expected_output, metadata):
3041                # ... evaluation logic
3042                return Evaluation(name="relevance", value=0.92)
3043
3044            def composite_evaluator(*, item, evaluations):
3045                # Weighted average of evaluations
3046                weights = {"accuracy": 0.6, "relevance": 0.4}
3047                total = sum(
3048                    e.value * weights.get(e.name, 0)
3049                    for e in evaluations
3050                    if isinstance(e.value, (int, float))
3051                )
3052                return Evaluation(
3053                    name="composite_score",
3054                    value=total,
3055                    comment=f"Weighted average of {len(evaluations)} metrics"
3056                )
3057
3058            result = client.run_batched_evaluation(
3059                scope="traces",
3060                mapper=trace_mapper,
3061                evaluators=[accuracy_evaluator, relevance_evaluator],
3062                composite_evaluator=composite_evaluator,
3063                filter='{"user_id": "important_user"}',
3064                verbose=True
3065            )
3066            ```
3067
3068            Handling incomplete runs with resume:
3069            ```python
3070            # Initial run that may fail or timeout
3071            result = client.run_batched_evaluation(
3072                scope="observations",
3073                mapper=obs_mapper,
3074                evaluators=[my_evaluator],
3075                max_items=10000,
3076                verbose=True
3077            )
3078
3079            # Check if incomplete
3080            if not result.completed and result.resume_token:
3081                print(f"Processed {result.resume_token.items_processed} items before interruption")
3082
3083                # Resume from where it left off
3084                result = client.run_batched_evaluation(
3085                    scope="observations",
3086                    mapper=obs_mapper,
3087                    evaluators=[my_evaluator],
3088                    resume_from=result.resume_token,
3089                    verbose=True
3090                )
3091
3092            print(f"Total items processed: {result.total_items_processed}")
3093            ```
3094
3095            Monitoring evaluator performance:
3096            ```python
3097            result = client.run_batched_evaluation(...)
3098
3099            for stats in result.evaluator_stats:
3100                success_rate = stats.successful_runs / stats.total_runs
3101                print(f"{stats.name}:")
3102                print(f"  Success rate: {success_rate:.1%}")
3103                print(f"  Scores created: {stats.total_scores_created}")
3104
3105                if stats.failed_runs > 0:
3106                    print(f"  âš ī¸  Failed {stats.failed_runs} times")
3107            ```
3108
3109        Note:
3110            - Evaluator failures are logged but don't stop the batch evaluation
3111            - Individual item failures are tracked but don't stop processing
3112            - Fetch failures are retried with exponential backoff
3113            - All scores are automatically flushed to Langfuse at the end
3114            - The resume mechanism uses timestamp-based filtering to avoid duplicates
3115        """
3116        runner = BatchEvaluationRunner(self)
3117
3118        return cast(
3119            BatchEvaluationResult,
3120            run_async_safely(
3121                runner.run_async(
3122                    scope=scope,
3123                    mapper=mapper,
3124                    evaluators=evaluators,
3125                    filter=filter,
3126                    fetch_batch_size=fetch_batch_size,
3127                    fetch_trace_fields=fetch_trace_fields,
3128                    max_items=max_items,
3129                    max_concurrency=max_concurrency,
3130                    composite_evaluator=composite_evaluator,
3131                    metadata=metadata,
3132                    _add_observation_scores_to_trace=_add_observation_scores_to_trace,
3133                    _additional_trace_tags=_additional_trace_tags,
3134                    max_retries=max_retries,
3135                    verbose=verbose,
3136                    resume_from=resume_from,
3137                )
3138            ),
3139        )
3140
3141    def auth_check(self) -> bool:
3142        """Check if the provided credentials (public and secret key) are valid.
3143
3144        Raises:
3145            Exception: If no projects were found for the provided credentials.
3146
3147        Note:
3148            This method is blocking. It is discouraged to use it in production code.
3149        """
3150        try:
3151            projects = self.api.projects.get()
3152            langfuse_logger.debug(
3153                f"Auth check successful, found {len(projects.data)} projects"
3154            )
3155            if len(projects.data) == 0:
3156                raise Exception(
3157                    "Auth check failed, no project found for the keys provided."
3158                )
3159            return True
3160
3161        except AttributeError as e:
3162            langfuse_logger.warning(
3163                f"Auth check failed: Client not properly initialized. Error: {e}"
3164            )
3165            return False
3166
3167        except Error as e:
3168            handle_fern_exception(e)
3169            raise e
3170
3171    def create_dataset(
3172        self,
3173        *,
3174        name: str,
3175        description: Optional[str] = None,
3176        metadata: Optional[Any] = None,
3177        input_schema: Optional[Any] = None,
3178        expected_output_schema: Optional[Any] = None,
3179    ) -> Dataset:
3180        """Create a dataset with the given name on Langfuse.
3181
3182        Args:
3183            name: Name of the dataset to create.
3184            description: Description of the dataset. Defaults to None.
3185            metadata: Additional metadata. Defaults to None.
3186            input_schema: JSON Schema for validating dataset item inputs. When set, all new items will be validated against this schema.
3187            expected_output_schema: JSON Schema for validating dataset item expected outputs. When set, all new items will be validated against this schema.
3188
3189        Returns:
3190            Dataset: The created dataset as returned by the Langfuse API.
3191        """
3192        try:
3193            langfuse_logger.debug(f"Creating datasets {name}")
3194
3195            result = self.api.datasets.create(
3196                name=name,
3197                description=description,
3198                metadata=metadata,
3199                input_schema=input_schema,
3200                expected_output_schema=expected_output_schema,
3201            )
3202
3203            return cast(Dataset, result)
3204
3205        except Error as e:
3206            handle_fern_exception(e)
3207            raise e
3208
3209    def create_dataset_item(
3210        self,
3211        *,
3212        dataset_name: str,
3213        input: Optional[Any] = None,
3214        expected_output: Optional[Any] = None,
3215        metadata: Optional[Any] = None,
3216        source_trace_id: Optional[str] = None,
3217        source_observation_id: Optional[str] = None,
3218        status: Optional[DatasetStatus] = None,
3219        id: Optional[str] = None,
3220    ) -> DatasetItem:
3221        """Create a dataset item.
3222
3223        Upserts if an item with id already exists.
3224
3225        Args:
3226            dataset_name: Name of the dataset in which the dataset item should be created.
3227            input: Input data. Defaults to None. Can contain any dict, list or scalar.
3228            expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar.
3229            metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar.
3230            source_trace_id: Id of the source trace. Defaults to None.
3231            source_observation_id: Id of the source observation. Defaults to None.
3232            status: Status of the dataset item. Defaults to ACTIVE for newly created items.
3233            id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets.
3234
3235        Returns:
3236            DatasetItem: The created dataset item as returned by the Langfuse API.
3237
3238        Example:
3239            ```python
3240            from langfuse import Langfuse
3241
3242            langfuse = Langfuse()
3243
3244            # Uploading items to the Langfuse dataset named "capital_cities"
3245            langfuse.create_dataset_item(
3246                dataset_name="capital_cities",
3247                input={"input": {"country": "Italy"}},
3248                expected_output={"expected_output": "Rome"},
3249                metadata={"foo": "bar"}
3250            )
3251            ```
3252        """
3253        try:
3254            langfuse_logger.debug(f"Creating dataset item for dataset {dataset_name}")
3255
3256            result = self.api.dataset_items.create(
3257                dataset_name=dataset_name,
3258                input=input,
3259                expected_output=expected_output,
3260                metadata=metadata,
3261                source_trace_id=source_trace_id,
3262                source_observation_id=source_observation_id,
3263                status=status,
3264                id=id,
3265            )
3266
3267            return cast(DatasetItem, result)
3268        except Error as e:
3269            handle_fern_exception(e)
3270            raise e
3271
3272    def resolve_media_references(
3273        self,
3274        *,
3275        obj: Any,
3276        resolve_with: Literal["base64_data_uri"],
3277        max_depth: int = 10,
3278        content_fetch_timeout_seconds: int = 5,
3279    ) -> Any:
3280        """Replace media reference strings in an object with base64 data URIs.
3281
3282        This method recursively traverses an object (up to max_depth) looking for media reference strings
3283        in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using
3284        the provided Langfuse client and replaces the reference string with a base64 data URI.
3285
3286        If fetching media content fails for a reference string, a warning is logged and the reference
3287        string is left unchanged.
3288
3289        Args:
3290            obj: The object to process. Can be a primitive value, array, or nested object.
3291                If the object has a __dict__ attribute, a dict will be returned instead of the original object type.
3292            resolve_with: The representation of the media content to replace the media reference string with.
3293                Currently only "base64_data_uri" is supported.
3294            max_depth: int: The maximum depth to traverse the object. Default is 10.
3295            content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5.
3296
3297        Returns:
3298            A deep copy of the input object with all media references replaced with base64 data URIs where possible.
3299            If the input object has a __dict__ attribute, a dict will be returned instead of the original object type.
3300
3301        Example:
3302            obj = {
3303                "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@",
3304                "nested": {
3305                    "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@"
3306                }
3307            }
3308
3309            result = await LangfuseMedia.resolve_media_references(obj, langfuse_client)
3310
3311            # Result:
3312            # {
3313            #     "image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...",
3314            #     "nested": {
3315            #         "pdf": "data:application/pdf;base64,JVBERi0xLjcK..."
3316            #     }
3317            # }
3318        """
3319        return LangfuseMedia.resolve_media_references(
3320            langfuse_client=self,
3321            obj=obj,
3322            resolve_with=resolve_with,
3323            max_depth=max_depth,
3324            content_fetch_timeout_seconds=content_fetch_timeout_seconds,
3325        )
3326
3327    @overload
3328    def get_prompt(
3329        self,
3330        name: str,
3331        *,
3332        version: Optional[int] = None,
3333        label: Optional[str] = None,
3334        type: Literal["chat"],
3335        cache_ttl_seconds: Optional[int] = None,
3336        fallback: Optional[List[ChatMessageDict]] = None,
3337        max_retries: Optional[int] = None,
3338        fetch_timeout_seconds: Optional[int] = None,
3339    ) -> ChatPromptClient: ...
3340
3341    @overload
3342    def get_prompt(
3343        self,
3344        name: str,
3345        *,
3346        version: Optional[int] = None,
3347        label: Optional[str] = None,
3348        type: Literal["text"] = "text",
3349        cache_ttl_seconds: Optional[int] = None,
3350        fallback: Optional[str] = None,
3351        max_retries: Optional[int] = None,
3352        fetch_timeout_seconds: Optional[int] = None,
3353    ) -> TextPromptClient: ...
3354
3355    def get_prompt(
3356        self,
3357        name: str,
3358        *,
3359        version: Optional[int] = None,
3360        label: Optional[str] = None,
3361        type: Literal["chat", "text"] = "text",
3362        cache_ttl_seconds: Optional[int] = None,
3363        fallback: Union[Optional[List[ChatMessageDict]], Optional[str]] = None,
3364        max_retries: Optional[int] = None,
3365        fetch_timeout_seconds: Optional[int] = None,
3366    ) -> PromptClient:
3367        """Get a prompt.
3368
3369        This method attempts to fetch the requested prompt from the local cache. If the prompt is not found
3370        in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again
3371        and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will
3372        return the expired prompt as a fallback.
3373
3374        Args:
3375            name (str): The name of the prompt to retrieve.
3376
3377        Keyword Args:
3378            version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both.
3379            label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both.
3380            cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a
3381            keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0.
3382            type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text".
3383            fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None.
3384            max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds.
3385            fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default.
3386
3387        Returns:
3388            The prompt object retrieved from the cache or directly fetched if not cached or expired of type
3389            - TextPromptClient, if type argument is 'text'.
3390            - ChatPromptClient, if type argument is 'chat'.
3391
3392        Raises:
3393            Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an
3394            expired prompt in the cache, in which case it logs a warning and returns the expired prompt.
3395        """
3396        if self._resources is None:
3397            raise Error(
3398                "SDK is not correctly initialized. Check the init logs for more details."
3399            )
3400        if version is not None and label is not None:
3401            raise ValueError("Cannot specify both version and label at the same time.")
3402
3403        if not name:
3404            raise ValueError("Prompt name cannot be empty.")
3405
3406        cache_key = PromptCache.generate_cache_key(name, version=version, label=label)
3407        bounded_max_retries = self._get_bounded_max_retries(
3408            max_retries, default_max_retries=2, max_retries_upper_bound=4
3409        )
3410
3411        langfuse_logger.debug(f"Getting prompt '{cache_key}'")
3412        cached_prompt = self._resources.prompt_cache.get(cache_key)
3413
3414        if cached_prompt is None or cache_ttl_seconds == 0:
3415            langfuse_logger.debug(
3416                f"Prompt '{cache_key}' not found in cache or caching disabled."
3417            )
3418            try:
3419                return self._fetch_prompt_and_update_cache(
3420                    name,
3421                    version=version,
3422                    label=label,
3423                    ttl_seconds=cache_ttl_seconds,
3424                    max_retries=bounded_max_retries,
3425                    fetch_timeout_seconds=fetch_timeout_seconds,
3426                )
3427            except Exception as e:
3428                if fallback:
3429                    langfuse_logger.warning(
3430                        f"Returning fallback prompt for '{cache_key}' due to fetch error: {e}"
3431                    )
3432
3433                    fallback_client_args: Dict[str, Any] = {
3434                        "name": name,
3435                        "prompt": fallback,
3436                        "type": type,
3437                        "version": version or 0,
3438                        "config": {},
3439                        "labels": [label] if label else [],
3440                        "tags": [],
3441                    }
3442
3443                    if type == "text":
3444                        return TextPromptClient(
3445                            prompt=Prompt_Text(**fallback_client_args),
3446                            is_fallback=True,
3447                        )
3448
3449                    if type == "chat":
3450                        return ChatPromptClient(
3451                            prompt=Prompt_Chat(**fallback_client_args),
3452                            is_fallback=True,
3453                        )
3454
3455                raise e
3456
3457        if cached_prompt.is_expired():
3458            langfuse_logger.debug(f"Stale prompt '{cache_key}' found in cache.")
3459            try:
3460                # refresh prompt in background thread, refresh_prompt deduplicates tasks
3461                langfuse_logger.debug(f"Refreshing prompt '{cache_key}' in background.")
3462
3463                def refresh_task() -> None:
3464                    self._fetch_prompt_and_update_cache(
3465                        name,
3466                        version=version,
3467                        label=label,
3468                        ttl_seconds=cache_ttl_seconds,
3469                        max_retries=bounded_max_retries,
3470                        fetch_timeout_seconds=fetch_timeout_seconds,
3471                    )
3472
3473                self._resources.prompt_cache.add_refresh_prompt_task(
3474                    cache_key,
3475                    refresh_task,
3476                )
3477                langfuse_logger.debug(
3478                    f"Returning stale prompt '{cache_key}' from cache."
3479                )
3480                # return stale prompt
3481                return cached_prompt.value
3482
3483            except Exception as e:
3484                langfuse_logger.warning(
3485                    f"Error when refreshing cached prompt '{cache_key}', returning cached version. Error: {e}"
3486                )
3487                # creation of refresh prompt task failed, return stale prompt
3488                return cached_prompt.value
3489
3490        return cached_prompt.value
3491
3492    def _fetch_prompt_and_update_cache(
3493        self,
3494        name: str,
3495        *,
3496        version: Optional[int] = None,
3497        label: Optional[str] = None,
3498        ttl_seconds: Optional[int] = None,
3499        max_retries: int,
3500        fetch_timeout_seconds: Optional[int],
3501    ) -> PromptClient:
3502        cache_key = PromptCache.generate_cache_key(name, version=version, label=label)
3503        langfuse_logger.debug(f"Fetching prompt '{cache_key}' from server...")
3504
3505        try:
3506
3507            @backoff.on_exception(
3508                backoff.constant, Exception, max_tries=max_retries + 1, logger=None
3509            )
3510            def fetch_prompts() -> Any:
3511                return self.api.prompts.get(
3512                    self._url_encode(name),
3513                    version=version,
3514                    label=label,
3515                    request_options={
3516                        "timeout_in_seconds": fetch_timeout_seconds,
3517                    }
3518                    if fetch_timeout_seconds is not None
3519                    else None,
3520                )
3521
3522            prompt_response = fetch_prompts()
3523
3524            prompt: PromptClient
3525            if prompt_response.type == "chat":
3526                prompt = ChatPromptClient(prompt_response)
3527            else:
3528                prompt = TextPromptClient(prompt_response)
3529
3530            if self._resources is not None:
3531                self._resources.prompt_cache.set(cache_key, prompt, ttl_seconds)
3532
3533            return prompt
3534
3535        except NotFoundError as not_found_error:
3536            langfuse_logger.warning(
3537                f"Prompt '{cache_key}' not found during refresh, evicting from cache."
3538            )
3539            if self._resources is not None:
3540                self._resources.prompt_cache.delete(cache_key)
3541            raise not_found_error
3542
3543        except Exception as e:
3544            langfuse_logger.error(
3545                f"Error while fetching prompt '{cache_key}': {str(e)}"
3546            )
3547            raise e
3548
3549    def _get_bounded_max_retries(
3550        self,
3551        max_retries: Optional[int],
3552        *,
3553        default_max_retries: int = 2,
3554        max_retries_upper_bound: int = 4,
3555    ) -> int:
3556        if max_retries is None:
3557            return default_max_retries
3558
3559        bounded_max_retries = min(
3560            max(max_retries, 0),
3561            max_retries_upper_bound,
3562        )
3563
3564        return bounded_max_retries
3565
3566    @overload
3567    def create_prompt(
3568        self,
3569        *,
3570        name: str,
3571        prompt: List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]],
3572        labels: List[str] = [],
3573        tags: Optional[List[str]] = None,
3574        type: Optional[Literal["chat"]],
3575        config: Optional[Any] = None,
3576        commit_message: Optional[str] = None,
3577    ) -> ChatPromptClient: ...
3578
3579    @overload
3580    def create_prompt(
3581        self,
3582        *,
3583        name: str,
3584        prompt: str,
3585        labels: List[str] = [],
3586        tags: Optional[List[str]] = None,
3587        type: Optional[Literal["text"]] = "text",
3588        config: Optional[Any] = None,
3589        commit_message: Optional[str] = None,
3590    ) -> TextPromptClient: ...
3591
3592    def create_prompt(
3593        self,
3594        *,
3595        name: str,
3596        prompt: Union[
3597            str, List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]]
3598        ],
3599        labels: List[str] = [],
3600        tags: Optional[List[str]] = None,
3601        type: Optional[Literal["chat", "text"]] = "text",
3602        config: Optional[Any] = None,
3603        commit_message: Optional[str] = None,
3604    ) -> PromptClient:
3605        """Create a new prompt in Langfuse.
3606
3607        Keyword Args:
3608            name : The name of the prompt to be created.
3609            prompt : The content of the prompt to be created.
3610            is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead.
3611            labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label.
3612            tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt.
3613            config: Additional structured data to be saved with the prompt. Defaults to None.
3614            type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text".
3615            commit_message: Optional string describing the change.
3616
3617        Returns:
3618            TextPromptClient: The prompt if type argument is 'text'.
3619            ChatPromptClient: The prompt if type argument is 'chat'.
3620        """
3621        try:
3622            langfuse_logger.debug(f"Creating prompt {name=}, {labels=}")
3623
3624            if type == "chat":
3625                if not isinstance(prompt, list):
3626                    raise ValueError(
3627                        "For 'chat' type, 'prompt' must be a list of chat messages with role and content attributes."
3628                    )
3629                request: Union[CreateChatPromptRequest, CreateTextPromptRequest] = (
3630                    CreateChatPromptRequest(
3631                        name=name,
3632                        prompt=cast(Any, prompt),
3633                        labels=labels,
3634                        tags=tags,
3635                        config=config or {},
3636                        commit_message=commit_message,
3637                        type=CreateChatPromptType.CHAT,
3638                    )
3639                )
3640                server_prompt = self.api.prompts.create(request=request)
3641
3642                if self._resources is not None:
3643                    self._resources.prompt_cache.invalidate(name)
3644
3645                return ChatPromptClient(prompt=cast(Prompt_Chat, server_prompt))
3646
3647            if not isinstance(prompt, str):
3648                raise ValueError("For 'text' type, 'prompt' must be a string.")
3649
3650            request = CreateTextPromptRequest(
3651                name=name,
3652                prompt=prompt,
3653                labels=labels,
3654                tags=tags,
3655                config=config or {},
3656                commit_message=commit_message,
3657            )
3658
3659            server_prompt = self.api.prompts.create(request=request)
3660
3661            if self._resources is not None:
3662                self._resources.prompt_cache.invalidate(name)
3663
3664            return TextPromptClient(prompt=cast(Prompt_Text, server_prompt))
3665
3666        except Error as e:
3667            handle_fern_exception(e)
3668            raise e
3669
3670    def update_prompt(
3671        self,
3672        *,
3673        name: str,
3674        version: int,
3675        new_labels: List[str] = [],
3676    ) -> Any:
3677        """Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name.
3678
3679        Args:
3680            name (str): The name of the prompt to update.
3681            version (int): The version number of the prompt to update.
3682            new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to [].
3683
3684        Returns:
3685            Prompt: The updated prompt from the Langfuse API.
3686
3687        """
3688        updated_prompt = self.api.prompt_version.update(
3689            name=self._url_encode(name),
3690            version=version,
3691            new_labels=new_labels,
3692        )
3693
3694        if self._resources is not None:
3695            self._resources.prompt_cache.invalidate(name)
3696
3697        return updated_prompt
3698
3699    def _url_encode(self, url: str, *, is_url_param: Optional[bool] = False) -> str:
3700        # httpx â‰Ĩ 0.28 does its own WHATWG-compliant quoting (eg. encodes bare
3701        # “%”, “?”, “#”, “|”, â€Ļ in query/path parts).  Re-quoting here would
3702        # double-encode, so we skip when the value is about to be sent straight
3703        # to httpx (`is_url_param=True`) and the installed version is â‰Ĩ 0.28.
3704        if is_url_param and Version(httpx.__version__) >= Version("0.28.0"):
3705            return url
3706
3707        # urllib.parse.quote does not escape slashes "/" by default; we need to add safe="" to force escaping
3708        # we need add safe="" to force escaping of slashes
3709        # This is necessary for prompts in prompt folders
3710        return urllib.parse.quote(url, safe="")
3711
3712    def clear_prompt_cache(self) -> None:
3713        """Clear the entire prompt cache, removing all cached prompts.
3714
3715        This method is useful when you want to force a complete refresh of all
3716        cached prompts, for example after major updates or when you need to
3717        ensure the latest versions are fetched from the server.
3718        """
3719        if self._resources is not None:
3720            self._resources.prompt_cache.clear()

Main client for Langfuse tracing and platform features.

This class provides an interface for creating and managing traces, spans, and generations in Langfuse as well as interacting with the Langfuse API.

The client features a thread-safe singleton pattern for each unique public API key, ensuring consistent trace context propagation across your application. It implements efficient batching of spans with configurable flush settings and includes background thread management for media uploads and score ingestion.

Configuration is flexible through either direct parameters or environment variables, with graceful fallbacks and runtime configuration updates.

Attributes:
  • api: Synchronous API client for Langfuse backend communication
  • async_api: Asynchronous API client for Langfuse backend communication
  • _otel_tracer: Internal LangfuseTracer instance managing OpenTelemetry components
Arguments:
  • public_key (Optional[str]): Your Langfuse public API key. Can also be set via LANGFUSE_PUBLIC_KEY environment variable.
  • secret_key (Optional[str]): Your Langfuse secret API key. Can also be set via LANGFUSE_SECRET_KEY environment variable.
  • base_url (Optional[str]): The Langfuse API base URL. Defaults to "https://cloud.langfuse.com". Can also be set via LANGFUSE_BASE_URL environment variable.
  • host (Optional[str]): Deprecated. Use base_url instead. The Langfuse API host URL. Defaults to "https://cloud.langfuse.com".
  • timeout (Optional[int]): Timeout in seconds for API requests. Defaults to 5 seconds.
  • httpx_client (Optional[httpx.Client]): Custom httpx client for making non-tracing HTTP requests. If not provided, a default client will be created.
  • debug (bool): Enable debug logging. Defaults to False. Can also be set via LANGFUSE_DEBUG environment variable.
  • tracing_enabled (Optional[bool]): Enable or disable tracing. Defaults to True. Can also be set via LANGFUSE_TRACING_ENABLED environment variable.
  • flush_at (Optional[int]): Number of spans to batch before sending to the API. Defaults to 512. Can also be set via LANGFUSE_FLUSH_AT environment variable.
  • flush_interval (Optional[float]): Time in seconds between batch flushes. Defaults to 5 seconds. Can also be set via LANGFUSE_FLUSH_INTERVAL environment variable.
  • environment (Optional[str]): Environment name for tracing. Default is 'default'. Can also be set via LANGFUSE_TRACING_ENVIRONMENT environment variable. Can be any lowercase alphanumeric string with hyphens and underscores that does not start with 'langfuse'.
  • release (Optional[str]): Release version/hash of your application. Used for grouping analytics by release.
  • media_upload_thread_count (Optional[int]): Number of background threads for handling media uploads. Defaults to 1. Can also be set via LANGFUSE_MEDIA_UPLOAD_THREAD_COUNT environment variable.
  • sample_rate (Optional[float]): Sampling rate for traces (0.0 to 1.0). Defaults to 1.0 (100% of traces are sampled). Can also be set via LANGFUSE_SAMPLE_RATE environment variable.
  • mask (Optional[MaskFunction]): Function to mask sensitive data in traces before sending to the API.
  • blocked_instrumentation_scopes (Optional[List[str]]): Deprecated. Use should_export_span instead. Equivalent behavior:

    from langfuse.span_filter import is_default_export_span
    blocked = {"sqlite", "requests"}
    
    should_export_span = lambda span: (
        is_default_export_span(span)
        and (
            span.instrumentation_scope is None
            or span.instrumentation_scope.name not in blocked
        )
    )
    
  • should_export_span (Optional[Callable[[ReadableSpan], bool]]): Callback to decide whether to export a span. If omitted, Langfuse uses the default filter (Langfuse SDK spans, spans with gen_ai.* attributes, and known LLM instrumentation scopes).

  • additional_headers (Optional[Dict[str, str]]): Additional headers to include in all API requests and OTLPSpanExporter requests. These headers will be merged with default headers. Note: If httpx_client is provided, additional_headers must be set directly on your custom httpx_client as well.
  • tracer_provider(Optional[TracerProvider]): OpenTelemetry TracerProvider to use for Langfuse. This can be useful to set to have disconnected tracing between Langfuse and other OpenTelemetry-span emitting libraries. Note: To track active spans, the context is still shared between TracerProviders. This may lead to broken trace trees.
Example:
from langfuse.otel import Langfuse

# Initialize the client (reads from env vars if not provided)
langfuse = Langfuse(
    public_key="your-public-key",
    secret_key="your-secret-key",
    host="https://cloud.langfuse.com",  # Optional, default shown
)

# Create a trace span
with langfuse.start_as_current_observation(name="process-query") as span:
    # Your application code here

    # Create a nested generation span for an LLM call
    with span.start_as_current_generation(
        name="generate-response",
        model="gpt-4",
        input={"query": "Tell me about AI"},
        model_parameters={"temperature": 0.7, "max_tokens": 500}
    ) as generation:
        # Generate response here
        response = "AI is a field of computer science..."

        generation.update(
            output=response,
            usage_details={"prompt_tokens": 10, "completion_tokens": 50},
            cost_details={"total_cost": 0.0023}
        )

        # Score the generation (supports NUMERIC, BOOLEAN, CATEGORICAL)
        generation.score(name="relevance", value=0.95, data_type="NUMERIC")
Langfuse( *, public_key: Optional[str] = None, secret_key: Optional[str] = None, base_url: Optional[str] = None, host: Optional[str] = None, timeout: Optional[int] = None, httpx_client: Optional[httpx.Client] = None, debug: bool = False, tracing_enabled: Optional[bool] = True, flush_at: Optional[int] = None, flush_interval: Optional[float] = None, environment: Optional[str] = None, release: Optional[str] = None, media_upload_thread_count: Optional[int] = None, sample_rate: Optional[float] = None, mask: Optional[langfuse.types.MaskFunction] = None, blocked_instrumentation_scopes: Optional[List[str]] = None, should_export_span: Optional[Callable[[opentelemetry.sdk.trace.ReadableSpan], bool]] = None, additional_headers: Optional[Dict[str, str]] = None, tracer_provider: Optional[opentelemetry.sdk.trace.TracerProvider] = None)
226    def __init__(
227        self,
228        *,
229        public_key: Optional[str] = None,
230        secret_key: Optional[str] = None,
231        base_url: Optional[str] = None,
232        host: Optional[str] = None,
233        timeout: Optional[int] = None,
234        httpx_client: Optional[httpx.Client] = None,
235        debug: bool = False,
236        tracing_enabled: Optional[bool] = True,
237        flush_at: Optional[int] = None,
238        flush_interval: Optional[float] = None,
239        environment: Optional[str] = None,
240        release: Optional[str] = None,
241        media_upload_thread_count: Optional[int] = None,
242        sample_rate: Optional[float] = None,
243        mask: Optional[MaskFunction] = None,
244        blocked_instrumentation_scopes: Optional[List[str]] = None,
245        should_export_span: Optional[Callable[[ReadableSpan], bool]] = None,
246        additional_headers: Optional[Dict[str, str]] = None,
247        tracer_provider: Optional[TracerProvider] = None,
248    ):
249        self._base_url = (
250            base_url
251            or os.environ.get(LANGFUSE_BASE_URL)
252            or host
253            or os.environ.get(LANGFUSE_HOST, "https://cloud.langfuse.com")
254        )
255        self._environment = environment or cast(
256            str, os.environ.get(LANGFUSE_TRACING_ENVIRONMENT)
257        )
258        self._release = (
259            release
260            or os.environ.get(LANGFUSE_RELEASE, None)
261            or get_common_release_envs()
262        )
263        self._project_id: Optional[str] = None
264        sample_rate = sample_rate or float(os.environ.get(LANGFUSE_SAMPLE_RATE, 1.0))
265        if not 0.0 <= sample_rate <= 1.0:
266            raise ValueError(
267                f"Sample rate must be between 0.0 and 1.0, got {sample_rate}"
268            )
269
270        timeout = timeout or int(os.environ.get(LANGFUSE_TIMEOUT, 5))
271
272        self._tracing_enabled = (
273            tracing_enabled
274            and os.environ.get(LANGFUSE_TRACING_ENABLED, "true").lower() != "false"
275        )
276        if not self._tracing_enabled:
277            langfuse_logger.info(
278                "Configuration: Langfuse tracing is explicitly disabled. No data will be sent to the Langfuse API."
279            )
280
281        debug = (
282            debug if debug else (os.getenv(LANGFUSE_DEBUG, "false").lower() == "true")
283        )
284        if debug:
285            logging.basicConfig(
286                format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
287            )
288            langfuse_logger.setLevel(logging.DEBUG)
289
290        public_key = public_key or os.environ.get(LANGFUSE_PUBLIC_KEY)
291        if public_key is None:
292            langfuse_logger.warning(
293                "Authentication error: Langfuse client initialized without public_key. Client will be disabled. "
294                "Provide a public_key parameter or set LANGFUSE_PUBLIC_KEY environment variable. "
295            )
296            self._otel_tracer = otel_trace_api.NoOpTracer()
297            return
298
299        secret_key = secret_key or os.environ.get(LANGFUSE_SECRET_KEY)
300        if secret_key is None:
301            langfuse_logger.warning(
302                "Authentication error: Langfuse client initialized without secret_key. Client will be disabled. "
303                "Provide a secret_key parameter or set LANGFUSE_SECRET_KEY environment variable. "
304            )
305            self._otel_tracer = otel_trace_api.NoOpTracer()
306            return
307
308        if os.environ.get("OTEL_SDK_DISABLED", "false").lower() == "true":
309            langfuse_logger.warning(
310                "OTEL_SDK_DISABLED is set. Langfuse tracing will be disabled and no traces will appear in the UI."
311            )
312
313        if blocked_instrumentation_scopes is not None:
314            warnings.warn(
315                "`blocked_instrumentation_scopes` is deprecated and will be removed in a future release. "
316                "Use `should_export_span` instead. Example: "
317                "from langfuse.span_filter import is_default_export_span; "
318                'blocked={"scope"}; should_export_span=lambda span: '
319                "is_default_export_span(span) and (span.instrumentation_scope is None or "
320                "span.instrumentation_scope.name not in blocked).",
321                DeprecationWarning,
322                stacklevel=2,
323            )
324
325        # Initialize api and tracer if requirements are met
326        self._resources = LangfuseResourceManager(
327            public_key=public_key,
328            secret_key=secret_key,
329            base_url=self._base_url,
330            timeout=timeout,
331            environment=self._environment,
332            release=release,
333            flush_at=flush_at,
334            flush_interval=flush_interval,
335            httpx_client=httpx_client,
336            media_upload_thread_count=media_upload_thread_count,
337            sample_rate=sample_rate,
338            mask=mask,
339            tracing_enabled=self._tracing_enabled,
340            blocked_instrumentation_scopes=blocked_instrumentation_scopes,
341            should_export_span=should_export_span,
342            additional_headers=additional_headers,
343            tracer_provider=tracer_provider,
344        )
345        self._mask = self._resources.mask
346
347        self._otel_tracer = (
348            self._resources.tracer
349            if self._tracing_enabled and self._resources.tracer is not None
350            else otel_trace_api.NoOpTracer()
351        )
352        self.api = self._resources.api
353        self.async_api = self._resources.async_api
api
async_api
def start_observation( self, *, trace_context: Optional[langfuse.types.TraceContext] = None, name: str, as_type: Union[Literal['generation', 'embedding'], Literal['span', 'agent', 'tool', 'chain', 'retriever', 'evaluator', 'guardrail']] = 'span', input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, float, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None) -> Union[LangfuseSpan, LangfuseGeneration, LangfuseAgent, LangfuseTool, LangfuseChain, LangfuseRetriever, LangfuseEvaluator, LangfuseEmbedding, LangfuseGuardrail]:
502    def start_observation(
503        self,
504        *,
505        trace_context: Optional[TraceContext] = None,
506        name: str,
507        as_type: ObservationTypeLiteralNoEvent = "span",
508        input: Optional[Any] = None,
509        output: Optional[Any] = None,
510        metadata: Optional[Any] = None,
511        version: Optional[str] = None,
512        level: Optional[SpanLevel] = None,
513        status_message: Optional[str] = None,
514        completion_start_time: Optional[datetime] = None,
515        model: Optional[str] = None,
516        model_parameters: Optional[Dict[str, MapValue]] = None,
517        usage_details: Optional[Dict[str, int]] = None,
518        cost_details: Optional[Dict[str, float]] = None,
519        prompt: Optional[PromptClient] = None,
520    ) -> Union[
521        LangfuseSpan,
522        LangfuseGeneration,
523        LangfuseAgent,
524        LangfuseTool,
525        LangfuseChain,
526        LangfuseRetriever,
527        LangfuseEvaluator,
528        LangfuseEmbedding,
529        LangfuseGuardrail,
530    ]:
531        """Create a new observation of the specified type.
532
533        This method creates a new observation but does not set it as the current span in the
534        context. To create and use an observation within a context, use start_as_current_observation().
535
536        Args:
537            trace_context: Optional context for connecting to an existing trace
538            name: Name of the observation
539            as_type: Type of observation to create (defaults to "span")
540            input: Input data for the operation
541            output: Output data from the operation
542            metadata: Additional metadata to associate with the observation
543            version: Version identifier for the code or component
544            level: Importance level of the observation
545            status_message: Optional status message for the observation
546            completion_start_time: When the model started generating (for generation types)
547            model: Name/identifier of the AI model used (for generation types)
548            model_parameters: Parameters used for the model (for generation types)
549            usage_details: Token usage information (for generation types)
550            cost_details: Cost information (for generation types)
551            prompt: Associated prompt template (for generation types)
552
553        Returns:
554            An observation object of the appropriate type that must be ended with .end()
555        """
556        if trace_context:
557            trace_id = trace_context.get("trace_id", None)
558            parent_span_id = trace_context.get("parent_span_id", None)
559
560            if trace_id:
561                remote_parent_span = self._create_remote_parent_span(
562                    trace_id=trace_id, parent_span_id=parent_span_id
563                )
564
565                with otel_trace_api.use_span(
566                    cast(otel_trace_api.Span, remote_parent_span)
567                ):
568                    otel_span = self._otel_tracer.start_span(name=name)
569                    otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True)
570
571                    return self._create_observation_from_otel_span(
572                        otel_span=otel_span,
573                        as_type=as_type,
574                        input=input,
575                        output=output,
576                        metadata=metadata,
577                        version=version,
578                        level=level,
579                        status_message=status_message,
580                        completion_start_time=completion_start_time,
581                        model=model,
582                        model_parameters=model_parameters,
583                        usage_details=usage_details,
584                        cost_details=cost_details,
585                        prompt=prompt,
586                    )
587
588        otel_span = self._otel_tracer.start_span(name=name)
589
590        return self._create_observation_from_otel_span(
591            otel_span=otel_span,
592            as_type=as_type,
593            input=input,
594            output=output,
595            metadata=metadata,
596            version=version,
597            level=level,
598            status_message=status_message,
599            completion_start_time=completion_start_time,
600            model=model,
601            model_parameters=model_parameters,
602            usage_details=usage_details,
603            cost_details=cost_details,
604            prompt=prompt,
605        )

Create a new observation of the specified type.

This method creates a new observation but does not set it as the current span in the context. To create and use an observation within a context, use start_as_current_observation().

Arguments:
  • trace_context: Optional context for connecting to an existing trace
  • name: Name of the observation
  • as_type: Type of observation to create (defaults to "span")
  • input: Input data for the operation
  • output: Output data from the operation
  • metadata: Additional metadata to associate with the observation
  • version: Version identifier for the code or component
  • level: Importance level of the observation
  • status_message: Optional status message for the observation
  • completion_start_time: When the model started generating (for generation types)
  • model: Name/identifier of the AI model used (for generation types)
  • model_parameters: Parameters used for the model (for generation types)
  • usage_details: Token usage information (for generation types)
  • cost_details: Cost information (for generation types)
  • prompt: Associated prompt template (for generation types)
Returns:

An observation object of the appropriate type that must be ended with .end()

def start_as_current_observation( self, *, trace_context: Optional[langfuse.types.TraceContext] = None, name: str, as_type: Union[Literal['generation', 'embedding'], Literal['span', 'agent', 'tool', 'chain', 'retriever', 'evaluator', 'guardrail']] = 'span', input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, float, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None, end_on_exit: Optional[bool] = None) -> Union[opentelemetry.util._decorator._AgnosticContextManager[LangfuseGeneration], opentelemetry.util._decorator._AgnosticContextManager[LangfuseSpan], opentelemetry.util._decorator._AgnosticContextManager[LangfuseAgent], opentelemetry.util._decorator._AgnosticContextManager[LangfuseTool], opentelemetry.util._decorator._AgnosticContextManager[LangfuseChain], opentelemetry.util._decorator._AgnosticContextManager[LangfuseRetriever], opentelemetry.util._decorator._AgnosticContextManager[LangfuseEvaluator], opentelemetry.util._decorator._AgnosticContextManager[LangfuseEmbedding], opentelemetry.util._decorator._AgnosticContextManager[LangfuseGuardrail]]:
 835    def start_as_current_observation(
 836        self,
 837        *,
 838        trace_context: Optional[TraceContext] = None,
 839        name: str,
 840        as_type: ObservationTypeLiteralNoEvent = "span",
 841        input: Optional[Any] = None,
 842        output: Optional[Any] = None,
 843        metadata: Optional[Any] = None,
 844        version: Optional[str] = None,
 845        level: Optional[SpanLevel] = None,
 846        status_message: Optional[str] = None,
 847        completion_start_time: Optional[datetime] = None,
 848        model: Optional[str] = None,
 849        model_parameters: Optional[Dict[str, MapValue]] = None,
 850        usage_details: Optional[Dict[str, int]] = None,
 851        cost_details: Optional[Dict[str, float]] = None,
 852        prompt: Optional[PromptClient] = None,
 853        end_on_exit: Optional[bool] = None,
 854    ) -> Union[
 855        _AgnosticContextManager[LangfuseGeneration],
 856        _AgnosticContextManager[LangfuseSpan],
 857        _AgnosticContextManager[LangfuseAgent],
 858        _AgnosticContextManager[LangfuseTool],
 859        _AgnosticContextManager[LangfuseChain],
 860        _AgnosticContextManager[LangfuseRetriever],
 861        _AgnosticContextManager[LangfuseEvaluator],
 862        _AgnosticContextManager[LangfuseEmbedding],
 863        _AgnosticContextManager[LangfuseGuardrail],
 864    ]:
 865        """Create a new observation and set it as the current span in a context manager.
 866
 867        This method creates a new observation of the specified type and sets it as the
 868        current span within a context manager. Use this method with a 'with' statement to
 869        automatically handle the observation lifecycle within a code block.
 870
 871        The created observation will be the child of the current span in the context.
 872
 873        Args:
 874            trace_context: Optional context for connecting to an existing trace
 875            name: Name of the observation (e.g., function or operation name)
 876            as_type: Type of observation to create (defaults to "span")
 877            input: Input data for the operation (can be any JSON-serializable object)
 878            output: Output data from the operation (can be any JSON-serializable object)
 879            metadata: Additional metadata to associate with the observation
 880            version: Version identifier for the code or component
 881            level: Importance level of the observation (info, warning, error)
 882            status_message: Optional status message for the observation
 883            end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
 884
 885            The following parameters are available when as_type is: "generation" or "embedding".
 886            completion_start_time: When the model started generating the response
 887            model: Name/identifier of the AI model used (e.g., "gpt-4")
 888            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
 889            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
 890            cost_details: Cost information for the model call
 891            prompt: Associated prompt template from Langfuse prompt management
 892
 893        Returns:
 894            A context manager that yields the appropriate observation type based on as_type
 895
 896        Example:
 897            ```python
 898            # Create a span
 899            with langfuse.start_as_current_observation(name="process-query", as_type="span") as span:
 900                # Do work
 901                result = process_data()
 902                span.update(output=result)
 903
 904                # Create a child span automatically
 905                with span.start_as_current_observation(name="sub-operation") as child_span:
 906                    # Do sub-operation work
 907                    child_span.update(output="sub-result")
 908
 909            # Create a tool observation
 910            with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool:
 911                # Do tool work
 912                results = search_web(query)
 913                tool.update(output=results)
 914
 915            # Create a generation observation
 916            with langfuse.start_as_current_observation(
 917                name="answer-generation",
 918                as_type="generation",
 919                model="gpt-4"
 920            ) as generation:
 921                # Generate answer
 922                response = llm.generate(...)
 923                generation.update(output=response)
 924            ```
 925        """
 926        if as_type in get_observation_types_list(ObservationTypeGenerationLike):
 927            if trace_context:
 928                trace_id = trace_context.get("trace_id", None)
 929                parent_span_id = trace_context.get("parent_span_id", None)
 930
 931                if trace_id:
 932                    remote_parent_span = self._create_remote_parent_span(
 933                        trace_id=trace_id, parent_span_id=parent_span_id
 934                    )
 935
 936                    return cast(
 937                        Union[
 938                            _AgnosticContextManager[LangfuseGeneration],
 939                            _AgnosticContextManager[LangfuseEmbedding],
 940                        ],
 941                        self._create_span_with_parent_context(
 942                            as_type=as_type,
 943                            name=name,
 944                            remote_parent_span=remote_parent_span,
 945                            parent=None,
 946                            end_on_exit=end_on_exit,
 947                            input=input,
 948                            output=output,
 949                            metadata=metadata,
 950                            version=version,
 951                            level=level,
 952                            status_message=status_message,
 953                            completion_start_time=completion_start_time,
 954                            model=model,
 955                            model_parameters=model_parameters,
 956                            usage_details=usage_details,
 957                            cost_details=cost_details,
 958                            prompt=prompt,
 959                        ),
 960                    )
 961
 962            return cast(
 963                Union[
 964                    _AgnosticContextManager[LangfuseGeneration],
 965                    _AgnosticContextManager[LangfuseEmbedding],
 966                ],
 967                self._start_as_current_otel_span_with_processed_media(
 968                    as_type=as_type,
 969                    name=name,
 970                    end_on_exit=end_on_exit,
 971                    input=input,
 972                    output=output,
 973                    metadata=metadata,
 974                    version=version,
 975                    level=level,
 976                    status_message=status_message,
 977                    completion_start_time=completion_start_time,
 978                    model=model,
 979                    model_parameters=model_parameters,
 980                    usage_details=usage_details,
 981                    cost_details=cost_details,
 982                    prompt=prompt,
 983                ),
 984            )
 985
 986        if as_type in get_observation_types_list(ObservationTypeSpanLike):
 987            if trace_context:
 988                trace_id = trace_context.get("trace_id", None)
 989                parent_span_id = trace_context.get("parent_span_id", None)
 990
 991                if trace_id:
 992                    remote_parent_span = self._create_remote_parent_span(
 993                        trace_id=trace_id, parent_span_id=parent_span_id
 994                    )
 995
 996                    return cast(
 997                        Union[
 998                            _AgnosticContextManager[LangfuseSpan],
 999                            _AgnosticContextManager[LangfuseAgent],
1000                            _AgnosticContextManager[LangfuseTool],
1001                            _AgnosticContextManager[LangfuseChain],
1002                            _AgnosticContextManager[LangfuseRetriever],
1003                            _AgnosticContextManager[LangfuseEvaluator],
1004                            _AgnosticContextManager[LangfuseGuardrail],
1005                        ],
1006                        self._create_span_with_parent_context(
1007                            as_type=as_type,
1008                            name=name,
1009                            remote_parent_span=remote_parent_span,
1010                            parent=None,
1011                            end_on_exit=end_on_exit,
1012                            input=input,
1013                            output=output,
1014                            metadata=metadata,
1015                            version=version,
1016                            level=level,
1017                            status_message=status_message,
1018                        ),
1019                    )
1020
1021            return cast(
1022                Union[
1023                    _AgnosticContextManager[LangfuseSpan],
1024                    _AgnosticContextManager[LangfuseAgent],
1025                    _AgnosticContextManager[LangfuseTool],
1026                    _AgnosticContextManager[LangfuseChain],
1027                    _AgnosticContextManager[LangfuseRetriever],
1028                    _AgnosticContextManager[LangfuseEvaluator],
1029                    _AgnosticContextManager[LangfuseGuardrail],
1030                ],
1031                self._start_as_current_otel_span_with_processed_media(
1032                    as_type=as_type,
1033                    name=name,
1034                    end_on_exit=end_on_exit,
1035                    input=input,
1036                    output=output,
1037                    metadata=metadata,
1038                    version=version,
1039                    level=level,
1040                    status_message=status_message,
1041                ),
1042            )
1043
1044        # This should never be reached since all valid types are handled above
1045        langfuse_logger.warning(
1046            f"Unknown observation type: {as_type}, falling back to span"
1047        )
1048        return self._start_as_current_otel_span_with_processed_media(
1049            as_type="span",
1050            name=name,
1051            end_on_exit=end_on_exit,
1052            input=input,
1053            output=output,
1054            metadata=metadata,
1055            version=version,
1056            level=level,
1057            status_message=status_message,
1058        )

Create a new observation and set it as the current span in a context manager.

This method creates a new observation of the specified type and sets it as the current span within a context manager. Use this method with a 'with' statement to automatically handle the observation lifecycle within a code block.

The created observation will be the child of the current span in the context.

Arguments:
  • trace_context: Optional context for connecting to an existing trace
  • name: Name of the observation (e.g., function or operation name)
  • as_type: Type of observation to create (defaults to "span")
  • input: Input data for the operation (can be any JSON-serializable object)
  • output: Output data from the operation (can be any JSON-serializable object)
  • metadata: Additional metadata to associate with the observation
  • version: Version identifier for the code or component
  • level: Importance level of the observation (info, warning, error)
  • status_message: Optional status message for the observation
  • end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
  • The following parameters are available when as_type is: "generation" or "embedding".
  • completion_start_time: When the model started generating the response
  • model: Name/identifier of the AI model used (e.g., "gpt-4")
  • model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
  • usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
  • cost_details: Cost information for the model call
  • prompt: Associated prompt template from Langfuse prompt management
Returns:

A context manager that yields the appropriate observation type based on as_type

Example:
# Create a span
with langfuse.start_as_current_observation(name="process-query", as_type="span") as span:
    # Do work
    result = process_data()
    span.update(output=result)

    # Create a child span automatically
    with span.start_as_current_observation(name="sub-operation") as child_span:
        # Do sub-operation work
        child_span.update(output="sub-result")

# Create a tool observation
with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool:
    # Do tool work
    results = search_web(query)
    tool.update(output=results)

# Create a generation observation
with langfuse.start_as_current_observation(
    name="answer-generation",
    as_type="generation",
    model="gpt-4"
) as generation:
    # Generate answer
    response = llm.generate(...)
    generation.update(output=response)
def update_current_generation( self, *, name: Optional[str] = None, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, float, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None) -> None:
1220    def update_current_generation(
1221        self,
1222        *,
1223        name: Optional[str] = None,
1224        input: Optional[Any] = None,
1225        output: Optional[Any] = None,
1226        metadata: Optional[Any] = None,
1227        version: Optional[str] = None,
1228        level: Optional[SpanLevel] = None,
1229        status_message: Optional[str] = None,
1230        completion_start_time: Optional[datetime] = None,
1231        model: Optional[str] = None,
1232        model_parameters: Optional[Dict[str, MapValue]] = None,
1233        usage_details: Optional[Dict[str, int]] = None,
1234        cost_details: Optional[Dict[str, float]] = None,
1235        prompt: Optional[PromptClient] = None,
1236    ) -> None:
1237        """Update the current active generation span with new information.
1238
1239        This method updates the current generation span in the active context with
1240        additional information. It's useful for adding output, usage stats, or other
1241        details that become available during or after model generation.
1242
1243        Args:
1244            name: The generation name
1245            input: Updated input data for the model
1246            output: Output from the model (e.g., completions)
1247            metadata: Additional metadata to associate with the generation
1248            version: Version identifier for the model or component
1249            level: Importance level of the generation (info, warning, error)
1250            status_message: Optional status message for the generation
1251            completion_start_time: When the model started generating the response
1252            model: Name/identifier of the AI model used (e.g., "gpt-4")
1253            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1254            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1255            cost_details: Cost information for the model call
1256            prompt: Associated prompt template from Langfuse prompt management
1257
1258        Example:
1259            ```python
1260            with langfuse.start_as_current_generation(name="answer-query") as generation:
1261                # Initial setup and API call
1262                response = llm.generate(...)
1263
1264                # Update with results that weren't available at creation time
1265                langfuse.update_current_generation(
1266                    output=response.text,
1267                    usage_details={
1268                        "prompt_tokens": response.usage.prompt_tokens,
1269                        "completion_tokens": response.usage.completion_tokens
1270                    }
1271                )
1272            ```
1273        """
1274        if not self._tracing_enabled:
1275            langfuse_logger.debug(
1276                "Operation skipped: update_current_generation - Tracing is disabled or client is in no-op mode."
1277            )
1278            return
1279
1280        current_otel_span = self._get_current_otel_span()
1281
1282        if current_otel_span is not None:
1283            generation = LangfuseGeneration(
1284                otel_span=current_otel_span, langfuse_client=self
1285            )
1286
1287            if name:
1288                current_otel_span.update_name(name)
1289
1290            generation.update(
1291                input=input,
1292                output=output,
1293                metadata=metadata,
1294                version=version,
1295                level=level,
1296                status_message=status_message,
1297                completion_start_time=completion_start_time,
1298                model=model,
1299                model_parameters=model_parameters,
1300                usage_details=usage_details,
1301                cost_details=cost_details,
1302                prompt=prompt,
1303            )

Update the current active generation span with new information.

This method updates the current generation span in the active context with additional information. It's useful for adding output, usage stats, or other details that become available during or after model generation.

Arguments:
  • name: The generation name
  • input: Updated input data for the model
  • output: Output from the model (e.g., completions)
  • metadata: Additional metadata to associate with the generation
  • version: Version identifier for the model or component
  • level: Importance level of the generation (info, warning, error)
  • status_message: Optional status message for the generation
  • completion_start_time: When the model started generating the response
  • model: Name/identifier of the AI model used (e.g., "gpt-4")
  • model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
  • usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
  • cost_details: Cost information for the model call
  • prompt: Associated prompt template from Langfuse prompt management
Example:
with langfuse.start_as_current_generation(name="answer-query") as generation:
    # Initial setup and API call
    response = llm.generate(...)

    # Update with results that weren't available at creation time
    langfuse.update_current_generation(
        output=response.text,
        usage_details={
            "prompt_tokens": response.usage.prompt_tokens,
            "completion_tokens": response.usage.completion_tokens
        }
    )
def update_current_span( self, *, name: Optional[str] = None, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None) -> None:
1305    def update_current_span(
1306        self,
1307        *,
1308        name: Optional[str] = None,
1309        input: Optional[Any] = None,
1310        output: Optional[Any] = None,
1311        metadata: Optional[Any] = None,
1312        version: Optional[str] = None,
1313        level: Optional[SpanLevel] = None,
1314        status_message: Optional[str] = None,
1315    ) -> None:
1316        """Update the current active span with new information.
1317
1318        This method updates the current span in the active context with
1319        additional information. It's useful for adding outputs or metadata
1320        that become available during execution.
1321
1322        Args:
1323            name: The span name
1324            input: Updated input data for the operation
1325            output: Output data from the operation
1326            metadata: Additional metadata to associate with the span
1327            version: Version identifier for the code or component
1328            level: Importance level of the span (info, warning, error)
1329            status_message: Optional status message for the span
1330
1331        Example:
1332            ```python
1333            with langfuse.start_as_current_observation(name="process-data") as span:
1334                # Initial processing
1335                result = process_first_part()
1336
1337                # Update with intermediate results
1338                langfuse.update_current_span(metadata={"intermediate_result": result})
1339
1340                # Continue processing
1341                final_result = process_second_part(result)
1342
1343                # Final update
1344                langfuse.update_current_span(output=final_result)
1345            ```
1346        """
1347        if not self._tracing_enabled:
1348            langfuse_logger.debug(
1349                "Operation skipped: update_current_span - Tracing is disabled or client is in no-op mode."
1350            )
1351            return
1352
1353        current_otel_span = self._get_current_otel_span()
1354
1355        if current_otel_span is not None:
1356            span = LangfuseSpan(
1357                otel_span=current_otel_span,
1358                langfuse_client=self,
1359                environment=self._environment,
1360                release=self._release,
1361            )
1362
1363            if name:
1364                current_otel_span.update_name(name)
1365
1366            span.update(
1367                input=input,
1368                output=output,
1369                metadata=metadata,
1370                version=version,
1371                level=level,
1372                status_message=status_message,
1373            )

Update the current active span with new information.

This method updates the current span in the active context with additional information. It's useful for adding outputs or metadata that become available during execution.

Arguments:
  • name: The span name
  • input: Updated input data for the operation
  • output: Output data from the operation
  • metadata: Additional metadata to associate with the span
  • version: Version identifier for the code or component
  • level: Importance level of the span (info, warning, error)
  • status_message: Optional status message for the span
Example:
with langfuse.start_as_current_observation(name="process-data") as span:
    # Initial processing
    result = process_first_part()

    # Update with intermediate results
    langfuse.update_current_span(metadata={"intermediate_result": result})

    # Continue processing
    final_result = process_second_part(result)

    # Final update
    langfuse.update_current_span(output=final_result)
@deprecated('Trace-level input/output is deprecated. For trace attributes (user_id, session_id, tags, etc.), use propagate_attributes() instead. This method will be removed in a future major version.')
def set_current_trace_io( self, *, input: Optional[Any] = None, output: Optional[Any] = None) -> None:
1375    @deprecated(
1376        "Trace-level input/output is deprecated. "
1377        "For trace attributes (user_id, session_id, tags, etc.), use propagate_attributes() instead. "
1378        "This method will be removed in a future major version."
1379    )
1380    def set_current_trace_io(
1381        self,
1382        *,
1383        input: Optional[Any] = None,
1384        output: Optional[Any] = None,
1385    ) -> None:
1386        """Set trace-level input and output for the current span's trace.
1387
1388        .. deprecated::
1389            This is a legacy method for backward compatibility with Langfuse platform
1390            features that still rely on trace-level input/output (e.g., legacy LLM-as-a-judge
1391            evaluators). It will be removed in a future major version.
1392
1393            For setting other trace attributes (user_id, session_id, metadata, tags, version),
1394            use :meth:`propagate_attributes` instead.
1395
1396        Args:
1397            input: Input data to associate with the trace.
1398            output: Output data to associate with the trace.
1399        """
1400        if not self._tracing_enabled:
1401            langfuse_logger.debug(
1402                "Operation skipped: set_current_trace_io - Tracing is disabled or client is in no-op mode."
1403            )
1404            return
1405
1406        current_otel_span = self._get_current_otel_span()
1407
1408        if current_otel_span is not None and current_otel_span.is_recording():
1409            existing_observation_type = current_otel_span.attributes.get(  # type: ignore[attr-defined]
1410                LangfuseOtelSpanAttributes.OBSERVATION_TYPE, "span"
1411            )
1412            # We need to preserve the class to keep the correct observation type
1413            span_class = self._get_span_class(existing_observation_type)
1414            span = span_class(
1415                otel_span=current_otel_span,
1416                langfuse_client=self,
1417                environment=self._environment,
1418                release=self._release,
1419            )
1420
1421            span.set_trace_io(
1422                input=input,
1423                output=output,
1424            )

Set trace-level input and output for the current span's trace.

Deprecated since version : This is a legacy method for backward compatibility with Langfuse platform features that still rely on trace-level input/output (e.g., legacy LLM-as-a-judge evaluators). It will be removed in a future major version.

For setting other trace attributes (user_id, session_id, metadata, tags, version), use propagate_attributes() instead.

Arguments:
  • input: Input data to associate with the trace.
  • output: Output data to associate with the trace.
def set_current_trace_as_public(self) -> None:
1426    def set_current_trace_as_public(self) -> None:
1427        """Make the current trace publicly accessible via its URL.
1428
1429        When a trace is published, anyone with the trace link can view the full trace
1430        without needing to be logged in to Langfuse. This action cannot be undone
1431        programmatically - once published, the entire trace becomes public.
1432
1433        This is a convenience method that publishes the trace from the currently
1434        active span context. Use this when you want to make a trace public from
1435        within a traced function without needing direct access to the span object.
1436        """
1437        if not self._tracing_enabled:
1438            langfuse_logger.debug(
1439                "Operation skipped: set_current_trace_as_public - Tracing is disabled or client is in no-op mode."
1440            )
1441            return
1442
1443        current_otel_span = self._get_current_otel_span()
1444
1445        if current_otel_span is not None and current_otel_span.is_recording():
1446            existing_observation_type = current_otel_span.attributes.get(  # type: ignore[attr-defined]
1447                LangfuseOtelSpanAttributes.OBSERVATION_TYPE, "span"
1448            )
1449            # We need to preserve the class to keep the correct observation type
1450            span_class = self._get_span_class(existing_observation_type)
1451            span = span_class(
1452                otel_span=current_otel_span,
1453                langfuse_client=self,
1454                environment=self._environment,
1455            )
1456
1457            span.set_trace_as_public()

Make the current trace publicly accessible via its URL.

When a trace is published, anyone with the trace link can view the full trace without needing to be logged in to Langfuse. This action cannot be undone programmatically - once published, the entire trace becomes public.

This is a convenience method that publishes the trace from the currently active span context. Use this when you want to make a trace public from within a traced function without needing direct access to the span object.

def create_event( self, *, trace_context: Optional[langfuse.types.TraceContext] = None, name: str, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None) -> LangfuseEvent:
1459    def create_event(
1460        self,
1461        *,
1462        trace_context: Optional[TraceContext] = None,
1463        name: str,
1464        input: Optional[Any] = None,
1465        output: Optional[Any] = None,
1466        metadata: Optional[Any] = None,
1467        version: Optional[str] = None,
1468        level: Optional[SpanLevel] = None,
1469        status_message: Optional[str] = None,
1470    ) -> LangfuseEvent:
1471        """Create a new Langfuse observation of type 'EVENT'.
1472
1473        The created Langfuse Event observation will be the child of the current span in the context.
1474
1475        Args:
1476            trace_context: Optional context for connecting to an existing trace
1477            name: Name of the span (e.g., function or operation name)
1478            input: Input data for the operation (can be any JSON-serializable object)
1479            output: Output data from the operation (can be any JSON-serializable object)
1480            metadata: Additional metadata to associate with the span
1481            version: Version identifier for the code or component
1482            level: Importance level of the span (info, warning, error)
1483            status_message: Optional status message for the span
1484
1485        Returns:
1486            The Langfuse Event object
1487
1488        Example:
1489            ```python
1490            event = langfuse.create_event(name="process-event")
1491            ```
1492        """
1493        timestamp = time_ns()
1494
1495        if trace_context:
1496            trace_id = trace_context.get("trace_id", None)
1497            parent_span_id = trace_context.get("parent_span_id", None)
1498
1499            if trace_id:
1500                remote_parent_span = self._create_remote_parent_span(
1501                    trace_id=trace_id, parent_span_id=parent_span_id
1502                )
1503
1504                with otel_trace_api.use_span(
1505                    cast(otel_trace_api.Span, remote_parent_span)
1506                ):
1507                    otel_span = self._otel_tracer.start_span(
1508                        name=name, start_time=timestamp
1509                    )
1510                    otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True)
1511
1512                    return cast(
1513                        LangfuseEvent,
1514                        LangfuseEvent(
1515                            otel_span=otel_span,
1516                            langfuse_client=self,
1517                            environment=self._environment,
1518                            release=self._release,
1519                            input=input,
1520                            output=output,
1521                            metadata=metadata,
1522                            version=version,
1523                            level=level,
1524                            status_message=status_message,
1525                        ).end(end_time=timestamp),
1526                    )
1527
1528        otel_span = self._otel_tracer.start_span(name=name, start_time=timestamp)
1529
1530        return cast(
1531            LangfuseEvent,
1532            LangfuseEvent(
1533                otel_span=otel_span,
1534                langfuse_client=self,
1535                environment=self._environment,
1536                release=self._release,
1537                input=input,
1538                output=output,
1539                metadata=metadata,
1540                version=version,
1541                level=level,
1542                status_message=status_message,
1543            ).end(end_time=timestamp),
1544        )

Create a new Langfuse observation of type 'EVENT'.

The created Langfuse Event observation will be the child of the current span in the context.

Arguments:
  • trace_context: Optional context for connecting to an existing trace
  • name: Name of the span (e.g., function or operation name)
  • input: Input data for the operation (can be any JSON-serializable object)
  • output: Output data from the operation (can be any JSON-serializable object)
  • metadata: Additional metadata to associate with the span
  • version: Version identifier for the code or component
  • level: Importance level of the span (info, warning, error)
  • status_message: Optional status message for the span
Returns:

The Langfuse Event object

Example:
event = langfuse.create_event(name="process-event")
@staticmethod
def create_trace_id(*, seed: Optional[str] = None) -> str:
1633    @staticmethod
1634    def create_trace_id(*, seed: Optional[str] = None) -> str:
1635        """Create a unique trace ID for use with Langfuse.
1636
1637        This method generates a unique trace ID for use with various Langfuse APIs.
1638        It can either generate a random ID or create a deterministic ID based on
1639        a seed string.
1640
1641        Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes.
1642        This method ensures the generated ID meets this requirement. If you need to
1643        correlate an external ID with a Langfuse trace ID, use the external ID as the
1644        seed to get a valid, deterministic Langfuse trace ID.
1645
1646        Args:
1647            seed: Optional string to use as a seed for deterministic ID generation.
1648                 If provided, the same seed will always produce the same ID.
1649                 If not provided, a random ID will be generated.
1650
1651        Returns:
1652            A 32-character lowercase hexadecimal string representing the Langfuse trace ID.
1653
1654        Example:
1655            ```python
1656            # Generate a random trace ID
1657            trace_id = langfuse.create_trace_id()
1658
1659            # Generate a deterministic ID based on a seed
1660            session_trace_id = langfuse.create_trace_id(seed="session-456")
1661
1662            # Correlate an external ID with a Langfuse trace ID
1663            external_id = "external-system-123456"
1664            correlated_trace_id = langfuse.create_trace_id(seed=external_id)
1665
1666            # Use the ID with trace context
1667            with langfuse.start_as_current_observation(
1668                name="process-request",
1669                trace_context={"trace_id": trace_id}
1670            ) as span:
1671                # Operation will be part of the specific trace
1672                pass
1673            ```
1674        """
1675        if not seed:
1676            trace_id_int = RandomIdGenerator().generate_trace_id()
1677
1678            return Langfuse._format_otel_trace_id(trace_id_int)
1679
1680        return sha256(seed.encode("utf-8")).digest()[:16].hex()

Create a unique trace ID for use with Langfuse.

This method generates a unique trace ID for use with various Langfuse APIs. It can either generate a random ID or create a deterministic ID based on a seed string.

Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes. This method ensures the generated ID meets this requirement. If you need to correlate an external ID with a Langfuse trace ID, use the external ID as the seed to get a valid, deterministic Langfuse trace ID.

Arguments:
  • seed: Optional string to use as a seed for deterministic ID generation. If provided, the same seed will always produce the same ID. If not provided, a random ID will be generated.
Returns:

A 32-character lowercase hexadecimal string representing the Langfuse trace ID.

Example:
# Generate a random trace ID
trace_id = langfuse.create_trace_id()

# Generate a deterministic ID based on a seed
session_trace_id = langfuse.create_trace_id(seed="session-456")

# Correlate an external ID with a Langfuse trace ID
external_id = "external-system-123456"
correlated_trace_id = langfuse.create_trace_id(seed=external_id)

# Use the ID with trace context
with langfuse.start_as_current_observation(
    name="process-request",
    trace_context={"trace_id": trace_id}
) as span:
    # Operation will be part of the specific trace
    pass
def create_score( self, *, name: str, value: Union[float, str], session_id: Optional[str] = None, dataset_run_id: Optional[str] = None, trace_id: Optional[str] = None, observation_id: Optional[str] = None, score_id: Optional[str] = None, data_type: Optional[Literal['NUMERIC', 'CATEGORICAL', 'BOOLEAN']] = None, comment: Optional[str] = None, config_id: Optional[str] = None, metadata: Optional[Any] = None, timestamp: Optional[datetime.datetime] = None) -> None:
1758    def create_score(
1759        self,
1760        *,
1761        name: str,
1762        value: Union[float, str],
1763        session_id: Optional[str] = None,
1764        dataset_run_id: Optional[str] = None,
1765        trace_id: Optional[str] = None,
1766        observation_id: Optional[str] = None,
1767        score_id: Optional[str] = None,
1768        data_type: Optional[ScoreDataType] = None,
1769        comment: Optional[str] = None,
1770        config_id: Optional[str] = None,
1771        metadata: Optional[Any] = None,
1772        timestamp: Optional[datetime] = None,
1773    ) -> None:
1774        """Create a score for a specific trace or observation.
1775
1776        This method creates a score for evaluating a Langfuse trace or observation. Scores can be
1777        used to track quality metrics, user feedback, or automated evaluations.
1778
1779        Args:
1780            name: Name of the score (e.g., "relevance", "accuracy")
1781            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
1782            session_id: ID of the Langfuse session to associate the score with
1783            dataset_run_id: ID of the Langfuse dataset run to associate the score with
1784            trace_id: ID of the Langfuse trace to associate the score with
1785            observation_id: Optional ID of the specific observation to score. Trace ID must be provided too.
1786            score_id: Optional custom ID for the score (auto-generated if not provided)
1787            data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
1788            comment: Optional comment or explanation for the score
1789            config_id: Optional ID of a score config defined in Langfuse
1790            metadata: Optional metadata to be attached to the score
1791            timestamp: Optional timestamp for the score (defaults to current UTC time)
1792
1793        Example:
1794            ```python
1795            # Create a numeric score for accuracy
1796            langfuse.create_score(
1797                name="accuracy",
1798                value=0.92,
1799                trace_id="abcdef1234567890abcdef1234567890",
1800                data_type="NUMERIC",
1801                comment="High accuracy with minor irrelevant details"
1802            )
1803
1804            # Create a categorical score for sentiment
1805            langfuse.create_score(
1806                name="sentiment",
1807                value="positive",
1808                trace_id="abcdef1234567890abcdef1234567890",
1809                observation_id="abcdef1234567890",
1810                data_type="CATEGORICAL"
1811            )
1812            ```
1813        """
1814        if not self._tracing_enabled:
1815            return
1816
1817        score_id = score_id or self._create_observation_id()
1818
1819        try:
1820            new_body = ScoreBody(
1821                id=score_id,
1822                session_id=session_id,
1823                datasetRunId=dataset_run_id,
1824                traceId=trace_id,
1825                observationId=observation_id,
1826                name=name,
1827                value=value,
1828                dataType=data_type,  # type: ignore
1829                comment=comment,
1830                configId=config_id,
1831                environment=self._environment,
1832                metadata=metadata,
1833            )
1834
1835            event = {
1836                "id": self.create_trace_id(),
1837                "type": "score-create",
1838                "timestamp": timestamp or _get_timestamp(),
1839                "body": new_body,
1840            }
1841
1842            if self._resources is not None:
1843                # Force the score to be in sample if it was for a legacy trace ID, i.e. non-32 hexchar
1844                force_sample = (
1845                    not self._is_valid_trace_id(trace_id) if trace_id else True
1846                )
1847
1848                self._resources.add_score_task(
1849                    event,
1850                    force_sample=force_sample,
1851                )
1852
1853        except Exception as e:
1854            langfuse_logger.exception(
1855                f"Error creating score: Failed to process score event for trace_id={trace_id}, name={name}. Error: {e}"
1856            )

Create a score for a specific trace or observation.

This method creates a score for evaluating a Langfuse trace or observation. Scores can be used to track quality metrics, user feedback, or automated evaluations.

Arguments:
  • name: Name of the score (e.g., "relevance", "accuracy")
  • value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
  • session_id: ID of the Langfuse session to associate the score with
  • dataset_run_id: ID of the Langfuse dataset run to associate the score with
  • trace_id: ID of the Langfuse trace to associate the score with
  • observation_id: Optional ID of the specific observation to score. Trace ID must be provided too.
  • score_id: Optional custom ID for the score (auto-generated if not provided)
  • data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
  • comment: Optional comment or explanation for the score
  • config_id: Optional ID of a score config defined in Langfuse
  • metadata: Optional metadata to be attached to the score
  • timestamp: Optional timestamp for the score (defaults to current UTC time)
Example:
# Create a numeric score for accuracy
langfuse.create_score(
    name="accuracy",
    value=0.92,
    trace_id="abcdef1234567890abcdef1234567890",
    data_type="NUMERIC",
    comment="High accuracy with minor irrelevant details"
)

# Create a categorical score for sentiment
langfuse.create_score(
    name="sentiment",
    value="positive",
    trace_id="abcdef1234567890abcdef1234567890",
    observation_id="abcdef1234567890",
    data_type="CATEGORICAL"
)
def score_current_span( self, *, name: str, value: Union[float, str], score_id: Optional[str] = None, data_type: Optional[Literal['NUMERIC', 'CATEGORICAL', 'BOOLEAN']] = None, comment: Optional[str] = None, config_id: Optional[str] = None, metadata: Optional[Any] = None) -> None:
1917    def score_current_span(
1918        self,
1919        *,
1920        name: str,
1921        value: Union[float, str],
1922        score_id: Optional[str] = None,
1923        data_type: Optional[ScoreDataType] = None,
1924        comment: Optional[str] = None,
1925        config_id: Optional[str] = None,
1926        metadata: Optional[Any] = None,
1927    ) -> None:
1928        """Create a score for the current active span.
1929
1930        This method scores the currently active span in the context. It's a convenient
1931        way to score the current operation without needing to know its trace and span IDs.
1932
1933        Args:
1934            name: Name of the score (e.g., "relevance", "accuracy")
1935            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
1936            score_id: Optional custom ID for the score (auto-generated if not provided)
1937            data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
1938            comment: Optional comment or explanation for the score
1939            config_id: Optional ID of a score config defined in Langfuse
1940            metadata: Optional metadata to be attached to the score
1941
1942        Example:
1943            ```python
1944            with langfuse.start_as_current_generation(name="answer-query") as generation:
1945                # Generate answer
1946                response = generate_answer(...)
1947                generation.update(output=response)
1948
1949                # Score the generation
1950                langfuse.score_current_span(
1951                    name="relevance",
1952                    value=0.85,
1953                    data_type="NUMERIC",
1954                    comment="Mostly relevant but contains some tangential information",
1955                    metadata={"model": "gpt-4", "prompt_version": "v2"}
1956                )
1957            ```
1958        """
1959        current_span = self._get_current_otel_span()
1960
1961        if current_span is not None:
1962            trace_id = self._get_otel_trace_id(current_span)
1963            observation_id = self._get_otel_span_id(current_span)
1964
1965            langfuse_logger.info(
1966                f"Score: Creating score name='{name}' value={value} for current span ({observation_id}) in trace {trace_id}"
1967            )
1968
1969            self.create_score(
1970                trace_id=trace_id,
1971                observation_id=observation_id,
1972                name=name,
1973                value=cast(str, value),
1974                score_id=score_id,
1975                data_type=cast(Literal["CATEGORICAL"], data_type),
1976                comment=comment,
1977                config_id=config_id,
1978                metadata=metadata,
1979            )

Create a score for the current active span.

This method scores the currently active span in the context. It's a convenient way to score the current operation without needing to know its trace and span IDs.

Arguments:
  • name: Name of the score (e.g., "relevance", "accuracy")
  • value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
  • score_id: Optional custom ID for the score (auto-generated if not provided)
  • data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
  • comment: Optional comment or explanation for the score
  • config_id: Optional ID of a score config defined in Langfuse
  • metadata: Optional metadata to be attached to the score
Example:
with langfuse.start_as_current_generation(name="answer-query") as generation:
    # Generate answer
    response = generate_answer(...)
    generation.update(output=response)

    # Score the generation
    langfuse.score_current_span(
        name="relevance",
        value=0.85,
        data_type="NUMERIC",
        comment="Mostly relevant but contains some tangential information",
        metadata={"model": "gpt-4", "prompt_version": "v2"}
    )
def score_current_trace( self, *, name: str, value: Union[float, str], score_id: Optional[str] = None, data_type: Optional[Literal['NUMERIC', 'CATEGORICAL', 'BOOLEAN']] = None, comment: Optional[str] = None, config_id: Optional[str] = None, metadata: Optional[Any] = None) -> None:
2007    def score_current_trace(
2008        self,
2009        *,
2010        name: str,
2011        value: Union[float, str],
2012        score_id: Optional[str] = None,
2013        data_type: Optional[ScoreDataType] = None,
2014        comment: Optional[str] = None,
2015        config_id: Optional[str] = None,
2016        metadata: Optional[Any] = None,
2017    ) -> None:
2018        """Create a score for the current trace.
2019
2020        This method scores the trace of the currently active span. Unlike score_current_span,
2021        this method associates the score with the entire trace rather than a specific span.
2022        It's useful for scoring overall performance or quality of the entire operation.
2023
2024        Args:
2025            name: Name of the score (e.g., "user_satisfaction", "overall_quality")
2026            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
2027            score_id: Optional custom ID for the score (auto-generated if not provided)
2028            data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
2029            comment: Optional comment or explanation for the score
2030            config_id: Optional ID of a score config defined in Langfuse
2031            metadata: Optional metadata to be attached to the score
2032
2033        Example:
2034            ```python
2035            with langfuse.start_as_current_observation(name="process-user-request") as span:
2036                # Process request
2037                result = process_complete_request()
2038                span.update(output=result)
2039
2040                # Score the overall trace
2041                langfuse.score_current_trace(
2042                    name="overall_quality",
2043                    value=0.95,
2044                    data_type="NUMERIC",
2045                    comment="High quality end-to-end response",
2046                    metadata={"evaluator": "gpt-4", "criteria": "comprehensive"}
2047                )
2048            ```
2049        """
2050        current_span = self._get_current_otel_span()
2051
2052        if current_span is not None:
2053            trace_id = self._get_otel_trace_id(current_span)
2054
2055            langfuse_logger.info(
2056                f"Score: Creating score name='{name}' value={value} for entire trace {trace_id}"
2057            )
2058
2059            self.create_score(
2060                trace_id=trace_id,
2061                name=name,
2062                value=cast(str, value),
2063                score_id=score_id,
2064                data_type=cast(Literal["CATEGORICAL"], data_type),
2065                comment=comment,
2066                config_id=config_id,
2067                metadata=metadata,
2068            )

Create a score for the current trace.

This method scores the trace of the currently active span. Unlike score_current_span, this method associates the score with the entire trace rather than a specific span. It's useful for scoring overall performance or quality of the entire operation.

Arguments:
  • name: Name of the score (e.g., "user_satisfaction", "overall_quality")
  • value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
  • score_id: Optional custom ID for the score (auto-generated if not provided)
  • data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
  • comment: Optional comment or explanation for the score
  • config_id: Optional ID of a score config defined in Langfuse
  • metadata: Optional metadata to be attached to the score
Example:
with langfuse.start_as_current_observation(name="process-user-request") as span:
    # Process request
    result = process_complete_request()
    span.update(output=result)

    # Score the overall trace
    langfuse.score_current_trace(
        name="overall_quality",
        value=0.95,
        data_type="NUMERIC",
        comment="High quality end-to-end response",
        metadata={"evaluator": "gpt-4", "criteria": "comprehensive"}
    )
def flush(self) -> None:
2070    def flush(self) -> None:
2071        """Force flush all pending spans and events to the Langfuse API.
2072
2073        This method manually flushes any pending spans, scores, and other events to the
2074        Langfuse API. It's useful in scenarios where you want to ensure all data is sent
2075        before proceeding, without waiting for the automatic flush interval.
2076
2077        Example:
2078            ```python
2079            # Record some spans and scores
2080            with langfuse.start_as_current_observation(name="operation") as span:
2081                # Do work...
2082                pass
2083
2084            # Ensure all data is sent to Langfuse before proceeding
2085            langfuse.flush()
2086
2087            # Continue with other work
2088            ```
2089        """
2090        if self._resources is not None:
2091            self._resources.flush()

Force flush all pending spans and events to the Langfuse API.

This method manually flushes any pending spans, scores, and other events to the Langfuse API. It's useful in scenarios where you want to ensure all data is sent before proceeding, without waiting for the automatic flush interval.

Example:
# Record some spans and scores
with langfuse.start_as_current_observation(name="operation") as span:
    # Do work...
    pass

# Ensure all data is sent to Langfuse before proceeding
langfuse.flush()

# Continue with other work
def shutdown(self) -> None:
2093    def shutdown(self) -> None:
2094        """Shut down the Langfuse client and flush all pending data.
2095
2096        This method cleanly shuts down the Langfuse client, ensuring all pending data
2097        is flushed to the API and all background threads are properly terminated.
2098
2099        It's important to call this method when your application is shutting down to
2100        prevent data loss and resource leaks. For most applications, using the client
2101        as a context manager or relying on the automatic shutdown via atexit is sufficient.
2102
2103        Example:
2104            ```python
2105            # Initialize Langfuse
2106            langfuse = Langfuse(public_key="...", secret_key="...")
2107
2108            # Use Langfuse throughout your application
2109            # ...
2110
2111            # When application is shutting down
2112            langfuse.shutdown()
2113            ```
2114        """
2115        if self._resources is not None:
2116            self._resources.shutdown()

Shut down the Langfuse client and flush all pending data.

This method cleanly shuts down the Langfuse client, ensuring all pending data is flushed to the API and all background threads are properly terminated.

It's important to call this method when your application is shutting down to prevent data loss and resource leaks. For most applications, using the client as a context manager or relying on the automatic shutdown via atexit is sufficient.

Example:
# Initialize Langfuse
langfuse = Langfuse(public_key="...", secret_key="...")

# Use Langfuse throughout your application
# ...

# When application is shutting down
langfuse.shutdown()
def get_current_trace_id(self) -> Optional[str]:
2118    def get_current_trace_id(self) -> Optional[str]:
2119        """Get the trace ID of the current active span.
2120
2121        This method retrieves the trace ID from the currently active span in the context.
2122        It can be used to get the trace ID for referencing in logs, external systems,
2123        or for creating related operations.
2124
2125        Returns:
2126            The current trace ID as a 32-character lowercase hexadecimal string,
2127            or None if there is no active span.
2128
2129        Example:
2130            ```python
2131            with langfuse.start_as_current_observation(name="process-request") as span:
2132                # Get the current trace ID for reference
2133                trace_id = langfuse.get_current_trace_id()
2134
2135                # Use it for external correlation
2136                log.info(f"Processing request with trace_id: {trace_id}")
2137
2138                # Or pass to another system
2139                external_system.process(data, trace_id=trace_id)
2140            ```
2141        """
2142        if not self._tracing_enabled:
2143            langfuse_logger.debug(
2144                "Operation skipped: get_current_trace_id - Tracing is disabled or client is in no-op mode."
2145            )
2146            return None
2147
2148        current_otel_span = self._get_current_otel_span()
2149
2150        return self._get_otel_trace_id(current_otel_span) if current_otel_span else None

Get the trace ID of the current active span.

This method retrieves the trace ID from the currently active span in the context. It can be used to get the trace ID for referencing in logs, external systems, or for creating related operations.

Returns:

The current trace ID as a 32-character lowercase hexadecimal string, or None if there is no active span.

Example:
with langfuse.start_as_current_observation(name="process-request") as span:
    # Get the current trace ID for reference
    trace_id = langfuse.get_current_trace_id()

    # Use it for external correlation
    log.info(f"Processing request with trace_id: {trace_id}")

    # Or pass to another system
    external_system.process(data, trace_id=trace_id)
def get_current_observation_id(self) -> Optional[str]:
2152    def get_current_observation_id(self) -> Optional[str]:
2153        """Get the observation ID (span ID) of the current active span.
2154
2155        This method retrieves the observation ID from the currently active span in the context.
2156        It can be used to get the observation ID for referencing in logs, external systems,
2157        or for creating scores or other related operations.
2158
2159        Returns:
2160            The current observation ID as a 16-character lowercase hexadecimal string,
2161            or None if there is no active span.
2162
2163        Example:
2164            ```python
2165            with langfuse.start_as_current_observation(name="process-user-query") as span:
2166                # Get the current observation ID
2167                observation_id = langfuse.get_current_observation_id()
2168
2169                # Store it for later reference
2170                cache.set(f"query_{query_id}_observation", observation_id)
2171
2172                # Process the query...
2173            ```
2174        """
2175        if not self._tracing_enabled:
2176            langfuse_logger.debug(
2177                "Operation skipped: get_current_observation_id - Tracing is disabled or client is in no-op mode."
2178            )
2179            return None
2180
2181        current_otel_span = self._get_current_otel_span()
2182
2183        return self._get_otel_span_id(current_otel_span) if current_otel_span else None

Get the observation ID (span ID) of the current active span.

This method retrieves the observation ID from the currently active span in the context. It can be used to get the observation ID for referencing in logs, external systems, or for creating scores or other related operations.

Returns:

The current observation ID as a 16-character lowercase hexadecimal string, or None if there is no active span.

Example:
with langfuse.start_as_current_observation(name="process-user-query") as span:
    # Get the current observation ID
    observation_id = langfuse.get_current_observation_id()

    # Store it for later reference
    cache.set(f"query_{query_id}_observation", observation_id)

    # Process the query...
def get_trace_url(self, *, trace_id: Optional[str] = None) -> Optional[str]:
2196    def get_trace_url(self, *, trace_id: Optional[str] = None) -> Optional[str]:
2197        """Get the URL to view a trace in the Langfuse UI.
2198
2199        This method generates a URL that links directly to a trace in the Langfuse UI.
2200        It's useful for providing links in logs, notifications, or debugging tools.
2201
2202        Args:
2203            trace_id: Optional trace ID to generate a URL for. If not provided,
2204                     the trace ID of the current active span will be used.
2205
2206        Returns:
2207            A URL string pointing to the trace in the Langfuse UI,
2208            or None if the project ID couldn't be retrieved or no trace ID is available.
2209
2210        Example:
2211            ```python
2212            # Get URL for the current trace
2213            with langfuse.start_as_current_observation(name="process-request") as span:
2214                trace_url = langfuse.get_trace_url()
2215                log.info(f"Processing trace: {trace_url}")
2216
2217            # Get URL for a specific trace
2218            specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef")
2219            send_notification(f"Review needed for trace: {specific_trace_url}")
2220            ```
2221        """
2222        final_trace_id = trace_id or self.get_current_trace_id()
2223        if not final_trace_id:
2224            return None
2225
2226        project_id = self._get_project_id()
2227
2228        return (
2229            f"{self._base_url}/project/{project_id}/traces/{final_trace_id}"
2230            if project_id and final_trace_id
2231            else None
2232        )

Get the URL to view a trace in the Langfuse UI.

This method generates a URL that links directly to a trace in the Langfuse UI. It's useful for providing links in logs, notifications, or debugging tools.

Arguments:
  • trace_id: Optional trace ID to generate a URL for. If not provided, the trace ID of the current active span will be used.
Returns:

A URL string pointing to the trace in the Langfuse UI, or None if the project ID couldn't be retrieved or no trace ID is available.

Example:
# Get URL for the current trace
with langfuse.start_as_current_observation(name="process-request") as span:
    trace_url = langfuse.get_trace_url()
    log.info(f"Processing trace: {trace_url}")

# Get URL for a specific trace
specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef")
send_notification(f"Review needed for trace: {specific_trace_url}")
def get_dataset( self, name: str, *, fetch_items_page_size: Optional[int] = 50, version: Optional[datetime.datetime] = None) -> langfuse._client.datasets.DatasetClient:
2234    def get_dataset(
2235        self,
2236        name: str,
2237        *,
2238        fetch_items_page_size: Optional[int] = 50,
2239        version: Optional[datetime] = None,
2240    ) -> "DatasetClient":
2241        """Fetch a dataset by its name.
2242
2243        Args:
2244            name (str): The name of the dataset to fetch.
2245            fetch_items_page_size (Optional[int]): All items of the dataset will be fetched in chunks of this size. Defaults to 50.
2246            version (Optional[datetime]): Retrieve dataset items as they existed at this specific point in time (UTC).
2247                If provided, returns the state of items at the specified UTC timestamp.
2248                If not provided, returns the latest version. Must be a timezone-aware datetime object in UTC.
2249
2250        Returns:
2251            DatasetClient: The dataset with the given name.
2252        """
2253        try:
2254            langfuse_logger.debug(f"Getting datasets {name}")
2255            dataset = self.api.datasets.get(dataset_name=self._url_encode(name))
2256
2257            dataset_items = []
2258            page = 1
2259
2260            while True:
2261                new_items = self.api.dataset_items.list(
2262                    dataset_name=self._url_encode(name, is_url_param=True),
2263                    page=page,
2264                    limit=fetch_items_page_size,
2265                    version=version,
2266                )
2267                dataset_items.extend(new_items.data)
2268
2269                if new_items.meta.total_pages <= page:
2270                    break
2271
2272                page += 1
2273
2274            return DatasetClient(
2275                dataset=dataset,
2276                items=dataset_items,
2277                version=version,
2278                langfuse_client=self,
2279            )
2280
2281        except Error as e:
2282            handle_fern_exception(e)
2283            raise e

Fetch a dataset by its name.

Arguments:
  • name (str): The name of the dataset to fetch.
  • fetch_items_page_size (Optional[int]): All items of the dataset will be fetched in chunks of this size. Defaults to 50.
  • version (Optional[datetime]): Retrieve dataset items as they existed at this specific point in time (UTC). If provided, returns the state of items at the specified UTC timestamp. If not provided, returns the latest version. Must be a timezone-aware datetime object in UTC.
Returns:

DatasetClient: The dataset with the given name.

def get_dataset_run( self, *, dataset_name: str, run_name: str) -> langfuse.api.DatasetRunWithItems:
2285    def get_dataset_run(
2286        self, *, dataset_name: str, run_name: str
2287    ) -> DatasetRunWithItems:
2288        """Fetch a dataset run by dataset name and run name.
2289
2290        Args:
2291            dataset_name (str): The name of the dataset.
2292            run_name (str): The name of the run.
2293
2294        Returns:
2295            DatasetRunWithItems: The dataset run with its items.
2296        """
2297        try:
2298            return cast(
2299                DatasetRunWithItems,
2300                self.api.datasets.get_run(
2301                    dataset_name=self._url_encode(dataset_name),
2302                    run_name=self._url_encode(run_name),
2303                    request_options=None,
2304                ),
2305            )
2306        except Error as e:
2307            handle_fern_exception(e)
2308            raise e

Fetch a dataset run by dataset name and run name.

Arguments:
  • dataset_name (str): The name of the dataset.
  • run_name (str): The name of the run.
Returns:

DatasetRunWithItems: The dataset run with its items.

def get_dataset_runs( self, *, dataset_name: str, page: Optional[int] = None, limit: Optional[int] = None) -> langfuse.api.PaginatedDatasetRuns:
2310    def get_dataset_runs(
2311        self,
2312        *,
2313        dataset_name: str,
2314        page: Optional[int] = None,
2315        limit: Optional[int] = None,
2316    ) -> PaginatedDatasetRuns:
2317        """Fetch all runs for a dataset.
2318
2319        Args:
2320            dataset_name (str): The name of the dataset.
2321            page (Optional[int]): Page number, starts at 1.
2322            limit (Optional[int]): Limit of items per page.
2323
2324        Returns:
2325            PaginatedDatasetRuns: Paginated list of dataset runs.
2326        """
2327        try:
2328            return cast(
2329                PaginatedDatasetRuns,
2330                self.api.datasets.get_runs(
2331                    dataset_name=self._url_encode(dataset_name),
2332                    page=page,
2333                    limit=limit,
2334                    request_options=None,
2335                ),
2336            )
2337        except Error as e:
2338            handle_fern_exception(e)
2339            raise e

Fetch all runs for a dataset.

Arguments:
  • dataset_name (str): The name of the dataset.
  • page (Optional[int]): Page number, starts at 1.
  • limit (Optional[int]): Limit of items per page.
Returns:

PaginatedDatasetRuns: Paginated list of dataset runs.

def delete_dataset_run( self, *, dataset_name: str, run_name: str) -> langfuse.api.DeleteDatasetRunResponse:
2341    def delete_dataset_run(
2342        self, *, dataset_name: str, run_name: str
2343    ) -> DeleteDatasetRunResponse:
2344        """Delete a dataset run and all its run items. This action is irreversible.
2345
2346        Args:
2347            dataset_name (str): The name of the dataset.
2348            run_name (str): The name of the run.
2349
2350        Returns:
2351            DeleteDatasetRunResponse: Confirmation of deletion.
2352        """
2353        try:
2354            return cast(
2355                DeleteDatasetRunResponse,
2356                self.api.datasets.delete_run(
2357                    dataset_name=self._url_encode(dataset_name),
2358                    run_name=self._url_encode(run_name),
2359                    request_options=None,
2360                ),
2361            )
2362        except Error as e:
2363            handle_fern_exception(e)
2364            raise e

Delete a dataset run and all its run items. This action is irreversible.

Arguments:
  • dataset_name (str): The name of the dataset.
  • run_name (str): The name of the run.
Returns:

DeleteDatasetRunResponse: Confirmation of deletion.

def run_experiment( self, *, name: str, run_name: Optional[str] = None, description: Optional[str] = None, data: Union[List[langfuse.experiment.LocalExperimentItem], List[langfuse.api.DatasetItem]], task: langfuse.experiment.TaskFunction, evaluators: List[langfuse.experiment.EvaluatorFunction] = [], composite_evaluator: Optional[CompositeEvaluatorFunction] = None, run_evaluators: List[langfuse.experiment.RunEvaluatorFunction] = [], max_concurrency: int = 50, metadata: Optional[Dict[str, str]] = None, _dataset_version: Optional[datetime.datetime] = None) -> langfuse.experiment.ExperimentResult:
2366    def run_experiment(
2367        self,
2368        *,
2369        name: str,
2370        run_name: Optional[str] = None,
2371        description: Optional[str] = None,
2372        data: ExperimentData,
2373        task: TaskFunction,
2374        evaluators: List[EvaluatorFunction] = [],
2375        composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
2376        run_evaluators: List[RunEvaluatorFunction] = [],
2377        max_concurrency: int = 50,
2378        metadata: Optional[Dict[str, str]] = None,
2379        _dataset_version: Optional[datetime] = None,
2380    ) -> ExperimentResult:
2381        """Run an experiment on a dataset with automatic tracing and evaluation.
2382
2383        This method executes a task function on each item in the provided dataset,
2384        automatically traces all executions with Langfuse for observability, runs
2385        item-level and run-level evaluators on the outputs, and returns comprehensive
2386        results with evaluation metrics.
2387
2388        The experiment system provides:
2389        - Automatic tracing of all task executions
2390        - Concurrent processing with configurable limits
2391        - Comprehensive error handling that isolates failures
2392        - Integration with Langfuse datasets for experiment tracking
2393        - Flexible evaluation framework supporting both sync and async evaluators
2394
2395        Args:
2396            name: Human-readable name for the experiment. Used for identification
2397                in the Langfuse UI.
2398            run_name: Optional exact name for the experiment run. If provided, this will be
2399                used as the exact dataset run name if the `data` contains Langfuse dataset items.
2400                If not provided, this will default to the experiment name appended with an ISO timestamp.
2401            description: Optional description explaining the experiment's purpose,
2402                methodology, or expected outcomes.
2403            data: Array of data items to process. Can be either:
2404                - List of dict-like items with 'input', 'expected_output', 'metadata' keys
2405                - List of Langfuse DatasetItem objects from dataset.items
2406            task: Function that processes each data item and returns output.
2407                Must accept 'item' as keyword argument and can return sync or async results.
2408                The task function signature should be: task(*, item, **kwargs) -> Any
2409            evaluators: List of functions to evaluate each item's output individually.
2410                Each evaluator receives input, output, expected_output, and metadata.
2411                Can return single Evaluation dict or list of Evaluation dicts.
2412            composite_evaluator: Optional function that creates composite scores from item-level evaluations.
2413                Receives the same inputs as item-level evaluators (input, output, expected_output, metadata)
2414                plus the list of evaluations from item-level evaluators. Useful for weighted averages,
2415                pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics.
2416            run_evaluators: List of functions to evaluate the entire experiment run.
2417                Each run evaluator receives all item_results and can compute aggregate metrics.
2418                Useful for calculating averages, distributions, or cross-item comparisons.
2419            max_concurrency: Maximum number of concurrent task executions (default: 50).
2420                Controls the number of items processed simultaneously. Adjust based on
2421                API rate limits and system resources.
2422            metadata: Optional metadata dictionary to attach to all experiment traces.
2423                This metadata will be included in every trace created during the experiment.
2424                If `data` are Langfuse dataset items, the metadata will be attached to the dataset run, too.
2425
2426        Returns:
2427            ExperimentResult containing:
2428            - run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset.
2429            - item_results: List of results for each processed item with outputs and evaluations
2430            - run_evaluations: List of aggregate evaluation results for the entire run
2431            - dataset_run_id: ID of the dataset run (if using Langfuse datasets)
2432            - dataset_run_url: Direct URL to view results in Langfuse UI (if applicable)
2433
2434        Raises:
2435            ValueError: If required parameters are missing or invalid
2436            Exception: If experiment setup fails (individual item failures are handled gracefully)
2437
2438        Examples:
2439            Basic experiment with local data:
2440            ```python
2441            def summarize_text(*, item, **kwargs):
2442                return f"Summary: {item['input'][:50]}..."
2443
2444            def length_evaluator(*, input, output, expected_output=None, **kwargs):
2445                return {
2446                    "name": "output_length",
2447                    "value": len(output),
2448                    "comment": f"Output contains {len(output)} characters"
2449                }
2450
2451            result = langfuse.run_experiment(
2452                name="Text Summarization Test",
2453                description="Evaluate summarization quality and length",
2454                data=[
2455                    {"input": "Long article text...", "expected_output": "Expected summary"},
2456                    {"input": "Another article...", "expected_output": "Another summary"}
2457                ],
2458                task=summarize_text,
2459                evaluators=[length_evaluator]
2460            )
2461
2462            print(f"Processed {len(result.item_results)} items")
2463            for item_result in result.item_results:
2464                print(f"Input: {item_result.item['input']}")
2465                print(f"Output: {item_result.output}")
2466                print(f"Evaluations: {item_result.evaluations}")
2467            ```
2468
2469            Advanced experiment with async task and multiple evaluators:
2470            ```python
2471            async def llm_task(*, item, **kwargs):
2472                # Simulate async LLM call
2473                response = await openai_client.chat.completions.create(
2474                    model="gpt-4",
2475                    messages=[{"role": "user", "content": item["input"]}]
2476                )
2477                return response.choices[0].message.content
2478
2479            def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
2480                if expected_output and expected_output.lower() in output.lower():
2481                    return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"}
2482                return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"}
2483
2484            def toxicity_evaluator(*, input, output, expected_output=None, **kwargs):
2485                # Simulate toxicity check
2486                toxicity_score = check_toxicity(output)  # Your toxicity checker
2487                return {
2488                    "name": "toxicity",
2489                    "value": toxicity_score,
2490                    "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}"
2491                }
2492
2493            def average_accuracy(*, item_results, **kwargs):
2494                accuracies = [
2495                    eval.value for result in item_results
2496                    for eval in result.evaluations
2497                    if eval.name == "accuracy"
2498                ]
2499                return {
2500                    "name": "average_accuracy",
2501                    "value": sum(accuracies) / len(accuracies) if accuracies else 0,
2502                    "comment": f"Average accuracy across {len(accuracies)} items"
2503                }
2504
2505            result = langfuse.run_experiment(
2506                name="LLM Safety and Accuracy Test",
2507                description="Evaluate model accuracy and safety across diverse prompts",
2508                data=test_dataset,  # Your dataset items
2509                task=llm_task,
2510                evaluators=[accuracy_evaluator, toxicity_evaluator],
2511                run_evaluators=[average_accuracy],
2512                max_concurrency=5,  # Limit concurrent API calls
2513                metadata={"model": "gpt-4", "temperature": 0.7}
2514            )
2515            ```
2516
2517            Using with Langfuse datasets:
2518            ```python
2519            # Get dataset from Langfuse
2520            dataset = langfuse.get_dataset("my-eval-dataset")
2521
2522            result = dataset.run_experiment(
2523                name="Production Model Evaluation",
2524                description="Monthly evaluation of production model performance",
2525                task=my_production_task,
2526                evaluators=[accuracy_evaluator, latency_evaluator]
2527            )
2528
2529            # Results automatically linked to dataset in Langfuse UI
2530            print(f"View results: {result['dataset_run_url']}")
2531            ```
2532
2533        Note:
2534            - Task and evaluator functions can be either synchronous or asynchronous
2535            - Individual item failures are logged but don't stop the experiment
2536            - All executions are automatically traced and visible in Langfuse UI
2537            - When using Langfuse datasets, results are automatically linked for easy comparison
2538            - This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.)
2539            - Async execution is handled automatically with smart event loop detection
2540        """
2541        return cast(
2542            ExperimentResult,
2543            run_async_safely(
2544                self._run_experiment_async(
2545                    name=name,
2546                    run_name=self._create_experiment_run_name(
2547                        name=name, run_name=run_name
2548                    ),
2549                    description=description,
2550                    data=data,
2551                    task=task,
2552                    evaluators=evaluators or [],
2553                    composite_evaluator=composite_evaluator,
2554                    run_evaluators=run_evaluators or [],
2555                    max_concurrency=max_concurrency,
2556                    metadata=metadata,
2557                    dataset_version=_dataset_version,
2558                ),
2559            ),
2560        )

Run an experiment on a dataset with automatic tracing and evaluation.

This method executes a task function on each item in the provided dataset, automatically traces all executions with Langfuse for observability, runs item-level and run-level evaluators on the outputs, and returns comprehensive results with evaluation metrics.

The experiment system provides:

  • Automatic tracing of all task executions
  • Concurrent processing with configurable limits
  • Comprehensive error handling that isolates failures
  • Integration with Langfuse datasets for experiment tracking
  • Flexible evaluation framework supporting both sync and async evaluators
Arguments:
  • name: Human-readable name for the experiment. Used for identification in the Langfuse UI.
  • run_name: Optional exact name for the experiment run. If provided, this will be used as the exact dataset run name if the data contains Langfuse dataset items. If not provided, this will default to the experiment name appended with an ISO timestamp.
  • description: Optional description explaining the experiment's purpose, methodology, or expected outcomes.
  • data: Array of data items to process. Can be either:
    • List of dict-like items with 'input', 'expected_output', 'metadata' keys
    • List of Langfuse DatasetItem objects from dataset.items
  • task: Function that processes each data item and returns output. Must accept 'item' as keyword argument and can return sync or async results. The task function signature should be: task(, item, *kwargs) -> Any
  • evaluators: List of functions to evaluate each item's output individually. Each evaluator receives input, output, expected_output, and metadata. Can return single Evaluation dict or list of Evaluation dicts.
  • composite_evaluator: Optional function that creates composite scores from item-level evaluations. Receives the same inputs as item-level evaluators (input, output, expected_output, metadata) plus the list of evaluations from item-level evaluators. Useful for weighted averages, pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics.
  • run_evaluators: List of functions to evaluate the entire experiment run. Each run evaluator receives all item_results and can compute aggregate metrics. Useful for calculating averages, distributions, or cross-item comparisons.
  • max_concurrency: Maximum number of concurrent task executions (default: 50). Controls the number of items processed simultaneously. Adjust based on API rate limits and system resources.
  • metadata: Optional metadata dictionary to attach to all experiment traces. This metadata will be included in every trace created during the experiment. If data are Langfuse dataset items, the metadata will be attached to the dataset run, too.
Returns:

ExperimentResult containing:

  • run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset.
  • item_results: List of results for each processed item with outputs and evaluations
  • run_evaluations: List of aggregate evaluation results for the entire run
  • dataset_run_id: ID of the dataset run (if using Langfuse datasets)
  • dataset_run_url: Direct URL to view results in Langfuse UI (if applicable)
Raises:
  • ValueError: If required parameters are missing or invalid
  • Exception: If experiment setup fails (individual item failures are handled gracefully)
Examples:

Basic experiment with local data:

def summarize_text(*, item, **kwargs):
    return f"Summary: {item['input'][:50]}..."

def length_evaluator(*, input, output, expected_output=None, **kwargs):
    return {
        "name": "output_length",
        "value": len(output),
        "comment": f"Output contains {len(output)} characters"
    }

result = langfuse.run_experiment(
    name="Text Summarization Test",
    description="Evaluate summarization quality and length",
    data=[
        {"input": "Long article text...", "expected_output": "Expected summary"},
        {"input": "Another article...", "expected_output": "Another summary"}
    ],
    task=summarize_text,
    evaluators=[length_evaluator]
)

print(f"Processed {len(result.item_results)} items")
for item_result in result.item_results:
    print(f"Input: {item_result.item['input']}")
    print(f"Output: {item_result.output}")
    print(f"Evaluations: {item_result.evaluations}")

Advanced experiment with async task and multiple evaluators:

async def llm_task(*, item, **kwargs):
    # Simulate async LLM call
    response = await openai_client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": item["input"]}]
    )
    return response.choices[0].message.content

def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
    if expected_output and expected_output.lower() in output.lower():
        return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"}
    return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"}

def toxicity_evaluator(*, input, output, expected_output=None, **kwargs):
    # Simulate toxicity check
    toxicity_score = check_toxicity(output)  # Your toxicity checker
    return {
        "name": "toxicity",
        "value": toxicity_score,
        "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}"
    }

def average_accuracy(*, item_results, **kwargs):
    accuracies = [
        eval.value for result in item_results
        for eval in result.evaluations
        if eval.name == "accuracy"
    ]
    return {
        "name": "average_accuracy",
        "value": sum(accuracies) / len(accuracies) if accuracies else 0,
        "comment": f"Average accuracy across {len(accuracies)} items"
    }

result = langfuse.run_experiment(
    name="LLM Safety and Accuracy Test",
    description="Evaluate model accuracy and safety across diverse prompts",
    data=test_dataset,  # Your dataset items
    task=llm_task,
    evaluators=[accuracy_evaluator, toxicity_evaluator],
    run_evaluators=[average_accuracy],
    max_concurrency=5,  # Limit concurrent API calls
    metadata={"model": "gpt-4", "temperature": 0.7}
)

Using with Langfuse datasets:

# Get dataset from Langfuse
dataset = langfuse.get_dataset("my-eval-dataset")

result = dataset.run_experiment(
    name="Production Model Evaluation",
    description="Monthly evaluation of production model performance",
    task=my_production_task,
    evaluators=[accuracy_evaluator, latency_evaluator]
)

# Results automatically linked to dataset in Langfuse UI
print(f"View results: {result['dataset_run_url']}")
Note:
  • Task and evaluator functions can be either synchronous or asynchronous
  • Individual item failures are logged but don't stop the experiment
  • All executions are automatically traced and visible in Langfuse UI
  • When using Langfuse datasets, results are automatically linked for easy comparison
  • This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.)
  • Async execution is handled automatically with smart event loop detection
def run_batched_evaluation( self, *, scope: Literal['traces', 'observations'], mapper: MapperFunction, filter: Optional[str] = None, fetch_batch_size: int = 50, fetch_trace_fields: Optional[str] = None, max_items: Optional[int] = None, max_retries: int = 3, evaluators: List[langfuse.experiment.EvaluatorFunction], composite_evaluator: Optional[CompositeEvaluatorFunction] = None, max_concurrency: int = 5, metadata: Optional[Dict[str, Any]] = None, _add_observation_scores_to_trace: bool = False, _additional_trace_tags: Optional[List[str]] = None, resume_from: Optional[BatchEvaluationResumeToken] = None, verbose: bool = False) -> BatchEvaluationResult:
2906    def run_batched_evaluation(
2907        self,
2908        *,
2909        scope: Literal["traces", "observations"],
2910        mapper: MapperFunction,
2911        filter: Optional[str] = None,
2912        fetch_batch_size: int = 50,
2913        fetch_trace_fields: Optional[str] = None,
2914        max_items: Optional[int] = None,
2915        max_retries: int = 3,
2916        evaluators: List[EvaluatorFunction],
2917        composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
2918        max_concurrency: int = 5,
2919        metadata: Optional[Dict[str, Any]] = None,
2920        _add_observation_scores_to_trace: bool = False,
2921        _additional_trace_tags: Optional[List[str]] = None,
2922        resume_from: Optional[BatchEvaluationResumeToken] = None,
2923        verbose: bool = False,
2924    ) -> BatchEvaluationResult:
2925        """Fetch traces or observations and run evaluations on each item.
2926
2927        This method provides a powerful way to evaluate existing data in Langfuse at scale.
2928        It fetches items based on filters, transforms them using a mapper function, runs
2929        evaluators on each item, and creates scores that are linked back to the original
2930        entities. This is ideal for:
2931
2932        - Running evaluations on production traces after deployment
2933        - Backtesting new evaluation metrics on historical data
2934        - Batch scoring of observations for quality monitoring
2935        - Periodic evaluation runs on recent data
2936
2937        The method uses a streaming/pipeline approach to process items in batches, making
2938        it memory-efficient for large datasets. It includes comprehensive error handling,
2939        retry logic, and resume capability for long-running evaluations.
2940
2941        Args:
2942            scope: The type of items to evaluate. Must be one of:
2943                - "traces": Evaluate complete traces with all their observations
2944                - "observations": Evaluate individual observations (spans, generations, events)
2945            mapper: Function that transforms API response objects into evaluator inputs.
2946                Receives a trace/observation object and returns an EvaluatorInputs
2947                instance with input, output, expected_output, and metadata fields.
2948                Can be sync or async.
2949            evaluators: List of evaluation functions to run on each item. Each evaluator
2950                receives the mapped inputs and returns Evaluation object(s). Evaluator
2951                failures are logged but don't stop the batch evaluation.
2952            filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples:
2953                - '{"tags": ["production"]}'
2954                - '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}'
2955                Default: None (fetches all items).
2956            fetch_batch_size: Number of items to fetch per API call and hold in memory.
2957                Larger values may be faster but use more memory. Default: 50.
2958            fetch_trace_fields: Comma-separated list of fields to include when fetching traces. Available field groups: 'core' (always included), 'io' (input, output, metadata), 'scores', 'observations', 'metrics'. If not specified, all fields are returned. Example: 'core,scores,metrics'. Note: Excluded 'observations' or 'scores' fields return empty arrays; excluded 'metrics' returns -1 for 'totalCost' and 'latency'. Only relevant if scope is 'traces'.
2959            max_items: Maximum total number of items to process. If None, processes all
2960                items matching the filter. Useful for testing or limiting evaluation runs.
2961                Default: None (process all).
2962            max_concurrency: Maximum number of items to evaluate concurrently. Controls
2963                parallelism and resource usage. Default: 5.
2964            composite_evaluator: Optional function that creates a composite score from
2965                item-level evaluations. Receives the original item and its evaluations,
2966                returns a single Evaluation. Useful for weighted averages or combined metrics.
2967                Default: None.
2968            metadata: Optional metadata dict to add to all created scores. Useful for
2969                tracking evaluation runs, versions, or other context. Default: None.
2970            max_retries: Maximum number of retry attempts for failed batch fetches.
2971                Uses exponential backoff (1s, 2s, 4s). Default: 3.
2972            verbose: If True, logs progress information to console. Useful for monitoring
2973                long-running evaluations. Default: False.
2974            resume_from: Optional resume token from a previous incomplete run. Allows
2975                continuing evaluation after interruption or failure. Default: None.
2976
2977
2978        Returns:
2979            BatchEvaluationResult containing:
2980                - total_items_fetched: Number of items fetched from API
2981                - total_items_processed: Number of items successfully evaluated
2982                - total_items_failed: Number of items that failed evaluation
2983                - total_scores_created: Scores created by item-level evaluators
2984                - total_composite_scores_created: Scores created by composite evaluator
2985                - total_evaluations_failed: Individual evaluator failures
2986                - evaluator_stats: Per-evaluator statistics (success rate, scores created)
2987                - resume_token: Token for resuming if incomplete (None if completed)
2988                - completed: True if all items processed
2989                - duration_seconds: Total execution time
2990                - failed_item_ids: IDs of items that failed
2991                - error_summary: Error types and counts
2992                - has_more_items: True if max_items reached but more exist
2993
2994        Raises:
2995            ValueError: If invalid scope is provided.
2996
2997        Examples:
2998            Basic trace evaluation:
2999            ```python
3000            from langfuse import Langfuse, EvaluatorInputs, Evaluation
3001
3002            client = Langfuse()
3003
3004            # Define mapper to extract fields from traces
3005            def trace_mapper(trace):
3006                return EvaluatorInputs(
3007                    input=trace.input,
3008                    output=trace.output,
3009                    expected_output=None,
3010                    metadata={"trace_id": trace.id}
3011                )
3012
3013            # Define evaluator
3014            def length_evaluator(*, input, output, expected_output, metadata):
3015                return Evaluation(
3016                    name="output_length",
3017                    value=len(output) if output else 0
3018                )
3019
3020            # Run batch evaluation
3021            result = client.run_batched_evaluation(
3022                scope="traces",
3023                mapper=trace_mapper,
3024                evaluators=[length_evaluator],
3025                filter='{"tags": ["production"]}',
3026                max_items=1000,
3027                verbose=True
3028            )
3029
3030            print(f"Processed {result.total_items_processed} traces")
3031            print(f"Created {result.total_scores_created} scores")
3032            ```
3033
3034            Evaluation with composite scorer:
3035            ```python
3036            def accuracy_evaluator(*, input, output, expected_output, metadata):
3037                # ... evaluation logic
3038                return Evaluation(name="accuracy", value=0.85)
3039
3040            def relevance_evaluator(*, input, output, expected_output, metadata):
3041                # ... evaluation logic
3042                return Evaluation(name="relevance", value=0.92)
3043
3044            def composite_evaluator(*, item, evaluations):
3045                # Weighted average of evaluations
3046                weights = {"accuracy": 0.6, "relevance": 0.4}
3047                total = sum(
3048                    e.value * weights.get(e.name, 0)
3049                    for e in evaluations
3050                    if isinstance(e.value, (int, float))
3051                )
3052                return Evaluation(
3053                    name="composite_score",
3054                    value=total,
3055                    comment=f"Weighted average of {len(evaluations)} metrics"
3056                )
3057
3058            result = client.run_batched_evaluation(
3059                scope="traces",
3060                mapper=trace_mapper,
3061                evaluators=[accuracy_evaluator, relevance_evaluator],
3062                composite_evaluator=composite_evaluator,
3063                filter='{"user_id": "important_user"}',
3064                verbose=True
3065            )
3066            ```
3067
3068            Handling incomplete runs with resume:
3069            ```python
3070            # Initial run that may fail or timeout
3071            result = client.run_batched_evaluation(
3072                scope="observations",
3073                mapper=obs_mapper,
3074                evaluators=[my_evaluator],
3075                max_items=10000,
3076                verbose=True
3077            )
3078
3079            # Check if incomplete
3080            if not result.completed and result.resume_token:
3081                print(f"Processed {result.resume_token.items_processed} items before interruption")
3082
3083                # Resume from where it left off
3084                result = client.run_batched_evaluation(
3085                    scope="observations",
3086                    mapper=obs_mapper,
3087                    evaluators=[my_evaluator],
3088                    resume_from=result.resume_token,
3089                    verbose=True
3090                )
3091
3092            print(f"Total items processed: {result.total_items_processed}")
3093            ```
3094
3095            Monitoring evaluator performance:
3096            ```python
3097            result = client.run_batched_evaluation(...)
3098
3099            for stats in result.evaluator_stats:
3100                success_rate = stats.successful_runs / stats.total_runs
3101                print(f"{stats.name}:")
3102                print(f"  Success rate: {success_rate:.1%}")
3103                print(f"  Scores created: {stats.total_scores_created}")
3104
3105                if stats.failed_runs > 0:
3106                    print(f"  âš ī¸  Failed {stats.failed_runs} times")
3107            ```
3108
3109        Note:
3110            - Evaluator failures are logged but don't stop the batch evaluation
3111            - Individual item failures are tracked but don't stop processing
3112            - Fetch failures are retried with exponential backoff
3113            - All scores are automatically flushed to Langfuse at the end
3114            - The resume mechanism uses timestamp-based filtering to avoid duplicates
3115        """
3116        runner = BatchEvaluationRunner(self)
3117
3118        return cast(
3119            BatchEvaluationResult,
3120            run_async_safely(
3121                runner.run_async(
3122                    scope=scope,
3123                    mapper=mapper,
3124                    evaluators=evaluators,
3125                    filter=filter,
3126                    fetch_batch_size=fetch_batch_size,
3127                    fetch_trace_fields=fetch_trace_fields,
3128                    max_items=max_items,
3129                    max_concurrency=max_concurrency,
3130                    composite_evaluator=composite_evaluator,
3131                    metadata=metadata,
3132                    _add_observation_scores_to_trace=_add_observation_scores_to_trace,
3133                    _additional_trace_tags=_additional_trace_tags,
3134                    max_retries=max_retries,
3135                    verbose=verbose,
3136                    resume_from=resume_from,
3137                )
3138            ),
3139        )

Fetch traces or observations and run evaluations on each item.

This method provides a powerful way to evaluate existing data in Langfuse at scale. It fetches items based on filters, transforms them using a mapper function, runs evaluators on each item, and creates scores that are linked back to the original entities. This is ideal for:

  • Running evaluations on production traces after deployment
  • Backtesting new evaluation metrics on historical data
  • Batch scoring of observations for quality monitoring
  • Periodic evaluation runs on recent data

The method uses a streaming/pipeline approach to process items in batches, making it memory-efficient for large datasets. It includes comprehensive error handling, retry logic, and resume capability for long-running evaluations.

Arguments:
  • scope: The type of items to evaluate. Must be one of:
    • "traces": Evaluate complete traces with all their observations
    • "observations": Evaluate individual observations (spans, generations, events)
  • mapper: Function that transforms API response objects into evaluator inputs. Receives a trace/observation object and returns an EvaluatorInputs instance with input, output, expected_output, and metadata fields. Can be sync or async.
  • evaluators: List of evaluation functions to run on each item. Each evaluator receives the mapped inputs and returns Evaluation object(s). Evaluator failures are logged but don't stop the batch evaluation.
  • filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples:
    • '{"tags": ["production"]}'
    • '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}' Default: None (fetches all items).
  • fetch_batch_size: Number of items to fetch per API call and hold in memory. Larger values may be faster but use more memory. Default: 50.
  • fetch_trace_fields: Comma-separated list of fields to include when fetching traces. Available field groups: 'core' (always included), 'io' (input, output, metadata), 'scores', 'observations', 'metrics'. If not specified, all fields are returned. Example: 'core,scores,metrics'. Note: Excluded 'observations' or 'scores' fields return empty arrays; excluded 'metrics' returns -1 for 'totalCost' and 'latency'. Only relevant if scope is 'traces'.
  • max_items: Maximum total number of items to process. If None, processes all items matching the filter. Useful for testing or limiting evaluation runs. Default: None (process all).
  • max_concurrency: Maximum number of items to evaluate concurrently. Controls parallelism and resource usage. Default: 5.
  • composite_evaluator: Optional function that creates a composite score from item-level evaluations. Receives the original item and its evaluations, returns a single Evaluation. Useful for weighted averages or combined metrics. Default: None.
  • metadata: Optional metadata dict to add to all created scores. Useful for tracking evaluation runs, versions, or other context. Default: None.
  • max_retries: Maximum number of retry attempts for failed batch fetches. Uses exponential backoff (1s, 2s, 4s). Default: 3.
  • verbose: If True, logs progress information to console. Useful for monitoring long-running evaluations. Default: False.
  • resume_from: Optional resume token from a previous incomplete run. Allows continuing evaluation after interruption or failure. Default: None.
Returns:

BatchEvaluationResult containing: - total_items_fetched: Number of items fetched from API - total_items_processed: Number of items successfully evaluated - total_items_failed: Number of items that failed evaluation - total_scores_created: Scores created by item-level evaluators - total_composite_scores_created: Scores created by composite evaluator - total_evaluations_failed: Individual evaluator failures - evaluator_stats: Per-evaluator statistics (success rate, scores created) - resume_token: Token for resuming if incomplete (None if completed) - completed: True if all items processed - duration_seconds: Total execution time - failed_item_ids: IDs of items that failed - error_summary: Error types and counts - has_more_items: True if max_items reached but more exist

Raises:
  • ValueError: If invalid scope is provided.
Examples:

Basic trace evaluation:

from langfuse import Langfuse, EvaluatorInputs, Evaluation

client = Langfuse()

# Define mapper to extract fields from traces
def trace_mapper(trace):
    return EvaluatorInputs(
        input=trace.input,
        output=trace.output,
        expected_output=None,
        metadata={"trace_id": trace.id}
    )

# Define evaluator
def length_evaluator(*, input, output, expected_output, metadata):
    return Evaluation(
        name="output_length",
        value=len(output) if output else 0
    )

# Run batch evaluation
result = client.run_batched_evaluation(
    scope="traces",
    mapper=trace_mapper,
    evaluators=[length_evaluator],
    filter='{"tags": ["production"]}',
    max_items=1000,
    verbose=True
)

print(f"Processed {result.total_items_processed} traces")
print(f"Created {result.total_scores_created} scores")

Evaluation with composite scorer:

def accuracy_evaluator(*, input, output, expected_output, metadata):
    # ... evaluation logic
    return Evaluation(name="accuracy", value=0.85)

def relevance_evaluator(*, input, output, expected_output, metadata):
    # ... evaluation logic
    return Evaluation(name="relevance", value=0.92)

def composite_evaluator(*, item, evaluations):
    # Weighted average of evaluations
    weights = {"accuracy": 0.6, "relevance": 0.4}
    total = sum(
        e.value * weights.get(e.name, 0)
        for e in evaluations
        if isinstance(e.value, (int, float))
    )
    return Evaluation(
        name="composite_score",
        value=total,
        comment=f"Weighted average of {len(evaluations)} metrics"
    )

result = client.run_batched_evaluation(
    scope="traces",
    mapper=trace_mapper,
    evaluators=[accuracy_evaluator, relevance_evaluator],
    composite_evaluator=composite_evaluator,
    filter='{"user_id": "important_user"}',
    verbose=True
)

Handling incomplete runs with resume:

# Initial run that may fail or timeout
result = client.run_batched_evaluation(
    scope="observations",
    mapper=obs_mapper,
    evaluators=[my_evaluator],
    max_items=10000,
    verbose=True
)

# Check if incomplete
if not result.completed and result.resume_token:
    print(f"Processed {result.resume_token.items_processed} items before interruption")

    # Resume from where it left off
    result = client.run_batched_evaluation(
        scope="observations",
        mapper=obs_mapper,
        evaluators=[my_evaluator],
        resume_from=result.resume_token,
        verbose=True
    )

print(f"Total items processed: {result.total_items_processed}")

Monitoring evaluator performance:

result = client.run_batched_evaluation(...)

for stats in result.evaluator_stats:
    success_rate = stats.successful_runs / stats.total_runs
    print(f"{stats.name}:")
    print(f"  Success rate: {success_rate:.1%}")
    print(f"  Scores created: {stats.total_scores_created}")

    if stats.failed_runs > 0:
        print(f"  âš ī¸  Failed {stats.failed_runs} times")
Note:
  • Evaluator failures are logged but don't stop the batch evaluation
  • Individual item failures are tracked but don't stop processing
  • Fetch failures are retried with exponential backoff
  • All scores are automatically flushed to Langfuse at the end
  • The resume mechanism uses timestamp-based filtering to avoid duplicates
def auth_check(self) -> bool:
3141    def auth_check(self) -> bool:
3142        """Check if the provided credentials (public and secret key) are valid.
3143
3144        Raises:
3145            Exception: If no projects were found for the provided credentials.
3146
3147        Note:
3148            This method is blocking. It is discouraged to use it in production code.
3149        """
3150        try:
3151            projects = self.api.projects.get()
3152            langfuse_logger.debug(
3153                f"Auth check successful, found {len(projects.data)} projects"
3154            )
3155            if len(projects.data) == 0:
3156                raise Exception(
3157                    "Auth check failed, no project found for the keys provided."
3158                )
3159            return True
3160
3161        except AttributeError as e:
3162            langfuse_logger.warning(
3163                f"Auth check failed: Client not properly initialized. Error: {e}"
3164            )
3165            return False
3166
3167        except Error as e:
3168            handle_fern_exception(e)
3169            raise e

Check if the provided credentials (public and secret key) are valid.

Raises:
  • Exception: If no projects were found for the provided credentials.
Note:

This method is blocking. It is discouraged to use it in production code.

def create_dataset( self, *, name: str, description: Optional[str] = None, metadata: Optional[Any] = None, input_schema: Optional[Any] = None, expected_output_schema: Optional[Any] = None) -> langfuse.api.Dataset:
3171    def create_dataset(
3172        self,
3173        *,
3174        name: str,
3175        description: Optional[str] = None,
3176        metadata: Optional[Any] = None,
3177        input_schema: Optional[Any] = None,
3178        expected_output_schema: Optional[Any] = None,
3179    ) -> Dataset:
3180        """Create a dataset with the given name on Langfuse.
3181
3182        Args:
3183            name: Name of the dataset to create.
3184            description: Description of the dataset. Defaults to None.
3185            metadata: Additional metadata. Defaults to None.
3186            input_schema: JSON Schema for validating dataset item inputs. When set, all new items will be validated against this schema.
3187            expected_output_schema: JSON Schema for validating dataset item expected outputs. When set, all new items will be validated against this schema.
3188
3189        Returns:
3190            Dataset: The created dataset as returned by the Langfuse API.
3191        """
3192        try:
3193            langfuse_logger.debug(f"Creating datasets {name}")
3194
3195            result = self.api.datasets.create(
3196                name=name,
3197                description=description,
3198                metadata=metadata,
3199                input_schema=input_schema,
3200                expected_output_schema=expected_output_schema,
3201            )
3202
3203            return cast(Dataset, result)
3204
3205        except Error as e:
3206            handle_fern_exception(e)
3207            raise e

Create a dataset with the given name on Langfuse.

Arguments:
  • name: Name of the dataset to create.
  • description: Description of the dataset. Defaults to None.
  • metadata: Additional metadata. Defaults to None.
  • input_schema: JSON Schema for validating dataset item inputs. When set, all new items will be validated against this schema.
  • expected_output_schema: JSON Schema for validating dataset item expected outputs. When set, all new items will be validated against this schema.
Returns:

Dataset: The created dataset as returned by the Langfuse API.

def create_dataset_item( self, *, dataset_name: str, input: Optional[Any] = None, expected_output: Optional[Any] = None, metadata: Optional[Any] = None, source_trace_id: Optional[str] = None, source_observation_id: Optional[str] = None, status: Optional[langfuse.api.DatasetStatus] = None, id: Optional[str] = None) -> langfuse.api.DatasetItem:
3209    def create_dataset_item(
3210        self,
3211        *,
3212        dataset_name: str,
3213        input: Optional[Any] = None,
3214        expected_output: Optional[Any] = None,
3215        metadata: Optional[Any] = None,
3216        source_trace_id: Optional[str] = None,
3217        source_observation_id: Optional[str] = None,
3218        status: Optional[DatasetStatus] = None,
3219        id: Optional[str] = None,
3220    ) -> DatasetItem:
3221        """Create a dataset item.
3222
3223        Upserts if an item with id already exists.
3224
3225        Args:
3226            dataset_name: Name of the dataset in which the dataset item should be created.
3227            input: Input data. Defaults to None. Can contain any dict, list or scalar.
3228            expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar.
3229            metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar.
3230            source_trace_id: Id of the source trace. Defaults to None.
3231            source_observation_id: Id of the source observation. Defaults to None.
3232            status: Status of the dataset item. Defaults to ACTIVE for newly created items.
3233            id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets.
3234
3235        Returns:
3236            DatasetItem: The created dataset item as returned by the Langfuse API.
3237
3238        Example:
3239            ```python
3240            from langfuse import Langfuse
3241
3242            langfuse = Langfuse()
3243
3244            # Uploading items to the Langfuse dataset named "capital_cities"
3245            langfuse.create_dataset_item(
3246                dataset_name="capital_cities",
3247                input={"input": {"country": "Italy"}},
3248                expected_output={"expected_output": "Rome"},
3249                metadata={"foo": "bar"}
3250            )
3251            ```
3252        """
3253        try:
3254            langfuse_logger.debug(f"Creating dataset item for dataset {dataset_name}")
3255
3256            result = self.api.dataset_items.create(
3257                dataset_name=dataset_name,
3258                input=input,
3259                expected_output=expected_output,
3260                metadata=metadata,
3261                source_trace_id=source_trace_id,
3262                source_observation_id=source_observation_id,
3263                status=status,
3264                id=id,
3265            )
3266
3267            return cast(DatasetItem, result)
3268        except Error as e:
3269            handle_fern_exception(e)
3270            raise e

Create a dataset item.

Upserts if an item with id already exists.

Arguments:
  • dataset_name: Name of the dataset in which the dataset item should be created.
  • input: Input data. Defaults to None. Can contain any dict, list or scalar.
  • expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar.
  • metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar.
  • source_trace_id: Id of the source trace. Defaults to None.
  • source_observation_id: Id of the source observation. Defaults to None.
  • status: Status of the dataset item. Defaults to ACTIVE for newly created items.
  • id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets.
Returns:

DatasetItem: The created dataset item as returned by the Langfuse API.

Example:
from langfuse import Langfuse

langfuse = Langfuse()

# Uploading items to the Langfuse dataset named "capital_cities"
langfuse.create_dataset_item(
    dataset_name="capital_cities",
    input={"input": {"country": "Italy"}},
    expected_output={"expected_output": "Rome"},
    metadata={"foo": "bar"}
)
def resolve_media_references( self, *, obj: Any, resolve_with: Literal['base64_data_uri'], max_depth: int = 10, content_fetch_timeout_seconds: int = 5) -> Any:
3272    def resolve_media_references(
3273        self,
3274        *,
3275        obj: Any,
3276        resolve_with: Literal["base64_data_uri"],
3277        max_depth: int = 10,
3278        content_fetch_timeout_seconds: int = 5,
3279    ) -> Any:
3280        """Replace media reference strings in an object with base64 data URIs.
3281
3282        This method recursively traverses an object (up to max_depth) looking for media reference strings
3283        in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using
3284        the provided Langfuse client and replaces the reference string with a base64 data URI.
3285
3286        If fetching media content fails for a reference string, a warning is logged and the reference
3287        string is left unchanged.
3288
3289        Args:
3290            obj: The object to process. Can be a primitive value, array, or nested object.
3291                If the object has a __dict__ attribute, a dict will be returned instead of the original object type.
3292            resolve_with: The representation of the media content to replace the media reference string with.
3293                Currently only "base64_data_uri" is supported.
3294            max_depth: int: The maximum depth to traverse the object. Default is 10.
3295            content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5.
3296
3297        Returns:
3298            A deep copy of the input object with all media references replaced with base64 data URIs where possible.
3299            If the input object has a __dict__ attribute, a dict will be returned instead of the original object type.
3300
3301        Example:
3302            obj = {
3303                "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@",
3304                "nested": {
3305                    "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@"
3306                }
3307            }
3308
3309            result = await LangfuseMedia.resolve_media_references(obj, langfuse_client)
3310
3311            # Result:
3312            # {
3313            #     "image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...",
3314            #     "nested": {
3315            #         "pdf": "data:application/pdf;base64,JVBERi0xLjcK..."
3316            #     }
3317            # }
3318        """
3319        return LangfuseMedia.resolve_media_references(
3320            langfuse_client=self,
3321            obj=obj,
3322            resolve_with=resolve_with,
3323            max_depth=max_depth,
3324            content_fetch_timeout_seconds=content_fetch_timeout_seconds,
3325        )

Replace media reference strings in an object with base64 data URIs.

This method recursively traverses an object (up to max_depth) looking for media reference strings in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using the provided Langfuse client and replaces the reference string with a base64 data URI.

If fetching media content fails for a reference string, a warning is logged and the reference string is left unchanged.

Arguments:
  • obj: The object to process. Can be a primitive value, array, or nested object. If the object has a __dict__ attribute, a dict will be returned instead of the original object type.
  • resolve_with: The representation of the media content to replace the media reference string with. Currently only "base64_data_uri" is supported.
  • max_depth: int: The maximum depth to traverse the object. Default is 10.
  • content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5.
Returns:

A deep copy of the input object with all media references replaced with base64 data URIs where possible. If the input object has a __dict__ attribute, a dict will be returned instead of the original object type.

Example:

obj = { "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@", "nested": { "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@" } }

result = await LangfuseMedia.resolve_media_references(obj, langfuse_client)

Result:

{

"image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...",

"nested": {

"pdf": "data:application/pdf;base64,JVBERi0xLjcK..."

}

}

def get_prompt( self, name: str, *, version: Optional[int] = None, label: Optional[str] = None, type: Literal['chat', 'text'] = 'text', cache_ttl_seconds: Optional[int] = None, fallback: Union[List[langfuse.model.ChatMessageDict], NoneType, str] = None, max_retries: Optional[int] = None, fetch_timeout_seconds: Optional[int] = None) -> Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient]:
3355    def get_prompt(
3356        self,
3357        name: str,
3358        *,
3359        version: Optional[int] = None,
3360        label: Optional[str] = None,
3361        type: Literal["chat", "text"] = "text",
3362        cache_ttl_seconds: Optional[int] = None,
3363        fallback: Union[Optional[List[ChatMessageDict]], Optional[str]] = None,
3364        max_retries: Optional[int] = None,
3365        fetch_timeout_seconds: Optional[int] = None,
3366    ) -> PromptClient:
3367        """Get a prompt.
3368
3369        This method attempts to fetch the requested prompt from the local cache. If the prompt is not found
3370        in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again
3371        and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will
3372        return the expired prompt as a fallback.
3373
3374        Args:
3375            name (str): The name of the prompt to retrieve.
3376
3377        Keyword Args:
3378            version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both.
3379            label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both.
3380            cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a
3381            keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0.
3382            type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text".
3383            fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None.
3384            max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds.
3385            fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default.
3386
3387        Returns:
3388            The prompt object retrieved from the cache or directly fetched if not cached or expired of type
3389            - TextPromptClient, if type argument is 'text'.
3390            - ChatPromptClient, if type argument is 'chat'.
3391
3392        Raises:
3393            Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an
3394            expired prompt in the cache, in which case it logs a warning and returns the expired prompt.
3395        """
3396        if self._resources is None:
3397            raise Error(
3398                "SDK is not correctly initialized. Check the init logs for more details."
3399            )
3400        if version is not None and label is not None:
3401            raise ValueError("Cannot specify both version and label at the same time.")
3402
3403        if not name:
3404            raise ValueError("Prompt name cannot be empty.")
3405
3406        cache_key = PromptCache.generate_cache_key(name, version=version, label=label)
3407        bounded_max_retries = self._get_bounded_max_retries(
3408            max_retries, default_max_retries=2, max_retries_upper_bound=4
3409        )
3410
3411        langfuse_logger.debug(f"Getting prompt '{cache_key}'")
3412        cached_prompt = self._resources.prompt_cache.get(cache_key)
3413
3414        if cached_prompt is None or cache_ttl_seconds == 0:
3415            langfuse_logger.debug(
3416                f"Prompt '{cache_key}' not found in cache or caching disabled."
3417            )
3418            try:
3419                return self._fetch_prompt_and_update_cache(
3420                    name,
3421                    version=version,
3422                    label=label,
3423                    ttl_seconds=cache_ttl_seconds,
3424                    max_retries=bounded_max_retries,
3425                    fetch_timeout_seconds=fetch_timeout_seconds,
3426                )
3427            except Exception as e:
3428                if fallback:
3429                    langfuse_logger.warning(
3430                        f"Returning fallback prompt for '{cache_key}' due to fetch error: {e}"
3431                    )
3432
3433                    fallback_client_args: Dict[str, Any] = {
3434                        "name": name,
3435                        "prompt": fallback,
3436                        "type": type,
3437                        "version": version or 0,
3438                        "config": {},
3439                        "labels": [label] if label else [],
3440                        "tags": [],
3441                    }
3442
3443                    if type == "text":
3444                        return TextPromptClient(
3445                            prompt=Prompt_Text(**fallback_client_args),
3446                            is_fallback=True,
3447                        )
3448
3449                    if type == "chat":
3450                        return ChatPromptClient(
3451                            prompt=Prompt_Chat(**fallback_client_args),
3452                            is_fallback=True,
3453                        )
3454
3455                raise e
3456
3457        if cached_prompt.is_expired():
3458            langfuse_logger.debug(f"Stale prompt '{cache_key}' found in cache.")
3459            try:
3460                # refresh prompt in background thread, refresh_prompt deduplicates tasks
3461                langfuse_logger.debug(f"Refreshing prompt '{cache_key}' in background.")
3462
3463                def refresh_task() -> None:
3464                    self._fetch_prompt_and_update_cache(
3465                        name,
3466                        version=version,
3467                        label=label,
3468                        ttl_seconds=cache_ttl_seconds,
3469                        max_retries=bounded_max_retries,
3470                        fetch_timeout_seconds=fetch_timeout_seconds,
3471                    )
3472
3473                self._resources.prompt_cache.add_refresh_prompt_task(
3474                    cache_key,
3475                    refresh_task,
3476                )
3477                langfuse_logger.debug(
3478                    f"Returning stale prompt '{cache_key}' from cache."
3479                )
3480                # return stale prompt
3481                return cached_prompt.value
3482
3483            except Exception as e:
3484                langfuse_logger.warning(
3485                    f"Error when refreshing cached prompt '{cache_key}', returning cached version. Error: {e}"
3486                )
3487                # creation of refresh prompt task failed, return stale prompt
3488                return cached_prompt.value
3489
3490        return cached_prompt.value

Get a prompt.

This method attempts to fetch the requested prompt from the local cache. If the prompt is not found in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will return the expired prompt as a fallback.

Arguments:
  • name (str): The name of the prompt to retrieve.
Keyword Args:

version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the production label is returned. Specify either version or label, not both. label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the production label is returned. Specify either version or label, not both. cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0. type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text". fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None. max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds. fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default.

Returns:

The prompt object retrieved from the cache or directly fetched if not cached or expired of type

  • TextPromptClient, if type argument is 'text'.
  • ChatPromptClient, if type argument is 'chat'.
Raises:
  • Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an
  • expired prompt in the cache, in which case it logs a warning and returns the expired prompt.
def create_prompt( self, *, name: str, prompt: Union[str, List[Union[langfuse.model.ChatMessageDict, langfuse.model.ChatMessageWithPlaceholdersDict_Message, langfuse.model.ChatMessageWithPlaceholdersDict_Placeholder]]], labels: List[str] = [], tags: Optional[List[str]] = None, type: Optional[Literal['chat', 'text']] = 'text', config: Optional[Any] = None, commit_message: Optional[str] = None) -> Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient]:
3592    def create_prompt(
3593        self,
3594        *,
3595        name: str,
3596        prompt: Union[
3597            str, List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]]
3598        ],
3599        labels: List[str] = [],
3600        tags: Optional[List[str]] = None,
3601        type: Optional[Literal["chat", "text"]] = "text",
3602        config: Optional[Any] = None,
3603        commit_message: Optional[str] = None,
3604    ) -> PromptClient:
3605        """Create a new prompt in Langfuse.
3606
3607        Keyword Args:
3608            name : The name of the prompt to be created.
3609            prompt : The content of the prompt to be created.
3610            is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead.
3611            labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label.
3612            tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt.
3613            config: Additional structured data to be saved with the prompt. Defaults to None.
3614            type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text".
3615            commit_message: Optional string describing the change.
3616
3617        Returns:
3618            TextPromptClient: The prompt if type argument is 'text'.
3619            ChatPromptClient: The prompt if type argument is 'chat'.
3620        """
3621        try:
3622            langfuse_logger.debug(f"Creating prompt {name=}, {labels=}")
3623
3624            if type == "chat":
3625                if not isinstance(prompt, list):
3626                    raise ValueError(
3627                        "For 'chat' type, 'prompt' must be a list of chat messages with role and content attributes."
3628                    )
3629                request: Union[CreateChatPromptRequest, CreateTextPromptRequest] = (
3630                    CreateChatPromptRequest(
3631                        name=name,
3632                        prompt=cast(Any, prompt),
3633                        labels=labels,
3634                        tags=tags,
3635                        config=config or {},
3636                        commit_message=commit_message,
3637                        type=CreateChatPromptType.CHAT,
3638                    )
3639                )
3640                server_prompt = self.api.prompts.create(request=request)
3641
3642                if self._resources is not None:
3643                    self._resources.prompt_cache.invalidate(name)
3644
3645                return ChatPromptClient(prompt=cast(Prompt_Chat, server_prompt))
3646
3647            if not isinstance(prompt, str):
3648                raise ValueError("For 'text' type, 'prompt' must be a string.")
3649
3650            request = CreateTextPromptRequest(
3651                name=name,
3652                prompt=prompt,
3653                labels=labels,
3654                tags=tags,
3655                config=config or {},
3656                commit_message=commit_message,
3657            )
3658
3659            server_prompt = self.api.prompts.create(request=request)
3660
3661            if self._resources is not None:
3662                self._resources.prompt_cache.invalidate(name)
3663
3664            return TextPromptClient(prompt=cast(Prompt_Text, server_prompt))
3665
3666        except Error as e:
3667            handle_fern_exception(e)
3668            raise e

Create a new prompt in Langfuse.

Keyword Args:

name : The name of the prompt to be created. prompt : The content of the prompt to be created. is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead. labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label. tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt. config: Additional structured data to be saved with the prompt. Defaults to None. type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text". commit_message: Optional string describing the change.

Returns:

TextPromptClient: The prompt if type argument is 'text'. ChatPromptClient: The prompt if type argument is 'chat'.

def update_prompt(self, *, name: str, version: int, new_labels: List[str] = []) -> Any:
3670    def update_prompt(
3671        self,
3672        *,
3673        name: str,
3674        version: int,
3675        new_labels: List[str] = [],
3676    ) -> Any:
3677        """Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name.
3678
3679        Args:
3680            name (str): The name of the prompt to update.
3681            version (int): The version number of the prompt to update.
3682            new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to [].
3683
3684        Returns:
3685            Prompt: The updated prompt from the Langfuse API.
3686
3687        """
3688        updated_prompt = self.api.prompt_version.update(
3689            name=self._url_encode(name),
3690            version=version,
3691            new_labels=new_labels,
3692        )
3693
3694        if self._resources is not None:
3695            self._resources.prompt_cache.invalidate(name)
3696
3697        return updated_prompt

Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name.

Arguments:
  • name (str): The name of the prompt to update.
  • version (int): The version number of the prompt to update.
  • new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to [].
Returns:

Prompt: The updated prompt from the Langfuse API.

def clear_prompt_cache(self) -> None:
3712    def clear_prompt_cache(self) -> None:
3713        """Clear the entire prompt cache, removing all cached prompts.
3714
3715        This method is useful when you want to force a complete refresh of all
3716        cached prompts, for example after major updates or when you need to
3717        ensure the latest versions are fetched from the server.
3718        """
3719        if self._resources is not None:
3720            self._resources.prompt_cache.clear()

Clear the entire prompt cache, removing all cached prompts.

This method is useful when you want to force a complete refresh of all cached prompts, for example after major updates or when you need to ensure the latest versions are fetched from the server.

def get_client(*, public_key: Optional[str] = None) -> Langfuse:
 62def get_client(*, public_key: Optional[str] = None) -> Langfuse:
 63    """Get or create a Langfuse client instance.
 64
 65    Returns an existing Langfuse client or creates a new one if none exists. In multi-project setups,
 66    providing a public_key is required. Multi-project support is experimental - see Langfuse docs.
 67
 68    Behavior:
 69    - Single project: Returns existing client or creates new one
 70    - Multi-project: Requires public_key to return specific client
 71    - No public_key in multi-project: Returns disabled client to prevent data leakage
 72
 73    The function uses a singleton pattern per public_key to conserve resources and maintain state.
 74
 75    Args:
 76        public_key (Optional[str]): Project identifier
 77            - With key: Returns client for that project
 78            - Without key: Returns single client or disabled client if multiple exist
 79
 80    Returns:
 81        Langfuse: Client instance in one of three states:
 82            1. Client for specified public_key
 83            2. Default client for single-project setup
 84            3. Disabled client when multiple projects exist without key
 85
 86    Security:
 87        Disables tracing when multiple projects exist without explicit key to prevent
 88        cross-project data leakage. Multi-project setups are experimental.
 89
 90    Example:
 91        ```python
 92        # Single project
 93        client = get_client()  # Default client
 94
 95        # In multi-project usage:
 96        client_a = get_client(public_key="project_a_key")  # Returns project A's client
 97        client_b = get_client(public_key="project_b_key")  # Returns project B's client
 98
 99        # Without specific key in multi-project setup:
100        client = get_client()  # Returns disabled client for safety
101        ```
102    """
103    with LangfuseResourceManager._lock:
104        active_instances = LangfuseResourceManager._instances
105
106        # If no explicit public_key provided, check execution context
107        if not public_key:
108            public_key = _current_public_key.get(None)
109
110        if not public_key:
111            if len(active_instances) == 0:
112                # No clients initialized yet, create default instance
113                return Langfuse()
114
115            if len(active_instances) == 1:
116                # Only one client exists, safe to use without specifying key
117                instance = list(active_instances.values())[0]
118
119                # Initialize with the credentials bound to the instance
120                # This is important if the original instance was instantiated
121                # via constructor arguments
122                return _create_client_from_instance(instance)
123
124            else:
125                # Multiple clients exist but no key specified - disable tracing
126                # to prevent cross-project data leakage
127                langfuse_logger.warning(
128                    "No 'langfuse_public_key' passed to decorated function, but multiple langfuse clients are instantiated in current process. Skipping tracing for this function to avoid cross-project leakage."
129                )
130                return Langfuse(
131                    tracing_enabled=False, public_key="fake", secret_key="fake"
132                )
133
134        else:
135            # Specific key provided, look up existing instance
136            target_instance: Optional[LangfuseResourceManager] = active_instances.get(
137                public_key, None
138            )
139
140            if target_instance is None:
141                # No instance found with this key - client not initialized properly
142                langfuse_logger.warning(
143                    f"No Langfuse client with public key {public_key} has been initialized. Skipping tracing for decorated function."
144                )
145                return Langfuse(
146                    tracing_enabled=False, public_key="fake", secret_key="fake"
147                )
148
149            # target_instance is guaranteed to be not None at this point
150            return _create_client_from_instance(target_instance, public_key)

Get or create a Langfuse client instance.

Returns an existing Langfuse client or creates a new one if none exists. In multi-project setups, providing a public_key is required. Multi-project support is experimental - see Langfuse docs.

Behavior:

  • Single project: Returns existing client or creates new one
  • Multi-project: Requires public_key to return specific client
  • No public_key in multi-project: Returns disabled client to prevent data leakage

The function uses a singleton pattern per public_key to conserve resources and maintain state.

Arguments:
  • public_key (Optional[str]): Project identifier
    • With key: Returns client for that project
    • Without key: Returns single client or disabled client if multiple exist
Returns:

Langfuse: Client instance in one of three states: 1. Client for specified public_key 2. Default client for single-project setup 3. Disabled client when multiple projects exist without key

Security:

Disables tracing when multiple projects exist without explicit key to prevent cross-project data leakage. Multi-project setups are experimental.

Example:
# Single project
client = get_client()  # Default client

# In multi-project usage:
client_a = get_client(public_key="project_a_key")  # Returns project A's client
client_b = get_client(public_key="project_b_key")  # Returns project B's client

# Without specific key in multi-project setup:
client = get_client()  # Returns disabled client for safety
def observe( func: Optional[~F] = None, *, name: Optional[str] = None, as_type: Union[Literal['generation', 'embedding'], Literal['span', 'agent', 'tool', 'chain', 'retriever', 'evaluator', 'guardrail'], NoneType] = None, capture_input: Optional[bool] = None, capture_output: Optional[bool] = None, transform_to_string: Optional[Callable[[Iterable], str]] = None) -> Union[~F, Callable[[~F], ~F]]:
 88    def observe(
 89        self,
 90        func: Optional[F] = None,
 91        *,
 92        name: Optional[str] = None,
 93        as_type: Optional[ObservationTypeLiteralNoEvent] = None,
 94        capture_input: Optional[bool] = None,
 95        capture_output: Optional[bool] = None,
 96        transform_to_string: Optional[Callable[[Iterable], str]] = None,
 97    ) -> Union[F, Callable[[F], F]]:
 98        """Wrap a function to create and manage Langfuse tracing around its execution, supporting both synchronous and asynchronous functions.
 99
100        This decorator provides seamless integration of Langfuse observability into your codebase. It automatically creates
101        spans or generations around function execution, capturing timing, inputs/outputs, and error states. The decorator
102        intelligently handles both synchronous and asynchronous functions, preserving function signatures and type hints.
103
104        Using OpenTelemetry's distributed tracing system, it maintains proper trace context propagation throughout your application,
105        enabling you to see hierarchical traces of function calls with detailed performance metrics and function-specific details.
106
107        Args:
108            func (Optional[Callable]): The function to decorate. When used with parentheses @observe(), this will be None.
109            name (Optional[str]): Custom name for the created trace or span. If not provided, the function name is used.
110            as_type (Optional[Literal]): Set the observation type. Supported values:
111                    "generation", "span", "agent", "tool", "chain", "retriever", "embedding", "evaluator", "guardrail".
112                    Observation types are highlighted in the Langfuse UI for filtering and visualization.
113                    The types "generation" and "embedding" create a span on which additional attributes such as model metrics
114                    can be set.
115
116        Returns:
117            Callable: A wrapped version of the original function that automatically creates and manages Langfuse spans.
118
119        Example:
120            For general function tracing with automatic naming:
121            ```python
122            @observe()
123            def process_user_request(user_id, query):
124                # Function is automatically traced with name "process_user_request"
125                return get_response(query)
126            ```
127
128            For language model generation tracking:
129            ```python
130            @observe(name="answer-generation", as_type="generation")
131            async def generate_answer(query):
132                # Creates a generation-type span with extended LLM metrics
133                response = await openai.chat.completions.create(
134                    model="gpt-4",
135                    messages=[{"role": "user", "content": query}]
136                )
137                return response.choices[0].message.content
138            ```
139
140            For trace context propagation between functions:
141            ```python
142            @observe()
143            def main_process():
144                # Parent span is created
145                return sub_process()  # Child span automatically connected to parent
146
147            @observe()
148            def sub_process():
149                # Automatically becomes a child span of main_process
150                return "result"
151            ```
152
153        Raises:
154            Exception: Propagates any exceptions from the wrapped function after logging them in the trace.
155
156        Notes:
157            - The decorator preserves the original function's signature, docstring, and return type.
158            - Proper parent-child relationships between spans are automatically maintained.
159            - Special keyword arguments can be passed to control tracing:
160              - langfuse_trace_id: Explicitly set the trace ID for this function call
161              - langfuse_parent_observation_id: Explicitly set the parent span ID
162              - langfuse_public_key: Use a specific Langfuse project (when multiple clients exist)
163            - For async functions, the decorator returns an async function wrapper.
164            - For sync functions, the decorator returns a synchronous wrapper.
165        """
166        valid_types = set(get_observation_types_list(ObservationTypeLiteralNoEvent))
167        if as_type is not None and as_type not in valid_types:
168            logger.warning(
169                f"Invalid as_type '{as_type}'. Valid types are: {', '.join(sorted(valid_types))}. Defaulting to 'span'."
170            )
171            as_type = "span"
172
173        function_io_capture_enabled = os.environ.get(
174            LANGFUSE_OBSERVE_DECORATOR_IO_CAPTURE_ENABLED, "True"
175        ).lower() not in ("false", "0")
176
177        should_capture_input = (
178            capture_input if capture_input is not None else function_io_capture_enabled
179        )
180
181        should_capture_output = (
182            capture_output
183            if capture_output is not None
184            else function_io_capture_enabled
185        )
186
187        def decorator(func: F) -> F:
188            return (
189                self._async_observe(
190                    func,
191                    name=name,
192                    as_type=as_type,
193                    capture_input=should_capture_input,
194                    capture_output=should_capture_output,
195                    transform_to_string=transform_to_string,
196                )
197                if asyncio.iscoroutinefunction(func)
198                else self._sync_observe(
199                    func,
200                    name=name,
201                    as_type=as_type,
202                    capture_input=should_capture_input,
203                    capture_output=should_capture_output,
204                    transform_to_string=transform_to_string,
205                )
206            )
207
208        """Handle decorator with or without parentheses.
209
210        This logic enables the decorator to work both with and without parentheses:
211        - @observe - Python passes the function directly to the decorator
212        - @observe() - Python calls the decorator first, which must return a function decorator
213
214        When called without arguments (@observe), the func parameter contains the function to decorate,
215        so we directly apply the decorator to it. When called with parentheses (@observe()),
216        func is None, so we return the decorator function itself for Python to apply in the next step.
217        """
218        if func is None:
219            return decorator
220        else:
221            return decorator(func)

Wrap a function to create and manage Langfuse tracing around its execution, supporting both synchronous and asynchronous functions.

This decorator provides seamless integration of Langfuse observability into your codebase. It automatically creates spans or generations around function execution, capturing timing, inputs/outputs, and error states. The decorator intelligently handles both synchronous and asynchronous functions, preserving function signatures and type hints.

Using OpenTelemetry's distributed tracing system, it maintains proper trace context propagation throughout your application, enabling you to see hierarchical traces of function calls with detailed performance metrics and function-specific details.

Arguments:
  • func (Optional[Callable]): The function to decorate. When used with parentheses @observe(), this will be None.
  • name (Optional[str]): Custom name for the created trace or span. If not provided, the function name is used.
  • as_type (Optional[Literal]): Set the observation type. Supported values: "generation", "span", "agent", "tool", "chain", "retriever", "embedding", "evaluator", "guardrail". Observation types are highlighted in the Langfuse UI for filtering and visualization. The types "generation" and "embedding" create a span on which additional attributes such as model metrics can be set.
Returns:

Callable: A wrapped version of the original function that automatically creates and manages Langfuse spans.

Example:

For general function tracing with automatic naming:

@observe()
def process_user_request(user_id, query):
    # Function is automatically traced with name "process_user_request"
    return get_response(query)

For language model generation tracking:

@observe(name="answer-generation", as_type="generation")
async def generate_answer(query):
    # Creates a generation-type span with extended LLM metrics
    response = await openai.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": query}]
    )
    return response.choices[0].message.content

For trace context propagation between functions:

@observe()
def main_process():
    # Parent span is created
    return sub_process()  # Child span automatically connected to parent

@observe()
def sub_process():
    # Automatically becomes a child span of main_process
    return "result"
Raises:
  • Exception: Propagates any exceptions from the wrapped function after logging them in the trace.
Notes:
  • The decorator preserves the original function's signature, docstring, and return type.
  • Proper parent-child relationships between spans are automatically maintained.
  • Special keyword arguments can be passed to control tracing:
    • langfuse_trace_id: Explicitly set the trace ID for this function call
    • langfuse_parent_observation_id: Explicitly set the parent span ID
    • langfuse_public_key: Use a specific Langfuse project (when multiple clients exist)
  • For async functions, the decorator returns an async function wrapper.
  • For sync functions, the decorator returns a synchronous wrapper.
def propagate_attributes( *, user_id: Optional[str] = None, session_id: Optional[str] = None, metadata: Optional[Dict[str, str]] = None, version: Optional[str] = None, tags: Optional[List[str]] = None, trace_name: Optional[str] = None, as_baggage: bool = False) -> opentelemetry.util._decorator._AgnosticContextManager[typing.Any]:
 76def propagate_attributes(
 77    *,
 78    user_id: Optional[str] = None,
 79    session_id: Optional[str] = None,
 80    metadata: Optional[Dict[str, str]] = None,
 81    version: Optional[str] = None,
 82    tags: Optional[List[str]] = None,
 83    trace_name: Optional[str] = None,
 84    as_baggage: bool = False,
 85) -> _AgnosticContextManager[Any]:
 86    """Propagate trace-level attributes to all spans created within this context.
 87
 88    This context manager sets attributes on the currently active span AND automatically
 89    propagates them to all new child spans created within the context. This is the
 90    recommended way to set trace-level attributes like user_id, session_id, and metadata
 91    dimensions that should be consistently applied across all observations in a trace.
 92
 93    **IMPORTANT**: Call this as early as possible within your trace/workflow. Only the
 94    currently active span and spans created after entering this context will have these
 95    attributes. Pre-existing spans will NOT be retroactively updated.
 96
 97    **Why this matters**: Langfuse aggregation queries (e.g., total cost by user_id,
 98    filtering by session_id) only include observations that have the attribute set.
 99    If you call `propagate_attributes` late in your workflow, earlier spans won't be
100    included in aggregations for that attribute.
101
102    Args:
103        user_id: User identifier to associate with all spans in this context.
104            Must be US-ASCII string, ≤200 characters. Use this to track which user
105            generated each trace and enable e.g. per-user cost/performance analysis.
106        session_id: Session identifier to associate with all spans in this context.
107            Must be US-ASCII string, ≤200 characters. Use this to group related traces
108            within a user session (e.g., a conversation thread, multi-turn interaction).
109        metadata: Additional key-value metadata to propagate to all spans.
110            - Keys and values must be US-ASCII strings
111            - All values must be ≤200 characters
112            - Use for dimensions like internal correlating identifiers
113            - AVOID: large payloads, sensitive data, non-string values (will be dropped with warning)
114        version: Version identfier for parts of your application that are independently versioned, e.g. agents
115        tags: List of tags to categorize the group of observations
116        trace_name: Name to assign to the trace. Must be US-ASCII string, ≤200 characters.
117            Use this to set a consistent trace name for all spans created within this context.
118        as_baggage: If True, propagates attributes using OpenTelemetry baggage for
119            cross-process/service propagation. **Security warning**: When enabled,
120            attribute values are added to HTTP headers on ALL outbound requests.
121            Only enable if values are safe to transmit via HTTP headers and you need
122            cross-service tracing. Default: False.
123
124    Returns:
125        Context manager that propagates attributes to all child spans.
126
127    Example:
128        Basic usage with user and session tracking:
129
130        ```python
131        from langfuse import Langfuse
132
133        langfuse = Langfuse()
134
135        # Set attributes early in the trace
136        with langfuse.start_as_current_observation(name="user_workflow") as span:
137            with langfuse.propagate_attributes(
138                user_id="user_123",
139                session_id="session_abc",
140                metadata={"experiment": "variant_a", "environment": "production"}
141            ):
142                # All spans created here will have user_id, session_id, and metadata
143                with langfuse.start_observation(name="llm_call") as llm_span:
144                    # This span inherits: user_id, session_id, experiment, environment
145                    ...
146
147                with langfuse.start_generation(name="completion") as gen:
148                    # This span also inherits all attributes
149                    ...
150        ```
151
152        Late propagation (anti-pattern):
153
154        ```python
155        with langfuse.start_as_current_observation(name="workflow") as span:
156            # These spans WON'T have user_id
157            early_span = langfuse.start_observation(name="early_work")
158            early_span.end()
159
160            # Set attributes in the middle
161            with langfuse.propagate_attributes(user_id="user_123"):
162                # Only spans created AFTER this point will have user_id
163                late_span = langfuse.start_observation(name="late_work")
164                late_span.end()
165
166            # Result: Aggregations by user_id will miss "early_work" span
167        ```
168
169        Cross-service propagation with baggage (advanced):
170
171        ```python
172        # Service A - originating service
173        with langfuse.start_as_current_observation(name="api_request"):
174            with langfuse.propagate_attributes(
175                user_id="user_123",
176                session_id="session_abc",
177                as_baggage=True  # Propagate via HTTP headers
178            ):
179                # Make HTTP request to Service B
180                response = requests.get("https://service-b.example.com/api")
181                # user_id and session_id are now in HTTP headers
182
183        # Service B - downstream service
184        # OpenTelemetry will automatically extract baggage from HTTP headers
185        # and propagate to spans in Service B
186        ```
187
188    Note:
189        - **Validation**: All attribute values (user_id, session_id, metadata values)
190          must be strings ≤200 characters. Invalid values will be dropped with a
191          warning logged. Ensure values meet constraints before calling.
192        - **OpenTelemetry**: This uses OpenTelemetry context propagation under the hood,
193          making it compatible with other OTel-instrumented libraries.
194
195    Raises:
196        No exceptions are raised. Invalid values are logged as warnings and dropped.
197    """
198    return _propagate_attributes(
199        user_id=user_id,
200        session_id=session_id,
201        metadata=metadata,
202        version=version,
203        tags=tags,
204        trace_name=trace_name,
205        as_baggage=as_baggage,
206    )

Propagate trace-level attributes to all spans created within this context.

This context manager sets attributes on the currently active span AND automatically propagates them to all new child spans created within the context. This is the recommended way to set trace-level attributes like user_id, session_id, and metadata dimensions that should be consistently applied across all observations in a trace.

IMPORTANT: Call this as early as possible within your trace/workflow. Only the currently active span and spans created after entering this context will have these attributes. Pre-existing spans will NOT be retroactively updated.

Why this matters: Langfuse aggregation queries (e.g., total cost by user_id, filtering by session_id) only include observations that have the attribute set. If you call propagate_attributes late in your workflow, earlier spans won't be included in aggregations for that attribute.

Arguments:
  • user_id: User identifier to associate with all spans in this context. Must be US-ASCII string, ≤200 characters. Use this to track which user generated each trace and enable e.g. per-user cost/performance analysis.
  • session_id: Session identifier to associate with all spans in this context. Must be US-ASCII string, ≤200 characters. Use this to group related traces within a user session (e.g., a conversation thread, multi-turn interaction).
  • metadata: Additional key-value metadata to propagate to all spans.
    • Keys and values must be US-ASCII strings
    • All values must be ≤200 characters
    • Use for dimensions like internal correlating identifiers
    • AVOID: large payloads, sensitive data, non-string values (will be dropped with warning)
  • version: Version identfier for parts of your application that are independently versioned, e.g. agents
  • tags: List of tags to categorize the group of observations
  • trace_name: Name to assign to the trace. Must be US-ASCII string, ≤200 characters. Use this to set a consistent trace name for all spans created within this context.
  • as_baggage: If True, propagates attributes using OpenTelemetry baggage for cross-process/service propagation. Security warning: When enabled, attribute values are added to HTTP headers on ALL outbound requests. Only enable if values are safe to transmit via HTTP headers and you need cross-service tracing. Default: False.
Returns:

Context manager that propagates attributes to all child spans.

Example:

Basic usage with user and session tracking:

from langfuse import Langfuse

langfuse = Langfuse()

# Set attributes early in the trace
with langfuse.start_as_current_observation(name="user_workflow") as span:
    with langfuse.propagate_attributes(
        user_id="user_123",
        session_id="session_abc",
        metadata={"experiment": "variant_a", "environment": "production"}
    ):
        # All spans created here will have user_id, session_id, and metadata
        with langfuse.start_observation(name="llm_call") as llm_span:
            # This span inherits: user_id, session_id, experiment, environment
            ...

        with langfuse.start_generation(name="completion") as gen:
            # This span also inherits all attributes
            ...

Late propagation (anti-pattern):

with langfuse.start_as_current_observation(name="workflow") as span:
    # These spans WON'T have user_id
    early_span = langfuse.start_observation(name="early_work")
    early_span.end()

    # Set attributes in the middle
    with langfuse.propagate_attributes(user_id="user_123"):
        # Only spans created AFTER this point will have user_id
        late_span = langfuse.start_observation(name="late_work")
        late_span.end()

    # Result: Aggregations by user_id will miss "early_work" span

Cross-service propagation with baggage (advanced):

# Service A - originating service
with langfuse.start_as_current_observation(name="api_request"):
    with langfuse.propagate_attributes(
        user_id="user_123",
        session_id="session_abc",
        as_baggage=True  # Propagate via HTTP headers
    ):
        # Make HTTP request to Service B
        response = requests.get("https://service-b.example.com/api")
        # user_id and session_id are now in HTTP headers

# Service B - downstream service
# OpenTelemetry will automatically extract baggage from HTTP headers
# and propagate to spans in Service B
Note:
  • Validation: All attribute values (user_id, session_id, metadata values) must be strings ≤200 characters. Invalid values will be dropped with a warning logged. Ensure values meet constraints before calling.
  • OpenTelemetry: This uses OpenTelemetry context propagation under the hood, making it compatible with other OTel-instrumented libraries.
Raises:
  • No exceptions are raised. Invalid values are logged as warnings and dropped.
ObservationTypeLiteral = typing.Union[typing.Literal['generation', 'embedding'], typing.Literal['span', 'agent', 'tool', 'chain', 'retriever', 'evaluator', 'guardrail'], typing.Literal['event']]
class LangfuseSpan(langfuse._client.span.LangfuseObservationWrapper):
1247class LangfuseSpan(LangfuseObservationWrapper):
1248    """Standard span implementation for general operations in Langfuse.
1249
1250    This class represents a general-purpose span that can be used to trace
1251    any operation in your application. It extends the base LangfuseObservationWrapper
1252    with specific methods for creating child spans, generations, and updating
1253    span-specific attributes. If possible, use a more specific type for
1254    better observability and insights.
1255    """
1256
1257    def __init__(
1258        self,
1259        *,
1260        otel_span: otel_trace_api.Span,
1261        langfuse_client: "Langfuse",
1262        input: Optional[Any] = None,
1263        output: Optional[Any] = None,
1264        metadata: Optional[Any] = None,
1265        environment: Optional[str] = None,
1266        release: Optional[str] = None,
1267        version: Optional[str] = None,
1268        level: Optional[SpanLevel] = None,
1269        status_message: Optional[str] = None,
1270    ):
1271        """Initialize a new LangfuseSpan.
1272
1273        Args:
1274            otel_span: The OpenTelemetry span to wrap
1275            langfuse_client: Reference to the parent Langfuse client
1276            input: Input data for the span (any JSON-serializable object)
1277            output: Output data from the span (any JSON-serializable object)
1278            metadata: Additional metadata to associate with the span
1279            environment: The tracing environment
1280            release: Release identifier for the application
1281            version: Version identifier for the code or component
1282            level: Importance level of the span (info, warning, error)
1283            status_message: Optional status message for the span
1284        """
1285        super().__init__(
1286            otel_span=otel_span,
1287            as_type="span",
1288            langfuse_client=langfuse_client,
1289            input=input,
1290            output=output,
1291            metadata=metadata,
1292            environment=environment,
1293            release=release,
1294            version=version,
1295            level=level,
1296            status_message=status_message,
1297        )

Standard span implementation for general operations in Langfuse.

This class represents a general-purpose span that can be used to trace any operation in your application. It extends the base LangfuseObservationWrapper with specific methods for creating child spans, generations, and updating span-specific attributes. If possible, use a more specific type for better observability and insights.

LangfuseSpan( *, otel_span: opentelemetry.trace.span.Span, langfuse_client: Langfuse, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, environment: Optional[str] = None, release: Optional[str] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None)
1257    def __init__(
1258        self,
1259        *,
1260        otel_span: otel_trace_api.Span,
1261        langfuse_client: "Langfuse",
1262        input: Optional[Any] = None,
1263        output: Optional[Any] = None,
1264        metadata: Optional[Any] = None,
1265        environment: Optional[str] = None,
1266        release: Optional[str] = None,
1267        version: Optional[str] = None,
1268        level: Optional[SpanLevel] = None,
1269        status_message: Optional[str] = None,
1270    ):
1271        """Initialize a new LangfuseSpan.
1272
1273        Args:
1274            otel_span: The OpenTelemetry span to wrap
1275            langfuse_client: Reference to the parent Langfuse client
1276            input: Input data for the span (any JSON-serializable object)
1277            output: Output data from the span (any JSON-serializable object)
1278            metadata: Additional metadata to associate with the span
1279            environment: The tracing environment
1280            release: Release identifier for the application
1281            version: Version identifier for the code or component
1282            level: Importance level of the span (info, warning, error)
1283            status_message: Optional status message for the span
1284        """
1285        super().__init__(
1286            otel_span=otel_span,
1287            as_type="span",
1288            langfuse_client=langfuse_client,
1289            input=input,
1290            output=output,
1291            metadata=metadata,
1292            environment=environment,
1293            release=release,
1294            version=version,
1295            level=level,
1296            status_message=status_message,
1297        )

Initialize a new LangfuseSpan.

Arguments:
  • otel_span: The OpenTelemetry span to wrap
  • langfuse_client: Reference to the parent Langfuse client
  • input: Input data for the span (any JSON-serializable object)
  • output: Output data from the span (any JSON-serializable object)
  • metadata: Additional metadata to associate with the span
  • environment: The tracing environment
  • release: Release identifier for the application
  • version: Version identifier for the code or component
  • level: Importance level of the span (info, warning, error)
  • status_message: Optional status message for the span
class LangfuseGeneration(langfuse._client.span.LangfuseObservationWrapper):
1300class LangfuseGeneration(LangfuseObservationWrapper):
1301    """Specialized span implementation for AI model generations in Langfuse.
1302
1303    This class represents a generation span specifically designed for tracking
1304    AI/LLM operations. It extends the base LangfuseObservationWrapper with specialized
1305    attributes for model details, token usage, and costs.
1306    """
1307
1308    def __init__(
1309        self,
1310        *,
1311        otel_span: otel_trace_api.Span,
1312        langfuse_client: "Langfuse",
1313        input: Optional[Any] = None,
1314        output: Optional[Any] = None,
1315        metadata: Optional[Any] = None,
1316        environment: Optional[str] = None,
1317        release: Optional[str] = None,
1318        version: Optional[str] = None,
1319        level: Optional[SpanLevel] = None,
1320        status_message: Optional[str] = None,
1321        completion_start_time: Optional[datetime] = None,
1322        model: Optional[str] = None,
1323        model_parameters: Optional[Dict[str, MapValue]] = None,
1324        usage_details: Optional[Dict[str, int]] = None,
1325        cost_details: Optional[Dict[str, float]] = None,
1326        prompt: Optional[PromptClient] = None,
1327    ):
1328        """Initialize a new LangfuseGeneration span.
1329
1330        Args:
1331            otel_span: The OpenTelemetry span to wrap
1332            langfuse_client: Reference to the parent Langfuse client
1333            input: Input data for the generation (e.g., prompts)
1334            output: Output from the generation (e.g., completions)
1335            metadata: Additional metadata to associate with the generation
1336            environment: The tracing environment
1337            release: Release identifier for the application
1338            version: Version identifier for the model or component
1339            level: Importance level of the generation (info, warning, error)
1340            status_message: Optional status message for the generation
1341            completion_start_time: When the model started generating the response
1342            model: Name/identifier of the AI model used (e.g., "gpt-4")
1343            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1344            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1345            cost_details: Cost information for the model call
1346            prompt: Associated prompt template from Langfuse prompt management
1347        """
1348        super().__init__(
1349            as_type="generation",
1350            otel_span=otel_span,
1351            langfuse_client=langfuse_client,
1352            input=input,
1353            output=output,
1354            metadata=metadata,
1355            environment=environment,
1356            release=release,
1357            version=version,
1358            level=level,
1359            status_message=status_message,
1360            completion_start_time=completion_start_time,
1361            model=model,
1362            model_parameters=model_parameters,
1363            usage_details=usage_details,
1364            cost_details=cost_details,
1365            prompt=prompt,
1366        )

Specialized span implementation for AI model generations in Langfuse.

This class represents a generation span specifically designed for tracking AI/LLM operations. It extends the base LangfuseObservationWrapper with specialized attributes for model details, token usage, and costs.

LangfuseGeneration( *, otel_span: opentelemetry.trace.span.Span, langfuse_client: Langfuse, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, environment: Optional[str] = None, release: Optional[str] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, float, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None)
1308    def __init__(
1309        self,
1310        *,
1311        otel_span: otel_trace_api.Span,
1312        langfuse_client: "Langfuse",
1313        input: Optional[Any] = None,
1314        output: Optional[Any] = None,
1315        metadata: Optional[Any] = None,
1316        environment: Optional[str] = None,
1317        release: Optional[str] = None,
1318        version: Optional[str] = None,
1319        level: Optional[SpanLevel] = None,
1320        status_message: Optional[str] = None,
1321        completion_start_time: Optional[datetime] = None,
1322        model: Optional[str] = None,
1323        model_parameters: Optional[Dict[str, MapValue]] = None,
1324        usage_details: Optional[Dict[str, int]] = None,
1325        cost_details: Optional[Dict[str, float]] = None,
1326        prompt: Optional[PromptClient] = None,
1327    ):
1328        """Initialize a new LangfuseGeneration span.
1329
1330        Args:
1331            otel_span: The OpenTelemetry span to wrap
1332            langfuse_client: Reference to the parent Langfuse client
1333            input: Input data for the generation (e.g., prompts)
1334            output: Output from the generation (e.g., completions)
1335            metadata: Additional metadata to associate with the generation
1336            environment: The tracing environment
1337            release: Release identifier for the application
1338            version: Version identifier for the model or component
1339            level: Importance level of the generation (info, warning, error)
1340            status_message: Optional status message for the generation
1341            completion_start_time: When the model started generating the response
1342            model: Name/identifier of the AI model used (e.g., "gpt-4")
1343            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1344            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1345            cost_details: Cost information for the model call
1346            prompt: Associated prompt template from Langfuse prompt management
1347        """
1348        super().__init__(
1349            as_type="generation",
1350            otel_span=otel_span,
1351            langfuse_client=langfuse_client,
1352            input=input,
1353            output=output,
1354            metadata=metadata,
1355            environment=environment,
1356            release=release,
1357            version=version,
1358            level=level,
1359            status_message=status_message,
1360            completion_start_time=completion_start_time,
1361            model=model,
1362            model_parameters=model_parameters,
1363            usage_details=usage_details,
1364            cost_details=cost_details,
1365            prompt=prompt,
1366        )

Initialize a new LangfuseGeneration span.

Arguments:
  • otel_span: The OpenTelemetry span to wrap
  • langfuse_client: Reference to the parent Langfuse client
  • input: Input data for the generation (e.g., prompts)
  • output: Output from the generation (e.g., completions)
  • metadata: Additional metadata to associate with the generation
  • environment: The tracing environment
  • release: Release identifier for the application
  • version: Version identifier for the model or component
  • level: Importance level of the generation (info, warning, error)
  • status_message: Optional status message for the generation
  • completion_start_time: When the model started generating the response
  • model: Name/identifier of the AI model used (e.g., "gpt-4")
  • model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
  • usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
  • cost_details: Cost information for the model call
  • prompt: Associated prompt template from Langfuse prompt management
class LangfuseEvent(langfuse._client.span.LangfuseObservationWrapper):
1369class LangfuseEvent(LangfuseObservationWrapper):
1370    """Specialized span implementation for Langfuse Events."""
1371
1372    def __init__(
1373        self,
1374        *,
1375        otel_span: otel_trace_api.Span,
1376        langfuse_client: "Langfuse",
1377        input: Optional[Any] = None,
1378        output: Optional[Any] = None,
1379        metadata: Optional[Any] = None,
1380        environment: Optional[str] = None,
1381        release: Optional[str] = None,
1382        version: Optional[str] = None,
1383        level: Optional[SpanLevel] = None,
1384        status_message: Optional[str] = None,
1385    ):
1386        """Initialize a new LangfuseEvent span.
1387
1388        Args:
1389            otel_span: The OpenTelemetry span to wrap
1390            langfuse_client: Reference to the parent Langfuse client
1391            input: Input data for the event
1392            output: Output from the event
1393            metadata: Additional metadata to associate with the generation
1394            environment: The tracing environment
1395            release: Release identifier for the application
1396            version: Version identifier for the model or component
1397            level: Importance level of the generation (info, warning, error)
1398            status_message: Optional status message for the generation
1399        """
1400        super().__init__(
1401            otel_span=otel_span,
1402            as_type="event",
1403            langfuse_client=langfuse_client,
1404            input=input,
1405            output=output,
1406            metadata=metadata,
1407            environment=environment,
1408            release=release,
1409            version=version,
1410            level=level,
1411            status_message=status_message,
1412        )
1413
1414    def update(
1415        self,
1416        *,
1417        name: Optional[str] = None,
1418        input: Optional[Any] = None,
1419        output: Optional[Any] = None,
1420        metadata: Optional[Any] = None,
1421        version: Optional[str] = None,
1422        level: Optional[SpanLevel] = None,
1423        status_message: Optional[str] = None,
1424        completion_start_time: Optional[datetime] = None,
1425        model: Optional[str] = None,
1426        model_parameters: Optional[Dict[str, MapValue]] = None,
1427        usage_details: Optional[Dict[str, int]] = None,
1428        cost_details: Optional[Dict[str, float]] = None,
1429        prompt: Optional[PromptClient] = None,
1430        **kwargs: Any,
1431    ) -> "LangfuseEvent":
1432        """Update is not allowed for LangfuseEvent because events cannot be updated.
1433
1434        This method logs a warning and returns self without making changes.
1435
1436        Returns:
1437            self: Returns the unchanged LangfuseEvent instance
1438        """
1439        langfuse_logger.warning(
1440            "Attempted to update LangfuseEvent observation. Events cannot be updated after creation."
1441        )
1442        return self

Specialized span implementation for Langfuse Events.

LangfuseEvent( *, otel_span: opentelemetry.trace.span.Span, langfuse_client: Langfuse, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, environment: Optional[str] = None, release: Optional[str] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None)
1372    def __init__(
1373        self,
1374        *,
1375        otel_span: otel_trace_api.Span,
1376        langfuse_client: "Langfuse",
1377        input: Optional[Any] = None,
1378        output: Optional[Any] = None,
1379        metadata: Optional[Any] = None,
1380        environment: Optional[str] = None,
1381        release: Optional[str] = None,
1382        version: Optional[str] = None,
1383        level: Optional[SpanLevel] = None,
1384        status_message: Optional[str] = None,
1385    ):
1386        """Initialize a new LangfuseEvent span.
1387
1388        Args:
1389            otel_span: The OpenTelemetry span to wrap
1390            langfuse_client: Reference to the parent Langfuse client
1391            input: Input data for the event
1392            output: Output from the event
1393            metadata: Additional metadata to associate with the generation
1394            environment: The tracing environment
1395            release: Release identifier for the application
1396            version: Version identifier for the model or component
1397            level: Importance level of the generation (info, warning, error)
1398            status_message: Optional status message for the generation
1399        """
1400        super().__init__(
1401            otel_span=otel_span,
1402            as_type="event",
1403            langfuse_client=langfuse_client,
1404            input=input,
1405            output=output,
1406            metadata=metadata,
1407            environment=environment,
1408            release=release,
1409            version=version,
1410            level=level,
1411            status_message=status_message,
1412        )

Initialize a new LangfuseEvent span.

Arguments:
  • otel_span: The OpenTelemetry span to wrap
  • langfuse_client: Reference to the parent Langfuse client
  • input: Input data for the event
  • output: Output from the event
  • metadata: Additional metadata to associate with the generation
  • environment: The tracing environment
  • release: Release identifier for the application
  • version: Version identifier for the model or component
  • level: Importance level of the generation (info, warning, error)
  • status_message: Optional status message for the generation
def update( self, *, name: Optional[str] = None, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, float, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None, **kwargs: Any) -> LangfuseEvent:
1414    def update(
1415        self,
1416        *,
1417        name: Optional[str] = None,
1418        input: Optional[Any] = None,
1419        output: Optional[Any] = None,
1420        metadata: Optional[Any] = None,
1421        version: Optional[str] = None,
1422        level: Optional[SpanLevel] = None,
1423        status_message: Optional[str] = None,
1424        completion_start_time: Optional[datetime] = None,
1425        model: Optional[str] = None,
1426        model_parameters: Optional[Dict[str, MapValue]] = None,
1427        usage_details: Optional[Dict[str, int]] = None,
1428        cost_details: Optional[Dict[str, float]] = None,
1429        prompt: Optional[PromptClient] = None,
1430        **kwargs: Any,
1431    ) -> "LangfuseEvent":
1432        """Update is not allowed for LangfuseEvent because events cannot be updated.
1433
1434        This method logs a warning and returns self without making changes.
1435
1436        Returns:
1437            self: Returns the unchanged LangfuseEvent instance
1438        """
1439        langfuse_logger.warning(
1440            "Attempted to update LangfuseEvent observation. Events cannot be updated after creation."
1441        )
1442        return self

Update is not allowed for LangfuseEvent because events cannot be updated.

This method logs a warning and returns self without making changes.

Returns:

self: Returns the unchanged LangfuseEvent instance

class LangfuseOtelSpanAttributes:
28class LangfuseOtelSpanAttributes:
29    # Langfuse-Trace attributes
30    TRACE_NAME = "langfuse.trace.name"
31    TRACE_USER_ID = "user.id"
32    TRACE_SESSION_ID = "session.id"
33    TRACE_TAGS = "langfuse.trace.tags"
34    TRACE_PUBLIC = "langfuse.trace.public"
35    TRACE_METADATA = "langfuse.trace.metadata"
36    TRACE_INPUT = "langfuse.trace.input"
37    TRACE_OUTPUT = "langfuse.trace.output"
38
39    # Langfuse-observation attributes
40    OBSERVATION_TYPE = "langfuse.observation.type"
41    OBSERVATION_METADATA = "langfuse.observation.metadata"
42    OBSERVATION_LEVEL = "langfuse.observation.level"
43    OBSERVATION_STATUS_MESSAGE = "langfuse.observation.status_message"
44    OBSERVATION_INPUT = "langfuse.observation.input"
45    OBSERVATION_OUTPUT = "langfuse.observation.output"
46
47    # Langfuse-observation of type Generation attributes
48    OBSERVATION_COMPLETION_START_TIME = "langfuse.observation.completion_start_time"
49    OBSERVATION_MODEL = "langfuse.observation.model.name"
50    OBSERVATION_MODEL_PARAMETERS = "langfuse.observation.model.parameters"
51    OBSERVATION_USAGE_DETAILS = "langfuse.observation.usage_details"
52    OBSERVATION_COST_DETAILS = "langfuse.observation.cost_details"
53    OBSERVATION_PROMPT_NAME = "langfuse.observation.prompt.name"
54    OBSERVATION_PROMPT_VERSION = "langfuse.observation.prompt.version"
55
56    # General
57    ENVIRONMENT = "langfuse.environment"
58    RELEASE = "langfuse.release"
59    VERSION = "langfuse.version"
60
61    # Internal
62    AS_ROOT = "langfuse.internal.as_root"
63
64    # Experiments
65    EXPERIMENT_ID = "langfuse.experiment.id"
66    EXPERIMENT_NAME = "langfuse.experiment.name"
67    EXPERIMENT_DESCRIPTION = "langfuse.experiment.description"
68    EXPERIMENT_METADATA = "langfuse.experiment.metadata"
69    EXPERIMENT_DATASET_ID = "langfuse.experiment.dataset.id"
70    EXPERIMENT_ITEM_ID = "langfuse.experiment.item.id"
71    EXPERIMENT_ITEM_EXPECTED_OUTPUT = "langfuse.experiment.item.expected_output"
72    EXPERIMENT_ITEM_METADATA = "langfuse.experiment.item.metadata"
73    EXPERIMENT_ITEM_ROOT_OBSERVATION_ID = "langfuse.experiment.item.root_observation_id"
TRACE_NAME = 'langfuse.trace.name'
TRACE_USER_ID = 'user.id'
TRACE_SESSION_ID = 'session.id'
TRACE_TAGS = 'langfuse.trace.tags'
TRACE_PUBLIC = 'langfuse.trace.public'
TRACE_METADATA = 'langfuse.trace.metadata'
TRACE_INPUT = 'langfuse.trace.input'
TRACE_OUTPUT = 'langfuse.trace.output'
OBSERVATION_TYPE = 'langfuse.observation.type'
OBSERVATION_METADATA = 'langfuse.observation.metadata'
OBSERVATION_LEVEL = 'langfuse.observation.level'
OBSERVATION_STATUS_MESSAGE = 'langfuse.observation.status_message'
OBSERVATION_INPUT = 'langfuse.observation.input'
OBSERVATION_OUTPUT = 'langfuse.observation.output'
OBSERVATION_COMPLETION_START_TIME = 'langfuse.observation.completion_start_time'
OBSERVATION_MODEL = 'langfuse.observation.model.name'
OBSERVATION_MODEL_PARAMETERS = 'langfuse.observation.model.parameters'
OBSERVATION_USAGE_DETAILS = 'langfuse.observation.usage_details'
OBSERVATION_COST_DETAILS = 'langfuse.observation.cost_details'
OBSERVATION_PROMPT_NAME = 'langfuse.observation.prompt.name'
OBSERVATION_PROMPT_VERSION = 'langfuse.observation.prompt.version'
ENVIRONMENT = 'langfuse.environment'
RELEASE = 'langfuse.release'
VERSION = 'langfuse.version'
AS_ROOT = 'langfuse.internal.as_root'
EXPERIMENT_ID = 'langfuse.experiment.id'
EXPERIMENT_NAME = 'langfuse.experiment.name'
EXPERIMENT_DESCRIPTION = 'langfuse.experiment.description'
EXPERIMENT_METADATA = 'langfuse.experiment.metadata'
EXPERIMENT_DATASET_ID = 'langfuse.experiment.dataset.id'
EXPERIMENT_ITEM_ID = 'langfuse.experiment.item.id'
EXPERIMENT_ITEM_EXPECTED_OUTPUT = 'langfuse.experiment.item.expected_output'
EXPERIMENT_ITEM_METADATA = 'langfuse.experiment.item.metadata'
EXPERIMENT_ITEM_ROOT_OBSERVATION_ID = 'langfuse.experiment.item.root_observation_id'
class LangfuseAgent(langfuse._client.span.LangfuseObservationWrapper):
1445class LangfuseAgent(LangfuseObservationWrapper):
1446    """Agent observation for reasoning blocks that act on tools using LLM guidance."""
1447
1448    def __init__(self, **kwargs: Any) -> None:
1449        """Initialize a new LangfuseAgent span."""
1450        kwargs["as_type"] = "agent"
1451        super().__init__(**kwargs)

Agent observation for reasoning blocks that act on tools using LLM guidance.

LangfuseAgent(**kwargs: Any)
1448    def __init__(self, **kwargs: Any) -> None:
1449        """Initialize a new LangfuseAgent span."""
1450        kwargs["as_type"] = "agent"
1451        super().__init__(**kwargs)

Initialize a new LangfuseAgent span.

class LangfuseTool(langfuse._client.span.LangfuseObservationWrapper):
1454class LangfuseTool(LangfuseObservationWrapper):
1455    """Tool observation representing external tool calls, e.g., calling a weather API."""
1456
1457    def __init__(self, **kwargs: Any) -> None:
1458        """Initialize a new LangfuseTool span."""
1459        kwargs["as_type"] = "tool"
1460        super().__init__(**kwargs)

Tool observation representing external tool calls, e.g., calling a weather API.

LangfuseTool(**kwargs: Any)
1457    def __init__(self, **kwargs: Any) -> None:
1458        """Initialize a new LangfuseTool span."""
1459        kwargs["as_type"] = "tool"
1460        super().__init__(**kwargs)

Initialize a new LangfuseTool span.

class LangfuseChain(langfuse._client.span.LangfuseObservationWrapper):
1463class LangfuseChain(LangfuseObservationWrapper):
1464    """Chain observation for connecting LLM application steps, e.g. passing context from retriever to LLM."""
1465
1466    def __init__(self, **kwargs: Any) -> None:
1467        """Initialize a new LangfuseChain span."""
1468        kwargs["as_type"] = "chain"
1469        super().__init__(**kwargs)

Chain observation for connecting LLM application steps, e.g. passing context from retriever to LLM.

LangfuseChain(**kwargs: Any)
1466    def __init__(self, **kwargs: Any) -> None:
1467        """Initialize a new LangfuseChain span."""
1468        kwargs["as_type"] = "chain"
1469        super().__init__(**kwargs)

Initialize a new LangfuseChain span.

class LangfuseEmbedding(langfuse._client.span.LangfuseObservationWrapper):
1481class LangfuseEmbedding(LangfuseObservationWrapper):
1482    """Embedding observation for LLM embedding calls, typically used before retrieval."""
1483
1484    def __init__(self, **kwargs: Any) -> None:
1485        """Initialize a new LangfuseEmbedding span."""
1486        kwargs["as_type"] = "embedding"
1487        super().__init__(**kwargs)

Embedding observation for LLM embedding calls, typically used before retrieval.

LangfuseEmbedding(**kwargs: Any)
1484    def __init__(self, **kwargs: Any) -> None:
1485        """Initialize a new LangfuseEmbedding span."""
1486        kwargs["as_type"] = "embedding"
1487        super().__init__(**kwargs)

Initialize a new LangfuseEmbedding span.

class LangfuseEvaluator(langfuse._client.span.LangfuseObservationWrapper):
1490class LangfuseEvaluator(LangfuseObservationWrapper):
1491    """Evaluator observation for assessing relevance, correctness, or helpfulness of LLM outputs."""
1492
1493    def __init__(self, **kwargs: Any) -> None:
1494        """Initialize a new LangfuseEvaluator span."""
1495        kwargs["as_type"] = "evaluator"
1496        super().__init__(**kwargs)

Evaluator observation for assessing relevance, correctness, or helpfulness of LLM outputs.

LangfuseEvaluator(**kwargs: Any)
1493    def __init__(self, **kwargs: Any) -> None:
1494        """Initialize a new LangfuseEvaluator span."""
1495        kwargs["as_type"] = "evaluator"
1496        super().__init__(**kwargs)

Initialize a new LangfuseEvaluator span.

class LangfuseRetriever(langfuse._client.span.LangfuseObservationWrapper):
1472class LangfuseRetriever(LangfuseObservationWrapper):
1473    """Retriever observation for data retrieval steps, e.g. vector store or database queries."""
1474
1475    def __init__(self, **kwargs: Any) -> None:
1476        """Initialize a new LangfuseRetriever span."""
1477        kwargs["as_type"] = "retriever"
1478        super().__init__(**kwargs)

Retriever observation for data retrieval steps, e.g. vector store or database queries.

LangfuseRetriever(**kwargs: Any)
1475    def __init__(self, **kwargs: Any) -> None:
1476        """Initialize a new LangfuseRetriever span."""
1477        kwargs["as_type"] = "retriever"
1478        super().__init__(**kwargs)

Initialize a new LangfuseRetriever span.

class LangfuseGuardrail(langfuse._client.span.LangfuseObservationWrapper):
1499class LangfuseGuardrail(LangfuseObservationWrapper):
1500    """Guardrail observation for protection e.g. against jailbreaks or offensive content."""
1501
1502    def __init__(self, **kwargs: Any) -> None:
1503        """Initialize a new LangfuseGuardrail span."""
1504        kwargs["as_type"] = "guardrail"
1505        super().__init__(**kwargs)

Guardrail observation for protection e.g. against jailbreaks or offensive content.

LangfuseGuardrail(**kwargs: Any)
1502    def __init__(self, **kwargs: Any) -> None:
1503        """Initialize a new LangfuseGuardrail span."""
1504        kwargs["as_type"] = "guardrail"
1505        super().__init__(**kwargs)

Initialize a new LangfuseGuardrail span.

class Evaluation:
 93class Evaluation:
 94    """Represents an evaluation result for an experiment item or an entire experiment run.
 95
 96    This class provides a strongly-typed way to create evaluation results in evaluator functions.
 97    Users must use keyword arguments when instantiating this class.
 98
 99    Attributes:
100        name: Unique identifier for the evaluation metric. Should be descriptive
101            and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity").
102            Used for aggregation and comparison across experiment runs.
103        value: The evaluation score or result. Can be:
104            - Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42)
105            - String: For categorical results like "positive", "negative", "neutral"
106            - Boolean: For binary assessments like "passes_safety_check"
107        comment: Optional human-readable explanation of the evaluation result.
108            Useful for providing context, explaining scoring rationale, or noting
109            special conditions. Displayed in Langfuse UI for interpretability.
110        metadata: Optional structured metadata about the evaluation process.
111            Can include confidence scores, intermediate calculations, model versions,
112            or any other relevant technical details.
113        data_type: Optional score data type. Required if value is not NUMERIC.
114            One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC.
115        config_id: Optional Langfuse score config ID.
116
117    Examples:
118        Basic accuracy evaluation:
119        ```python
120        from langfuse import Evaluation
121
122        def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
123            if not expected_output:
124                return Evaluation(name="accuracy", value=0, comment="No expected output")
125
126            is_correct = output.strip().lower() == expected_output.strip().lower()
127            return Evaluation(
128                name="accuracy",
129                value=1.0 if is_correct else 0.0,
130                comment="Correct answer" if is_correct else "Incorrect answer"
131            )
132        ```
133
134        Multi-metric evaluator:
135        ```python
136        def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs):
137            return [
138                Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"),
139                Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"),
140                Evaluation(
141                    name="quality",
142                    value=0.85,
143                    comment="High quality response",
144                    metadata={"confidence": 0.92, "model": "gpt-4"}
145                )
146            ]
147        ```
148
149        Categorical evaluation:
150        ```python
151        def sentiment_evaluator(*, input, output, **kwargs):
152            sentiment = analyze_sentiment(output)  # Returns "positive", "negative", or "neutral"
153            return Evaluation(
154                name="sentiment",
155                value=sentiment,
156                comment=f"Response expresses {sentiment} sentiment",
157                data_type="CATEGORICAL"
158            )
159        ```
160
161        Failed evaluation with error handling:
162        ```python
163        def external_api_evaluator(*, input, output, **kwargs):
164            try:
165                score = external_api.evaluate(output)
166                return Evaluation(name="external_score", value=score)
167            except Exception as e:
168                return Evaluation(
169                    name="external_score",
170                    value=0,
171                    comment=f"API unavailable: {e}",
172                    metadata={"error": str(e), "retry_count": 3}
173                )
174        ```
175
176    Note:
177        All arguments must be passed as keywords. Positional arguments are not allowed
178        to ensure code clarity and prevent errors from argument reordering.
179    """
180
181    def __init__(
182        self,
183        *,
184        name: str,
185        value: Union[int, float, str, bool],
186        comment: Optional[str] = None,
187        metadata: Optional[Dict[str, Any]] = None,
188        data_type: Optional[ScoreDataType] = None,
189        config_id: Optional[str] = None,
190    ):
191        """Initialize an Evaluation with the provided data.
192
193        Args:
194            name: Unique identifier for the evaluation metric.
195            value: The evaluation score or result.
196            comment: Optional human-readable explanation of the result.
197            metadata: Optional structured metadata about the evaluation process.
198            data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
199            config_id: Optional Langfuse score config ID.
200
201        Note:
202            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
203        """
204        self.name = name
205        self.value = value
206        self.comment = comment
207        self.metadata = metadata
208        self.data_type = data_type
209        self.config_id = config_id

Represents an evaluation result for an experiment item or an entire experiment run.

This class provides a strongly-typed way to create evaluation results in evaluator functions. Users must use keyword arguments when instantiating this class.

Attributes:
  • name: Unique identifier for the evaluation metric. Should be descriptive and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity"). Used for aggregation and comparison across experiment runs.
  • value: The evaluation score or result. Can be:
    • Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42)
    • String: For categorical results like "positive", "negative", "neutral"
    • Boolean: For binary assessments like "passes_safety_check"
  • comment: Optional human-readable explanation of the evaluation result. Useful for providing context, explaining scoring rationale, or noting special conditions. Displayed in Langfuse UI for interpretability.
  • metadata: Optional structured metadata about the evaluation process. Can include confidence scores, intermediate calculations, model versions, or any other relevant technical details.
  • data_type: Optional score data type. Required if value is not NUMERIC. One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC.
  • config_id: Optional Langfuse score config ID.
Examples:

Basic accuracy evaluation:

from langfuse import Evaluation

def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
    if not expected_output:
        return Evaluation(name="accuracy", value=0, comment="No expected output")

    is_correct = output.strip().lower() == expected_output.strip().lower()
    return Evaluation(
        name="accuracy",
        value=1.0 if is_correct else 0.0,
        comment="Correct answer" if is_correct else "Incorrect answer"
    )

Multi-metric evaluator:

def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs):
    return [
        Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"),
        Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"),
        Evaluation(
            name="quality",
            value=0.85,
            comment="High quality response",
            metadata={"confidence": 0.92, "model": "gpt-4"}
        )
    ]

Categorical evaluation:

def sentiment_evaluator(*, input, output, **kwargs):
    sentiment = analyze_sentiment(output)  # Returns "positive", "negative", or "neutral"
    return Evaluation(
        name="sentiment",
        value=sentiment,
        comment=f"Response expresses {sentiment} sentiment",
        data_type="CATEGORICAL"
    )

Failed evaluation with error handling:

def external_api_evaluator(*, input, output, **kwargs):
    try:
        score = external_api.evaluate(output)
        return Evaluation(name="external_score", value=score)
    except Exception as e:
        return Evaluation(
            name="external_score",
            value=0,
            comment=f"API unavailable: {e}",
            metadata={"error": str(e), "retry_count": 3}
        )
Note:

All arguments must be passed as keywords. Positional arguments are not allowed to ensure code clarity and prevent errors from argument reordering.

Evaluation( *, name: str, value: Union[int, float, str, bool], comment: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, data_type: Optional[langfuse.api.ScoreDataType] = None, config_id: Optional[str] = None)
181    def __init__(
182        self,
183        *,
184        name: str,
185        value: Union[int, float, str, bool],
186        comment: Optional[str] = None,
187        metadata: Optional[Dict[str, Any]] = None,
188        data_type: Optional[ScoreDataType] = None,
189        config_id: Optional[str] = None,
190    ):
191        """Initialize an Evaluation with the provided data.
192
193        Args:
194            name: Unique identifier for the evaluation metric.
195            value: The evaluation score or result.
196            comment: Optional human-readable explanation of the result.
197            metadata: Optional structured metadata about the evaluation process.
198            data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
199            config_id: Optional Langfuse score config ID.
200
201        Note:
202            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
203        """
204        self.name = name
205        self.value = value
206        self.comment = comment
207        self.metadata = metadata
208        self.data_type = data_type
209        self.config_id = config_id

Initialize an Evaluation with the provided data.

Arguments:
  • name: Unique identifier for the evaluation metric.
  • value: The evaluation score or result.
  • comment: Optional human-readable explanation of the result.
  • metadata: Optional structured metadata about the evaluation process.
  • data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
  • config_id: Optional Langfuse score config ID.
Note:

All arguments must be provided as keywords. Positional arguments will raise a TypeError.

name
value
comment
metadata
data_type
config_id
class EvaluatorInputs:
 38class EvaluatorInputs:
 39    """Input data structure for evaluators, returned by mapper functions.
 40
 41    This class provides a strongly-typed container for transforming API response
 42    objects (traces, observations) into the standardized format expected
 43    by evaluator functions. It ensures consistent access to input, output, expected
 44    output, and metadata regardless of the source entity type.
 45
 46    Attributes:
 47        input: The input data that was provided to generate the output being evaluated.
 48            For traces, this might be the initial prompt or request. For observations,
 49            this could be the span's input. The exact meaning depends on your use case.
 50        output: The actual output that was produced and needs to be evaluated.
 51            For traces, this is typically the final response. For observations,
 52            this might be the generation output or span result.
 53        expected_output: Optional ground truth or expected result for comparison.
 54            Used by evaluators to assess correctness. May be None if no ground truth
 55            is available for the entity being evaluated.
 56        metadata: Optional structured metadata providing additional context for evaluation.
 57            Can include information about the entity, execution context, user attributes,
 58            or any other relevant data that evaluators might use.
 59
 60    Examples:
 61        Simple mapper for traces:
 62        ```python
 63        from langfuse import EvaluatorInputs
 64
 65        def trace_mapper(trace):
 66            return EvaluatorInputs(
 67                input=trace.input,
 68                output=trace.output,
 69                expected_output=None,  # No ground truth available
 70                metadata={"user_id": trace.user_id, "tags": trace.tags}
 71            )
 72        ```
 73
 74        Mapper for observations extracting specific fields:
 75        ```python
 76        def observation_mapper(observation):
 77            # Extract input/output from observation's data
 78            input_data = observation.input if hasattr(observation, 'input') else None
 79            output_data = observation.output if hasattr(observation, 'output') else None
 80
 81            return EvaluatorInputs(
 82                input=input_data,
 83                output=output_data,
 84                expected_output=None,
 85                metadata={
 86                    "observation_type": observation.type,
 87                    "model": observation.model,
 88                    "latency_ms": observation.end_time - observation.start_time
 89                }
 90            )
 91        ```
 92        ```
 93
 94    Note:
 95        All arguments must be passed as keywords when instantiating this class.
 96    """
 97
 98    def __init__(
 99        self,
100        *,
101        input: Any,
102        output: Any,
103        expected_output: Any = None,
104        metadata: Optional[Dict[str, Any]] = None,
105    ):
106        """Initialize EvaluatorInputs with the provided data.
107
108        Args:
109            input: The input data for evaluation.
110            output: The output data to be evaluated.
111            expected_output: Optional ground truth for comparison.
112            metadata: Optional additional context for evaluation.
113
114        Note:
115            All arguments must be provided as keywords.
116        """
117        self.input = input
118        self.output = output
119        self.expected_output = expected_output
120        self.metadata = metadata

Input data structure for evaluators, returned by mapper functions.

This class provides a strongly-typed container for transforming API response objects (traces, observations) into the standardized format expected by evaluator functions. It ensures consistent access to input, output, expected output, and metadata regardless of the source entity type.

Attributes:
  • input: The input data that was provided to generate the output being evaluated. For traces, this might be the initial prompt or request. For observations, this could be the span's input. The exact meaning depends on your use case.
  • output: The actual output that was produced and needs to be evaluated. For traces, this is typically the final response. For observations, this might be the generation output or span result.
  • expected_output: Optional ground truth or expected result for comparison. Used by evaluators to assess correctness. May be None if no ground truth is available for the entity being evaluated.
  • metadata: Optional structured metadata providing additional context for evaluation. Can include information about the entity, execution context, user attributes, or any other relevant data that evaluators might use.
Examples:

Simple mapper for traces:

from langfuse import EvaluatorInputs

def trace_mapper(trace):
    return EvaluatorInputs(
        input=trace.input,
        output=trace.output,
        expected_output=None,  # No ground truth available
        metadata={"user_id": trace.user_id, "tags": trace.tags}
    )

Mapper for observations extracting specific fields:

def observation_mapper(observation):
    # Extract input/output from observation's data
    input_data = observation.input if hasattr(observation, 'input') else None
    output_data = observation.output if hasattr(observation, 'output') else None

    return EvaluatorInputs(
        input=input_data,
        output=output_data,
        expected_output=None,
        metadata={
            "observation_type": observation.type,
            "model": observation.model,
            "latency_ms": observation.end_time - observation.start_time
        }
    )

```

Note:

All arguments must be passed as keywords when instantiating this class.

EvaluatorInputs( *, input: Any, output: Any, expected_output: Any = None, metadata: Optional[Dict[str, Any]] = None)
 98    def __init__(
 99        self,
100        *,
101        input: Any,
102        output: Any,
103        expected_output: Any = None,
104        metadata: Optional[Dict[str, Any]] = None,
105    ):
106        """Initialize EvaluatorInputs with the provided data.
107
108        Args:
109            input: The input data for evaluation.
110            output: The output data to be evaluated.
111            expected_output: Optional ground truth for comparison.
112            metadata: Optional additional context for evaluation.
113
114        Note:
115            All arguments must be provided as keywords.
116        """
117        self.input = input
118        self.output = output
119        self.expected_output = expected_output
120        self.metadata = metadata

Initialize EvaluatorInputs with the provided data.

Arguments:
  • input: The input data for evaluation.
  • output: The output data to be evaluated.
  • expected_output: Optional ground truth for comparison.
  • metadata: Optional additional context for evaluation.
Note:

All arguments must be provided as keywords.

input
output
expected_output
metadata
class MapperFunction(typing.Protocol):
123class MapperFunction(Protocol):
124    """Protocol defining the interface for mapper functions in batch evaluation.
125
126    Mapper functions transform API response objects (traces or observations)
127    into the standardized EvaluatorInputs format that evaluators expect. This abstraction
128    allows you to define how to extract and structure evaluation data from different
129    entity types.
130
131    Mapper functions must:
132    - Accept a single item parameter (trace, observation)
133    - Return an EvaluatorInputs instance with input, output, expected_output, metadata
134    - Can be either synchronous or asynchronous
135    - Should handle missing or malformed data gracefully
136    """
137
138    def __call__(
139        self,
140        *,
141        item: Union["TraceWithFullDetails", "ObservationsView"],
142        **kwargs: Dict[str, Any],
143    ) -> Union[EvaluatorInputs, Awaitable[EvaluatorInputs]]:
144        """Transform an API response object into evaluator inputs.
145
146        This method defines how to extract evaluation-relevant data from the raw
147        API response object. The implementation should map entity-specific fields
148        to the standardized input/output/expected_output/metadata structure.
149
150        Args:
151            item: The API response object to transform. The type depends on the scope:
152                - TraceWithFullDetails: When evaluating traces
153                - ObservationsView: When evaluating observations
154
155        Returns:
156            EvaluatorInputs: A structured container with:
157                - input: The input data that generated the output
158                - output: The output to be evaluated
159                - expected_output: Optional ground truth for comparison
160                - metadata: Optional additional context
161
162            Can return either a direct EvaluatorInputs instance or an awaitable
163            (for async mappers that need to fetch additional data).
164
165        Examples:
166            Basic trace mapper:
167            ```python
168            def map_trace(trace):
169                return EvaluatorInputs(
170                    input=trace.input,
171                    output=trace.output,
172                    expected_output=None,
173                    metadata={"trace_id": trace.id, "user": trace.user_id}
174                )
175            ```
176
177            Observation mapper with conditional logic:
178            ```python
179            def map_observation(observation):
180                # Extract fields based on observation type
181                if observation.type == "GENERATION":
182                    input_data = observation.input
183                    output_data = observation.output
184                else:
185                    # For other types, use different fields
186                    input_data = observation.metadata.get("input")
187                    output_data = observation.metadata.get("output")
188
189                return EvaluatorInputs(
190                    input=input_data,
191                    output=output_data,
192                    expected_output=None,
193                    metadata={"obs_id": observation.id, "type": observation.type}
194                )
195            ```
196
197            Async mapper (if additional processing needed):
198            ```python
199            async def map_trace_async(trace):
200                # Could do async processing here if needed
201                processed_output = await some_async_transformation(trace.output)
202
203                return EvaluatorInputs(
204                    input=trace.input,
205                    output=processed_output,
206                    expected_output=None,
207                    metadata={"trace_id": trace.id}
208                )
209            ```
210        """
211        ...

Protocol defining the interface for mapper functions in batch evaluation.

Mapper functions transform API response objects (traces or observations) into the standardized EvaluatorInputs format that evaluators expect. This abstraction allows you to define how to extract and structure evaluation data from different entity types.

Mapper functions must:

  • Accept a single item parameter (trace, observation)
  • Return an EvaluatorInputs instance with input, output, expected_output, metadata
  • Can be either synchronous or asynchronous
  • Should handle missing or malformed data gracefully
MapperFunction(*args, **kwargs)
1927def _no_init_or_replace_init(self, *args, **kwargs):
1928    cls = type(self)
1929
1930    if cls._is_protocol:
1931        raise TypeError('Protocols cannot be instantiated')
1932
1933    # Already using a custom `__init__`. No need to calculate correct
1934    # `__init__` to call. This can lead to RecursionError. See bpo-45121.
1935    if cls.__init__ is not _no_init_or_replace_init:
1936        return
1937
1938    # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`.
1939    # The first instantiation of the subclass will call `_no_init_or_replace_init` which
1940    # searches for a proper new `__init__` in the MRO. The new `__init__`
1941    # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent
1942    # instantiation of the protocol subclass will thus use the new
1943    # `__init__` and no longer call `_no_init_or_replace_init`.
1944    for base in cls.__mro__:
1945        init = base.__dict__.get('__init__', _no_init_or_replace_init)
1946        if init is not _no_init_or_replace_init:
1947            cls.__init__ = init
1948            break
1949    else:
1950        # should not happen
1951        cls.__init__ = object.__init__
1952
1953    cls.__init__(self, *args, **kwargs)
class CompositeEvaluatorFunction(typing.Protocol):
214class CompositeEvaluatorFunction(Protocol):
215    """Protocol defining the interface for composite evaluator functions.
216
217    Composite evaluators create aggregate scores from multiple item-level evaluations.
218    This is commonly used to compute weighted averages, combined metrics, or other
219    composite assessments based on individual evaluation results.
220
221    Composite evaluators:
222    - Accept the same inputs as item-level evaluators (input, output, expected_output, metadata)
223      plus the list of evaluations
224    - Return either a single Evaluation, a list of Evaluations, or a dict
225    - Can be either synchronous or asynchronous
226    - Have access to both raw item data and evaluation results
227    """
228
229    def __call__(
230        self,
231        *,
232        input: Optional[Any] = None,
233        output: Optional[Any] = None,
234        expected_output: Optional[Any] = None,
235        metadata: Optional[Dict[str, Any]] = None,
236        evaluations: List[Evaluation],
237        **kwargs: Dict[str, Any],
238    ) -> Union[
239        Evaluation,
240        List[Evaluation],
241        Dict[str, Any],
242        Awaitable[Evaluation],
243        Awaitable[List[Evaluation]],
244        Awaitable[Dict[str, Any]],
245    ]:
246        r"""Create a composite evaluation from item-level evaluation results.
247
248        This method combines multiple evaluation scores into a single composite metric.
249        Common use cases include weighted averages, pass/fail decisions based on multiple
250        criteria, or custom scoring logic that considers multiple dimensions.
251
252        Args:
253            input: The input data that was provided to the system being evaluated.
254            output: The output generated by the system being evaluated.
255            expected_output: The expected/reference output for comparison (if available).
256            metadata: Additional metadata about the evaluation context.
257            evaluations: List of evaluation results from item-level evaluators.
258                Each evaluation contains name, value, comment, and metadata.
259
260        Returns:
261            Can return any of:
262            - Evaluation: A single composite evaluation result
263            - List[Evaluation]: Multiple composite evaluations
264            - Dict: A dict that will be converted to an Evaluation
265                - name: Identifier for the composite metric (e.g., "composite_score")
266                - value: The computed composite value
267                - comment: Optional explanation of how the score was computed
268                - metadata: Optional details about the composition logic
269
270            Can return either a direct Evaluation instance or an awaitable
271            (for async composite evaluators).
272
273        Examples:
274            Simple weighted average:
275            ```python
276            def weighted_composite(*, input, output, expected_output, metadata, evaluations):
277                weights = {
278                    "accuracy": 0.5,
279                    "relevance": 0.3,
280                    "safety": 0.2
281                }
282
283                total_score = 0.0
284                total_weight = 0.0
285
286                for eval in evaluations:
287                    if eval.name in weights and isinstance(eval.value, (int, float)):
288                        total_score += eval.value * weights[eval.name]
289                        total_weight += weights[eval.name]
290
291                final_score = total_score / total_weight if total_weight > 0 else 0.0
292
293                return Evaluation(
294                    name="composite_score",
295                    value=final_score,
296                    comment=f"Weighted average of {len(evaluations)} metrics"
297                )
298            ```
299
300            Pass/fail composite based on thresholds:
301            ```python
302            def pass_fail_composite(*, input, output, expected_output, metadata, evaluations):
303                # Must pass all criteria
304                thresholds = {
305                    "accuracy": 0.7,
306                    "safety": 0.9,
307                    "relevance": 0.6
308                }
309
310                passes = True
311                failing_metrics = []
312
313                for metric, threshold in thresholds.items():
314                    eval_result = next((e for e in evaluations if e.name == metric), None)
315                    if eval_result and isinstance(eval_result.value, (int, float)):
316                        if eval_result.value < threshold:
317                            passes = False
318                            failing_metrics.append(metric)
319
320                return Evaluation(
321                    name="passes_all_checks",
322                    value=passes,
323                    comment=f"Failed: {', '.join(failing_metrics)}" if failing_metrics else "All checks passed",
324                    data_type="BOOLEAN"
325                )
326            ```
327
328            Async composite with external scoring:
329            ```python
330            async def llm_composite(*, input, output, expected_output, metadata, evaluations):
331                # Use LLM to synthesize multiple evaluation results
332                eval_summary = "\n".join(
333                    f"- {e.name}: {e.value}" for e in evaluations
334                )
335
336                prompt = f"Given these evaluation scores:\n{eval_summary}\n"
337                prompt += f"For the output: {output}\n"
338                prompt += "Provide an overall quality score from 0-1."
339
340                response = await openai.chat.completions.create(
341                    model="gpt-4",
342                    messages=[{"role": "user", "content": prompt}]
343                )
344
345                score = float(response.choices[0].message.content.strip())
346
347                return Evaluation(
348                    name="llm_composite_score",
349                    value=score,
350                    comment="LLM-synthesized composite score"
351                )
352            ```
353
354            Context-aware composite:
355            ```python
356            def context_composite(*, input, output, expected_output, metadata, evaluations):
357                # Adjust weighting based on metadata
358                base_weights = {"accuracy": 0.5, "speed": 0.3, "cost": 0.2}
359
360                # If metadata indicates high importance, prioritize accuracy
361                if metadata and metadata.get('importance') == 'high':
362                    weights = {"accuracy": 0.7, "speed": 0.2, "cost": 0.1}
363                else:
364                    weights = base_weights
365
366                total = sum(
367                    e.value * weights.get(e.name, 0)
368                    for e in evaluations
369                    if isinstance(e.value, (int, float))
370                )
371
372                return Evaluation(
373                    name="weighted_composite",
374                    value=total,
375                    comment="Context-aware weighted composite"
376                )
377            ```
378        """
379        ...

Protocol defining the interface for composite evaluator functions.

Composite evaluators create aggregate scores from multiple item-level evaluations. This is commonly used to compute weighted averages, combined metrics, or other composite assessments based on individual evaluation results.

Composite evaluators:

  • Accept the same inputs as item-level evaluators (input, output, expected_output, metadata) plus the list of evaluations
  • Return either a single Evaluation, a list of Evaluations, or a dict
  • Can be either synchronous or asynchronous
  • Have access to both raw item data and evaluation results
CompositeEvaluatorFunction(*args, **kwargs)
1927def _no_init_or_replace_init(self, *args, **kwargs):
1928    cls = type(self)
1929
1930    if cls._is_protocol:
1931        raise TypeError('Protocols cannot be instantiated')
1932
1933    # Already using a custom `__init__`. No need to calculate correct
1934    # `__init__` to call. This can lead to RecursionError. See bpo-45121.
1935    if cls.__init__ is not _no_init_or_replace_init:
1936        return
1937
1938    # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`.
1939    # The first instantiation of the subclass will call `_no_init_or_replace_init` which
1940    # searches for a proper new `__init__` in the MRO. The new `__init__`
1941    # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent
1942    # instantiation of the protocol subclass will thus use the new
1943    # `__init__` and no longer call `_no_init_or_replace_init`.
1944    for base in cls.__mro__:
1945        init = base.__dict__.get('__init__', _no_init_or_replace_init)
1946        if init is not _no_init_or_replace_init:
1947            cls.__init__ = init
1948            break
1949    else:
1950        # should not happen
1951        cls.__init__ = object.__init__
1952
1953    cls.__init__(self, *args, **kwargs)
class EvaluatorStats:
382class EvaluatorStats:
383    """Statistics for a single evaluator's performance during batch evaluation.
384
385    This class tracks detailed metrics about how a specific evaluator performed
386    across all items in a batch evaluation run. It helps identify evaluator issues,
387    understand reliability, and optimize evaluation pipelines.
388
389    Attributes:
390        name: The name of the evaluator function (extracted from __name__).
391        total_runs: Total number of times the evaluator was invoked.
392        successful_runs: Number of times the evaluator completed successfully.
393        failed_runs: Number of times the evaluator raised an exception or failed.
394        total_scores_created: Total number of evaluation scores created by this evaluator.
395            Can be higher than successful_runs if the evaluator returns multiple scores.
396
397    Examples:
398        Accessing evaluator stats from batch evaluation result:
399        ```python
400        result = client.run_batched_evaluation(...)
401
402        for stats in result.evaluator_stats:
403            print(f"Evaluator: {stats.name}")
404            print(f"  Success rate: {stats.successful_runs / stats.total_runs:.1%}")
405            print(f"  Scores created: {stats.total_scores_created}")
406
407            if stats.failed_runs > 0:
408                print(f"  âš ī¸  Failed {stats.failed_runs} times")
409        ```
410
411        Identifying problematic evaluators:
412        ```python
413        result = client.run_batched_evaluation(...)
414
415        # Find evaluators with high failure rates
416        for stats in result.evaluator_stats:
417            failure_rate = stats.failed_runs / stats.total_runs
418            if failure_rate > 0.1:  # More than 10% failures
419                print(f"âš ī¸  {stats.name} has {failure_rate:.1%} failure rate")
420                print(f"    Consider debugging or removing this evaluator")
421        ```
422
423    Note:
424        All arguments must be passed as keywords when instantiating this class.
425    """
426
427    def __init__(
428        self,
429        *,
430        name: str,
431        total_runs: int = 0,
432        successful_runs: int = 0,
433        failed_runs: int = 0,
434        total_scores_created: int = 0,
435    ):
436        """Initialize EvaluatorStats with the provided metrics.
437
438        Args:
439            name: The evaluator function name.
440            total_runs: Total number of evaluator invocations.
441            successful_runs: Number of successful completions.
442            failed_runs: Number of failures.
443            total_scores_created: Total scores created by this evaluator.
444
445        Note:
446            All arguments must be provided as keywords.
447        """
448        self.name = name
449        self.total_runs = total_runs
450        self.successful_runs = successful_runs
451        self.failed_runs = failed_runs
452        self.total_scores_created = total_scores_created

Statistics for a single evaluator's performance during batch evaluation.

This class tracks detailed metrics about how a specific evaluator performed across all items in a batch evaluation run. It helps identify evaluator issues, understand reliability, and optimize evaluation pipelines.

Attributes:
  • name: The name of the evaluator function (extracted from __name__).
  • total_runs: Total number of times the evaluator was invoked.
  • successful_runs: Number of times the evaluator completed successfully.
  • failed_runs: Number of times the evaluator raised an exception or failed.
  • total_scores_created: Total number of evaluation scores created by this evaluator. Can be higher than successful_runs if the evaluator returns multiple scores.
Examples:

Accessing evaluator stats from batch evaluation result:

result = client.run_batched_evaluation(...)

for stats in result.evaluator_stats:
    print(f"Evaluator: {stats.name}")
    print(f"  Success rate: {stats.successful_runs / stats.total_runs:.1%}")
    print(f"  Scores created: {stats.total_scores_created}")

    if stats.failed_runs > 0:
        print(f"  âš ī¸  Failed {stats.failed_runs} times")

Identifying problematic evaluators:

result = client.run_batched_evaluation(...)

# Find evaluators with high failure rates
for stats in result.evaluator_stats:
    failure_rate = stats.failed_runs / stats.total_runs
    if failure_rate > 0.1:  # More than 10% failures
        print(f"âš ī¸  {stats.name} has {failure_rate:.1%} failure rate")
        print(f"    Consider debugging or removing this evaluator")
Note:

All arguments must be passed as keywords when instantiating this class.

EvaluatorStats( *, name: str, total_runs: int = 0, successful_runs: int = 0, failed_runs: int = 0, total_scores_created: int = 0)
427    def __init__(
428        self,
429        *,
430        name: str,
431        total_runs: int = 0,
432        successful_runs: int = 0,
433        failed_runs: int = 0,
434        total_scores_created: int = 0,
435    ):
436        """Initialize EvaluatorStats with the provided metrics.
437
438        Args:
439            name: The evaluator function name.
440            total_runs: Total number of evaluator invocations.
441            successful_runs: Number of successful completions.
442            failed_runs: Number of failures.
443            total_scores_created: Total scores created by this evaluator.
444
445        Note:
446            All arguments must be provided as keywords.
447        """
448        self.name = name
449        self.total_runs = total_runs
450        self.successful_runs = successful_runs
451        self.failed_runs = failed_runs
452        self.total_scores_created = total_scores_created

Initialize EvaluatorStats with the provided metrics.

Arguments:
  • name: The evaluator function name.
  • total_runs: Total number of evaluator invocations.
  • successful_runs: Number of successful completions.
  • failed_runs: Number of failures.
  • total_scores_created: Total scores created by this evaluator.
Note:

All arguments must be provided as keywords.

name
total_runs
successful_runs
failed_runs
total_scores_created
class BatchEvaluationResumeToken:
455class BatchEvaluationResumeToken:
456    """Token for resuming a failed batch evaluation run.
457
458    This class encapsulates all the information needed to resume a batch evaluation
459    that was interrupted or failed partway through. It uses timestamp-based filtering
460    to avoid re-processing items that were already evaluated, even if the underlying
461    dataset changed between runs.
462
463    Attributes:
464        scope: The type of items being evaluated ("traces", "observations").
465        filter: The original JSON filter string used to query items.
466        last_processed_timestamp: ISO 8601 timestamp of the last successfully processed item.
467            Used to construct a filter that only fetches items after this timestamp.
468        last_processed_id: The ID of the last successfully processed item, for reference.
469        items_processed: Count of items successfully processed before interruption.
470
471    Examples:
472        Resuming a failed batch evaluation:
473        ```python
474        # Initial run that fails partway through
475        try:
476            result = client.run_batched_evaluation(
477                scope="traces",
478                mapper=my_mapper,
479                evaluators=[evaluator1, evaluator2],
480                filter='{"tags": ["production"]}',
481                max_items=10000
482            )
483        except Exception as e:
484            print(f"Evaluation failed: {e}")
485
486            # Save the resume token
487            if result.resume_token:
488                # Store resume token for later (e.g., in a file or database)
489                import json
490                with open("resume_token.json", "w") as f:
491                    json.dump({
492                        "scope": result.resume_token.scope,
493                        "filter": result.resume_token.filter,
494                        "last_timestamp": result.resume_token.last_processed_timestamp,
495                        "last_id": result.resume_token.last_processed_id,
496                        "items_done": result.resume_token.items_processed
497                    }, f)
498
499        # Later, resume from where it left off
500        with open("resume_token.json") as f:
501            token_data = json.load(f)
502
503        resume_token = BatchEvaluationResumeToken(
504            scope=token_data["scope"],
505            filter=token_data["filter"],
506            last_processed_timestamp=token_data["last_timestamp"],
507            last_processed_id=token_data["last_id"],
508            items_processed=token_data["items_done"]
509        )
510
511        # Resume the evaluation
512        result = client.run_batched_evaluation(
513            scope="traces",
514            mapper=my_mapper,
515            evaluators=[evaluator1, evaluator2],
516            resume_from=resume_token
517        )
518
519        print(f"Processed {result.total_items_processed} additional items")
520        ```
521
522        Handling partial completion:
523        ```python
524        result = client.run_batched_evaluation(...)
525
526        if not result.completed:
527            print(f"Evaluation incomplete. Processed {result.resume_token.items_processed} items")
528            print(f"Last item: {result.resume_token.last_processed_id}")
529            print(f"Resume from: {result.resume_token.last_processed_timestamp}")
530
531            # Optionally retry automatically
532            if result.resume_token:
533                print("Retrying...")
534                result = client.run_batched_evaluation(
535                    scope=result.resume_token.scope,
536                    mapper=my_mapper,
537                    evaluators=my_evaluators,
538                    resume_from=result.resume_token
539                )
540        ```
541
542    Note:
543        All arguments must be passed as keywords when instantiating this class.
544        The timestamp-based approach means that items created after the initial run
545        but before the timestamp will be skipped. This is intentional to avoid
546        duplicates and ensure consistent evaluation.
547    """
548
549    def __init__(
550        self,
551        *,
552        scope: str,
553        filter: Optional[str],
554        last_processed_timestamp: str,
555        last_processed_id: str,
556        items_processed: int,
557    ):
558        """Initialize BatchEvaluationResumeToken with the provided state.
559
560        Args:
561            scope: The scope type ("traces", "observations").
562            filter: The original JSON filter string.
563            last_processed_timestamp: ISO 8601 timestamp of last processed item.
564            last_processed_id: ID of last processed item.
565            items_processed: Count of items processed before interruption.
566
567        Note:
568            All arguments must be provided as keywords.
569        """
570        self.scope = scope
571        self.filter = filter
572        self.last_processed_timestamp = last_processed_timestamp
573        self.last_processed_id = last_processed_id
574        self.items_processed = items_processed

Token for resuming a failed batch evaluation run.

This class encapsulates all the information needed to resume a batch evaluation that was interrupted or failed partway through. It uses timestamp-based filtering to avoid re-processing items that were already evaluated, even if the underlying dataset changed between runs.

Attributes:
  • scope: The type of items being evaluated ("traces", "observations").
  • filter: The original JSON filter string used to query items.
  • last_processed_timestamp: ISO 8601 timestamp of the last successfully processed item. Used to construct a filter that only fetches items after this timestamp.
  • last_processed_id: The ID of the last successfully processed item, for reference.
  • items_processed: Count of items successfully processed before interruption.
Examples:

Resuming a failed batch evaluation:

# Initial run that fails partway through
try:
    result = client.run_batched_evaluation(
        scope="traces",
        mapper=my_mapper,
        evaluators=[evaluator1, evaluator2],
        filter='{"tags": ["production"]}',
        max_items=10000
    )
except Exception as e:
    print(f"Evaluation failed: {e}")

    # Save the resume token
    if result.resume_token:
        # Store resume token for later (e.g., in a file or database)
        import json
        with open("resume_token.json", "w") as f:
            json.dump({
                "scope": result.resume_token.scope,
                "filter": result.resume_token.filter,
                "last_timestamp": result.resume_token.last_processed_timestamp,
                "last_id": result.resume_token.last_processed_id,
                "items_done": result.resume_token.items_processed
            }, f)

# Later, resume from where it left off
with open("resume_token.json") as f:
    token_data = json.load(f)

resume_token = BatchEvaluationResumeToken(
    scope=token_data["scope"],
    filter=token_data["filter"],
    last_processed_timestamp=token_data["last_timestamp"],
    last_processed_id=token_data["last_id"],
    items_processed=token_data["items_done"]
)

# Resume the evaluation
result = client.run_batched_evaluation(
    scope="traces",
    mapper=my_mapper,
    evaluators=[evaluator1, evaluator2],
    resume_from=resume_token
)

print(f"Processed {result.total_items_processed} additional items")

Handling partial completion:

result = client.run_batched_evaluation(...)

if not result.completed:
    print(f"Evaluation incomplete. Processed {result.resume_token.items_processed} items")
    print(f"Last item: {result.resume_token.last_processed_id}")
    print(f"Resume from: {result.resume_token.last_processed_timestamp}")

    # Optionally retry automatically
    if result.resume_token:
        print("Retrying...")
        result = client.run_batched_evaluation(
            scope=result.resume_token.scope,
            mapper=my_mapper,
            evaluators=my_evaluators,
            resume_from=result.resume_token
        )
Note:

All arguments must be passed as keywords when instantiating this class. The timestamp-based approach means that items created after the initial run but before the timestamp will be skipped. This is intentional to avoid duplicates and ensure consistent evaluation.

BatchEvaluationResumeToken( *, scope: str, filter: Optional[str], last_processed_timestamp: str, last_processed_id: str, items_processed: int)
549    def __init__(
550        self,
551        *,
552        scope: str,
553        filter: Optional[str],
554        last_processed_timestamp: str,
555        last_processed_id: str,
556        items_processed: int,
557    ):
558        """Initialize BatchEvaluationResumeToken with the provided state.
559
560        Args:
561            scope: The scope type ("traces", "observations").
562            filter: The original JSON filter string.
563            last_processed_timestamp: ISO 8601 timestamp of last processed item.
564            last_processed_id: ID of last processed item.
565            items_processed: Count of items processed before interruption.
566
567        Note:
568            All arguments must be provided as keywords.
569        """
570        self.scope = scope
571        self.filter = filter
572        self.last_processed_timestamp = last_processed_timestamp
573        self.last_processed_id = last_processed_id
574        self.items_processed = items_processed

Initialize BatchEvaluationResumeToken with the provided state.

Arguments:
  • scope: The scope type ("traces", "observations").
  • filter: The original JSON filter string.
  • last_processed_timestamp: ISO 8601 timestamp of last processed item.
  • last_processed_id: ID of last processed item.
  • items_processed: Count of items processed before interruption.
Note:

All arguments must be provided as keywords.

scope
filter
last_processed_timestamp
last_processed_id
items_processed
class BatchEvaluationResult:
577class BatchEvaluationResult:
578    r"""Complete result structure for batch evaluation execution.
579
580    This class encapsulates comprehensive statistics and metadata about a batch
581    evaluation run, including counts, evaluator-specific metrics, timing information,
582    error details, and resume capability.
583
584    Attributes:
585        total_items_fetched: Total number of items fetched from the API.
586        total_items_processed: Number of items successfully evaluated.
587        total_items_failed: Number of items that failed during evaluation.
588        total_scores_created: Total scores created by all item-level evaluators.
589        total_composite_scores_created: Scores created by the composite evaluator.
590        total_evaluations_failed: Number of individual evaluator failures across all items.
591        evaluator_stats: List of per-evaluator statistics (success/failure rates, scores created).
592        resume_token: Token for resuming if evaluation was interrupted (None if completed).
593        completed: True if all items were processed, False if stopped early or failed.
594        duration_seconds: Total time taken to execute the batch evaluation.
595        failed_item_ids: List of IDs for items that failed evaluation.
596        error_summary: Dictionary mapping error types to occurrence counts.
597        has_more_items: True if max_items limit was reached but more items exist.
598        item_evaluations: Dictionary mapping item IDs to their evaluation results (both regular and composite).
599
600    Examples:
601        Basic result inspection:
602        ```python
603        result = client.run_batched_evaluation(...)
604
605        print(f"Processed: {result.total_items_processed}/{result.total_items_fetched}")
606        print(f"Scores created: {result.total_scores_created}")
607        print(f"Duration: {result.duration_seconds:.2f}s")
608        print(f"Success rate: {result.total_items_processed / result.total_items_fetched:.1%}")
609        ```
610
611        Detailed analysis with evaluator stats:
612        ```python
613        result = client.run_batched_evaluation(...)
614
615        print(f"\n📊 Batch Evaluation Results")
616        print(f"{'='*50}")
617        print(f"Items processed: {result.total_items_processed}")
618        print(f"Items failed: {result.total_items_failed}")
619        print(f"Scores created: {result.total_scores_created}")
620
621        if result.total_composite_scores_created > 0:
622            print(f"Composite scores: {result.total_composite_scores_created}")
623
624        print(f"\n📈 Evaluator Performance:")
625        for stats in result.evaluator_stats:
626            success_rate = stats.successful_runs / stats.total_runs if stats.total_runs > 0 else 0
627            print(f"\n  {stats.name}:")
628            print(f"    Success rate: {success_rate:.1%}")
629            print(f"    Scores created: {stats.total_scores_created}")
630            if stats.failed_runs > 0:
631                print(f"    âš ī¸  Failures: {stats.failed_runs}")
632
633        if result.error_summary:
634            print(f"\nâš ī¸  Errors encountered:")
635            for error_type, count in result.error_summary.items():
636                print(f"    {error_type}: {count}")
637        ```
638
639        Handling incomplete runs:
640        ```python
641        result = client.run_batched_evaluation(...)
642
643        if not result.completed:
644            print("âš ī¸  Evaluation incomplete!")
645
646            if result.resume_token:
647                print(f"Processed {result.resume_token.items_processed} items before failure")
648                print(f"Use resume_from parameter to continue from:")
649                print(f"  Timestamp: {result.resume_token.last_processed_timestamp}")
650                print(f"  Last ID: {result.resume_token.last_processed_id}")
651
652        if result.has_more_items:
653            print(f"â„šī¸  More items available beyond max_items limit")
654        ```
655
656        Performance monitoring:
657        ```python
658        result = client.run_batched_evaluation(...)
659
660        items_per_second = result.total_items_processed / result.duration_seconds
661        avg_scores_per_item = result.total_scores_created / result.total_items_processed
662
663        print(f"Performance metrics:")
664        print(f"  Throughput: {items_per_second:.2f} items/second")
665        print(f"  Avg scores/item: {avg_scores_per_item:.2f}")
666        print(f"  Total duration: {result.duration_seconds:.2f}s")
667
668        if result.total_evaluations_failed > 0:
669            failure_rate = result.total_evaluations_failed / (
670                result.total_items_processed * len(result.evaluator_stats)
671            )
672            print(f"  Evaluation failure rate: {failure_rate:.1%}")
673        ```
674
675    Note:
676        All arguments must be passed as keywords when instantiating this class.
677    """
678
679    def __init__(
680        self,
681        *,
682        total_items_fetched: int,
683        total_items_processed: int,
684        total_items_failed: int,
685        total_scores_created: int,
686        total_composite_scores_created: int,
687        total_evaluations_failed: int,
688        evaluator_stats: List[EvaluatorStats],
689        resume_token: Optional[BatchEvaluationResumeToken],
690        completed: bool,
691        duration_seconds: float,
692        failed_item_ids: List[str],
693        error_summary: Dict[str, int],
694        has_more_items: bool,
695        item_evaluations: Dict[str, List["Evaluation"]],
696    ):
697        """Initialize BatchEvaluationResult with comprehensive statistics.
698
699        Args:
700            total_items_fetched: Total items fetched from API.
701            total_items_processed: Items successfully evaluated.
702            total_items_failed: Items that failed evaluation.
703            total_scores_created: Scores from item-level evaluators.
704            total_composite_scores_created: Scores from composite evaluator.
705            total_evaluations_failed: Individual evaluator failures.
706            evaluator_stats: Per-evaluator statistics.
707            resume_token: Token for resuming (None if completed).
708            completed: Whether all items were processed.
709            duration_seconds: Total execution time.
710            failed_item_ids: IDs of failed items.
711            error_summary: Error types and counts.
712            has_more_items: Whether more items exist beyond max_items.
713            item_evaluations: Dictionary mapping item IDs to their evaluation results.
714
715        Note:
716            All arguments must be provided as keywords.
717        """
718        self.total_items_fetched = total_items_fetched
719        self.total_items_processed = total_items_processed
720        self.total_items_failed = total_items_failed
721        self.total_scores_created = total_scores_created
722        self.total_composite_scores_created = total_composite_scores_created
723        self.total_evaluations_failed = total_evaluations_failed
724        self.evaluator_stats = evaluator_stats
725        self.resume_token = resume_token
726        self.completed = completed
727        self.duration_seconds = duration_seconds
728        self.failed_item_ids = failed_item_ids
729        self.error_summary = error_summary
730        self.has_more_items = has_more_items
731        self.item_evaluations = item_evaluations
732
733    def __str__(self) -> str:
734        """Return a formatted string representation of the batch evaluation results.
735
736        Returns:
737            A multi-line string with a summary of the evaluation results.
738        """
739        lines = []
740        lines.append("=" * 60)
741        lines.append("Batch Evaluation Results")
742        lines.append("=" * 60)
743
744        # Summary statistics
745        lines.append(f"\nStatus: {'Completed' if self.completed else 'Incomplete'}")
746        lines.append(f"Duration: {self.duration_seconds:.2f}s")
747        lines.append(f"\nItems fetched: {self.total_items_fetched}")
748        lines.append(f"Items processed: {self.total_items_processed}")
749
750        if self.total_items_failed > 0:
751            lines.append(f"Items failed: {self.total_items_failed}")
752
753        # Success rate
754        if self.total_items_fetched > 0:
755            success_rate = self.total_items_processed / self.total_items_fetched * 100
756            lines.append(f"Success rate: {success_rate:.1f}%")
757
758        # Scores created
759        lines.append(f"\nScores created: {self.total_scores_created}")
760        if self.total_composite_scores_created > 0:
761            lines.append(f"Composite scores: {self.total_composite_scores_created}")
762
763        total_scores = self.total_scores_created + self.total_composite_scores_created
764        lines.append(f"Total scores: {total_scores}")
765
766        # Evaluator statistics
767        if self.evaluator_stats:
768            lines.append("\nEvaluator Performance:")
769            for stats in self.evaluator_stats:
770                lines.append(f"  {stats.name}:")
771                if stats.total_runs > 0:
772                    success_rate = (
773                        stats.successful_runs / stats.total_runs * 100
774                        if stats.total_runs > 0
775                        else 0
776                    )
777                    lines.append(
778                        f"    Runs: {stats.successful_runs}/{stats.total_runs} "
779                        f"({success_rate:.1f}% success)"
780                    )
781                    lines.append(f"    Scores created: {stats.total_scores_created}")
782                    if stats.failed_runs > 0:
783                        lines.append(f"    Failed runs: {stats.failed_runs}")
784
785        # Performance metrics
786        if self.total_items_processed > 0 and self.duration_seconds > 0:
787            items_per_sec = self.total_items_processed / self.duration_seconds
788            lines.append("\nPerformance:")
789            lines.append(f"  Throughput: {items_per_sec:.2f} items/second")
790            if self.total_scores_created > 0:
791                avg_scores = self.total_scores_created / self.total_items_processed
792                lines.append(f"  Avg scores per item: {avg_scores:.2f}")
793
794        # Errors and warnings
795        if self.error_summary:
796            lines.append("\nErrors encountered:")
797            for error_type, count in self.error_summary.items():
798                lines.append(f"  {error_type}: {count}")
799
800        # Incomplete run information
801        if not self.completed:
802            lines.append("\nWarning: Evaluation incomplete")
803            if self.resume_token:
804                lines.append(
805                    f"  Last processed: {self.resume_token.last_processed_timestamp}"
806                )
807                lines.append(f"  Items processed: {self.resume_token.items_processed}")
808                lines.append("  Use resume_from parameter to continue")
809
810        if self.has_more_items:
811            lines.append("\nNote: More items available beyond max_items limit")
812
813        lines.append("=" * 60)
814        return "\n".join(lines)

Complete result structure for batch evaluation execution.

This class encapsulates comprehensive statistics and metadata about a batch evaluation run, including counts, evaluator-specific metrics, timing information, error details, and resume capability.

Attributes:
  • total_items_fetched: Total number of items fetched from the API.
  • total_items_processed: Number of items successfully evaluated.
  • total_items_failed: Number of items that failed during evaluation.
  • total_scores_created: Total scores created by all item-level evaluators.
  • total_composite_scores_created: Scores created by the composite evaluator.
  • total_evaluations_failed: Number of individual evaluator failures across all items.
  • evaluator_stats: List of per-evaluator statistics (success/failure rates, scores created).
  • resume_token: Token for resuming if evaluation was interrupted (None if completed).
  • completed: True if all items were processed, False if stopped early or failed.
  • duration_seconds: Total time taken to execute the batch evaluation.
  • failed_item_ids: List of IDs for items that failed evaluation.
  • error_summary: Dictionary mapping error types to occurrence counts.
  • has_more_items: True if max_items limit was reached but more items exist.
  • item_evaluations: Dictionary mapping item IDs to their evaluation results (both regular and composite).
Examples:

Basic result inspection:

result = client.run_batched_evaluation(...)

print(f"Processed: {result.total_items_processed}/{result.total_items_fetched}")
print(f"Scores created: {result.total_scores_created}")
print(f"Duration: {result.duration_seconds:.2f}s")
print(f"Success rate: {result.total_items_processed / result.total_items_fetched:.1%}")

Detailed analysis with evaluator stats:

result = client.run_batched_evaluation(...)

print(f"\n📊 Batch Evaluation Results")
print(f"{'='*50}")
print(f"Items processed: {result.total_items_processed}")
print(f"Items failed: {result.total_items_failed}")
print(f"Scores created: {result.total_scores_created}")

if result.total_composite_scores_created > 0:
    print(f"Composite scores: {result.total_composite_scores_created}")

print(f"\n📈 Evaluator Performance:")
for stats in result.evaluator_stats:
    success_rate = stats.successful_runs / stats.total_runs if stats.total_runs > 0 else 0
    print(f"\n  {stats.name}:")
    print(f"    Success rate: {success_rate:.1%}")
    print(f"    Scores created: {stats.total_scores_created}")
    if stats.failed_runs > 0:
        print(f"    âš ī¸  Failures: {stats.failed_runs}")

if result.error_summary:
    print(f"\nâš ī¸  Errors encountered:")
    for error_type, count in result.error_summary.items():
        print(f"    {error_type}: {count}")

Handling incomplete runs:

result = client.run_batched_evaluation(...)

if not result.completed:
    print("âš ī¸  Evaluation incomplete!")

    if result.resume_token:
        print(f"Processed {result.resume_token.items_processed} items before failure")
        print(f"Use resume_from parameter to continue from:")
        print(f"  Timestamp: {result.resume_token.last_processed_timestamp}")
        print(f"  Last ID: {result.resume_token.last_processed_id}")

if result.has_more_items:
    print(f"â„šī¸  More items available beyond max_items limit")

Performance monitoring:

result = client.run_batched_evaluation(...)

items_per_second = result.total_items_processed / result.duration_seconds
avg_scores_per_item = result.total_scores_created / result.total_items_processed

print(f"Performance metrics:")
print(f"  Throughput: {items_per_second:.2f} items/second")
print(f"  Avg scores/item: {avg_scores_per_item:.2f}")
print(f"  Total duration: {result.duration_seconds:.2f}s")

if result.total_evaluations_failed > 0:
    failure_rate = result.total_evaluations_failed / (
        result.total_items_processed * len(result.evaluator_stats)
    )
    print(f"  Evaluation failure rate: {failure_rate:.1%}")
Note:

All arguments must be passed as keywords when instantiating this class.

BatchEvaluationResult( *, total_items_fetched: int, total_items_processed: int, total_items_failed: int, total_scores_created: int, total_composite_scores_created: int, total_evaluations_failed: int, evaluator_stats: List[EvaluatorStats], resume_token: Optional[BatchEvaluationResumeToken], completed: bool, duration_seconds: float, failed_item_ids: List[str], error_summary: Dict[str, int], has_more_items: bool, item_evaluations: Dict[str, List[Evaluation]])
679    def __init__(
680        self,
681        *,
682        total_items_fetched: int,
683        total_items_processed: int,
684        total_items_failed: int,
685        total_scores_created: int,
686        total_composite_scores_created: int,
687        total_evaluations_failed: int,
688        evaluator_stats: List[EvaluatorStats],
689        resume_token: Optional[BatchEvaluationResumeToken],
690        completed: bool,
691        duration_seconds: float,
692        failed_item_ids: List[str],
693        error_summary: Dict[str, int],
694        has_more_items: bool,
695        item_evaluations: Dict[str, List["Evaluation"]],
696    ):
697        """Initialize BatchEvaluationResult with comprehensive statistics.
698
699        Args:
700            total_items_fetched: Total items fetched from API.
701            total_items_processed: Items successfully evaluated.
702            total_items_failed: Items that failed evaluation.
703            total_scores_created: Scores from item-level evaluators.
704            total_composite_scores_created: Scores from composite evaluator.
705            total_evaluations_failed: Individual evaluator failures.
706            evaluator_stats: Per-evaluator statistics.
707            resume_token: Token for resuming (None if completed).
708            completed: Whether all items were processed.
709            duration_seconds: Total execution time.
710            failed_item_ids: IDs of failed items.
711            error_summary: Error types and counts.
712            has_more_items: Whether more items exist beyond max_items.
713            item_evaluations: Dictionary mapping item IDs to their evaluation results.
714
715        Note:
716            All arguments must be provided as keywords.
717        """
718        self.total_items_fetched = total_items_fetched
719        self.total_items_processed = total_items_processed
720        self.total_items_failed = total_items_failed
721        self.total_scores_created = total_scores_created
722        self.total_composite_scores_created = total_composite_scores_created
723        self.total_evaluations_failed = total_evaluations_failed
724        self.evaluator_stats = evaluator_stats
725        self.resume_token = resume_token
726        self.completed = completed
727        self.duration_seconds = duration_seconds
728        self.failed_item_ids = failed_item_ids
729        self.error_summary = error_summary
730        self.has_more_items = has_more_items
731        self.item_evaluations = item_evaluations

Initialize BatchEvaluationResult with comprehensive statistics.

Arguments:
  • total_items_fetched: Total items fetched from API.
  • total_items_processed: Items successfully evaluated.
  • total_items_failed: Items that failed evaluation.
  • total_scores_created: Scores from item-level evaluators.
  • total_composite_scores_created: Scores from composite evaluator.
  • total_evaluations_failed: Individual evaluator failures.
  • evaluator_stats: Per-evaluator statistics.
  • resume_token: Token for resuming (None if completed).
  • completed: Whether all items were processed.
  • duration_seconds: Total execution time.
  • failed_item_ids: IDs of failed items.
  • error_summary: Error types and counts.
  • has_more_items: Whether more items exist beyond max_items.
  • item_evaluations: Dictionary mapping item IDs to their evaluation results.
Note:

All arguments must be provided as keywords.

total_items_fetched
total_items_processed
total_items_failed
total_scores_created
total_composite_scores_created
total_evaluations_failed
evaluator_stats
resume_token
completed
duration_seconds
failed_item_ids
error_summary
has_more_items
item_evaluations
def is_default_export_span(span: opentelemetry.sdk.trace.ReadableSpan) -> bool:
73def is_default_export_span(span: ReadableSpan) -> bool:
74    """Return whether a span should be exported by default."""
75    return (
76        is_langfuse_span(span) or is_genai_span(span) or is_known_llm_instrumentor(span)
77    )

Return whether a span should be exported by default.

def is_langfuse_span(span: opentelemetry.sdk.trace.ReadableSpan) -> bool:
36def is_langfuse_span(span: ReadableSpan) -> bool:
37    """Return whether the span was created by the Langfuse SDK tracer."""
38    return (
39        span.instrumentation_scope is not None
40        and span.instrumentation_scope.name == LANGFUSE_TRACER_NAME
41    )

Return whether the span was created by the Langfuse SDK tracer.

def is_genai_span(span: opentelemetry.sdk.trace.ReadableSpan) -> bool:
44def is_genai_span(span: ReadableSpan) -> bool:
45    """Return whether the span has any ``gen_ai.*`` semantic convention attribute."""
46    if span.attributes is None:
47        return False
48
49    return any(
50        isinstance(key, str) and key.startswith("gen_ai")
51        for key in span.attributes.keys()
52    )

Return whether the span has any gen_ai.* semantic convention attribute.

def is_known_llm_instrumentor(span: opentelemetry.sdk.trace.ReadableSpan) -> bool:
60def is_known_llm_instrumentor(span: ReadableSpan) -> bool:
61    """Return whether the span comes from a known LLM instrumentation scope."""
62    if span.instrumentation_scope is None:
63        return False
64
65    scope_name = span.instrumentation_scope.name
66
67    return any(
68        _matches_scope_prefix(scope_name, prefix)
69        for prefix in KNOWN_LLM_INSTRUMENTATION_SCOPE_PREFIXES
70    )

Return whether the span comes from a known LLM instrumentation scope.

KNOWN_LLM_INSTRUMENTATION_SCOPE_PREFIXES = frozenset({'agent_framework', 'opentelemetry.instrumentation.anthropic', 'litellm', 'ai', 'haystack', 'strands-agents', 'vllm', 'langfuse-sdk', 'langsmith', 'openinference'})