langfuse

 1""".. include:: ../README.md"""
 2
 3from langfuse.batch_evaluation import (
 4    BatchEvaluationResult,
 5    BatchEvaluationResumeToken,
 6    CompositeEvaluatorFunction,
 7    EvaluatorInputs,
 8    EvaluatorStats,
 9    MapperFunction,
10)
11from langfuse.experiment import Evaluation
12
13from ._client import client as _client_module
14from ._client.attributes import LangfuseOtelSpanAttributes
15from ._client.constants import ObservationTypeLiteral
16from ._client.get_client import get_client
17from ._client.observe import observe
18from ._client.propagation import propagate_attributes
19from ._client.span import (
20    LangfuseAgent,
21    LangfuseChain,
22    LangfuseEmbedding,
23    LangfuseEvaluator,
24    LangfuseEvent,
25    LangfuseGeneration,
26    LangfuseGuardrail,
27    LangfuseRetriever,
28    LangfuseSpan,
29    LangfuseTool,
30)
31from ._version import __version__
32from .span_filter import (
33    KNOWN_LLM_INSTRUMENTATION_SCOPE_PREFIXES,
34    is_default_export_span,
35    is_genai_span,
36    is_known_llm_instrumentor,
37    is_langfuse_span,
38)
39
40Langfuse = _client_module.Langfuse
41
42__all__ = [
43    "Langfuse",
44    "get_client",
45    "observe",
46    "propagate_attributes",
47    "ObservationTypeLiteral",
48    "LangfuseSpan",
49    "LangfuseGeneration",
50    "LangfuseEvent",
51    "LangfuseOtelSpanAttributes",
52    "LangfuseAgent",
53    "LangfuseTool",
54    "LangfuseChain",
55    "LangfuseEmbedding",
56    "LangfuseEvaluator",
57    "LangfuseRetriever",
58    "LangfuseGuardrail",
59    "Evaluation",
60    "EvaluatorInputs",
61    "MapperFunction",
62    "CompositeEvaluatorFunction",
63    "EvaluatorStats",
64    "BatchEvaluationResumeToken",
65    "BatchEvaluationResult",
66    "__version__",
67    "is_default_export_span",
68    "is_langfuse_span",
69    "is_genai_span",
70    "is_known_llm_instrumentor",
71    "KNOWN_LLM_INSTRUMENTATION_SCOPE_PREFIXES",
72    "experiment",
73    "api",
74]
class Langfuse:
 139class Langfuse:
 140    """Main client for Langfuse tracing and platform features.
 141
 142    This class provides an interface for creating and managing traces, spans,
 143    and generations in Langfuse as well as interacting with the Langfuse API.
 144
 145    The client features a thread-safe singleton pattern for each unique public API key,
 146    ensuring consistent trace context propagation across your application. It implements
 147    efficient batching of spans with configurable flush settings and includes background
 148    thread management for media uploads and score ingestion.
 149
 150    Configuration is flexible through either direct parameters or environment variables,
 151    with graceful fallbacks and runtime configuration updates.
 152
 153    Attributes:
 154        api: Synchronous API client for Langfuse backend communication
 155        async_api: Asynchronous API client for Langfuse backend communication
 156        _otel_tracer: Internal LangfuseTracer instance managing OpenTelemetry components
 157
 158    Parameters:
 159        public_key (Optional[str]): Your Langfuse public API key. Can also be set via LANGFUSE_PUBLIC_KEY environment variable.
 160        secret_key (Optional[str]): Your Langfuse secret API key. Can also be set via LANGFUSE_SECRET_KEY environment variable.
 161        base_url (Optional[str]): The Langfuse API base URL. Defaults to "https://cloud.langfuse.com". Can also be set via LANGFUSE_BASE_URL environment variable.
 162        host (Optional[str]): Deprecated. Use base_url instead. The Langfuse API host URL. Defaults to "https://cloud.langfuse.com".
 163        timeout (Optional[int]): Timeout in seconds for API requests. Defaults to 5 seconds.
 164        httpx_client (Optional[httpx.Client]): Custom httpx client for making non-tracing HTTP requests. If not provided, a default client will be created.
 165        debug (bool): Enable debug logging. Defaults to False. Can also be set via LANGFUSE_DEBUG environment variable.
 166        tracing_enabled (Optional[bool]): Enable or disable tracing. Defaults to True. Can also be set via LANGFUSE_TRACING_ENABLED environment variable.
 167        flush_at (Optional[int]): Number of spans to batch before sending to the API. Defaults to 512. Can also be set via LANGFUSE_FLUSH_AT environment variable.
 168        flush_interval (Optional[float]): Time in seconds between batch flushes. Defaults to 5 seconds. Can also be set via LANGFUSE_FLUSH_INTERVAL environment variable.
 169        environment (Optional[str]): Environment name for tracing. Default is 'default'. Can also be set via LANGFUSE_TRACING_ENVIRONMENT environment variable. Can be any lowercase alphanumeric string with hyphens and underscores that does not start with 'langfuse'.
 170        release (Optional[str]): Release version/hash of your application. Used for grouping analytics by release.
 171        media_upload_thread_count (Optional[int]): Number of background threads for handling media uploads. Defaults to 1. Can also be set via LANGFUSE_MEDIA_UPLOAD_THREAD_COUNT environment variable.
 172        sample_rate (Optional[float]): Sampling rate for traces (0.0 to 1.0). Defaults to 1.0 (100% of traces are sampled). Can also be set via LANGFUSE_SAMPLE_RATE environment variable.
 173        mask (Optional[MaskFunction]): Function to mask sensitive data in traces before sending to the API.
 174        blocked_instrumentation_scopes (Optional[List[str]]): Deprecated. Use `should_export_span` instead. Equivalent behavior:
 175            ```python
 176            from langfuse.span_filter import is_default_export_span
 177            blocked = {"sqlite", "requests"}
 178
 179            should_export_span = lambda span: (
 180                is_default_export_span(span)
 181                and (
 182                    span.instrumentation_scope is None
 183                    or span.instrumentation_scope.name not in blocked
 184                )
 185            )
 186            ```
 187        should_export_span (Optional[Callable[[ReadableSpan], bool]]): Callback to decide whether to export a span. If omitted, Langfuse uses the default filter (Langfuse SDK spans, spans with `gen_ai.*` attributes, and known LLM instrumentation scopes).
 188        additional_headers (Optional[Dict[str, str]]): Additional headers to include in all API requests and in the default OTLPSpanExporter requests. These headers will be merged with default headers. Note: If httpx_client is provided, additional_headers must be set directly on your custom httpx_client as well. If `span_exporter` is provided, these headers are not wired into that exporter and must be configured on the exporter instance directly.
 189        tracer_provider(Optional[TracerProvider]): OpenTelemetry TracerProvider to use for Langfuse. This can be useful to set to have disconnected tracing between Langfuse and other OpenTelemetry-span emitting libraries. Note: To track active spans, the context is still shared between TracerProviders. This may lead to broken trace trees.
 190        span_exporter (Optional[SpanExporter]): Custom OpenTelemetry span exporter for the Langfuse span processor. If omitted, Langfuse creates an OTLPSpanExporter pointed at the Langfuse OTLP endpoint. If provided, Langfuse does not wire `base_url`, exporter headers, exporter auth, or exporter timeout into it. Configure endpoint, headers, and timeout on the exporter instance directly. If you are sending spans to Langfuse v4 or using Langfuse Cloud Fast Preview, include `x-langfuse-ingestion-version=4` on the exporter to enable real time processing of exported spans.
 191
 192    Example:
 193        ```python
 194        from langfuse.otel import Langfuse
 195
 196        # Initialize the client (reads from env vars if not provided)
 197        langfuse = Langfuse(
 198            public_key="your-public-key",
 199            secret_key="your-secret-key",
 200            host="https://cloud.langfuse.com",  # Optional, default shown
 201        )
 202
 203        # Create a trace span
 204        with langfuse.start_as_current_observation(name="process-query") as span:
 205            # Your application code here
 206
 207            # Create a nested generation span for an LLM call
 208            with span.start_as_current_generation(
 209                name="generate-response",
 210                model="gpt-4",
 211                input={"query": "Tell me about AI"},
 212                model_parameters={"temperature": 0.7, "max_tokens": 500}
 213            ) as generation:
 214                # Generate response here
 215                response = "AI is a field of computer science..."
 216
 217                generation.update(
 218                    output=response,
 219                    usage_details={"prompt_tokens": 10, "completion_tokens": 50},
 220                    cost_details={"total_cost": 0.0023}
 221                )
 222
 223                # Score the generation (supports NUMERIC, BOOLEAN, CATEGORICAL)
 224                generation.score(name="relevance", value=0.95, data_type="NUMERIC")
 225        ```
 226    """
 227
 228    _resources: Optional[LangfuseResourceManager] = None
 229    _mask: Optional[MaskFunction] = None
 230    _otel_tracer: otel_trace_api.Tracer
 231
 232    def __init__(
 233        self,
 234        *,
 235        public_key: Optional[str] = None,
 236        secret_key: Optional[str] = None,
 237        base_url: Optional[str] = None,
 238        host: Optional[str] = None,
 239        timeout: Optional[int] = None,
 240        httpx_client: Optional[httpx.Client] = None,
 241        debug: bool = False,
 242        tracing_enabled: Optional[bool] = True,
 243        flush_at: Optional[int] = None,
 244        flush_interval: Optional[float] = None,
 245        environment: Optional[str] = None,
 246        release: Optional[str] = None,
 247        media_upload_thread_count: Optional[int] = None,
 248        sample_rate: Optional[float] = None,
 249        mask: Optional[MaskFunction] = None,
 250        blocked_instrumentation_scopes: Optional[List[str]] = None,
 251        should_export_span: Optional[Callable[[ReadableSpan], bool]] = None,
 252        additional_headers: Optional[Dict[str, str]] = None,
 253        tracer_provider: Optional[TracerProvider] = None,
 254        span_exporter: Optional[SpanExporter] = None,
 255    ):
 256        self._base_url = (
 257            base_url
 258            or os.environ.get(LANGFUSE_BASE_URL)
 259            or host
 260            or os.environ.get(LANGFUSE_HOST, "https://cloud.langfuse.com")
 261        )
 262        self._environment = environment or cast(
 263            str, os.environ.get(LANGFUSE_TRACING_ENVIRONMENT)
 264        )
 265        self._release = (
 266            release
 267            or os.environ.get(LANGFUSE_RELEASE, None)
 268            or get_common_release_envs()
 269        )
 270        self._project_id: Optional[str] = None
 271        sample_rate = sample_rate or float(os.environ.get(LANGFUSE_SAMPLE_RATE, 1.0))
 272        if not 0.0 <= sample_rate <= 1.0:
 273            raise ValueError(
 274                f"Sample rate must be between 0.0 and 1.0, got {sample_rate}"
 275            )
 276
 277        timeout = timeout or int(os.environ.get(LANGFUSE_TIMEOUT, 5))
 278
 279        self._tracing_enabled = (
 280            tracing_enabled
 281            and os.environ.get(LANGFUSE_TRACING_ENABLED, "true").lower() != "false"
 282        )
 283        if not self._tracing_enabled:
 284            langfuse_logger.info(
 285                "Configuration: Langfuse tracing is explicitly disabled. No data will be sent to the Langfuse API."
 286            )
 287
 288        debug = (
 289            debug if debug else (os.getenv(LANGFUSE_DEBUG, "false").lower() == "true")
 290        )
 291        if debug:
 292            logging.basicConfig(
 293                format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 294            )
 295            langfuse_logger.setLevel(logging.DEBUG)
 296
 297        public_key = public_key or os.environ.get(LANGFUSE_PUBLIC_KEY)
 298        if public_key is None:
 299            langfuse_logger.warning(
 300                "Authentication error: Langfuse client initialized without public_key. Client will be disabled. "
 301                "Provide a public_key parameter or set LANGFUSE_PUBLIC_KEY environment variable. "
 302            )
 303            self._otel_tracer = otel_trace_api.NoOpTracer()
 304            return
 305
 306        secret_key = secret_key or os.environ.get(LANGFUSE_SECRET_KEY)
 307        if secret_key is None:
 308            langfuse_logger.warning(
 309                "Authentication error: Langfuse client initialized without secret_key. Client will be disabled. "
 310                "Provide a secret_key parameter or set LANGFUSE_SECRET_KEY environment variable. "
 311            )
 312            self._otel_tracer = otel_trace_api.NoOpTracer()
 313            return
 314
 315        if os.environ.get("OTEL_SDK_DISABLED", "false").lower() == "true":
 316            langfuse_logger.warning(
 317                "OTEL_SDK_DISABLED is set. Langfuse tracing will be disabled and no traces will appear in the UI."
 318            )
 319
 320        if blocked_instrumentation_scopes is not None:
 321            warnings.warn(
 322                "`blocked_instrumentation_scopes` is deprecated and will be removed in a future release. "
 323                "Use `should_export_span` instead. Example: "
 324                "from langfuse.span_filter import is_default_export_span; "
 325                'blocked={"scope"}; should_export_span=lambda span: '
 326                "is_default_export_span(span) and (span.instrumentation_scope is None or "
 327                "span.instrumentation_scope.name not in blocked).",
 328                DeprecationWarning,
 329                stacklevel=2,
 330            )
 331
 332        # Initialize api and tracer if requirements are met
 333        self._resources = LangfuseResourceManager(
 334            public_key=public_key,
 335            secret_key=secret_key,
 336            base_url=self._base_url,
 337            timeout=timeout,
 338            environment=self._environment,
 339            release=release,
 340            flush_at=flush_at,
 341            flush_interval=flush_interval,
 342            httpx_client=httpx_client,
 343            media_upload_thread_count=media_upload_thread_count,
 344            sample_rate=sample_rate,
 345            mask=mask,
 346            tracing_enabled=self._tracing_enabled,
 347            blocked_instrumentation_scopes=blocked_instrumentation_scopes,
 348            should_export_span=should_export_span,
 349            additional_headers=additional_headers,
 350            tracer_provider=tracer_provider,
 351            span_exporter=span_exporter,
 352        )
 353        self._mask = self._resources.mask
 354
 355        self._otel_tracer = (
 356            self._resources.tracer
 357            if self._tracing_enabled and self._resources.tracer is not None
 358            else otel_trace_api.NoOpTracer()
 359        )
 360        self.api = self._resources.api
 361        self.async_api = self._resources.async_api
 362
 363    @overload
 364    def start_observation(
 365        self,
 366        *,
 367        trace_context: Optional[TraceContext] = None,
 368        name: str,
 369        as_type: Literal["generation"],
 370        input: Optional[Any] = None,
 371        output: Optional[Any] = None,
 372        metadata: Optional[Any] = None,
 373        version: Optional[str] = None,
 374        level: Optional[SpanLevel] = None,
 375        status_message: Optional[str] = None,
 376        completion_start_time: Optional[datetime] = None,
 377        model: Optional[str] = None,
 378        model_parameters: Optional[Dict[str, MapValue]] = None,
 379        usage_details: Optional[Dict[str, int]] = None,
 380        cost_details: Optional[Dict[str, float]] = None,
 381        prompt: Optional[PromptClient] = None,
 382    ) -> LangfuseGeneration: ...
 383
 384    @overload
 385    def start_observation(
 386        self,
 387        *,
 388        trace_context: Optional[TraceContext] = None,
 389        name: str,
 390        as_type: Literal["span"] = "span",
 391        input: Optional[Any] = None,
 392        output: Optional[Any] = None,
 393        metadata: Optional[Any] = None,
 394        version: Optional[str] = None,
 395        level: Optional[SpanLevel] = None,
 396        status_message: Optional[str] = None,
 397    ) -> LangfuseSpan: ...
 398
 399    @overload
 400    def start_observation(
 401        self,
 402        *,
 403        trace_context: Optional[TraceContext] = None,
 404        name: str,
 405        as_type: Literal["agent"],
 406        input: Optional[Any] = None,
 407        output: Optional[Any] = None,
 408        metadata: Optional[Any] = None,
 409        version: Optional[str] = None,
 410        level: Optional[SpanLevel] = None,
 411        status_message: Optional[str] = None,
 412    ) -> LangfuseAgent: ...
 413
 414    @overload
 415    def start_observation(
 416        self,
 417        *,
 418        trace_context: Optional[TraceContext] = None,
 419        name: str,
 420        as_type: Literal["tool"],
 421        input: Optional[Any] = None,
 422        output: Optional[Any] = None,
 423        metadata: Optional[Any] = None,
 424        version: Optional[str] = None,
 425        level: Optional[SpanLevel] = None,
 426        status_message: Optional[str] = None,
 427    ) -> LangfuseTool: ...
 428
 429    @overload
 430    def start_observation(
 431        self,
 432        *,
 433        trace_context: Optional[TraceContext] = None,
 434        name: str,
 435        as_type: Literal["chain"],
 436        input: Optional[Any] = None,
 437        output: Optional[Any] = None,
 438        metadata: Optional[Any] = None,
 439        version: Optional[str] = None,
 440        level: Optional[SpanLevel] = None,
 441        status_message: Optional[str] = None,
 442    ) -> LangfuseChain: ...
 443
 444    @overload
 445    def start_observation(
 446        self,
 447        *,
 448        trace_context: Optional[TraceContext] = None,
 449        name: str,
 450        as_type: Literal["retriever"],
 451        input: Optional[Any] = None,
 452        output: Optional[Any] = None,
 453        metadata: Optional[Any] = None,
 454        version: Optional[str] = None,
 455        level: Optional[SpanLevel] = None,
 456        status_message: Optional[str] = None,
 457    ) -> LangfuseRetriever: ...
 458
 459    @overload
 460    def start_observation(
 461        self,
 462        *,
 463        trace_context: Optional[TraceContext] = None,
 464        name: str,
 465        as_type: Literal["evaluator"],
 466        input: Optional[Any] = None,
 467        output: Optional[Any] = None,
 468        metadata: Optional[Any] = None,
 469        version: Optional[str] = None,
 470        level: Optional[SpanLevel] = None,
 471        status_message: Optional[str] = None,
 472    ) -> LangfuseEvaluator: ...
 473
 474    @overload
 475    def start_observation(
 476        self,
 477        *,
 478        trace_context: Optional[TraceContext] = None,
 479        name: str,
 480        as_type: Literal["embedding"],
 481        input: Optional[Any] = None,
 482        output: Optional[Any] = None,
 483        metadata: Optional[Any] = None,
 484        version: Optional[str] = None,
 485        level: Optional[SpanLevel] = None,
 486        status_message: Optional[str] = None,
 487        completion_start_time: Optional[datetime] = None,
 488        model: Optional[str] = None,
 489        model_parameters: Optional[Dict[str, MapValue]] = None,
 490        usage_details: Optional[Dict[str, int]] = None,
 491        cost_details: Optional[Dict[str, float]] = None,
 492        prompt: Optional[PromptClient] = None,
 493    ) -> LangfuseEmbedding: ...
 494
 495    @overload
 496    def start_observation(
 497        self,
 498        *,
 499        trace_context: Optional[TraceContext] = None,
 500        name: str,
 501        as_type: Literal["guardrail"],
 502        input: Optional[Any] = None,
 503        output: Optional[Any] = None,
 504        metadata: Optional[Any] = None,
 505        version: Optional[str] = None,
 506        level: Optional[SpanLevel] = None,
 507        status_message: Optional[str] = None,
 508    ) -> LangfuseGuardrail: ...
 509
 510    def start_observation(
 511        self,
 512        *,
 513        trace_context: Optional[TraceContext] = None,
 514        name: str,
 515        as_type: ObservationTypeLiteralNoEvent = "span",
 516        input: Optional[Any] = None,
 517        output: Optional[Any] = None,
 518        metadata: Optional[Any] = None,
 519        version: Optional[str] = None,
 520        level: Optional[SpanLevel] = None,
 521        status_message: Optional[str] = None,
 522        completion_start_time: Optional[datetime] = None,
 523        model: Optional[str] = None,
 524        model_parameters: Optional[Dict[str, MapValue]] = None,
 525        usage_details: Optional[Dict[str, int]] = None,
 526        cost_details: Optional[Dict[str, float]] = None,
 527        prompt: Optional[PromptClient] = None,
 528    ) -> Union[
 529        LangfuseSpan,
 530        LangfuseGeneration,
 531        LangfuseAgent,
 532        LangfuseTool,
 533        LangfuseChain,
 534        LangfuseRetriever,
 535        LangfuseEvaluator,
 536        LangfuseEmbedding,
 537        LangfuseGuardrail,
 538    ]:
 539        """Create a new observation of the specified type.
 540
 541        This method creates a new observation but does not set it as the current span in the
 542        context. To create and use an observation within a context, use start_as_current_observation().
 543
 544        Args:
 545            trace_context: Optional context for connecting to an existing trace
 546            name: Name of the observation
 547            as_type: Type of observation to create (defaults to "span")
 548            input: Input data for the operation
 549            output: Output data from the operation
 550            metadata: Additional metadata to associate with the observation
 551            version: Version identifier for the code or component
 552            level: Importance level of the observation
 553            status_message: Optional status message for the observation
 554            completion_start_time: When the model started generating (for generation types)
 555            model: Name/identifier of the AI model used (for generation types)
 556            model_parameters: Parameters used for the model (for generation types)
 557            usage_details: Token usage information (for generation types)
 558            cost_details: Cost information (for generation types)
 559            prompt: Associated prompt template (for generation types)
 560
 561        Returns:
 562            An observation object of the appropriate type that must be ended with .end()
 563        """
 564        if trace_context:
 565            trace_id = trace_context.get("trace_id", None)
 566            parent_span_id = trace_context.get("parent_span_id", None)
 567
 568            if trace_id:
 569                remote_parent_span = self._create_remote_parent_span(
 570                    trace_id=trace_id, parent_span_id=parent_span_id
 571                )
 572
 573                with otel_trace_api.use_span(
 574                    cast(otel_trace_api.Span, remote_parent_span)
 575                ):
 576                    otel_span = self._otel_tracer.start_span(name=name)
 577                    otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True)
 578
 579                    return self._create_observation_from_otel_span(
 580                        otel_span=otel_span,
 581                        as_type=as_type,
 582                        input=input,
 583                        output=output,
 584                        metadata=metadata,
 585                        version=version,
 586                        level=level,
 587                        status_message=status_message,
 588                        completion_start_time=completion_start_time,
 589                        model=model,
 590                        model_parameters=model_parameters,
 591                        usage_details=usage_details,
 592                        cost_details=cost_details,
 593                        prompt=prompt,
 594                    )
 595
 596        otel_span = self._otel_tracer.start_span(name=name)
 597
 598        return self._create_observation_from_otel_span(
 599            otel_span=otel_span,
 600            as_type=as_type,
 601            input=input,
 602            output=output,
 603            metadata=metadata,
 604            version=version,
 605            level=level,
 606            status_message=status_message,
 607            completion_start_time=completion_start_time,
 608            model=model,
 609            model_parameters=model_parameters,
 610            usage_details=usage_details,
 611            cost_details=cost_details,
 612            prompt=prompt,
 613        )
 614
 615    def _create_observation_from_otel_span(
 616        self,
 617        *,
 618        otel_span: otel_trace_api.Span,
 619        as_type: ObservationTypeLiteralNoEvent,
 620        input: Optional[Any] = None,
 621        output: Optional[Any] = None,
 622        metadata: Optional[Any] = None,
 623        version: Optional[str] = None,
 624        level: Optional[SpanLevel] = None,
 625        status_message: Optional[str] = None,
 626        completion_start_time: Optional[datetime] = None,
 627        model: Optional[str] = None,
 628        model_parameters: Optional[Dict[str, MapValue]] = None,
 629        usage_details: Optional[Dict[str, int]] = None,
 630        cost_details: Optional[Dict[str, float]] = None,
 631        prompt: Optional[PromptClient] = None,
 632    ) -> Union[
 633        LangfuseSpan,
 634        LangfuseGeneration,
 635        LangfuseAgent,
 636        LangfuseTool,
 637        LangfuseChain,
 638        LangfuseRetriever,
 639        LangfuseEvaluator,
 640        LangfuseEmbedding,
 641        LangfuseGuardrail,
 642    ]:
 643        """Create the appropriate observation type from an OTEL span."""
 644        if as_type in get_observation_types_list(ObservationTypeGenerationLike):
 645            observation_class = self._get_span_class(as_type)
 646            # Type ignore to prevent overloads of internal _get_span_class function,
 647            # issue is that LangfuseEvent could be returned and that classes have diff. args
 648            return observation_class(  # type: ignore[return-value,call-arg]
 649                otel_span=otel_span,
 650                langfuse_client=self,
 651                environment=self._environment,
 652                release=self._release,
 653                input=input,
 654                output=output,
 655                metadata=metadata,
 656                version=version,
 657                level=level,
 658                status_message=status_message,
 659                completion_start_time=completion_start_time,
 660                model=model,
 661                model_parameters=model_parameters,
 662                usage_details=usage_details,
 663                cost_details=cost_details,
 664                prompt=prompt,
 665            )
 666        else:
 667            # For other types (e.g. span, guardrail), create appropriate class without generation properties
 668            observation_class = self._get_span_class(as_type)
 669            # Type ignore to prevent overloads of internal _get_span_class function,
 670            # issue is that LangfuseEvent could be returned and that classes have diff. args
 671            return observation_class(  # type: ignore[return-value,call-arg]
 672                otel_span=otel_span,
 673                langfuse_client=self,
 674                environment=self._environment,
 675                release=self._release,
 676                input=input,
 677                output=output,
 678                metadata=metadata,
 679                version=version,
 680                level=level,
 681                status_message=status_message,
 682            )
 683            # span._observation_type = as_type
 684            # span._otel_span.set_attribute("langfuse.observation.type", as_type)
 685            # return span
 686
 687    @overload
 688    def start_as_current_observation(
 689        self,
 690        *,
 691        trace_context: Optional[TraceContext] = None,
 692        name: str,
 693        as_type: Literal["generation"],
 694        input: Optional[Any] = None,
 695        output: Optional[Any] = None,
 696        metadata: Optional[Any] = None,
 697        version: Optional[str] = None,
 698        level: Optional[SpanLevel] = None,
 699        status_message: Optional[str] = None,
 700        completion_start_time: Optional[datetime] = None,
 701        model: Optional[str] = None,
 702        model_parameters: Optional[Dict[str, MapValue]] = None,
 703        usage_details: Optional[Dict[str, int]] = None,
 704        cost_details: Optional[Dict[str, float]] = None,
 705        prompt: Optional[PromptClient] = None,
 706        end_on_exit: Optional[bool] = None,
 707    ) -> _AgnosticContextManager[LangfuseGeneration]: ...
 708
 709    @overload
 710    def start_as_current_observation(
 711        self,
 712        *,
 713        trace_context: Optional[TraceContext] = None,
 714        name: str,
 715        as_type: Literal["span"] = "span",
 716        input: Optional[Any] = None,
 717        output: Optional[Any] = None,
 718        metadata: Optional[Any] = None,
 719        version: Optional[str] = None,
 720        level: Optional[SpanLevel] = None,
 721        status_message: Optional[str] = None,
 722        end_on_exit: Optional[bool] = None,
 723    ) -> _AgnosticContextManager[LangfuseSpan]: ...
 724
 725    @overload
 726    def start_as_current_observation(
 727        self,
 728        *,
 729        trace_context: Optional[TraceContext] = None,
 730        name: str,
 731        as_type: Literal["agent"],
 732        input: Optional[Any] = None,
 733        output: Optional[Any] = None,
 734        metadata: Optional[Any] = None,
 735        version: Optional[str] = None,
 736        level: Optional[SpanLevel] = None,
 737        status_message: Optional[str] = None,
 738        end_on_exit: Optional[bool] = None,
 739    ) -> _AgnosticContextManager[LangfuseAgent]: ...
 740
 741    @overload
 742    def start_as_current_observation(
 743        self,
 744        *,
 745        trace_context: Optional[TraceContext] = None,
 746        name: str,
 747        as_type: Literal["tool"],
 748        input: Optional[Any] = None,
 749        output: Optional[Any] = None,
 750        metadata: Optional[Any] = None,
 751        version: Optional[str] = None,
 752        level: Optional[SpanLevel] = None,
 753        status_message: Optional[str] = None,
 754        end_on_exit: Optional[bool] = None,
 755    ) -> _AgnosticContextManager[LangfuseTool]: ...
 756
 757    @overload
 758    def start_as_current_observation(
 759        self,
 760        *,
 761        trace_context: Optional[TraceContext] = None,
 762        name: str,
 763        as_type: Literal["chain"],
 764        input: Optional[Any] = None,
 765        output: Optional[Any] = None,
 766        metadata: Optional[Any] = None,
 767        version: Optional[str] = None,
 768        level: Optional[SpanLevel] = None,
 769        status_message: Optional[str] = None,
 770        end_on_exit: Optional[bool] = None,
 771    ) -> _AgnosticContextManager[LangfuseChain]: ...
 772
 773    @overload
 774    def start_as_current_observation(
 775        self,
 776        *,
 777        trace_context: Optional[TraceContext] = None,
 778        name: str,
 779        as_type: Literal["retriever"],
 780        input: Optional[Any] = None,
 781        output: Optional[Any] = None,
 782        metadata: Optional[Any] = None,
 783        version: Optional[str] = None,
 784        level: Optional[SpanLevel] = None,
 785        status_message: Optional[str] = None,
 786        end_on_exit: Optional[bool] = None,
 787    ) -> _AgnosticContextManager[LangfuseRetriever]: ...
 788
 789    @overload
 790    def start_as_current_observation(
 791        self,
 792        *,
 793        trace_context: Optional[TraceContext] = None,
 794        name: str,
 795        as_type: Literal["evaluator"],
 796        input: Optional[Any] = None,
 797        output: Optional[Any] = None,
 798        metadata: Optional[Any] = None,
 799        version: Optional[str] = None,
 800        level: Optional[SpanLevel] = None,
 801        status_message: Optional[str] = None,
 802        end_on_exit: Optional[bool] = None,
 803    ) -> _AgnosticContextManager[LangfuseEvaluator]: ...
 804
 805    @overload
 806    def start_as_current_observation(
 807        self,
 808        *,
 809        trace_context: Optional[TraceContext] = None,
 810        name: str,
 811        as_type: Literal["embedding"],
 812        input: Optional[Any] = None,
 813        output: Optional[Any] = None,
 814        metadata: Optional[Any] = None,
 815        version: Optional[str] = None,
 816        level: Optional[SpanLevel] = None,
 817        status_message: Optional[str] = None,
 818        completion_start_time: Optional[datetime] = None,
 819        model: Optional[str] = None,
 820        model_parameters: Optional[Dict[str, MapValue]] = None,
 821        usage_details: Optional[Dict[str, int]] = None,
 822        cost_details: Optional[Dict[str, float]] = None,
 823        prompt: Optional[PromptClient] = None,
 824        end_on_exit: Optional[bool] = None,
 825    ) -> _AgnosticContextManager[LangfuseEmbedding]: ...
 826
 827    @overload
 828    def start_as_current_observation(
 829        self,
 830        *,
 831        trace_context: Optional[TraceContext] = None,
 832        name: str,
 833        as_type: Literal["guardrail"],
 834        input: Optional[Any] = None,
 835        output: Optional[Any] = None,
 836        metadata: Optional[Any] = None,
 837        version: Optional[str] = None,
 838        level: Optional[SpanLevel] = None,
 839        status_message: Optional[str] = None,
 840        end_on_exit: Optional[bool] = None,
 841    ) -> _AgnosticContextManager[LangfuseGuardrail]: ...
 842
 843    def start_as_current_observation(
 844        self,
 845        *,
 846        trace_context: Optional[TraceContext] = None,
 847        name: str,
 848        as_type: ObservationTypeLiteralNoEvent = "span",
 849        input: Optional[Any] = None,
 850        output: Optional[Any] = None,
 851        metadata: Optional[Any] = None,
 852        version: Optional[str] = None,
 853        level: Optional[SpanLevel] = None,
 854        status_message: Optional[str] = None,
 855        completion_start_time: Optional[datetime] = None,
 856        model: Optional[str] = None,
 857        model_parameters: Optional[Dict[str, MapValue]] = None,
 858        usage_details: Optional[Dict[str, int]] = None,
 859        cost_details: Optional[Dict[str, float]] = None,
 860        prompt: Optional[PromptClient] = None,
 861        end_on_exit: Optional[bool] = None,
 862    ) -> Union[
 863        _AgnosticContextManager[LangfuseGeneration],
 864        _AgnosticContextManager[LangfuseSpan],
 865        _AgnosticContextManager[LangfuseAgent],
 866        _AgnosticContextManager[LangfuseTool],
 867        _AgnosticContextManager[LangfuseChain],
 868        _AgnosticContextManager[LangfuseRetriever],
 869        _AgnosticContextManager[LangfuseEvaluator],
 870        _AgnosticContextManager[LangfuseEmbedding],
 871        _AgnosticContextManager[LangfuseGuardrail],
 872    ]:
 873        """Create a new observation and set it as the current span in a context manager.
 874
 875        This method creates a new observation of the specified type and sets it as the
 876        current span within a context manager. Use this method with a 'with' statement to
 877        automatically handle the observation lifecycle within a code block.
 878
 879        The created observation will be the child of the current span in the context.
 880
 881        Args:
 882            trace_context: Optional context for connecting to an existing trace
 883            name: Name of the observation (e.g., function or operation name)
 884            as_type: Type of observation to create (defaults to "span")
 885            input: Input data for the operation (can be any JSON-serializable object)
 886            output: Output data from the operation (can be any JSON-serializable object)
 887            metadata: Additional metadata to associate with the observation
 888            version: Version identifier for the code or component
 889            level: Importance level of the observation (info, warning, error)
 890            status_message: Optional status message for the observation
 891            end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
 892
 893            The following parameters are available when as_type is: "generation" or "embedding".
 894            completion_start_time: When the model started generating the response
 895            model: Name/identifier of the AI model used (e.g., "gpt-4")
 896            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
 897            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
 898            cost_details: Cost information for the model call
 899            prompt: Associated prompt template from Langfuse prompt management
 900
 901        Returns:
 902            A context manager that yields the appropriate observation type based on as_type
 903
 904        Example:
 905            ```python
 906            # Create a span
 907            with langfuse.start_as_current_observation(name="process-query", as_type="span") as span:
 908                # Do work
 909                result = process_data()
 910                span.update(output=result)
 911
 912                # Create a child span automatically
 913                with span.start_as_current_observation(name="sub-operation") as child_span:
 914                    # Do sub-operation work
 915                    child_span.update(output="sub-result")
 916
 917            # Create a tool observation
 918            with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool:
 919                # Do tool work
 920                results = search_web(query)
 921                tool.update(output=results)
 922
 923            # Create a generation observation
 924            with langfuse.start_as_current_observation(
 925                name="answer-generation",
 926                as_type="generation",
 927                model="gpt-4"
 928            ) as generation:
 929                # Generate answer
 930                response = llm.generate(...)
 931                generation.update(output=response)
 932            ```
 933        """
 934        if as_type in get_observation_types_list(ObservationTypeGenerationLike):
 935            if trace_context:
 936                trace_id = trace_context.get("trace_id", None)
 937                parent_span_id = trace_context.get("parent_span_id", None)
 938
 939                if trace_id:
 940                    remote_parent_span = self._create_remote_parent_span(
 941                        trace_id=trace_id, parent_span_id=parent_span_id
 942                    )
 943
 944                    return cast(
 945                        Union[
 946                            _AgnosticContextManager[LangfuseGeneration],
 947                            _AgnosticContextManager[LangfuseEmbedding],
 948                        ],
 949                        self._create_span_with_parent_context(
 950                            as_type=as_type,
 951                            name=name,
 952                            remote_parent_span=remote_parent_span,
 953                            parent=None,
 954                            end_on_exit=end_on_exit,
 955                            input=input,
 956                            output=output,
 957                            metadata=metadata,
 958                            version=version,
 959                            level=level,
 960                            status_message=status_message,
 961                            completion_start_time=completion_start_time,
 962                            model=model,
 963                            model_parameters=model_parameters,
 964                            usage_details=usage_details,
 965                            cost_details=cost_details,
 966                            prompt=prompt,
 967                        ),
 968                    )
 969
 970            return cast(
 971                Union[
 972                    _AgnosticContextManager[LangfuseGeneration],
 973                    _AgnosticContextManager[LangfuseEmbedding],
 974                ],
 975                self._start_as_current_otel_span_with_processed_media(
 976                    as_type=as_type,
 977                    name=name,
 978                    end_on_exit=end_on_exit,
 979                    input=input,
 980                    output=output,
 981                    metadata=metadata,
 982                    version=version,
 983                    level=level,
 984                    status_message=status_message,
 985                    completion_start_time=completion_start_time,
 986                    model=model,
 987                    model_parameters=model_parameters,
 988                    usage_details=usage_details,
 989                    cost_details=cost_details,
 990                    prompt=prompt,
 991                ),
 992            )
 993
 994        if as_type in get_observation_types_list(ObservationTypeSpanLike):
 995            if trace_context:
 996                trace_id = trace_context.get("trace_id", None)
 997                parent_span_id = trace_context.get("parent_span_id", None)
 998
 999                if trace_id:
1000                    remote_parent_span = self._create_remote_parent_span(
1001                        trace_id=trace_id, parent_span_id=parent_span_id
1002                    )
1003
1004                    return cast(
1005                        Union[
1006                            _AgnosticContextManager[LangfuseSpan],
1007                            _AgnosticContextManager[LangfuseAgent],
1008                            _AgnosticContextManager[LangfuseTool],
1009                            _AgnosticContextManager[LangfuseChain],
1010                            _AgnosticContextManager[LangfuseRetriever],
1011                            _AgnosticContextManager[LangfuseEvaluator],
1012                            _AgnosticContextManager[LangfuseGuardrail],
1013                        ],
1014                        self._create_span_with_parent_context(
1015                            as_type=as_type,
1016                            name=name,
1017                            remote_parent_span=remote_parent_span,
1018                            parent=None,
1019                            end_on_exit=end_on_exit,
1020                            input=input,
1021                            output=output,
1022                            metadata=metadata,
1023                            version=version,
1024                            level=level,
1025                            status_message=status_message,
1026                        ),
1027                    )
1028
1029            return cast(
1030                Union[
1031                    _AgnosticContextManager[LangfuseSpan],
1032                    _AgnosticContextManager[LangfuseAgent],
1033                    _AgnosticContextManager[LangfuseTool],
1034                    _AgnosticContextManager[LangfuseChain],
1035                    _AgnosticContextManager[LangfuseRetriever],
1036                    _AgnosticContextManager[LangfuseEvaluator],
1037                    _AgnosticContextManager[LangfuseGuardrail],
1038                ],
1039                self._start_as_current_otel_span_with_processed_media(
1040                    as_type=as_type,
1041                    name=name,
1042                    end_on_exit=end_on_exit,
1043                    input=input,
1044                    output=output,
1045                    metadata=metadata,
1046                    version=version,
1047                    level=level,
1048                    status_message=status_message,
1049                ),
1050            )
1051
1052        # This should never be reached since all valid types are handled above
1053        langfuse_logger.warning(
1054            f"Unknown observation type: {as_type}, falling back to span"
1055        )
1056        return self._start_as_current_otel_span_with_processed_media(
1057            as_type="span",
1058            name=name,
1059            end_on_exit=end_on_exit,
1060            input=input,
1061            output=output,
1062            metadata=metadata,
1063            version=version,
1064            level=level,
1065            status_message=status_message,
1066        )
1067
1068    def _get_span_class(
1069        self,
1070        as_type: ObservationTypeLiteral,
1071    ) -> Union[
1072        Type[LangfuseAgent],
1073        Type[LangfuseTool],
1074        Type[LangfuseChain],
1075        Type[LangfuseRetriever],
1076        Type[LangfuseEvaluator],
1077        Type[LangfuseEmbedding],
1078        Type[LangfuseGuardrail],
1079        Type[LangfuseGeneration],
1080        Type[LangfuseEvent],
1081        Type[LangfuseSpan],
1082    ]:
1083        """Get the appropriate span class based on as_type."""
1084        normalized_type = as_type.lower()
1085
1086        if normalized_type == "agent":
1087            return LangfuseAgent
1088        elif normalized_type == "tool":
1089            return LangfuseTool
1090        elif normalized_type == "chain":
1091            return LangfuseChain
1092        elif normalized_type == "retriever":
1093            return LangfuseRetriever
1094        elif normalized_type == "evaluator":
1095            return LangfuseEvaluator
1096        elif normalized_type == "embedding":
1097            return LangfuseEmbedding
1098        elif normalized_type == "guardrail":
1099            return LangfuseGuardrail
1100        elif normalized_type == "generation":
1101            return LangfuseGeneration
1102        elif normalized_type == "event":
1103            return LangfuseEvent
1104        elif normalized_type == "span":
1105            return LangfuseSpan
1106        else:
1107            return LangfuseSpan
1108
1109    @_agnosticcontextmanager
1110    def _create_span_with_parent_context(
1111        self,
1112        *,
1113        name: str,
1114        parent: Optional[otel_trace_api.Span] = None,
1115        remote_parent_span: Optional[otel_trace_api.Span] = None,
1116        as_type: ObservationTypeLiteralNoEvent,
1117        end_on_exit: Optional[bool] = None,
1118        input: Optional[Any] = None,
1119        output: Optional[Any] = None,
1120        metadata: Optional[Any] = None,
1121        version: Optional[str] = None,
1122        level: Optional[SpanLevel] = None,
1123        status_message: Optional[str] = None,
1124        completion_start_time: Optional[datetime] = None,
1125        model: Optional[str] = None,
1126        model_parameters: Optional[Dict[str, MapValue]] = None,
1127        usage_details: Optional[Dict[str, int]] = None,
1128        cost_details: Optional[Dict[str, float]] = None,
1129        prompt: Optional[PromptClient] = None,
1130    ) -> Any:
1131        parent_span = parent or cast(otel_trace_api.Span, remote_parent_span)
1132
1133        with otel_trace_api.use_span(parent_span):
1134            with self._start_as_current_otel_span_with_processed_media(
1135                name=name,
1136                as_type=as_type,
1137                end_on_exit=end_on_exit,
1138                input=input,
1139                output=output,
1140                metadata=metadata,
1141                version=version,
1142                level=level,
1143                status_message=status_message,
1144                completion_start_time=completion_start_time,
1145                model=model,
1146                model_parameters=model_parameters,
1147                usage_details=usage_details,
1148                cost_details=cost_details,
1149                prompt=prompt,
1150            ) as langfuse_span:
1151                if remote_parent_span is not None:
1152                    langfuse_span._otel_span.set_attribute(
1153                        LangfuseOtelSpanAttributes.AS_ROOT, True
1154                    )
1155
1156                yield langfuse_span
1157
1158    @_agnosticcontextmanager
1159    def _start_as_current_otel_span_with_processed_media(
1160        self,
1161        *,
1162        name: str,
1163        as_type: Optional[ObservationTypeLiteralNoEvent] = None,
1164        end_on_exit: Optional[bool] = None,
1165        input: Optional[Any] = None,
1166        output: Optional[Any] = None,
1167        metadata: Optional[Any] = None,
1168        version: Optional[str] = None,
1169        level: Optional[SpanLevel] = None,
1170        status_message: Optional[str] = None,
1171        completion_start_time: Optional[datetime] = None,
1172        model: Optional[str] = None,
1173        model_parameters: Optional[Dict[str, MapValue]] = None,
1174        usage_details: Optional[Dict[str, int]] = None,
1175        cost_details: Optional[Dict[str, float]] = None,
1176        prompt: Optional[PromptClient] = None,
1177    ) -> Any:
1178        with self._otel_tracer.start_as_current_span(
1179            name=name,
1180            end_on_exit=end_on_exit if end_on_exit is not None else True,
1181        ) as otel_span:
1182            span_class = self._get_span_class(
1183                as_type or "generation"
1184            )  # default was "generation"
1185            common_args = {
1186                "otel_span": otel_span,
1187                "langfuse_client": self,
1188                "environment": self._environment,
1189                "release": self._release,
1190                "input": input,
1191                "output": output,
1192                "metadata": metadata,
1193                "version": version,
1194                "level": level,
1195                "status_message": status_message,
1196            }
1197
1198            if span_class in [
1199                LangfuseGeneration,
1200                LangfuseEmbedding,
1201            ]:
1202                common_args.update(
1203                    {
1204                        "completion_start_time": completion_start_time,
1205                        "model": model,
1206                        "model_parameters": model_parameters,
1207                        "usage_details": usage_details,
1208                        "cost_details": cost_details,
1209                        "prompt": prompt,
1210                    }
1211                )
1212            # For span-like types (span, agent, tool, chain, retriever, evaluator, guardrail), no generation properties needed
1213
1214            yield span_class(**common_args)  # type: ignore[arg-type]
1215
1216    def _get_current_otel_span(self) -> Optional[otel_trace_api.Span]:
1217        current_span = otel_trace_api.get_current_span()
1218
1219        if current_span is otel_trace_api.INVALID_SPAN:
1220            langfuse_logger.warning(
1221                "Context error: No active span in current context. Operations that depend on an active span will be skipped. "
1222                "Ensure spans are created with start_as_current_observation() or that you're operating within an active span context."
1223            )
1224            return None
1225
1226        return current_span
1227
1228    def update_current_generation(
1229        self,
1230        *,
1231        name: Optional[str] = None,
1232        input: Optional[Any] = None,
1233        output: Optional[Any] = None,
1234        metadata: Optional[Any] = None,
1235        version: Optional[str] = None,
1236        level: Optional[SpanLevel] = None,
1237        status_message: Optional[str] = None,
1238        completion_start_time: Optional[datetime] = None,
1239        model: Optional[str] = None,
1240        model_parameters: Optional[Dict[str, MapValue]] = None,
1241        usage_details: Optional[Dict[str, int]] = None,
1242        cost_details: Optional[Dict[str, float]] = None,
1243        prompt: Optional[PromptClient] = None,
1244    ) -> None:
1245        """Update the current active generation span with new information.
1246
1247        This method updates the current generation span in the active context with
1248        additional information. It's useful for adding output, usage stats, or other
1249        details that become available during or after model generation.
1250
1251        Args:
1252            name: The generation name
1253            input: Updated input data for the model
1254            output: Output from the model (e.g., completions)
1255            metadata: Additional metadata to associate with the generation
1256            version: Version identifier for the model or component
1257            level: Importance level of the generation (info, warning, error)
1258            status_message: Optional status message for the generation
1259            completion_start_time: When the model started generating the response
1260            model: Name/identifier of the AI model used (e.g., "gpt-4")
1261            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1262            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1263            cost_details: Cost information for the model call
1264            prompt: Associated prompt template from Langfuse prompt management
1265
1266        Example:
1267            ```python
1268            with langfuse.start_as_current_generation(name="answer-query") as generation:
1269                # Initial setup and API call
1270                response = llm.generate(...)
1271
1272                # Update with results that weren't available at creation time
1273                langfuse.update_current_generation(
1274                    output=response.text,
1275                    usage_details={
1276                        "prompt_tokens": response.usage.prompt_tokens,
1277                        "completion_tokens": response.usage.completion_tokens
1278                    }
1279                )
1280            ```
1281        """
1282        if not self._tracing_enabled:
1283            langfuse_logger.debug(
1284                "Operation skipped: update_current_generation - Tracing is disabled or client is in no-op mode."
1285            )
1286            return
1287
1288        current_otel_span = self._get_current_otel_span()
1289
1290        if current_otel_span is not None:
1291            generation = LangfuseGeneration(
1292                otel_span=current_otel_span, langfuse_client=self
1293            )
1294
1295            if name:
1296                current_otel_span.update_name(name)
1297
1298            generation.update(
1299                input=input,
1300                output=output,
1301                metadata=metadata,
1302                version=version,
1303                level=level,
1304                status_message=status_message,
1305                completion_start_time=completion_start_time,
1306                model=model,
1307                model_parameters=model_parameters,
1308                usage_details=usage_details,
1309                cost_details=cost_details,
1310                prompt=prompt,
1311            )
1312
1313    def update_current_span(
1314        self,
1315        *,
1316        name: Optional[str] = None,
1317        input: Optional[Any] = None,
1318        output: Optional[Any] = None,
1319        metadata: Optional[Any] = None,
1320        version: Optional[str] = None,
1321        level: Optional[SpanLevel] = None,
1322        status_message: Optional[str] = None,
1323    ) -> None:
1324        """Update the current active span with new information.
1325
1326        This method updates the current span in the active context with
1327        additional information. It's useful for adding outputs or metadata
1328        that become available during execution.
1329
1330        Args:
1331            name: The span name
1332            input: Updated input data for the operation
1333            output: Output data from the operation
1334            metadata: Additional metadata to associate with the span
1335            version: Version identifier for the code or component
1336            level: Importance level of the span (info, warning, error)
1337            status_message: Optional status message for the span
1338
1339        Example:
1340            ```python
1341            with langfuse.start_as_current_observation(name="process-data") as span:
1342                # Initial processing
1343                result = process_first_part()
1344
1345                # Update with intermediate results
1346                langfuse.update_current_span(metadata={"intermediate_result": result})
1347
1348                # Continue processing
1349                final_result = process_second_part(result)
1350
1351                # Final update
1352                langfuse.update_current_span(output=final_result)
1353            ```
1354        """
1355        if not self._tracing_enabled:
1356            langfuse_logger.debug(
1357                "Operation skipped: update_current_span - Tracing is disabled or client is in no-op mode."
1358            )
1359            return
1360
1361        current_otel_span = self._get_current_otel_span()
1362
1363        if current_otel_span is not None:
1364            span = LangfuseSpan(
1365                otel_span=current_otel_span,
1366                langfuse_client=self,
1367                environment=self._environment,
1368                release=self._release,
1369            )
1370
1371            if name:
1372                current_otel_span.update_name(name)
1373
1374            span.update(
1375                input=input,
1376                output=output,
1377                metadata=metadata,
1378                version=version,
1379                level=level,
1380                status_message=status_message,
1381            )
1382
1383    @deprecated(
1384        "Trace-level input/output is deprecated. "
1385        "For trace attributes (user_id, session_id, tags, etc.), use propagate_attributes() instead. "
1386        "This method will be removed in a future major version."
1387    )
1388    def set_current_trace_io(
1389        self,
1390        *,
1391        input: Optional[Any] = None,
1392        output: Optional[Any] = None,
1393    ) -> None:
1394        """Set trace-level input and output for the current span's trace.
1395
1396        .. deprecated::
1397            This is a legacy method for backward compatibility with Langfuse platform
1398            features that still rely on trace-level input/output (e.g., legacy LLM-as-a-judge
1399            evaluators). It will be removed in a future major version.
1400
1401            For setting other trace attributes (user_id, session_id, metadata, tags, version),
1402            use :meth:`propagate_attributes` instead.
1403
1404        Args:
1405            input: Input data to associate with the trace.
1406            output: Output data to associate with the trace.
1407        """
1408        if not self._tracing_enabled:
1409            langfuse_logger.debug(
1410                "Operation skipped: set_current_trace_io - Tracing is disabled or client is in no-op mode."
1411            )
1412            return
1413
1414        current_otel_span = self._get_current_otel_span()
1415
1416        if current_otel_span is not None and current_otel_span.is_recording():
1417            existing_observation_type = current_otel_span.attributes.get(  # type: ignore[attr-defined]
1418                LangfuseOtelSpanAttributes.OBSERVATION_TYPE, "span"
1419            )
1420            # We need to preserve the class to keep the correct observation type
1421            span_class = self._get_span_class(existing_observation_type)
1422            span = span_class(
1423                otel_span=current_otel_span,
1424                langfuse_client=self,
1425                environment=self._environment,
1426                release=self._release,
1427            )
1428
1429            span.set_trace_io(
1430                input=input,
1431                output=output,
1432            )
1433
1434    def set_current_trace_as_public(self) -> None:
1435        """Make the current trace publicly accessible via its URL.
1436
1437        When a trace is published, anyone with the trace link can view the full trace
1438        without needing to be logged in to Langfuse. This action cannot be undone
1439        programmatically - once published, the entire trace becomes public.
1440
1441        This is a convenience method that publishes the trace from the currently
1442        active span context. Use this when you want to make a trace public from
1443        within a traced function without needing direct access to the span object.
1444        """
1445        if not self._tracing_enabled:
1446            langfuse_logger.debug(
1447                "Operation skipped: set_current_trace_as_public - Tracing is disabled or client is in no-op mode."
1448            )
1449            return
1450
1451        current_otel_span = self._get_current_otel_span()
1452
1453        if current_otel_span is not None and current_otel_span.is_recording():
1454            existing_observation_type = current_otel_span.attributes.get(  # type: ignore[attr-defined]
1455                LangfuseOtelSpanAttributes.OBSERVATION_TYPE, "span"
1456            )
1457            # We need to preserve the class to keep the correct observation type
1458            span_class = self._get_span_class(existing_observation_type)
1459            span = span_class(
1460                otel_span=current_otel_span,
1461                langfuse_client=self,
1462                environment=self._environment,
1463            )
1464
1465            span.set_trace_as_public()
1466
1467    def create_event(
1468        self,
1469        *,
1470        trace_context: Optional[TraceContext] = None,
1471        name: str,
1472        input: Optional[Any] = None,
1473        output: Optional[Any] = None,
1474        metadata: Optional[Any] = None,
1475        version: Optional[str] = None,
1476        level: Optional[SpanLevel] = None,
1477        status_message: Optional[str] = None,
1478    ) -> LangfuseEvent:
1479        """Create a new Langfuse observation of type 'EVENT'.
1480
1481        The created Langfuse Event observation will be the child of the current span in the context.
1482
1483        Args:
1484            trace_context: Optional context for connecting to an existing trace
1485            name: Name of the span (e.g., function or operation name)
1486            input: Input data for the operation (can be any JSON-serializable object)
1487            output: Output data from the operation (can be any JSON-serializable object)
1488            metadata: Additional metadata to associate with the span
1489            version: Version identifier for the code or component
1490            level: Importance level of the span (info, warning, error)
1491            status_message: Optional status message for the span
1492
1493        Returns:
1494            The Langfuse Event object
1495
1496        Example:
1497            ```python
1498            event = langfuse.create_event(name="process-event")
1499            ```
1500        """
1501        timestamp = time_ns()
1502
1503        if trace_context:
1504            trace_id = trace_context.get("trace_id", None)
1505            parent_span_id = trace_context.get("parent_span_id", None)
1506
1507            if trace_id:
1508                remote_parent_span = self._create_remote_parent_span(
1509                    trace_id=trace_id, parent_span_id=parent_span_id
1510                )
1511
1512                with otel_trace_api.use_span(
1513                    cast(otel_trace_api.Span, remote_parent_span)
1514                ):
1515                    otel_span = self._otel_tracer.start_span(
1516                        name=name, start_time=timestamp
1517                    )
1518                    otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True)
1519
1520                    return cast(
1521                        LangfuseEvent,
1522                        LangfuseEvent(
1523                            otel_span=otel_span,
1524                            langfuse_client=self,
1525                            environment=self._environment,
1526                            release=self._release,
1527                            input=input,
1528                            output=output,
1529                            metadata=metadata,
1530                            version=version,
1531                            level=level,
1532                            status_message=status_message,
1533                        ).end(end_time=timestamp),
1534                    )
1535
1536        otel_span = self._otel_tracer.start_span(name=name, start_time=timestamp)
1537
1538        return cast(
1539            LangfuseEvent,
1540            LangfuseEvent(
1541                otel_span=otel_span,
1542                langfuse_client=self,
1543                environment=self._environment,
1544                release=self._release,
1545                input=input,
1546                output=output,
1547                metadata=metadata,
1548                version=version,
1549                level=level,
1550                status_message=status_message,
1551            ).end(end_time=timestamp),
1552        )
1553
1554    def _create_remote_parent_span(
1555        self, *, trace_id: str, parent_span_id: Optional[str]
1556    ) -> Any:
1557        if not self._is_valid_trace_id(trace_id):
1558            langfuse_logger.warning(
1559                f"Passed trace ID '{trace_id}' is not a valid 32 lowercase hex char Langfuse trace id. Ignoring trace ID."
1560            )
1561
1562        if parent_span_id and not self._is_valid_span_id(parent_span_id):
1563            langfuse_logger.warning(
1564                f"Passed span ID '{parent_span_id}' is not a valid 16 lowercase hex char Langfuse span id. Ignoring parent span ID."
1565            )
1566
1567        int_trace_id = int(trace_id, 16)
1568        int_parent_span_id = (
1569            int(parent_span_id, 16)
1570            if parent_span_id
1571            else RandomIdGenerator().generate_span_id()
1572        )
1573
1574        span_context = otel_trace_api.SpanContext(
1575            trace_id=int_trace_id,
1576            span_id=int_parent_span_id,
1577            trace_flags=otel_trace_api.TraceFlags(0x01),  # mark span as sampled
1578            is_remote=False,
1579        )
1580
1581        return otel_trace_api.NonRecordingSpan(span_context)
1582
1583    def _is_valid_trace_id(self, trace_id: str) -> bool:
1584        pattern = r"^[0-9a-f]{32}$"
1585
1586        return bool(re.match(pattern, trace_id))
1587
1588    def _is_valid_span_id(self, span_id: str) -> bool:
1589        pattern = r"^[0-9a-f]{16}$"
1590
1591        return bool(re.match(pattern, span_id))
1592
1593    def _create_observation_id(self, *, seed: Optional[str] = None) -> str:
1594        """Create a unique observation ID for use with Langfuse.
1595
1596        This method generates a unique observation ID (span ID in OpenTelemetry terms)
1597        for use with various Langfuse APIs. It can either generate a random ID or
1598        create a deterministic ID based on a seed string.
1599
1600        Observation IDs must be 16 lowercase hexadecimal characters, representing 8 bytes.
1601        This method ensures the generated ID meets this requirement. If you need to
1602        correlate an external ID with a Langfuse observation ID, use the external ID as
1603        the seed to get a valid, deterministic observation ID.
1604
1605        Args:
1606            seed: Optional string to use as a seed for deterministic ID generation.
1607                 If provided, the same seed will always produce the same ID.
1608                 If not provided, a random ID will be generated.
1609
1610        Returns:
1611            A 16-character lowercase hexadecimal string representing the observation ID.
1612
1613        Example:
1614            ```python
1615            # Generate a random observation ID
1616            obs_id = langfuse.create_observation_id()
1617
1618            # Generate a deterministic ID based on a seed
1619            user_obs_id = langfuse.create_observation_id(seed="user-123-feedback")
1620
1621            # Correlate an external item ID with a Langfuse observation ID
1622            item_id = "item-789012"
1623            correlated_obs_id = langfuse.create_observation_id(seed=item_id)
1624
1625            # Use the ID with Langfuse APIs
1626            langfuse.create_score(
1627                name="relevance",
1628                value=0.95,
1629                trace_id=trace_id,
1630                observation_id=obs_id
1631            )
1632            ```
1633        """
1634        if not seed:
1635            span_id_int = RandomIdGenerator().generate_span_id()
1636
1637            return self._format_otel_span_id(span_id_int)
1638
1639        return sha256(seed.encode("utf-8")).digest()[:8].hex()
1640
1641    @staticmethod
1642    def create_trace_id(*, seed: Optional[str] = None) -> str:
1643        """Create a unique trace ID for use with Langfuse.
1644
1645        This method generates a unique trace ID for use with various Langfuse APIs.
1646        It can either generate a random ID or create a deterministic ID based on
1647        a seed string.
1648
1649        Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes.
1650        This method ensures the generated ID meets this requirement. If you need to
1651        correlate an external ID with a Langfuse trace ID, use the external ID as the
1652        seed to get a valid, deterministic Langfuse trace ID.
1653
1654        Args:
1655            seed: Optional string to use as a seed for deterministic ID generation.
1656                 If provided, the same seed will always produce the same ID.
1657                 If not provided, a random ID will be generated.
1658
1659        Returns:
1660            A 32-character lowercase hexadecimal string representing the Langfuse trace ID.
1661
1662        Example:
1663            ```python
1664            # Generate a random trace ID
1665            trace_id = langfuse.create_trace_id()
1666
1667            # Generate a deterministic ID based on a seed
1668            session_trace_id = langfuse.create_trace_id(seed="session-456")
1669
1670            # Correlate an external ID with a Langfuse trace ID
1671            external_id = "external-system-123456"
1672            correlated_trace_id = langfuse.create_trace_id(seed=external_id)
1673
1674            # Use the ID with trace context
1675            with langfuse.start_as_current_observation(
1676                name="process-request",
1677                trace_context={"trace_id": trace_id}
1678            ) as span:
1679                # Operation will be part of the specific trace
1680                pass
1681            ```
1682        """
1683        if not seed:
1684            trace_id_int = RandomIdGenerator().generate_trace_id()
1685
1686            return Langfuse._format_otel_trace_id(trace_id_int)
1687
1688        return sha256(seed.encode("utf-8")).digest()[:16].hex()
1689
1690    def _get_otel_trace_id(self, otel_span: otel_trace_api.Span) -> str:
1691        span_context = otel_span.get_span_context()
1692
1693        return self._format_otel_trace_id(span_context.trace_id)
1694
1695    def _get_otel_span_id(self, otel_span: otel_trace_api.Span) -> str:
1696        span_context = otel_span.get_span_context()
1697
1698        return self._format_otel_span_id(span_context.span_id)
1699
1700    @staticmethod
1701    def _format_otel_span_id(span_id_int: int) -> str:
1702        """Format an integer span ID to a 16-character lowercase hex string.
1703
1704        Internal method to convert an OpenTelemetry integer span ID to the standard
1705        W3C Trace Context format (16-character lowercase hex string).
1706
1707        Args:
1708            span_id_int: 64-bit integer representing a span ID
1709
1710        Returns:
1711            A 16-character lowercase hexadecimal string
1712        """
1713        return format(span_id_int, "016x")
1714
1715    @staticmethod
1716    def _format_otel_trace_id(trace_id_int: int) -> str:
1717        """Format an integer trace ID to a 32-character lowercase hex string.
1718
1719        Internal method to convert an OpenTelemetry integer trace ID to the standard
1720        W3C Trace Context format (32-character lowercase hex string).
1721
1722        Args:
1723            trace_id_int: 128-bit integer representing a trace ID
1724
1725        Returns:
1726            A 32-character lowercase hexadecimal string
1727        """
1728        return format(trace_id_int, "032x")
1729
1730    @overload
1731    def create_score(
1732        self,
1733        *,
1734        name: str,
1735        value: float,
1736        session_id: Optional[str] = None,
1737        dataset_run_id: Optional[str] = None,
1738        trace_id: Optional[str] = None,
1739        observation_id: Optional[str] = None,
1740        score_id: Optional[str] = None,
1741        data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None,
1742        comment: Optional[str] = None,
1743        config_id: Optional[str] = None,
1744        metadata: Optional[Any] = None,
1745        timestamp: Optional[datetime] = None,
1746    ) -> None: ...
1747
1748    @overload
1749    def create_score(
1750        self,
1751        *,
1752        name: str,
1753        value: str,
1754        session_id: Optional[str] = None,
1755        dataset_run_id: Optional[str] = None,
1756        trace_id: Optional[str] = None,
1757        score_id: Optional[str] = None,
1758        observation_id: Optional[str] = None,
1759        data_type: Optional[Literal["CATEGORICAL", "TEXT"]] = "CATEGORICAL",
1760        comment: Optional[str] = None,
1761        config_id: Optional[str] = None,
1762        metadata: Optional[Any] = None,
1763        timestamp: Optional[datetime] = None,
1764    ) -> None: ...
1765
1766    def create_score(
1767        self,
1768        *,
1769        name: str,
1770        value: Union[float, str],
1771        session_id: Optional[str] = None,
1772        dataset_run_id: Optional[str] = None,
1773        trace_id: Optional[str] = None,
1774        observation_id: Optional[str] = None,
1775        score_id: Optional[str] = None,
1776        data_type: Optional[ScoreDataType] = None,
1777        comment: Optional[str] = None,
1778        config_id: Optional[str] = None,
1779        metadata: Optional[Any] = None,
1780        timestamp: Optional[datetime] = None,
1781    ) -> None:
1782        """Create a score for a specific trace or observation.
1783
1784        This method creates a score for evaluating a Langfuse trace or observation. Scores can be
1785        used to track quality metrics, user feedback, or automated evaluations.
1786
1787        Args:
1788            name: Name of the score (e.g., "relevance", "accuracy")
1789            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT)
1790            session_id: ID of the Langfuse session to associate the score with
1791            dataset_run_id: ID of the Langfuse dataset run to associate the score with
1792            trace_id: ID of the Langfuse trace to associate the score with
1793            observation_id: Optional ID of the specific observation to score. Trace ID must be provided too.
1794            score_id: Optional custom ID for the score (auto-generated if not provided)
1795            data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT)
1796            comment: Optional comment or explanation for the score
1797            config_id: Optional ID of a score config defined in Langfuse
1798            metadata: Optional metadata to be attached to the score
1799            timestamp: Optional timestamp for the score (defaults to current UTC time)
1800
1801        Example:
1802            ```python
1803            # Create a numeric score for accuracy
1804            langfuse.create_score(
1805                name="accuracy",
1806                value=0.92,
1807                trace_id="abcdef1234567890abcdef1234567890",
1808                data_type="NUMERIC",
1809                comment="High accuracy with minor irrelevant details"
1810            )
1811
1812            # Create a categorical score for sentiment
1813            langfuse.create_score(
1814                name="sentiment",
1815                value="positive",
1816                trace_id="abcdef1234567890abcdef1234567890",
1817                observation_id="abcdef1234567890",
1818                data_type="CATEGORICAL"
1819            )
1820            ```
1821        """
1822        if not self._tracing_enabled:
1823            return
1824
1825        score_id = score_id or self._create_observation_id()
1826
1827        try:
1828            new_body = ScoreBody(
1829                id=score_id,
1830                sessionId=session_id,
1831                datasetRunId=dataset_run_id,
1832                traceId=trace_id,
1833                observationId=observation_id,
1834                name=name,
1835                value=value,
1836                dataType=data_type,  # type: ignore
1837                comment=comment,
1838                configId=config_id,
1839                environment=self._environment,
1840                metadata=metadata,
1841            )
1842
1843            event = {
1844                "id": self.create_trace_id(),
1845                "type": "score-create",
1846                "timestamp": timestamp or _get_timestamp(),
1847                "body": new_body,
1848            }
1849
1850            if self._resources is not None:
1851                # Force the score to be in sample if it was for a legacy trace ID, i.e. non-32 hexchar
1852                force_sample = (
1853                    not self._is_valid_trace_id(trace_id) if trace_id else True
1854                )
1855
1856                self._resources.add_score_task(
1857                    event,
1858                    force_sample=force_sample,
1859                )
1860
1861        except Exception as e:
1862            langfuse_logger.exception(
1863                f"Error creating score: Failed to process score event for trace_id={trace_id}, name={name}. Error: {e}"
1864            )
1865
1866    def _create_trace_tags_via_ingestion(
1867        self,
1868        *,
1869        trace_id: str,
1870        tags: List[str],
1871    ) -> None:
1872        """Private helper to enqueue trace tag updates via ingestion API events."""
1873        if not self._tracing_enabled:
1874            return
1875
1876        if len(tags) == 0:
1877            return
1878
1879        try:
1880            new_body = TraceBody(
1881                id=trace_id,
1882                tags=tags,
1883            )
1884
1885            event = {
1886                "id": self.create_trace_id(),
1887                "type": "trace-create",
1888                "timestamp": _get_timestamp(),
1889                "body": new_body,
1890            }
1891
1892            if self._resources is not None:
1893                self._resources.add_trace_task(event)
1894        except Exception as e:
1895            langfuse_logger.exception(
1896                f"Error updating trace tags: Failed to process trace update event for trace_id={trace_id}. Error: {e}"
1897            )
1898
1899    @overload
1900    def score_current_span(
1901        self,
1902        *,
1903        name: str,
1904        value: float,
1905        score_id: Optional[str] = None,
1906        data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None,
1907        comment: Optional[str] = None,
1908        config_id: Optional[str] = None,
1909        metadata: Optional[Any] = None,
1910    ) -> None: ...
1911
1912    @overload
1913    def score_current_span(
1914        self,
1915        *,
1916        name: str,
1917        value: str,
1918        score_id: Optional[str] = None,
1919        data_type: Optional[Literal["CATEGORICAL", "TEXT"]] = "CATEGORICAL",
1920        comment: Optional[str] = None,
1921        config_id: Optional[str] = None,
1922        metadata: Optional[Any] = None,
1923    ) -> None: ...
1924
1925    def score_current_span(
1926        self,
1927        *,
1928        name: str,
1929        value: Union[float, str],
1930        score_id: Optional[str] = None,
1931        data_type: Optional[ScoreDataType] = None,
1932        comment: Optional[str] = None,
1933        config_id: Optional[str] = None,
1934        metadata: Optional[Any] = None,
1935    ) -> None:
1936        """Create a score for the current active span.
1937
1938        This method scores the currently active span in the context. It's a convenient
1939        way to score the current operation without needing to know its trace and span IDs.
1940
1941        Args:
1942            name: Name of the score (e.g., "relevance", "accuracy")
1943            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT)
1944            score_id: Optional custom ID for the score (auto-generated if not provided)
1945            data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT)
1946            comment: Optional comment or explanation for the score
1947            config_id: Optional ID of a score config defined in Langfuse
1948            metadata: Optional metadata to be attached to the score
1949
1950        Example:
1951            ```python
1952            with langfuse.start_as_current_generation(name="answer-query") as generation:
1953                # Generate answer
1954                response = generate_answer(...)
1955                generation.update(output=response)
1956
1957                # Score the generation
1958                langfuse.score_current_span(
1959                    name="relevance",
1960                    value=0.85,
1961                    data_type="NUMERIC",
1962                    comment="Mostly relevant but contains some tangential information",
1963                    metadata={"model": "gpt-4", "prompt_version": "v2"}
1964                )
1965            ```
1966        """
1967        current_span = self._get_current_otel_span()
1968
1969        if current_span is not None:
1970            trace_id = self._get_otel_trace_id(current_span)
1971            observation_id = self._get_otel_span_id(current_span)
1972
1973            langfuse_logger.info(
1974                f"Score: Creating score name='{name}' value={value} for current span ({observation_id}) in trace {trace_id}"
1975            )
1976
1977            self.create_score(
1978                trace_id=trace_id,
1979                observation_id=observation_id,
1980                name=name,
1981                value=cast(str, value),
1982                score_id=score_id,
1983                data_type=cast(Literal["CATEGORICAL", "TEXT"], data_type),
1984                comment=comment,
1985                config_id=config_id,
1986                metadata=metadata,
1987            )
1988
1989    @overload
1990    def score_current_trace(
1991        self,
1992        *,
1993        name: str,
1994        value: float,
1995        score_id: Optional[str] = None,
1996        data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None,
1997        comment: Optional[str] = None,
1998        config_id: Optional[str] = None,
1999        metadata: Optional[Any] = None,
2000    ) -> None: ...
2001
2002    @overload
2003    def score_current_trace(
2004        self,
2005        *,
2006        name: str,
2007        value: str,
2008        score_id: Optional[str] = None,
2009        data_type: Optional[Literal["CATEGORICAL", "TEXT"]] = "CATEGORICAL",
2010        comment: Optional[str] = None,
2011        config_id: Optional[str] = None,
2012        metadata: Optional[Any] = None,
2013    ) -> None: ...
2014
2015    def score_current_trace(
2016        self,
2017        *,
2018        name: str,
2019        value: Union[float, str],
2020        score_id: Optional[str] = None,
2021        data_type: Optional[ScoreDataType] = None,
2022        comment: Optional[str] = None,
2023        config_id: Optional[str] = None,
2024        metadata: Optional[Any] = None,
2025    ) -> None:
2026        """Create a score for the current trace.
2027
2028        This method scores the trace of the currently active span. Unlike score_current_span,
2029        this method associates the score with the entire trace rather than a specific span.
2030        It's useful for scoring overall performance or quality of the entire operation.
2031
2032        Args:
2033            name: Name of the score (e.g., "user_satisfaction", "overall_quality")
2034            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT)
2035            score_id: Optional custom ID for the score (auto-generated if not provided)
2036            data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT)
2037            comment: Optional comment or explanation for the score
2038            config_id: Optional ID of a score config defined in Langfuse
2039            metadata: Optional metadata to be attached to the score
2040
2041        Example:
2042            ```python
2043            with langfuse.start_as_current_observation(name="process-user-request") as span:
2044                # Process request
2045                result = process_complete_request()
2046                span.update(output=result)
2047
2048                # Score the overall trace
2049                langfuse.score_current_trace(
2050                    name="overall_quality",
2051                    value=0.95,
2052                    data_type="NUMERIC",
2053                    comment="High quality end-to-end response",
2054                    metadata={"evaluator": "gpt-4", "criteria": "comprehensive"}
2055                )
2056            ```
2057        """
2058        current_span = self._get_current_otel_span()
2059
2060        if current_span is not None:
2061            trace_id = self._get_otel_trace_id(current_span)
2062
2063            langfuse_logger.info(
2064                f"Score: Creating score name='{name}' value={value} for entire trace {trace_id}"
2065            )
2066
2067            self.create_score(
2068                trace_id=trace_id,
2069                name=name,
2070                value=cast(str, value),
2071                score_id=score_id,
2072                data_type=cast(Literal["CATEGORICAL", "TEXT"], data_type),
2073                comment=comment,
2074                config_id=config_id,
2075                metadata=metadata,
2076            )
2077
2078    def flush(self) -> None:
2079        """Force flush all pending spans and events to the Langfuse API.
2080
2081        This method manually flushes any pending spans, scores, and other events to the
2082        Langfuse API. It's useful in scenarios where you want to ensure all data is sent
2083        before proceeding, without waiting for the automatic flush interval.
2084
2085        Example:
2086            ```python
2087            # Record some spans and scores
2088            with langfuse.start_as_current_observation(name="operation") as span:
2089                # Do work...
2090                pass
2091
2092            # Ensure all data is sent to Langfuse before proceeding
2093            langfuse.flush()
2094
2095            # Continue with other work
2096            ```
2097        """
2098        if self._resources is not None:
2099            self._resources.flush()
2100
2101    def shutdown(self) -> None:
2102        """Shut down the Langfuse client and flush all pending data.
2103
2104        This method cleanly shuts down the Langfuse client, ensuring all pending data
2105        is flushed to the API and all background threads are properly terminated.
2106
2107        It's important to call this method when your application is shutting down to
2108        prevent data loss and resource leaks. For most applications, using the client
2109        as a context manager or relying on the automatic shutdown via atexit is sufficient.
2110
2111        Example:
2112            ```python
2113            # Initialize Langfuse
2114            langfuse = Langfuse(public_key="...", secret_key="...")
2115
2116            # Use Langfuse throughout your application
2117            # ...
2118
2119            # When application is shutting down
2120            langfuse.shutdown()
2121            ```
2122        """
2123        if self._resources is not None:
2124            self._resources.shutdown()
2125
2126    def get_current_trace_id(self) -> Optional[str]:
2127        """Get the trace ID of the current active span.
2128
2129        This method retrieves the trace ID from the currently active span in the context.
2130        It can be used to get the trace ID for referencing in logs, external systems,
2131        or for creating related operations.
2132
2133        Returns:
2134            The current trace ID as a 32-character lowercase hexadecimal string,
2135            or None if there is no active span.
2136
2137        Example:
2138            ```python
2139            with langfuse.start_as_current_observation(name="process-request") as span:
2140                # Get the current trace ID for reference
2141                trace_id = langfuse.get_current_trace_id()
2142
2143                # Use it for external correlation
2144                log.info(f"Processing request with trace_id: {trace_id}")
2145
2146                # Or pass to another system
2147                external_system.process(data, trace_id=trace_id)
2148            ```
2149        """
2150        if not self._tracing_enabled:
2151            langfuse_logger.debug(
2152                "Operation skipped: get_current_trace_id - Tracing is disabled or client is in no-op mode."
2153            )
2154            return None
2155
2156        current_otel_span = self._get_current_otel_span()
2157
2158        return self._get_otel_trace_id(current_otel_span) if current_otel_span else None
2159
2160    def get_current_observation_id(self) -> Optional[str]:
2161        """Get the observation ID (span ID) of the current active span.
2162
2163        This method retrieves the observation ID from the currently active span in the context.
2164        It can be used to get the observation ID for referencing in logs, external systems,
2165        or for creating scores or other related operations.
2166
2167        Returns:
2168            The current observation ID as a 16-character lowercase hexadecimal string,
2169            or None if there is no active span.
2170
2171        Example:
2172            ```python
2173            with langfuse.start_as_current_observation(name="process-user-query") as span:
2174                # Get the current observation ID
2175                observation_id = langfuse.get_current_observation_id()
2176
2177                # Store it for later reference
2178                cache.set(f"query_{query_id}_observation", observation_id)
2179
2180                # Process the query...
2181            ```
2182        """
2183        if not self._tracing_enabled:
2184            langfuse_logger.debug(
2185                "Operation skipped: get_current_observation_id - Tracing is disabled or client is in no-op mode."
2186            )
2187            return None
2188
2189        current_otel_span = self._get_current_otel_span()
2190
2191        return self._get_otel_span_id(current_otel_span) if current_otel_span else None
2192
2193    def _get_project_id(self) -> Optional[str]:
2194        """Fetch and return the current project id. Persisted across requests. Returns None if no project id is found for api keys."""
2195        if not self._project_id:
2196            proj = self.api.projects.get()
2197            if not proj.data or not proj.data[0].id:
2198                return None
2199
2200            self._project_id = proj.data[0].id
2201
2202        return self._project_id
2203
2204    def get_trace_url(self, *, trace_id: Optional[str] = None) -> Optional[str]:
2205        """Get the URL to view a trace in the Langfuse UI.
2206
2207        This method generates a URL that links directly to a trace in the Langfuse UI.
2208        It's useful for providing links in logs, notifications, or debugging tools.
2209
2210        Args:
2211            trace_id: Optional trace ID to generate a URL for. If not provided,
2212                     the trace ID of the current active span will be used.
2213
2214        Returns:
2215            A URL string pointing to the trace in the Langfuse UI,
2216            or None if the project ID couldn't be retrieved or no trace ID is available.
2217
2218        Example:
2219            ```python
2220            # Get URL for the current trace
2221            with langfuse.start_as_current_observation(name="process-request") as span:
2222                trace_url = langfuse.get_trace_url()
2223                log.info(f"Processing trace: {trace_url}")
2224
2225            # Get URL for a specific trace
2226            specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef")
2227            send_notification(f"Review needed for trace: {specific_trace_url}")
2228            ```
2229        """
2230        final_trace_id = trace_id or self.get_current_trace_id()
2231        if not final_trace_id:
2232            return None
2233
2234        project_id = self._get_project_id()
2235
2236        return (
2237            f"{self._base_url}/project/{project_id}/traces/{final_trace_id}"
2238            if project_id and final_trace_id
2239            else None
2240        )
2241
2242    def get_dataset(
2243        self,
2244        name: str,
2245        *,
2246        fetch_items_page_size: Optional[int] = 50,
2247        version: Optional[datetime] = None,
2248    ) -> "DatasetClient":
2249        """Fetch a dataset by its name.
2250
2251        Args:
2252            name (str): The name of the dataset to fetch.
2253            fetch_items_page_size (Optional[int]): All items of the dataset will be fetched in chunks of this size. Defaults to 50.
2254            version (Optional[datetime]): Retrieve dataset items as they existed at this specific point in time (UTC).
2255                If provided, returns the state of items at the specified UTC timestamp.
2256                If not provided, returns the latest version. Must be a timezone-aware datetime object in UTC.
2257
2258        Returns:
2259            DatasetClient: The dataset with the given name.
2260        """
2261        try:
2262            langfuse_logger.debug(f"Getting datasets {name}")
2263            dataset = self.api.datasets.get(dataset_name=self._url_encode(name))
2264
2265            dataset_items = []
2266            page = 1
2267
2268            while True:
2269                new_items = self.api.dataset_items.list(
2270                    dataset_name=self._url_encode(name, is_url_param=True),
2271                    page=page,
2272                    limit=fetch_items_page_size,
2273                    version=version,
2274                )
2275                dataset_items.extend(new_items.data)
2276
2277                if new_items.meta.total_pages <= page:
2278                    break
2279
2280                page += 1
2281
2282            return DatasetClient(
2283                dataset=dataset,
2284                items=dataset_items,
2285                version=version,
2286                langfuse_client=self,
2287            )
2288
2289        except Error as e:
2290            handle_fern_exception(e)
2291            raise e
2292
2293    def get_dataset_run(
2294        self, *, dataset_name: str, run_name: str
2295    ) -> DatasetRunWithItems:
2296        """Fetch a dataset run by dataset name and run name.
2297
2298        Args:
2299            dataset_name (str): The name of the dataset.
2300            run_name (str): The name of the run.
2301
2302        Returns:
2303            DatasetRunWithItems: The dataset run with its items.
2304        """
2305        try:
2306            return cast(
2307                DatasetRunWithItems,
2308                self.api.datasets.get_run(
2309                    dataset_name=self._url_encode(dataset_name),
2310                    run_name=self._url_encode(run_name),
2311                    request_options=None,
2312                ),
2313            )
2314        except Error as e:
2315            handle_fern_exception(e)
2316            raise e
2317
2318    def get_dataset_runs(
2319        self,
2320        *,
2321        dataset_name: str,
2322        page: Optional[int] = None,
2323        limit: Optional[int] = None,
2324    ) -> PaginatedDatasetRuns:
2325        """Fetch all runs for a dataset.
2326
2327        Args:
2328            dataset_name (str): The name of the dataset.
2329            page (Optional[int]): Page number, starts at 1.
2330            limit (Optional[int]): Limit of items per page.
2331
2332        Returns:
2333            PaginatedDatasetRuns: Paginated list of dataset runs.
2334        """
2335        try:
2336            return cast(
2337                PaginatedDatasetRuns,
2338                self.api.datasets.get_runs(
2339                    dataset_name=self._url_encode(dataset_name),
2340                    page=page,
2341                    limit=limit,
2342                    request_options=None,
2343                ),
2344            )
2345        except Error as e:
2346            handle_fern_exception(e)
2347            raise e
2348
2349    def delete_dataset_run(
2350        self, *, dataset_name: str, run_name: str
2351    ) -> DeleteDatasetRunResponse:
2352        """Delete a dataset run and all its run items. This action is irreversible.
2353
2354        Args:
2355            dataset_name (str): The name of the dataset.
2356            run_name (str): The name of the run.
2357
2358        Returns:
2359            DeleteDatasetRunResponse: Confirmation of deletion.
2360        """
2361        try:
2362            return cast(
2363                DeleteDatasetRunResponse,
2364                self.api.datasets.delete_run(
2365                    dataset_name=self._url_encode(dataset_name),
2366                    run_name=self._url_encode(run_name),
2367                    request_options=None,
2368                ),
2369            )
2370        except Error as e:
2371            handle_fern_exception(e)
2372            raise e
2373
2374    def run_experiment(
2375        self,
2376        *,
2377        name: str,
2378        run_name: Optional[str] = None,
2379        description: Optional[str] = None,
2380        data: ExperimentData,
2381        task: TaskFunction,
2382        evaluators: List[EvaluatorFunction] = [],
2383        composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
2384        run_evaluators: List[RunEvaluatorFunction] = [],
2385        max_concurrency: int = 50,
2386        metadata: Optional[Dict[str, str]] = None,
2387        _dataset_version: Optional[datetime] = None,
2388    ) -> ExperimentResult:
2389        """Run an experiment on a dataset with automatic tracing and evaluation.
2390
2391        This method executes a task function on each item in the provided dataset,
2392        automatically traces all executions with Langfuse for observability, runs
2393        item-level and run-level evaluators on the outputs, and returns comprehensive
2394        results with evaluation metrics.
2395
2396        The experiment system provides:
2397        - Automatic tracing of all task executions
2398        - Concurrent processing with configurable limits
2399        - Comprehensive error handling that isolates failures
2400        - Integration with Langfuse datasets for experiment tracking
2401        - Flexible evaluation framework supporting both sync and async evaluators
2402
2403        Args:
2404            name: Human-readable name for the experiment. Used for identification
2405                in the Langfuse UI.
2406            run_name: Optional exact name for the experiment run. If provided, this will be
2407                used as the exact dataset run name if the `data` contains Langfuse dataset items.
2408                If not provided, this will default to the experiment name appended with an ISO timestamp.
2409            description: Optional description explaining the experiment's purpose,
2410                methodology, or expected outcomes.
2411            data: Array of data items to process. Can be either:
2412                - List of dict-like items with 'input', 'expected_output', 'metadata' keys
2413                - List of Langfuse DatasetItem objects from dataset.items
2414            task: Function that processes each data item and returns output.
2415                Must accept 'item' as keyword argument and can return sync or async results.
2416                The task function signature should be: task(*, item, **kwargs) -> Any
2417            evaluators: List of functions to evaluate each item's output individually.
2418                Each evaluator receives input, output, expected_output, and metadata.
2419                Can return single Evaluation dict or list of Evaluation dicts.
2420            composite_evaluator: Optional function that creates composite scores from item-level evaluations.
2421                Receives the same inputs as item-level evaluators (input, output, expected_output, metadata)
2422                plus the list of evaluations from item-level evaluators. Useful for weighted averages,
2423                pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics.
2424            run_evaluators: List of functions to evaluate the entire experiment run.
2425                Each run evaluator receives all item_results and can compute aggregate metrics.
2426                Useful for calculating averages, distributions, or cross-item comparisons.
2427            max_concurrency: Maximum number of concurrent task executions (default: 50).
2428                Controls the number of items processed simultaneously. Adjust based on
2429                API rate limits and system resources.
2430            metadata: Optional metadata dictionary to attach to all experiment traces.
2431                This metadata will be included in every trace created during the experiment.
2432                If `data` are Langfuse dataset items, the metadata will be attached to the dataset run, too.
2433
2434        Returns:
2435            ExperimentResult containing:
2436            - run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset.
2437            - item_results: List of results for each processed item with outputs and evaluations
2438            - run_evaluations: List of aggregate evaluation results for the entire run
2439            - experiment_id: Stable identifier for the experiment run across all items
2440            - dataset_run_id: ID of the dataset run (if using Langfuse datasets)
2441            - dataset_run_url: Direct URL to view results in Langfuse UI (if applicable)
2442
2443        Raises:
2444            ValueError: If required parameters are missing or invalid
2445            Exception: If experiment setup fails (individual item failures are handled gracefully)
2446
2447        Examples:
2448            Basic experiment with local data:
2449            ```python
2450            def summarize_text(*, item, **kwargs):
2451                return f"Summary: {item['input'][:50]}..."
2452
2453            def length_evaluator(*, input, output, expected_output=None, **kwargs):
2454                return {
2455                    "name": "output_length",
2456                    "value": len(output),
2457                    "comment": f"Output contains {len(output)} characters"
2458                }
2459
2460            result = langfuse.run_experiment(
2461                name="Text Summarization Test",
2462                description="Evaluate summarization quality and length",
2463                data=[
2464                    {"input": "Long article text...", "expected_output": "Expected summary"},
2465                    {"input": "Another article...", "expected_output": "Another summary"}
2466                ],
2467                task=summarize_text,
2468                evaluators=[length_evaluator]
2469            )
2470
2471            print(f"Processed {len(result.item_results)} items")
2472            for item_result in result.item_results:
2473                print(f"Input: {item_result.item['input']}")
2474                print(f"Output: {item_result.output}")
2475                print(f"Evaluations: {item_result.evaluations}")
2476            ```
2477
2478            Advanced experiment with async task and multiple evaluators:
2479            ```python
2480            async def llm_task(*, item, **kwargs):
2481                # Simulate async LLM call
2482                response = await openai_client.chat.completions.create(
2483                    model="gpt-4",
2484                    messages=[{"role": "user", "content": item["input"]}]
2485                )
2486                return response.choices[0].message.content
2487
2488            def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
2489                if expected_output and expected_output.lower() in output.lower():
2490                    return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"}
2491                return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"}
2492
2493            def toxicity_evaluator(*, input, output, expected_output=None, **kwargs):
2494                # Simulate toxicity check
2495                toxicity_score = check_toxicity(output)  # Your toxicity checker
2496                return {
2497                    "name": "toxicity",
2498                    "value": toxicity_score,
2499                    "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}"
2500                }
2501
2502            def average_accuracy(*, item_results, **kwargs):
2503                accuracies = [
2504                    eval.value for result in item_results
2505                    for eval in result.evaluations
2506                    if eval.name == "accuracy"
2507                ]
2508                return {
2509                    "name": "average_accuracy",
2510                    "value": sum(accuracies) / len(accuracies) if accuracies else 0,
2511                    "comment": f"Average accuracy across {len(accuracies)} items"
2512                }
2513
2514            result = langfuse.run_experiment(
2515                name="LLM Safety and Accuracy Test",
2516                description="Evaluate model accuracy and safety across diverse prompts",
2517                data=test_dataset,  # Your dataset items
2518                task=llm_task,
2519                evaluators=[accuracy_evaluator, toxicity_evaluator],
2520                run_evaluators=[average_accuracy],
2521                max_concurrency=5,  # Limit concurrent API calls
2522                metadata={"model": "gpt-4", "temperature": 0.7}
2523            )
2524            ```
2525
2526            Using with Langfuse datasets:
2527            ```python
2528            # Get dataset from Langfuse
2529            dataset = langfuse.get_dataset("my-eval-dataset")
2530
2531            result = dataset.run_experiment(
2532                name="Production Model Evaluation",
2533                description="Monthly evaluation of production model performance",
2534                task=my_production_task,
2535                evaluators=[accuracy_evaluator, latency_evaluator]
2536            )
2537
2538            # Results automatically linked to dataset in Langfuse UI
2539            print(f"View results: {result['dataset_run_url']}")
2540            ```
2541
2542        Note:
2543            - Task and evaluator functions can be either synchronous or asynchronous
2544            - Individual item failures are logged but don't stop the experiment
2545            - All executions are automatically traced and visible in Langfuse UI
2546            - When using Langfuse datasets, results are automatically linked for easy comparison
2547            - This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.)
2548            - Async execution is handled automatically with smart event loop detection
2549        """
2550        return cast(
2551            ExperimentResult,
2552            run_async_safely(
2553                self._run_experiment_async(
2554                    name=name,
2555                    run_name=self._create_experiment_run_name(
2556                        name=name, run_name=run_name
2557                    ),
2558                    description=description,
2559                    data=data,
2560                    task=task,
2561                    evaluators=evaluators or [],
2562                    composite_evaluator=composite_evaluator,
2563                    run_evaluators=run_evaluators or [],
2564                    max_concurrency=max_concurrency,
2565                    metadata=metadata,
2566                    dataset_version=_dataset_version,
2567                ),
2568            ),
2569        )
2570
2571    async def _run_experiment_async(
2572        self,
2573        *,
2574        name: str,
2575        run_name: str,
2576        description: Optional[str],
2577        data: ExperimentData,
2578        task: TaskFunction,
2579        evaluators: List[EvaluatorFunction],
2580        composite_evaluator: Optional[CompositeEvaluatorFunction],
2581        run_evaluators: List[RunEvaluatorFunction],
2582        max_concurrency: int,
2583        metadata: Optional[Dict[str, Any]] = None,
2584        dataset_version: Optional[datetime] = None,
2585    ) -> ExperimentResult:
2586        langfuse_logger.debug(
2587            f"Starting experiment '{name}' run '{run_name}' with {len(data)} items"
2588        )
2589
2590        shared_fallback_experiment_id = self._create_observation_id()
2591
2592        # Set up concurrency control
2593        semaphore = asyncio.Semaphore(max_concurrency)
2594
2595        # Process all items
2596        async def process_item(item: ExperimentItem) -> ExperimentItemResult:
2597            async with semaphore:
2598                return await self._process_experiment_item(
2599                    item,
2600                    task,
2601                    evaluators,
2602                    composite_evaluator,
2603                    shared_fallback_experiment_id,
2604                    name,
2605                    run_name,
2606                    description,
2607                    metadata,
2608                    dataset_version,
2609                )
2610
2611        # Run all items concurrently
2612        tasks = [process_item(item) for item in data]
2613        item_results = await asyncio.gather(*tasks, return_exceptions=True)
2614
2615        # Filter out any exceptions and log errors
2616        valid_results: List[ExperimentItemResult] = []
2617        for i, result in enumerate(item_results):
2618            if isinstance(result, Exception):
2619                langfuse_logger.error(f"Item {i} failed: {result}")
2620            elif isinstance(result, ExperimentItemResult):
2621                valid_results.append(result)  # type: ignore
2622
2623        # Run experiment-level evaluators
2624        run_evaluations: List[Evaluation] = []
2625        for run_evaluator in run_evaluators:
2626            try:
2627                evaluations = await _run_evaluator(
2628                    run_evaluator, item_results=valid_results
2629                )
2630                run_evaluations.extend(evaluations)
2631            except Exception as e:
2632                langfuse_logger.error(f"Run evaluator failed: {e}")
2633
2634        # Generate dataset run URL if applicable
2635        dataset_run_id = next(
2636            (
2637                result.dataset_run_id
2638                for result in valid_results
2639                if result.dataset_run_id
2640            ),
2641            None,
2642        )
2643        dataset_run_url = None
2644        if dataset_run_id and data:
2645            try:
2646                # Check if the first item has dataset_id (for DatasetItem objects)
2647                first_item = data[0]
2648                dataset_id = None
2649
2650                if hasattr(first_item, "dataset_id"):
2651                    dataset_id = getattr(first_item, "dataset_id", None)
2652
2653                if dataset_id:
2654                    project_id = self._get_project_id()
2655
2656                    if project_id:
2657                        dataset_run_url = f"{self._base_url}/project/{project_id}/datasets/{dataset_id}/runs/{dataset_run_id}"
2658
2659            except Exception:
2660                pass  # URL generation is optional
2661
2662        # Store run-level evaluations as scores
2663        for evaluation in run_evaluations:
2664            try:
2665                if dataset_run_id:
2666                    self.create_score(
2667                        dataset_run_id=dataset_run_id,
2668                        name=evaluation.name or "<unknown>",
2669                        value=evaluation.value,  # type: ignore
2670                        comment=evaluation.comment,
2671                        metadata=evaluation.metadata,
2672                        data_type=evaluation.data_type,  # type: ignore
2673                        config_id=evaluation.config_id,
2674                    )
2675
2676            except Exception as e:
2677                langfuse_logger.error(f"Failed to store run evaluation: {e}")
2678
2679        # Flush scores and traces
2680        self.flush()
2681
2682        return ExperimentResult(
2683            name=name,
2684            run_name=run_name,
2685            description=description,
2686            item_results=valid_results,
2687            run_evaluations=run_evaluations,
2688            experiment_id=dataset_run_id or shared_fallback_experiment_id,
2689            dataset_run_id=dataset_run_id,
2690            dataset_run_url=dataset_run_url,
2691        )
2692
2693    async def _process_experiment_item(
2694        self,
2695        item: ExperimentItem,
2696        task: Callable,
2697        evaluators: List[Callable],
2698        composite_evaluator: Optional[CompositeEvaluatorFunction],
2699        fallback_experiment_id: str,
2700        experiment_name: str,
2701        experiment_run_name: str,
2702        experiment_description: Optional[str],
2703        experiment_metadata: Optional[Dict[str, Any]] = None,
2704        dataset_version: Optional[datetime] = None,
2705    ) -> ExperimentItemResult:
2706        span_name = "experiment-item-run"
2707
2708        with self.start_as_current_observation(name=span_name) as span:
2709            try:
2710                input_data = (
2711                    item.get("input")
2712                    if isinstance(item, dict)
2713                    else getattr(item, "input", None)
2714                )
2715
2716                if input_data is None:
2717                    raise ValueError("Experiment Item is missing input. Skipping item.")
2718
2719                expected_output = (
2720                    item.get("expected_output")
2721                    if isinstance(item, dict)
2722                    else getattr(item, "expected_output", None)
2723                )
2724
2725                item_metadata = (
2726                    item.get("metadata")
2727                    if isinstance(item, dict)
2728                    else getattr(item, "metadata", None)
2729                )
2730
2731                final_observation_metadata = {
2732                    "experiment_name": experiment_name,
2733                    "experiment_run_name": experiment_run_name,
2734                    **(experiment_metadata or {}),
2735                }
2736
2737                trace_id = span.trace_id
2738                dataset_id = None
2739                dataset_item_id = None
2740                dataset_run_id = None
2741
2742                # Link to dataset run if this is a dataset item
2743                if hasattr(item, "id") and hasattr(item, "dataset_id"):
2744                    try:
2745                        # Use sync API to avoid event loop issues when run_async_safely
2746                        # creates multiple event loops across different threads
2747                        dataset_run_item = await asyncio.to_thread(
2748                            self.api.dataset_run_items.create,
2749                            run_name=experiment_run_name,
2750                            run_description=experiment_description,
2751                            metadata=experiment_metadata,
2752                            dataset_item_id=item.id,  # type: ignore
2753                            trace_id=trace_id,
2754                            observation_id=span.id,
2755                            dataset_version=dataset_version,
2756                        )
2757
2758                        dataset_run_id = dataset_run_item.dataset_run_id
2759
2760                    except Exception as e:
2761                        langfuse_logger.error(f"Failed to create dataset run item: {e}")
2762
2763                if (
2764                    not isinstance(item, dict)
2765                    and hasattr(item, "dataset_id")
2766                    and hasattr(item, "id")
2767                ):
2768                    dataset_id = item.dataset_id
2769                    dataset_item_id = item.id
2770
2771                    final_observation_metadata.update(
2772                        {"dataset_id": dataset_id, "dataset_item_id": dataset_item_id}
2773                    )
2774
2775                if isinstance(item_metadata, dict):
2776                    final_observation_metadata.update(item_metadata)
2777
2778                experiment_id = dataset_run_id or fallback_experiment_id
2779                experiment_item_id = (
2780                    dataset_item_id or get_sha256_hash_hex(_serialize(input_data))[:16]
2781                )
2782                span._otel_span.set_attributes(
2783                    {
2784                        k: v
2785                        for k, v in {
2786                            LangfuseOtelSpanAttributes.ENVIRONMENT: LANGFUSE_SDK_EXPERIMENT_ENVIRONMENT,
2787                            LangfuseOtelSpanAttributes.EXPERIMENT_DESCRIPTION: experiment_description,
2788                            LangfuseOtelSpanAttributes.EXPERIMENT_ITEM_EXPECTED_OUTPUT: _serialize(
2789                                expected_output
2790                            ),
2791                        }.items()
2792                        if v is not None
2793                    }
2794                )
2795
2796                propagated_experiment_attributes = PropagatedExperimentAttributes(
2797                    experiment_id=experiment_id,
2798                    experiment_name=experiment_run_name,
2799                    experiment_metadata=_flatten_and_serialize_metadata_values(
2800                        experiment_metadata
2801                    ),
2802                    experiment_dataset_id=dataset_id,
2803                    experiment_item_id=experiment_item_id,
2804                    experiment_item_metadata=_flatten_and_serialize_metadata_values(
2805                        item_metadata if isinstance(item_metadata, dict) else None
2806                    ),
2807                    experiment_item_root_observation_id=span.id,
2808                )
2809
2810                with _propagate_attributes(experiment=propagated_experiment_attributes):
2811                    output = await _run_task(task, item)
2812
2813                span.update(
2814                    input=input_data,
2815                    output=output,
2816                    metadata=final_observation_metadata,
2817                )
2818
2819            except Exception as e:
2820                span.update(
2821                    output=f"Error: {str(e)}", level="ERROR", status_message=str(e)
2822                )
2823                raise e
2824
2825            # Run evaluators
2826            evaluations = []
2827
2828            for evaluator in evaluators:
2829                try:
2830                    eval_metadata: Optional[Dict[str, Any]] = None
2831
2832                    if isinstance(item, dict):
2833                        eval_metadata = item.get("metadata")
2834                    elif hasattr(item, "metadata"):
2835                        eval_metadata = item.metadata
2836
2837                    with _propagate_attributes(
2838                        experiment=propagated_experiment_attributes
2839                    ):
2840                        eval_results = await _run_evaluator(
2841                            evaluator,
2842                            input=input_data,
2843                            output=output,
2844                            expected_output=expected_output,
2845                            metadata=eval_metadata,
2846                        )
2847                        evaluations.extend(eval_results)
2848
2849                        # Store evaluations as scores
2850                        for evaluation in eval_results:
2851                            self.create_score(
2852                                trace_id=trace_id,
2853                                observation_id=span.id,
2854                                name=evaluation.name,
2855                                value=evaluation.value,  # type: ignore
2856                                comment=evaluation.comment,
2857                                metadata=evaluation.metadata,
2858                                config_id=evaluation.config_id,
2859                                data_type=evaluation.data_type,  # type: ignore
2860                            )
2861
2862                except Exception as e:
2863                    langfuse_logger.error(f"Evaluator failed: {e}")
2864
2865            # Run composite evaluator if provided and we have evaluations
2866            if composite_evaluator and evaluations:
2867                try:
2868                    composite_eval_metadata: Optional[Dict[str, Any]] = None
2869                    if isinstance(item, dict):
2870                        composite_eval_metadata = item.get("metadata")
2871                    elif hasattr(item, "metadata"):
2872                        composite_eval_metadata = item.metadata
2873
2874                    with _propagate_attributes(
2875                        experiment=propagated_experiment_attributes
2876                    ):
2877                        result = composite_evaluator(
2878                            input=input_data,
2879                            output=output,
2880                            expected_output=expected_output,
2881                            metadata=composite_eval_metadata,
2882                            evaluations=evaluations,
2883                        )
2884
2885                        # Handle async composite evaluators
2886                        if asyncio.iscoroutine(result):
2887                            result = await result
2888
2889                        # Normalize to list
2890                        composite_evals: List[Evaluation] = []
2891                        if isinstance(result, (dict, Evaluation)):
2892                            composite_evals = [result]  # type: ignore
2893                        elif isinstance(result, list):
2894                            composite_evals = result  # type: ignore
2895
2896                        # Store composite evaluations as scores and add to evaluations list
2897                        for composite_evaluation in composite_evals:
2898                            self.create_score(
2899                                trace_id=trace_id,
2900                                observation_id=span.id,
2901                                name=composite_evaluation.name,
2902                                value=composite_evaluation.value,  # type: ignore
2903                                comment=composite_evaluation.comment,
2904                                metadata=composite_evaluation.metadata,
2905                                config_id=composite_evaluation.config_id,
2906                                data_type=composite_evaluation.data_type,  # type: ignore
2907                            )
2908                            evaluations.append(composite_evaluation)
2909
2910                except Exception as e:
2911                    langfuse_logger.error(f"Composite evaluator failed: {e}")
2912
2913            return ExperimentItemResult(
2914                item=item,
2915                output=output,
2916                evaluations=evaluations,
2917                trace_id=trace_id,
2918                dataset_run_id=dataset_run_id,
2919            )
2920
2921    def _create_experiment_run_name(
2922        self, *, name: Optional[str] = None, run_name: Optional[str] = None
2923    ) -> str:
2924        if run_name:
2925            return run_name
2926
2927        iso_timestamp = _get_timestamp().isoformat().replace("+00:00", "Z")
2928
2929        return f"{name} - {iso_timestamp}"
2930
2931    def run_batched_evaluation(
2932        self,
2933        *,
2934        scope: Literal["traces", "observations"],
2935        mapper: MapperFunction,
2936        filter: Optional[str] = None,
2937        fetch_batch_size: int = 50,
2938        fetch_trace_fields: Optional[str] = None,
2939        max_items: Optional[int] = None,
2940        max_retries: int = 3,
2941        evaluators: List[EvaluatorFunction],
2942        composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
2943        max_concurrency: int = 5,
2944        metadata: Optional[Dict[str, Any]] = None,
2945        _add_observation_scores_to_trace: bool = False,
2946        _additional_trace_tags: Optional[List[str]] = None,
2947        resume_from: Optional[BatchEvaluationResumeToken] = None,
2948        verbose: bool = False,
2949    ) -> BatchEvaluationResult:
2950        """Fetch traces or observations and run evaluations on each item.
2951
2952        This method provides a powerful way to evaluate existing data in Langfuse at scale.
2953        It fetches items based on filters, transforms them using a mapper function, runs
2954        evaluators on each item, and creates scores that are linked back to the original
2955        entities. This is ideal for:
2956
2957        - Running evaluations on production traces after deployment
2958        - Backtesting new evaluation metrics on historical data
2959        - Batch scoring of observations for quality monitoring
2960        - Periodic evaluation runs on recent data
2961
2962        The method uses a streaming/pipeline approach to process items in batches, making
2963        it memory-efficient for large datasets. It includes comprehensive error handling,
2964        retry logic, and resume capability for long-running evaluations.
2965
2966        Args:
2967            scope: The type of items to evaluate. Must be one of:
2968                - "traces": Evaluate complete traces with all their observations
2969                - "observations": Evaluate individual observations (spans, generations, events)
2970            mapper: Function that transforms API response objects into evaluator inputs.
2971                Receives a trace/observation object and returns an EvaluatorInputs
2972                instance with input, output, expected_output, and metadata fields.
2973                Can be sync or async.
2974            evaluators: List of evaluation functions to run on each item. Each evaluator
2975                receives the mapped inputs and returns Evaluation object(s). Evaluator
2976                failures are logged but don't stop the batch evaluation.
2977            filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples:
2978                - '{"tags": ["production"]}'
2979                - '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}'
2980                Default: None (fetches all items).
2981            fetch_batch_size: Number of items to fetch per API call and hold in memory.
2982                Larger values may be faster but use more memory. Default: 50.
2983            fetch_trace_fields: Comma-separated list of fields to include when fetching traces. Available field groups: 'core' (always included), 'io' (input, output, metadata), 'scores', 'observations', 'metrics'. If not specified, all fields are returned. Example: 'core,scores,metrics'. Note: Excluded 'observations' or 'scores' fields return empty arrays; excluded 'metrics' returns -1 for 'totalCost' and 'latency'. Only relevant if scope is 'traces'.
2984            max_items: Maximum total number of items to process. If None, processes all
2985                items matching the filter. Useful for testing or limiting evaluation runs.
2986                Default: None (process all).
2987            max_concurrency: Maximum number of items to evaluate concurrently. Controls
2988                parallelism and resource usage. Default: 5.
2989            composite_evaluator: Optional function that creates a composite score from
2990                item-level evaluations. Receives the original item and its evaluations,
2991                returns a single Evaluation. Useful for weighted averages or combined metrics.
2992                Default: None.
2993            metadata: Optional metadata dict to add to all created scores. Useful for
2994                tracking evaluation runs, versions, or other context. Default: None.
2995            max_retries: Maximum number of retry attempts for failed batch fetches.
2996                Uses exponential backoff (1s, 2s, 4s). Default: 3.
2997            verbose: If True, logs progress information to console. Useful for monitoring
2998                long-running evaluations. Default: False.
2999            resume_from: Optional resume token from a previous incomplete run. Allows
3000                continuing evaluation after interruption or failure. Default: None.
3001
3002
3003        Returns:
3004            BatchEvaluationResult containing:
3005                - total_items_fetched: Number of items fetched from API
3006                - total_items_processed: Number of items successfully evaluated
3007                - total_items_failed: Number of items that failed evaluation
3008                - total_scores_created: Scores created by item-level evaluators
3009                - total_composite_scores_created: Scores created by composite evaluator
3010                - total_evaluations_failed: Individual evaluator failures
3011                - evaluator_stats: Per-evaluator statistics (success rate, scores created)
3012                - resume_token: Token for resuming if incomplete (None if completed)
3013                - completed: True if all items processed
3014                - duration_seconds: Total execution time
3015                - failed_item_ids: IDs of items that failed
3016                - error_summary: Error types and counts
3017                - has_more_items: True if max_items reached but more exist
3018
3019        Raises:
3020            ValueError: If invalid scope is provided.
3021
3022        Examples:
3023            Basic trace evaluation:
3024            ```python
3025            from langfuse import Langfuse, EvaluatorInputs, Evaluation
3026
3027            client = Langfuse()
3028
3029            # Define mapper to extract fields from traces
3030            def trace_mapper(trace):
3031                return EvaluatorInputs(
3032                    input=trace.input,
3033                    output=trace.output,
3034                    expected_output=None,
3035                    metadata={"trace_id": trace.id}
3036                )
3037
3038            # Define evaluator
3039            def length_evaluator(*, input, output, expected_output, metadata):
3040                return Evaluation(
3041                    name="output_length",
3042                    value=len(output) if output else 0
3043                )
3044
3045            # Run batch evaluation
3046            result = client.run_batched_evaluation(
3047                scope="traces",
3048                mapper=trace_mapper,
3049                evaluators=[length_evaluator],
3050                filter='{"tags": ["production"]}',
3051                max_items=1000,
3052                verbose=True
3053            )
3054
3055            print(f"Processed {result.total_items_processed} traces")
3056            print(f"Created {result.total_scores_created} scores")
3057            ```
3058
3059            Evaluation with composite scorer:
3060            ```python
3061            def accuracy_evaluator(*, input, output, expected_output, metadata):
3062                # ... evaluation logic
3063                return Evaluation(name="accuracy", value=0.85)
3064
3065            def relevance_evaluator(*, input, output, expected_output, metadata):
3066                # ... evaluation logic
3067                return Evaluation(name="relevance", value=0.92)
3068
3069            def composite_evaluator(*, item, evaluations):
3070                # Weighted average of evaluations
3071                weights = {"accuracy": 0.6, "relevance": 0.4}
3072                total = sum(
3073                    e.value * weights.get(e.name, 0)
3074                    for e in evaluations
3075                    if isinstance(e.value, (int, float))
3076                )
3077                return Evaluation(
3078                    name="composite_score",
3079                    value=total,
3080                    comment=f"Weighted average of {len(evaluations)} metrics"
3081                )
3082
3083            result = client.run_batched_evaluation(
3084                scope="traces",
3085                mapper=trace_mapper,
3086                evaluators=[accuracy_evaluator, relevance_evaluator],
3087                composite_evaluator=composite_evaluator,
3088                filter='{"user_id": "important_user"}',
3089                verbose=True
3090            )
3091            ```
3092
3093            Handling incomplete runs with resume:
3094            ```python
3095            # Initial run that may fail or timeout
3096            result = client.run_batched_evaluation(
3097                scope="observations",
3098                mapper=obs_mapper,
3099                evaluators=[my_evaluator],
3100                max_items=10000,
3101                verbose=True
3102            )
3103
3104            # Check if incomplete
3105            if not result.completed and result.resume_token:
3106                print(f"Processed {result.resume_token.items_processed} items before interruption")
3107
3108                # Resume from where it left off
3109                result = client.run_batched_evaluation(
3110                    scope="observations",
3111                    mapper=obs_mapper,
3112                    evaluators=[my_evaluator],
3113                    resume_from=result.resume_token,
3114                    verbose=True
3115                )
3116
3117            print(f"Total items processed: {result.total_items_processed}")
3118            ```
3119
3120            Monitoring evaluator performance:
3121            ```python
3122            result = client.run_batched_evaluation(...)
3123
3124            for stats in result.evaluator_stats:
3125                success_rate = stats.successful_runs / stats.total_runs
3126                print(f"{stats.name}:")
3127                print(f"  Success rate: {success_rate:.1%}")
3128                print(f"  Scores created: {stats.total_scores_created}")
3129
3130                if stats.failed_runs > 0:
3131                    print(f"  âš ī¸  Failed {stats.failed_runs} times")
3132            ```
3133
3134        Note:
3135            - Evaluator failures are logged but don't stop the batch evaluation
3136            - Individual item failures are tracked but don't stop processing
3137            - Fetch failures are retried with exponential backoff
3138            - All scores are automatically flushed to Langfuse at the end
3139            - The resume mechanism uses timestamp-based filtering to avoid duplicates
3140        """
3141        runner = BatchEvaluationRunner(self)
3142
3143        return cast(
3144            BatchEvaluationResult,
3145            run_async_safely(
3146                runner.run_async(
3147                    scope=scope,
3148                    mapper=mapper,
3149                    evaluators=evaluators,
3150                    filter=filter,
3151                    fetch_batch_size=fetch_batch_size,
3152                    fetch_trace_fields=fetch_trace_fields,
3153                    max_items=max_items,
3154                    max_concurrency=max_concurrency,
3155                    composite_evaluator=composite_evaluator,
3156                    metadata=metadata,
3157                    _add_observation_scores_to_trace=_add_observation_scores_to_trace,
3158                    _additional_trace_tags=_additional_trace_tags,
3159                    max_retries=max_retries,
3160                    verbose=verbose,
3161                    resume_from=resume_from,
3162                )
3163            ),
3164        )
3165
3166    def auth_check(self) -> bool:
3167        """Check if the provided credentials (public and secret key) are valid.
3168
3169        Raises:
3170            Exception: If no projects were found for the provided credentials.
3171
3172        Note:
3173            This method is blocking. It is discouraged to use it in production code.
3174        """
3175        try:
3176            projects = self.api.projects.get()
3177            langfuse_logger.debug(
3178                f"Auth check successful, found {len(projects.data)} projects"
3179            )
3180            if len(projects.data) == 0:
3181                raise Exception(
3182                    "Auth check failed, no project found for the keys provided."
3183                )
3184            return True
3185
3186        except AttributeError as e:
3187            langfuse_logger.warning(
3188                f"Auth check failed: Client not properly initialized. Error: {e}"
3189            )
3190            return False
3191
3192        except Error as e:
3193            handle_fern_exception(e)
3194            raise e
3195
3196    def create_dataset(
3197        self,
3198        *,
3199        name: str,
3200        description: Optional[str] = None,
3201        metadata: Optional[Any] = None,
3202        input_schema: Optional[Any] = None,
3203        expected_output_schema: Optional[Any] = None,
3204    ) -> Dataset:
3205        """Create a dataset with the given name on Langfuse.
3206
3207        Args:
3208            name: Name of the dataset to create.
3209            description: Description of the dataset. Defaults to None.
3210            metadata: Additional metadata. Defaults to None.
3211            input_schema: JSON Schema for validating dataset item inputs. When set, all new items will be validated against this schema.
3212            expected_output_schema: JSON Schema for validating dataset item expected outputs. When set, all new items will be validated against this schema.
3213
3214        Returns:
3215            Dataset: The created dataset as returned by the Langfuse API.
3216        """
3217        try:
3218            langfuse_logger.debug(f"Creating datasets {name}")
3219
3220            result = self.api.datasets.create(
3221                name=name,
3222                description=description,
3223                metadata=metadata,
3224                input_schema=input_schema,
3225                expected_output_schema=expected_output_schema,
3226            )
3227
3228            return cast(Dataset, result)
3229
3230        except Error as e:
3231            handle_fern_exception(e)
3232            raise e
3233
3234    def create_dataset_item(
3235        self,
3236        *,
3237        dataset_name: str,
3238        input: Optional[Any] = None,
3239        expected_output: Optional[Any] = None,
3240        metadata: Optional[Any] = None,
3241        source_trace_id: Optional[str] = None,
3242        source_observation_id: Optional[str] = None,
3243        status: Optional[DatasetStatus] = None,
3244        id: Optional[str] = None,
3245    ) -> DatasetItem:
3246        """Create a dataset item.
3247
3248        Upserts if an item with id already exists.
3249
3250        Args:
3251            dataset_name: Name of the dataset in which the dataset item should be created.
3252            input: Input data. Defaults to None. Can contain any dict, list or scalar.
3253            expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar.
3254            metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar.
3255            source_trace_id: Id of the source trace. Defaults to None.
3256            source_observation_id: Id of the source observation. Defaults to None.
3257            status: Status of the dataset item. Defaults to ACTIVE for newly created items.
3258            id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets.
3259
3260        Returns:
3261            DatasetItem: The created dataset item as returned by the Langfuse API.
3262
3263        Example:
3264            ```python
3265            from langfuse import Langfuse
3266
3267            langfuse = Langfuse()
3268
3269            # Uploading items to the Langfuse dataset named "capital_cities"
3270            langfuse.create_dataset_item(
3271                dataset_name="capital_cities",
3272                input={"input": {"country": "Italy"}},
3273                expected_output={"expected_output": "Rome"},
3274                metadata={"foo": "bar"}
3275            )
3276            ```
3277        """
3278        try:
3279            langfuse_logger.debug(f"Creating dataset item for dataset {dataset_name}")
3280
3281            result = self.api.dataset_items.create(
3282                dataset_name=dataset_name,
3283                input=input,
3284                expected_output=expected_output,
3285                metadata=metadata,
3286                source_trace_id=source_trace_id,
3287                source_observation_id=source_observation_id,
3288                status=status,
3289                id=id,
3290            )
3291
3292            return cast(DatasetItem, result)
3293        except Error as e:
3294            handle_fern_exception(e)
3295            raise e
3296
3297    def resolve_media_references(
3298        self,
3299        *,
3300        obj: Any,
3301        resolve_with: Literal["base64_data_uri"],
3302        max_depth: int = 10,
3303        content_fetch_timeout_seconds: int = 5,
3304    ) -> Any:
3305        """Replace media reference strings in an object with base64 data URIs.
3306
3307        This method recursively traverses an object (up to max_depth) looking for media reference strings
3308        in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using
3309        the provided Langfuse client and replaces the reference string with a base64 data URI.
3310
3311        If fetching media content fails for a reference string, a warning is logged and the reference
3312        string is left unchanged.
3313
3314        Args:
3315            obj: The object to process. Can be a primitive value, array, or nested object.
3316                If the object has a __dict__ attribute, a dict will be returned instead of the original object type.
3317            resolve_with: The representation of the media content to replace the media reference string with.
3318                Currently only "base64_data_uri" is supported.
3319            max_depth: int: The maximum depth to traverse the object. Default is 10.
3320            content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5.
3321
3322        Returns:
3323            A deep copy of the input object with all media references replaced with base64 data URIs where possible.
3324            If the input object has a __dict__ attribute, a dict will be returned instead of the original object type.
3325
3326        Example:
3327            obj = {
3328                "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@",
3329                "nested": {
3330                    "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@"
3331                }
3332            }
3333
3334            result = await LangfuseMedia.resolve_media_references(obj, langfuse_client)
3335
3336            # Result:
3337            # {
3338            #     "image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...",
3339            #     "nested": {
3340            #         "pdf": "data:application/pdf;base64,JVBERi0xLjcK..."
3341            #     }
3342            # }
3343        """
3344        return LangfuseMedia.resolve_media_references(
3345            langfuse_client=self,
3346            obj=obj,
3347            resolve_with=resolve_with,
3348            max_depth=max_depth,
3349            content_fetch_timeout_seconds=content_fetch_timeout_seconds,
3350        )
3351
3352    @overload
3353    def get_prompt(
3354        self,
3355        name: str,
3356        *,
3357        version: Optional[int] = None,
3358        label: Optional[str] = None,
3359        type: Literal["chat"],
3360        cache_ttl_seconds: Optional[int] = None,
3361        fallback: Optional[List[ChatMessageDict]] = None,
3362        max_retries: Optional[int] = None,
3363        fetch_timeout_seconds: Optional[int] = None,
3364    ) -> ChatPromptClient: ...
3365
3366    @overload
3367    def get_prompt(
3368        self,
3369        name: str,
3370        *,
3371        version: Optional[int] = None,
3372        label: Optional[str] = None,
3373        type: Literal["text"] = "text",
3374        cache_ttl_seconds: Optional[int] = None,
3375        fallback: Optional[str] = None,
3376        max_retries: Optional[int] = None,
3377        fetch_timeout_seconds: Optional[int] = None,
3378    ) -> TextPromptClient: ...
3379
3380    def get_prompt(
3381        self,
3382        name: str,
3383        *,
3384        version: Optional[int] = None,
3385        label: Optional[str] = None,
3386        type: Literal["chat", "text"] = "text",
3387        cache_ttl_seconds: Optional[int] = None,
3388        fallback: Union[Optional[List[ChatMessageDict]], Optional[str]] = None,
3389        max_retries: Optional[int] = None,
3390        fetch_timeout_seconds: Optional[int] = None,
3391    ) -> PromptClient:
3392        """Get a prompt.
3393
3394        This method attempts to fetch the requested prompt from the local cache. If the prompt is not found
3395        in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again
3396        and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will
3397        return the expired prompt as a fallback.
3398
3399        Args:
3400            name (str): The name of the prompt to retrieve.
3401
3402        Keyword Args:
3403            version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both.
3404            label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both.
3405            cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a
3406            keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0.
3407            type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text".
3408            fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None.
3409            max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds.
3410            fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default.
3411
3412        Returns:
3413            The prompt object retrieved from the cache or directly fetched if not cached or expired of type
3414            - TextPromptClient, if type argument is 'text'.
3415            - ChatPromptClient, if type argument is 'chat'.
3416
3417        Raises:
3418            Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an
3419            expired prompt in the cache, in which case it logs a warning and returns the expired prompt.
3420        """
3421        if self._resources is None:
3422            raise Error(
3423                "SDK is not correctly initialized. Check the init logs for more details."
3424            )
3425        if version is not None and label is not None:
3426            raise ValueError("Cannot specify both version and label at the same time.")
3427
3428        if not name:
3429            raise ValueError("Prompt name cannot be empty.")
3430
3431        cache_key = PromptCache.generate_cache_key(name, version=version, label=label)
3432        bounded_max_retries = self._get_bounded_max_retries(
3433            max_retries, default_max_retries=2, max_retries_upper_bound=4
3434        )
3435
3436        langfuse_logger.debug(f"Getting prompt '{cache_key}'")
3437        cached_prompt = self._resources.prompt_cache.get(cache_key)
3438
3439        if cached_prompt is None or cache_ttl_seconds == 0:
3440            langfuse_logger.debug(
3441                f"Prompt '{cache_key}' not found in cache or caching disabled."
3442            )
3443            try:
3444                return self._fetch_prompt_and_update_cache(
3445                    name,
3446                    version=version,
3447                    label=label,
3448                    ttl_seconds=cache_ttl_seconds,
3449                    max_retries=bounded_max_retries,
3450                    fetch_timeout_seconds=fetch_timeout_seconds,
3451                )
3452            except Exception as e:
3453                if fallback:
3454                    langfuse_logger.warning(
3455                        f"Returning fallback prompt for '{cache_key}' due to fetch error: {e}"
3456                    )
3457
3458                    fallback_client_args: Dict[str, Any] = {
3459                        "name": name,
3460                        "prompt": fallback,
3461                        "type": type,
3462                        "version": version or 0,
3463                        "config": {},
3464                        "labels": [label] if label else [],
3465                        "tags": [],
3466                    }
3467
3468                    if type == "text":
3469                        return TextPromptClient(
3470                            prompt=Prompt_Text(**fallback_client_args),
3471                            is_fallback=True,
3472                        )
3473
3474                    if type == "chat":
3475                        return ChatPromptClient(
3476                            prompt=Prompt_Chat(**fallback_client_args),
3477                            is_fallback=True,
3478                        )
3479
3480                raise e
3481
3482        if cached_prompt.is_expired():
3483            langfuse_logger.debug(f"Stale prompt '{cache_key}' found in cache.")
3484            try:
3485                # refresh prompt in background thread, refresh_prompt deduplicates tasks
3486                langfuse_logger.debug(f"Refreshing prompt '{cache_key}' in background.")
3487
3488                def refresh_task() -> None:
3489                    self._fetch_prompt_and_update_cache(
3490                        name,
3491                        version=version,
3492                        label=label,
3493                        ttl_seconds=cache_ttl_seconds,
3494                        max_retries=bounded_max_retries,
3495                        fetch_timeout_seconds=fetch_timeout_seconds,
3496                    )
3497
3498                self._resources.prompt_cache.add_refresh_prompt_task_if_current(
3499                    cache_key,
3500                    cached_prompt,
3501                    refresh_task,
3502                )
3503                langfuse_logger.debug(
3504                    f"Returning stale prompt '{cache_key}' from cache."
3505                )
3506                # return stale prompt
3507                return cached_prompt.value
3508
3509            except Exception as e:
3510                langfuse_logger.warning(
3511                    f"Error when refreshing cached prompt '{cache_key}', returning cached version. Error: {e}"
3512                )
3513                # creation of refresh prompt task failed, return stale prompt
3514                return cached_prompt.value
3515
3516        return cached_prompt.value
3517
3518    def _fetch_prompt_and_update_cache(
3519        self,
3520        name: str,
3521        *,
3522        version: Optional[int] = None,
3523        label: Optional[str] = None,
3524        ttl_seconds: Optional[int] = None,
3525        max_retries: int,
3526        fetch_timeout_seconds: Optional[int],
3527    ) -> PromptClient:
3528        cache_key = PromptCache.generate_cache_key(name, version=version, label=label)
3529        langfuse_logger.debug(f"Fetching prompt '{cache_key}' from server...")
3530
3531        try:
3532
3533            @backoff.on_exception(
3534                backoff.constant, Exception, max_tries=max_retries + 1, logger=None
3535            )
3536            def fetch_prompts() -> Any:
3537                return self.api.prompts.get(
3538                    self._url_encode(name),
3539                    version=version,
3540                    label=label,
3541                    request_options={
3542                        "timeout_in_seconds": fetch_timeout_seconds,
3543                    }
3544                    if fetch_timeout_seconds is not None
3545                    else None,
3546                )
3547
3548            prompt_response = fetch_prompts()
3549
3550            prompt: PromptClient
3551            if prompt_response.type == "chat":
3552                prompt = ChatPromptClient(prompt_response)
3553            else:
3554                prompt = TextPromptClient(prompt_response)
3555
3556            if self._resources is not None:
3557                self._resources.prompt_cache.set(cache_key, prompt, ttl_seconds)
3558
3559            return prompt
3560
3561        except NotFoundError as not_found_error:
3562            langfuse_logger.warning(
3563                f"Prompt '{cache_key}' not found during refresh, evicting from cache."
3564            )
3565            if self._resources is not None:
3566                self._resources.prompt_cache.delete(cache_key)
3567            raise not_found_error
3568
3569        except Exception as e:
3570            langfuse_logger.error(
3571                f"Error while fetching prompt '{cache_key}': {str(e)}"
3572            )
3573            raise e
3574
3575    def _get_bounded_max_retries(
3576        self,
3577        max_retries: Optional[int],
3578        *,
3579        default_max_retries: int = 2,
3580        max_retries_upper_bound: int = 4,
3581    ) -> int:
3582        if max_retries is None:
3583            return default_max_retries
3584
3585        bounded_max_retries = min(
3586            max(max_retries, 0),
3587            max_retries_upper_bound,
3588        )
3589
3590        return bounded_max_retries
3591
3592    @overload
3593    def create_prompt(
3594        self,
3595        *,
3596        name: str,
3597        prompt: List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]],
3598        labels: List[str] = [],
3599        tags: Optional[List[str]] = None,
3600        type: Optional[Literal["chat"]],
3601        config: Optional[Any] = None,
3602        commit_message: Optional[str] = None,
3603    ) -> ChatPromptClient: ...
3604
3605    @overload
3606    def create_prompt(
3607        self,
3608        *,
3609        name: str,
3610        prompt: str,
3611        labels: List[str] = [],
3612        tags: Optional[List[str]] = None,
3613        type: Optional[Literal["text"]] = "text",
3614        config: Optional[Any] = None,
3615        commit_message: Optional[str] = None,
3616    ) -> TextPromptClient: ...
3617
3618    def create_prompt(
3619        self,
3620        *,
3621        name: str,
3622        prompt: Union[
3623            str, List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]]
3624        ],
3625        labels: List[str] = [],
3626        tags: Optional[List[str]] = None,
3627        type: Optional[Literal["chat", "text"]] = "text",
3628        config: Optional[Any] = None,
3629        commit_message: Optional[str] = None,
3630    ) -> PromptClient:
3631        """Create a new prompt in Langfuse.
3632
3633        Keyword Args:
3634            name : The name of the prompt to be created.
3635            prompt : The content of the prompt to be created.
3636            is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead.
3637            labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label.
3638            tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt.
3639            config: Additional structured data to be saved with the prompt. Defaults to None.
3640            type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text".
3641            commit_message: Optional string describing the change.
3642
3643        Returns:
3644            TextPromptClient: The prompt if type argument is 'text'.
3645            ChatPromptClient: The prompt if type argument is 'chat'.
3646        """
3647        try:
3648            langfuse_logger.debug(f"Creating prompt {name=}, {labels=}")
3649
3650            if type == "chat":
3651                if not isinstance(prompt, list):
3652                    raise ValueError(
3653                        "For 'chat' type, 'prompt' must be a list of chat messages with role and content attributes."
3654                    )
3655                request: Union[CreateChatPromptRequest, CreateTextPromptRequest] = (
3656                    CreateChatPromptRequest(
3657                        name=name,
3658                        prompt=cast(Any, prompt),
3659                        labels=labels,
3660                        tags=tags,
3661                        config=config or {},
3662                        commit_message=commit_message,
3663                        type=CreateChatPromptType.CHAT,
3664                    )
3665                )
3666                server_prompt = self.api.prompts.create(request=request)
3667
3668                if self._resources is not None:
3669                    self._resources.prompt_cache.invalidate(name)
3670
3671                return ChatPromptClient(prompt=cast(Prompt_Chat, server_prompt))
3672
3673            if not isinstance(prompt, str):
3674                raise ValueError("For 'text' type, 'prompt' must be a string.")
3675
3676            request = CreateTextPromptRequest(
3677                name=name,
3678                prompt=prompt,
3679                labels=labels,
3680                tags=tags,
3681                config=config or {},
3682                commit_message=commit_message,
3683            )
3684
3685            server_prompt = self.api.prompts.create(request=request)
3686
3687            if self._resources is not None:
3688                self._resources.prompt_cache.invalidate(name)
3689
3690            return TextPromptClient(prompt=cast(Prompt_Text, server_prompt))
3691
3692        except Error as e:
3693            handle_fern_exception(e)
3694            raise e
3695
3696    def update_prompt(
3697        self,
3698        *,
3699        name: str,
3700        version: int,
3701        new_labels: List[str] = [],
3702    ) -> Any:
3703        """Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name.
3704
3705        Args:
3706            name (str): The name of the prompt to update.
3707            version (int): The version number of the prompt to update.
3708            new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to [].
3709
3710        Returns:
3711            Prompt: The updated prompt from the Langfuse API.
3712
3713        """
3714        updated_prompt = self.api.prompt_version.update(
3715            name=self._url_encode(name),
3716            version=version,
3717            new_labels=new_labels,
3718        )
3719
3720        if self._resources is not None:
3721            self._resources.prompt_cache.invalidate(name)
3722
3723        return updated_prompt
3724
3725    def _url_encode(self, url: str, *, is_url_param: Optional[bool] = False) -> str:
3726        # httpx â‰Ĩ 0.28 does its own WHATWG-compliant quoting (eg. encodes bare
3727        # “%”, “?”, “#”, “|”, â€Ļ in query/path parts).  Re-quoting here would
3728        # double-encode, so we skip when the value is about to be sent straight
3729        # to httpx (`is_url_param=True`) and the installed version is â‰Ĩ 0.28.
3730        if is_url_param and Version(httpx.__version__) >= Version("0.28.0"):
3731            return url
3732
3733        # urllib.parse.quote does not escape slashes "/" by default; we need to add safe="" to force escaping
3734        # we need add safe="" to force escaping of slashes
3735        # This is necessary for prompts in prompt folders
3736        return urllib.parse.quote(url, safe="")
3737
3738    def clear_prompt_cache(self) -> None:
3739        """Clear the entire prompt cache, removing all cached prompts.
3740
3741        This method is useful when you want to force a complete refresh of all
3742        cached prompts, for example after major updates or when you need to
3743        ensure the latest versions are fetched from the server.
3744        """
3745        if self._resources is not None:
3746            self._resources.prompt_cache.clear()

Main client for Langfuse tracing and platform features.

This class provides an interface for creating and managing traces, spans, and generations in Langfuse as well as interacting with the Langfuse API.

The client features a thread-safe singleton pattern for each unique public API key, ensuring consistent trace context propagation across your application. It implements efficient batching of spans with configurable flush settings and includes background thread management for media uploads and score ingestion.

Configuration is flexible through either direct parameters or environment variables, with graceful fallbacks and runtime configuration updates.

Attributes:
  • api: Synchronous API client for Langfuse backend communication
  • async_api: Asynchronous API client for Langfuse backend communication
  • _otel_tracer: Internal LangfuseTracer instance managing OpenTelemetry components
Arguments:
  • public_key (Optional[str]): Your Langfuse public API key. Can also be set via LANGFUSE_PUBLIC_KEY environment variable.
  • secret_key (Optional[str]): Your Langfuse secret API key. Can also be set via LANGFUSE_SECRET_KEY environment variable.
  • base_url (Optional[str]): The Langfuse API base URL. Defaults to "https://cloud.langfuse.com". Can also be set via LANGFUSE_BASE_URL environment variable.
  • host (Optional[str]): Deprecated. Use base_url instead. The Langfuse API host URL. Defaults to "https://cloud.langfuse.com".
  • timeout (Optional[int]): Timeout in seconds for API requests. Defaults to 5 seconds.
  • httpx_client (Optional[httpx.Client]): Custom httpx client for making non-tracing HTTP requests. If not provided, a default client will be created.
  • debug (bool): Enable debug logging. Defaults to False. Can also be set via LANGFUSE_DEBUG environment variable.
  • tracing_enabled (Optional[bool]): Enable or disable tracing. Defaults to True. Can also be set via LANGFUSE_TRACING_ENABLED environment variable.
  • flush_at (Optional[int]): Number of spans to batch before sending to the API. Defaults to 512. Can also be set via LANGFUSE_FLUSH_AT environment variable.
  • flush_interval (Optional[float]): Time in seconds between batch flushes. Defaults to 5 seconds. Can also be set via LANGFUSE_FLUSH_INTERVAL environment variable.
  • environment (Optional[str]): Environment name for tracing. Default is 'default'. Can also be set via LANGFUSE_TRACING_ENVIRONMENT environment variable. Can be any lowercase alphanumeric string with hyphens and underscores that does not start with 'langfuse'.
  • release (Optional[str]): Release version/hash of your application. Used for grouping analytics by release.
  • media_upload_thread_count (Optional[int]): Number of background threads for handling media uploads. Defaults to 1. Can also be set via LANGFUSE_MEDIA_UPLOAD_THREAD_COUNT environment variable.
  • sample_rate (Optional[float]): Sampling rate for traces (0.0 to 1.0). Defaults to 1.0 (100% of traces are sampled). Can also be set via LANGFUSE_SAMPLE_RATE environment variable.
  • mask (Optional[MaskFunction]): Function to mask sensitive data in traces before sending to the API.
  • blocked_instrumentation_scopes (Optional[List[str]]): Deprecated. Use should_export_span instead. Equivalent behavior:

    from langfuse.span_filter import is_default_export_span
    blocked = {"sqlite", "requests"}
    
    should_export_span = lambda span: (
        is_default_export_span(span)
        and (
            span.instrumentation_scope is None
            or span.instrumentation_scope.name not in blocked
        )
    )
    
  • should_export_span (Optional[Callable[[ReadableSpan], bool]]): Callback to decide whether to export a span. If omitted, Langfuse uses the default filter (Langfuse SDK spans, spans with gen_ai.* attributes, and known LLM instrumentation scopes).

  • additional_headers (Optional[Dict[str, str]]): Additional headers to include in all API requests and in the default OTLPSpanExporter requests. These headers will be merged with default headers. Note: If httpx_client is provided, additional_headers must be set directly on your custom httpx_client as well. If span_exporter is provided, these headers are not wired into that exporter and must be configured on the exporter instance directly.
  • tracer_provider(Optional[TracerProvider]): OpenTelemetry TracerProvider to use for Langfuse. This can be useful to set to have disconnected tracing between Langfuse and other OpenTelemetry-span emitting libraries. Note: To track active spans, the context is still shared between TracerProviders. This may lead to broken trace trees.
  • span_exporter (Optional[SpanExporter]): Custom OpenTelemetry span exporter for the Langfuse span processor. If omitted, Langfuse creates an OTLPSpanExporter pointed at the Langfuse OTLP endpoint. If provided, Langfuse does not wire base_url, exporter headers, exporter auth, or exporter timeout into it. Configure endpoint, headers, and timeout on the exporter instance directly. If you are sending spans to Langfuse v4 or using Langfuse Cloud Fast Preview, include x-langfuse-ingestion-version=4 on the exporter to enable real time processing of exported spans.
Example:
from langfuse.otel import Langfuse

# Initialize the client (reads from env vars if not provided)
langfuse = Langfuse(
    public_key="your-public-key",
    secret_key="your-secret-key",
    host="https://cloud.langfuse.com",  # Optional, default shown
)

# Create a trace span
with langfuse.start_as_current_observation(name="process-query") as span:
    # Your application code here

    # Create a nested generation span for an LLM call
    with span.start_as_current_generation(
        name="generate-response",
        model="gpt-4",
        input={"query": "Tell me about AI"},
        model_parameters={"temperature": 0.7, "max_tokens": 500}
    ) as generation:
        # Generate response here
        response = "AI is a field of computer science..."

        generation.update(
            output=response,
            usage_details={"prompt_tokens": 10, "completion_tokens": 50},
            cost_details={"total_cost": 0.0023}
        )

        # Score the generation (supports NUMERIC, BOOLEAN, CATEGORICAL)
        generation.score(name="relevance", value=0.95, data_type="NUMERIC")
Langfuse( *, public_key: Optional[str] = None, secret_key: Optional[str] = None, base_url: Optional[str] = None, host: Optional[str] = None, timeout: Optional[int] = None, httpx_client: Optional[httpx.Client] = None, debug: bool = False, tracing_enabled: Optional[bool] = True, flush_at: Optional[int] = None, flush_interval: Optional[float] = None, environment: Optional[str] = None, release: Optional[str] = None, media_upload_thread_count: Optional[int] = None, sample_rate: Optional[float] = None, mask: Optional[langfuse.types.MaskFunction] = None, blocked_instrumentation_scopes: Optional[List[str]] = None, should_export_span: Optional[Callable[[opentelemetry.sdk.trace.ReadableSpan], bool]] = None, additional_headers: Optional[Dict[str, str]] = None, tracer_provider: Optional[opentelemetry.sdk.trace.TracerProvider] = None, span_exporter: Optional[opentelemetry.sdk.trace.export.SpanExporter] = None)
232    def __init__(
233        self,
234        *,
235        public_key: Optional[str] = None,
236        secret_key: Optional[str] = None,
237        base_url: Optional[str] = None,
238        host: Optional[str] = None,
239        timeout: Optional[int] = None,
240        httpx_client: Optional[httpx.Client] = None,
241        debug: bool = False,
242        tracing_enabled: Optional[bool] = True,
243        flush_at: Optional[int] = None,
244        flush_interval: Optional[float] = None,
245        environment: Optional[str] = None,
246        release: Optional[str] = None,
247        media_upload_thread_count: Optional[int] = None,
248        sample_rate: Optional[float] = None,
249        mask: Optional[MaskFunction] = None,
250        blocked_instrumentation_scopes: Optional[List[str]] = None,
251        should_export_span: Optional[Callable[[ReadableSpan], bool]] = None,
252        additional_headers: Optional[Dict[str, str]] = None,
253        tracer_provider: Optional[TracerProvider] = None,
254        span_exporter: Optional[SpanExporter] = None,
255    ):
256        self._base_url = (
257            base_url
258            or os.environ.get(LANGFUSE_BASE_URL)
259            or host
260            or os.environ.get(LANGFUSE_HOST, "https://cloud.langfuse.com")
261        )
262        self._environment = environment or cast(
263            str, os.environ.get(LANGFUSE_TRACING_ENVIRONMENT)
264        )
265        self._release = (
266            release
267            or os.environ.get(LANGFUSE_RELEASE, None)
268            or get_common_release_envs()
269        )
270        self._project_id: Optional[str] = None
271        sample_rate = sample_rate or float(os.environ.get(LANGFUSE_SAMPLE_RATE, 1.0))
272        if not 0.0 <= sample_rate <= 1.0:
273            raise ValueError(
274                f"Sample rate must be between 0.0 and 1.0, got {sample_rate}"
275            )
276
277        timeout = timeout or int(os.environ.get(LANGFUSE_TIMEOUT, 5))
278
279        self._tracing_enabled = (
280            tracing_enabled
281            and os.environ.get(LANGFUSE_TRACING_ENABLED, "true").lower() != "false"
282        )
283        if not self._tracing_enabled:
284            langfuse_logger.info(
285                "Configuration: Langfuse tracing is explicitly disabled. No data will be sent to the Langfuse API."
286            )
287
288        debug = (
289            debug if debug else (os.getenv(LANGFUSE_DEBUG, "false").lower() == "true")
290        )
291        if debug:
292            logging.basicConfig(
293                format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
294            )
295            langfuse_logger.setLevel(logging.DEBUG)
296
297        public_key = public_key or os.environ.get(LANGFUSE_PUBLIC_KEY)
298        if public_key is None:
299            langfuse_logger.warning(
300                "Authentication error: Langfuse client initialized without public_key. Client will be disabled. "
301                "Provide a public_key parameter or set LANGFUSE_PUBLIC_KEY environment variable. "
302            )
303            self._otel_tracer = otel_trace_api.NoOpTracer()
304            return
305
306        secret_key = secret_key or os.environ.get(LANGFUSE_SECRET_KEY)
307        if secret_key is None:
308            langfuse_logger.warning(
309                "Authentication error: Langfuse client initialized without secret_key. Client will be disabled. "
310                "Provide a secret_key parameter or set LANGFUSE_SECRET_KEY environment variable. "
311            )
312            self._otel_tracer = otel_trace_api.NoOpTracer()
313            return
314
315        if os.environ.get("OTEL_SDK_DISABLED", "false").lower() == "true":
316            langfuse_logger.warning(
317                "OTEL_SDK_DISABLED is set. Langfuse tracing will be disabled and no traces will appear in the UI."
318            )
319
320        if blocked_instrumentation_scopes is not None:
321            warnings.warn(
322                "`blocked_instrumentation_scopes` is deprecated and will be removed in a future release. "
323                "Use `should_export_span` instead. Example: "
324                "from langfuse.span_filter import is_default_export_span; "
325                'blocked={"scope"}; should_export_span=lambda span: '
326                "is_default_export_span(span) and (span.instrumentation_scope is None or "
327                "span.instrumentation_scope.name not in blocked).",
328                DeprecationWarning,
329                stacklevel=2,
330            )
331
332        # Initialize api and tracer if requirements are met
333        self._resources = LangfuseResourceManager(
334            public_key=public_key,
335            secret_key=secret_key,
336            base_url=self._base_url,
337            timeout=timeout,
338            environment=self._environment,
339            release=release,
340            flush_at=flush_at,
341            flush_interval=flush_interval,
342            httpx_client=httpx_client,
343            media_upload_thread_count=media_upload_thread_count,
344            sample_rate=sample_rate,
345            mask=mask,
346            tracing_enabled=self._tracing_enabled,
347            blocked_instrumentation_scopes=blocked_instrumentation_scopes,
348            should_export_span=should_export_span,
349            additional_headers=additional_headers,
350            tracer_provider=tracer_provider,
351            span_exporter=span_exporter,
352        )
353        self._mask = self._resources.mask
354
355        self._otel_tracer = (
356            self._resources.tracer
357            if self._tracing_enabled and self._resources.tracer is not None
358            else otel_trace_api.NoOpTracer()
359        )
360        self.api = self._resources.api
361        self.async_api = self._resources.async_api
api
async_api
def start_observation( self, *, trace_context: Optional[langfuse.types.TraceContext] = None, name: str, as_type: Union[Literal['generation', 'embedding'], Literal['span', 'agent', 'tool', 'chain', 'retriever', 'evaluator', 'guardrail']] = 'span', input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, float, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None) -> Union[LangfuseSpan, LangfuseGeneration, LangfuseAgent, LangfuseTool, LangfuseChain, LangfuseRetriever, LangfuseEvaluator, LangfuseEmbedding, LangfuseGuardrail]:
510    def start_observation(
511        self,
512        *,
513        trace_context: Optional[TraceContext] = None,
514        name: str,
515        as_type: ObservationTypeLiteralNoEvent = "span",
516        input: Optional[Any] = None,
517        output: Optional[Any] = None,
518        metadata: Optional[Any] = None,
519        version: Optional[str] = None,
520        level: Optional[SpanLevel] = None,
521        status_message: Optional[str] = None,
522        completion_start_time: Optional[datetime] = None,
523        model: Optional[str] = None,
524        model_parameters: Optional[Dict[str, MapValue]] = None,
525        usage_details: Optional[Dict[str, int]] = None,
526        cost_details: Optional[Dict[str, float]] = None,
527        prompt: Optional[PromptClient] = None,
528    ) -> Union[
529        LangfuseSpan,
530        LangfuseGeneration,
531        LangfuseAgent,
532        LangfuseTool,
533        LangfuseChain,
534        LangfuseRetriever,
535        LangfuseEvaluator,
536        LangfuseEmbedding,
537        LangfuseGuardrail,
538    ]:
539        """Create a new observation of the specified type.
540
541        This method creates a new observation but does not set it as the current span in the
542        context. To create and use an observation within a context, use start_as_current_observation().
543
544        Args:
545            trace_context: Optional context for connecting to an existing trace
546            name: Name of the observation
547            as_type: Type of observation to create (defaults to "span")
548            input: Input data for the operation
549            output: Output data from the operation
550            metadata: Additional metadata to associate with the observation
551            version: Version identifier for the code or component
552            level: Importance level of the observation
553            status_message: Optional status message for the observation
554            completion_start_time: When the model started generating (for generation types)
555            model: Name/identifier of the AI model used (for generation types)
556            model_parameters: Parameters used for the model (for generation types)
557            usage_details: Token usage information (for generation types)
558            cost_details: Cost information (for generation types)
559            prompt: Associated prompt template (for generation types)
560
561        Returns:
562            An observation object of the appropriate type that must be ended with .end()
563        """
564        if trace_context:
565            trace_id = trace_context.get("trace_id", None)
566            parent_span_id = trace_context.get("parent_span_id", None)
567
568            if trace_id:
569                remote_parent_span = self._create_remote_parent_span(
570                    trace_id=trace_id, parent_span_id=parent_span_id
571                )
572
573                with otel_trace_api.use_span(
574                    cast(otel_trace_api.Span, remote_parent_span)
575                ):
576                    otel_span = self._otel_tracer.start_span(name=name)
577                    otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True)
578
579                    return self._create_observation_from_otel_span(
580                        otel_span=otel_span,
581                        as_type=as_type,
582                        input=input,
583                        output=output,
584                        metadata=metadata,
585                        version=version,
586                        level=level,
587                        status_message=status_message,
588                        completion_start_time=completion_start_time,
589                        model=model,
590                        model_parameters=model_parameters,
591                        usage_details=usage_details,
592                        cost_details=cost_details,
593                        prompt=prompt,
594                    )
595
596        otel_span = self._otel_tracer.start_span(name=name)
597
598        return self._create_observation_from_otel_span(
599            otel_span=otel_span,
600            as_type=as_type,
601            input=input,
602            output=output,
603            metadata=metadata,
604            version=version,
605            level=level,
606            status_message=status_message,
607            completion_start_time=completion_start_time,
608            model=model,
609            model_parameters=model_parameters,
610            usage_details=usage_details,
611            cost_details=cost_details,
612            prompt=prompt,
613        )

Create a new observation of the specified type.

This method creates a new observation but does not set it as the current span in the context. To create and use an observation within a context, use start_as_current_observation().

Arguments:
  • trace_context: Optional context for connecting to an existing trace
  • name: Name of the observation
  • as_type: Type of observation to create (defaults to "span")
  • input: Input data for the operation
  • output: Output data from the operation
  • metadata: Additional metadata to associate with the observation
  • version: Version identifier for the code or component
  • level: Importance level of the observation
  • status_message: Optional status message for the observation
  • completion_start_time: When the model started generating (for generation types)
  • model: Name/identifier of the AI model used (for generation types)
  • model_parameters: Parameters used for the model (for generation types)
  • usage_details: Token usage information (for generation types)
  • cost_details: Cost information (for generation types)
  • prompt: Associated prompt template (for generation types)
Returns:

An observation object of the appropriate type that must be ended with .end()

def start_as_current_observation( self, *, trace_context: Optional[langfuse.types.TraceContext] = None, name: str, as_type: Union[Literal['generation', 'embedding'], Literal['span', 'agent', 'tool', 'chain', 'retriever', 'evaluator', 'guardrail']] = 'span', input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, float, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None, end_on_exit: Optional[bool] = None) -> Union[opentelemetry.util._decorator._AgnosticContextManager[LangfuseGeneration], opentelemetry.util._decorator._AgnosticContextManager[LangfuseSpan], opentelemetry.util._decorator._AgnosticContextManager[LangfuseAgent], opentelemetry.util._decorator._AgnosticContextManager[LangfuseTool], opentelemetry.util._decorator._AgnosticContextManager[LangfuseChain], opentelemetry.util._decorator._AgnosticContextManager[LangfuseRetriever], opentelemetry.util._decorator._AgnosticContextManager[LangfuseEvaluator], opentelemetry.util._decorator._AgnosticContextManager[LangfuseEmbedding], opentelemetry.util._decorator._AgnosticContextManager[LangfuseGuardrail]]:
 843    def start_as_current_observation(
 844        self,
 845        *,
 846        trace_context: Optional[TraceContext] = None,
 847        name: str,
 848        as_type: ObservationTypeLiteralNoEvent = "span",
 849        input: Optional[Any] = None,
 850        output: Optional[Any] = None,
 851        metadata: Optional[Any] = None,
 852        version: Optional[str] = None,
 853        level: Optional[SpanLevel] = None,
 854        status_message: Optional[str] = None,
 855        completion_start_time: Optional[datetime] = None,
 856        model: Optional[str] = None,
 857        model_parameters: Optional[Dict[str, MapValue]] = None,
 858        usage_details: Optional[Dict[str, int]] = None,
 859        cost_details: Optional[Dict[str, float]] = None,
 860        prompt: Optional[PromptClient] = None,
 861        end_on_exit: Optional[bool] = None,
 862    ) -> Union[
 863        _AgnosticContextManager[LangfuseGeneration],
 864        _AgnosticContextManager[LangfuseSpan],
 865        _AgnosticContextManager[LangfuseAgent],
 866        _AgnosticContextManager[LangfuseTool],
 867        _AgnosticContextManager[LangfuseChain],
 868        _AgnosticContextManager[LangfuseRetriever],
 869        _AgnosticContextManager[LangfuseEvaluator],
 870        _AgnosticContextManager[LangfuseEmbedding],
 871        _AgnosticContextManager[LangfuseGuardrail],
 872    ]:
 873        """Create a new observation and set it as the current span in a context manager.
 874
 875        This method creates a new observation of the specified type and sets it as the
 876        current span within a context manager. Use this method with a 'with' statement to
 877        automatically handle the observation lifecycle within a code block.
 878
 879        The created observation will be the child of the current span in the context.
 880
 881        Args:
 882            trace_context: Optional context for connecting to an existing trace
 883            name: Name of the observation (e.g., function or operation name)
 884            as_type: Type of observation to create (defaults to "span")
 885            input: Input data for the operation (can be any JSON-serializable object)
 886            output: Output data from the operation (can be any JSON-serializable object)
 887            metadata: Additional metadata to associate with the observation
 888            version: Version identifier for the code or component
 889            level: Importance level of the observation (info, warning, error)
 890            status_message: Optional status message for the observation
 891            end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
 892
 893            The following parameters are available when as_type is: "generation" or "embedding".
 894            completion_start_time: When the model started generating the response
 895            model: Name/identifier of the AI model used (e.g., "gpt-4")
 896            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
 897            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
 898            cost_details: Cost information for the model call
 899            prompt: Associated prompt template from Langfuse prompt management
 900
 901        Returns:
 902            A context manager that yields the appropriate observation type based on as_type
 903
 904        Example:
 905            ```python
 906            # Create a span
 907            with langfuse.start_as_current_observation(name="process-query", as_type="span") as span:
 908                # Do work
 909                result = process_data()
 910                span.update(output=result)
 911
 912                # Create a child span automatically
 913                with span.start_as_current_observation(name="sub-operation") as child_span:
 914                    # Do sub-operation work
 915                    child_span.update(output="sub-result")
 916
 917            # Create a tool observation
 918            with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool:
 919                # Do tool work
 920                results = search_web(query)
 921                tool.update(output=results)
 922
 923            # Create a generation observation
 924            with langfuse.start_as_current_observation(
 925                name="answer-generation",
 926                as_type="generation",
 927                model="gpt-4"
 928            ) as generation:
 929                # Generate answer
 930                response = llm.generate(...)
 931                generation.update(output=response)
 932            ```
 933        """
 934        if as_type in get_observation_types_list(ObservationTypeGenerationLike):
 935            if trace_context:
 936                trace_id = trace_context.get("trace_id", None)
 937                parent_span_id = trace_context.get("parent_span_id", None)
 938
 939                if trace_id:
 940                    remote_parent_span = self._create_remote_parent_span(
 941                        trace_id=trace_id, parent_span_id=parent_span_id
 942                    )
 943
 944                    return cast(
 945                        Union[
 946                            _AgnosticContextManager[LangfuseGeneration],
 947                            _AgnosticContextManager[LangfuseEmbedding],
 948                        ],
 949                        self._create_span_with_parent_context(
 950                            as_type=as_type,
 951                            name=name,
 952                            remote_parent_span=remote_parent_span,
 953                            parent=None,
 954                            end_on_exit=end_on_exit,
 955                            input=input,
 956                            output=output,
 957                            metadata=metadata,
 958                            version=version,
 959                            level=level,
 960                            status_message=status_message,
 961                            completion_start_time=completion_start_time,
 962                            model=model,
 963                            model_parameters=model_parameters,
 964                            usage_details=usage_details,
 965                            cost_details=cost_details,
 966                            prompt=prompt,
 967                        ),
 968                    )
 969
 970            return cast(
 971                Union[
 972                    _AgnosticContextManager[LangfuseGeneration],
 973                    _AgnosticContextManager[LangfuseEmbedding],
 974                ],
 975                self._start_as_current_otel_span_with_processed_media(
 976                    as_type=as_type,
 977                    name=name,
 978                    end_on_exit=end_on_exit,
 979                    input=input,
 980                    output=output,
 981                    metadata=metadata,
 982                    version=version,
 983                    level=level,
 984                    status_message=status_message,
 985                    completion_start_time=completion_start_time,
 986                    model=model,
 987                    model_parameters=model_parameters,
 988                    usage_details=usage_details,
 989                    cost_details=cost_details,
 990                    prompt=prompt,
 991                ),
 992            )
 993
 994        if as_type in get_observation_types_list(ObservationTypeSpanLike):
 995            if trace_context:
 996                trace_id = trace_context.get("trace_id", None)
 997                parent_span_id = trace_context.get("parent_span_id", None)
 998
 999                if trace_id:
1000                    remote_parent_span = self._create_remote_parent_span(
1001                        trace_id=trace_id, parent_span_id=parent_span_id
1002                    )
1003
1004                    return cast(
1005                        Union[
1006                            _AgnosticContextManager[LangfuseSpan],
1007                            _AgnosticContextManager[LangfuseAgent],
1008                            _AgnosticContextManager[LangfuseTool],
1009                            _AgnosticContextManager[LangfuseChain],
1010                            _AgnosticContextManager[LangfuseRetriever],
1011                            _AgnosticContextManager[LangfuseEvaluator],
1012                            _AgnosticContextManager[LangfuseGuardrail],
1013                        ],
1014                        self._create_span_with_parent_context(
1015                            as_type=as_type,
1016                            name=name,
1017                            remote_parent_span=remote_parent_span,
1018                            parent=None,
1019                            end_on_exit=end_on_exit,
1020                            input=input,
1021                            output=output,
1022                            metadata=metadata,
1023                            version=version,
1024                            level=level,
1025                            status_message=status_message,
1026                        ),
1027                    )
1028
1029            return cast(
1030                Union[
1031                    _AgnosticContextManager[LangfuseSpan],
1032                    _AgnosticContextManager[LangfuseAgent],
1033                    _AgnosticContextManager[LangfuseTool],
1034                    _AgnosticContextManager[LangfuseChain],
1035                    _AgnosticContextManager[LangfuseRetriever],
1036                    _AgnosticContextManager[LangfuseEvaluator],
1037                    _AgnosticContextManager[LangfuseGuardrail],
1038                ],
1039                self._start_as_current_otel_span_with_processed_media(
1040                    as_type=as_type,
1041                    name=name,
1042                    end_on_exit=end_on_exit,
1043                    input=input,
1044                    output=output,
1045                    metadata=metadata,
1046                    version=version,
1047                    level=level,
1048                    status_message=status_message,
1049                ),
1050            )
1051
1052        # This should never be reached since all valid types are handled above
1053        langfuse_logger.warning(
1054            f"Unknown observation type: {as_type}, falling back to span"
1055        )
1056        return self._start_as_current_otel_span_with_processed_media(
1057            as_type="span",
1058            name=name,
1059            end_on_exit=end_on_exit,
1060            input=input,
1061            output=output,
1062            metadata=metadata,
1063            version=version,
1064            level=level,
1065            status_message=status_message,
1066        )

Create a new observation and set it as the current span in a context manager.

This method creates a new observation of the specified type and sets it as the current span within a context manager. Use this method with a 'with' statement to automatically handle the observation lifecycle within a code block.

The created observation will be the child of the current span in the context.

Arguments:
  • trace_context: Optional context for connecting to an existing trace
  • name: Name of the observation (e.g., function or operation name)
  • as_type: Type of observation to create (defaults to "span")
  • input: Input data for the operation (can be any JSON-serializable object)
  • output: Output data from the operation (can be any JSON-serializable object)
  • metadata: Additional metadata to associate with the observation
  • version: Version identifier for the code or component
  • level: Importance level of the observation (info, warning, error)
  • status_message: Optional status message for the observation
  • end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
  • The following parameters are available when as_type is: "generation" or "embedding".
  • completion_start_time: When the model started generating the response
  • model: Name/identifier of the AI model used (e.g., "gpt-4")
  • model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
  • usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
  • cost_details: Cost information for the model call
  • prompt: Associated prompt template from Langfuse prompt management
Returns:

A context manager that yields the appropriate observation type based on as_type

Example:
# Create a span
with langfuse.start_as_current_observation(name="process-query", as_type="span") as span:
    # Do work
    result = process_data()
    span.update(output=result)

    # Create a child span automatically
    with span.start_as_current_observation(name="sub-operation") as child_span:
        # Do sub-operation work
        child_span.update(output="sub-result")

# Create a tool observation
with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool:
    # Do tool work
    results = search_web(query)
    tool.update(output=results)

# Create a generation observation
with langfuse.start_as_current_observation(
    name="answer-generation",
    as_type="generation",
    model="gpt-4"
) as generation:
    # Generate answer
    response = llm.generate(...)
    generation.update(output=response)
def update_current_generation( self, *, name: Optional[str] = None, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, float, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None) -> None:
1228    def update_current_generation(
1229        self,
1230        *,
1231        name: Optional[str] = None,
1232        input: Optional[Any] = None,
1233        output: Optional[Any] = None,
1234        metadata: Optional[Any] = None,
1235        version: Optional[str] = None,
1236        level: Optional[SpanLevel] = None,
1237        status_message: Optional[str] = None,
1238        completion_start_time: Optional[datetime] = None,
1239        model: Optional[str] = None,
1240        model_parameters: Optional[Dict[str, MapValue]] = None,
1241        usage_details: Optional[Dict[str, int]] = None,
1242        cost_details: Optional[Dict[str, float]] = None,
1243        prompt: Optional[PromptClient] = None,
1244    ) -> None:
1245        """Update the current active generation span with new information.
1246
1247        This method updates the current generation span in the active context with
1248        additional information. It's useful for adding output, usage stats, or other
1249        details that become available during or after model generation.
1250
1251        Args:
1252            name: The generation name
1253            input: Updated input data for the model
1254            output: Output from the model (e.g., completions)
1255            metadata: Additional metadata to associate with the generation
1256            version: Version identifier for the model or component
1257            level: Importance level of the generation (info, warning, error)
1258            status_message: Optional status message for the generation
1259            completion_start_time: When the model started generating the response
1260            model: Name/identifier of the AI model used (e.g., "gpt-4")
1261            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1262            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1263            cost_details: Cost information for the model call
1264            prompt: Associated prompt template from Langfuse prompt management
1265
1266        Example:
1267            ```python
1268            with langfuse.start_as_current_generation(name="answer-query") as generation:
1269                # Initial setup and API call
1270                response = llm.generate(...)
1271
1272                # Update with results that weren't available at creation time
1273                langfuse.update_current_generation(
1274                    output=response.text,
1275                    usage_details={
1276                        "prompt_tokens": response.usage.prompt_tokens,
1277                        "completion_tokens": response.usage.completion_tokens
1278                    }
1279                )
1280            ```
1281        """
1282        if not self._tracing_enabled:
1283            langfuse_logger.debug(
1284                "Operation skipped: update_current_generation - Tracing is disabled or client is in no-op mode."
1285            )
1286            return
1287
1288        current_otel_span = self._get_current_otel_span()
1289
1290        if current_otel_span is not None:
1291            generation = LangfuseGeneration(
1292                otel_span=current_otel_span, langfuse_client=self
1293            )
1294
1295            if name:
1296                current_otel_span.update_name(name)
1297
1298            generation.update(
1299                input=input,
1300                output=output,
1301                metadata=metadata,
1302                version=version,
1303                level=level,
1304                status_message=status_message,
1305                completion_start_time=completion_start_time,
1306                model=model,
1307                model_parameters=model_parameters,
1308                usage_details=usage_details,
1309                cost_details=cost_details,
1310                prompt=prompt,
1311            )

Update the current active generation span with new information.

This method updates the current generation span in the active context with additional information. It's useful for adding output, usage stats, or other details that become available during or after model generation.

Arguments:
  • name: The generation name
  • input: Updated input data for the model
  • output: Output from the model (e.g., completions)
  • metadata: Additional metadata to associate with the generation
  • version: Version identifier for the model or component
  • level: Importance level of the generation (info, warning, error)
  • status_message: Optional status message for the generation
  • completion_start_time: When the model started generating the response
  • model: Name/identifier of the AI model used (e.g., "gpt-4")
  • model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
  • usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
  • cost_details: Cost information for the model call
  • prompt: Associated prompt template from Langfuse prompt management
Example:
with langfuse.start_as_current_generation(name="answer-query") as generation:
    # Initial setup and API call
    response = llm.generate(...)

    # Update with results that weren't available at creation time
    langfuse.update_current_generation(
        output=response.text,
        usage_details={
            "prompt_tokens": response.usage.prompt_tokens,
            "completion_tokens": response.usage.completion_tokens
        }
    )
def update_current_span( self, *, name: Optional[str] = None, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None) -> None:
1313    def update_current_span(
1314        self,
1315        *,
1316        name: Optional[str] = None,
1317        input: Optional[Any] = None,
1318        output: Optional[Any] = None,
1319        metadata: Optional[Any] = None,
1320        version: Optional[str] = None,
1321        level: Optional[SpanLevel] = None,
1322        status_message: Optional[str] = None,
1323    ) -> None:
1324        """Update the current active span with new information.
1325
1326        This method updates the current span in the active context with
1327        additional information. It's useful for adding outputs or metadata
1328        that become available during execution.
1329
1330        Args:
1331            name: The span name
1332            input: Updated input data for the operation
1333            output: Output data from the operation
1334            metadata: Additional metadata to associate with the span
1335            version: Version identifier for the code or component
1336            level: Importance level of the span (info, warning, error)
1337            status_message: Optional status message for the span
1338
1339        Example:
1340            ```python
1341            with langfuse.start_as_current_observation(name="process-data") as span:
1342                # Initial processing
1343                result = process_first_part()
1344
1345                # Update with intermediate results
1346                langfuse.update_current_span(metadata={"intermediate_result": result})
1347
1348                # Continue processing
1349                final_result = process_second_part(result)
1350
1351                # Final update
1352                langfuse.update_current_span(output=final_result)
1353            ```
1354        """
1355        if not self._tracing_enabled:
1356            langfuse_logger.debug(
1357                "Operation skipped: update_current_span - Tracing is disabled or client is in no-op mode."
1358            )
1359            return
1360
1361        current_otel_span = self._get_current_otel_span()
1362
1363        if current_otel_span is not None:
1364            span = LangfuseSpan(
1365                otel_span=current_otel_span,
1366                langfuse_client=self,
1367                environment=self._environment,
1368                release=self._release,
1369            )
1370
1371            if name:
1372                current_otel_span.update_name(name)
1373
1374            span.update(
1375                input=input,
1376                output=output,
1377                metadata=metadata,
1378                version=version,
1379                level=level,
1380                status_message=status_message,
1381            )

Update the current active span with new information.

This method updates the current span in the active context with additional information. It's useful for adding outputs or metadata that become available during execution.

Arguments:
  • name: The span name
  • input: Updated input data for the operation
  • output: Output data from the operation
  • metadata: Additional metadata to associate with the span
  • version: Version identifier for the code or component
  • level: Importance level of the span (info, warning, error)
  • status_message: Optional status message for the span
Example:
with langfuse.start_as_current_observation(name="process-data") as span:
    # Initial processing
    result = process_first_part()

    # Update with intermediate results
    langfuse.update_current_span(metadata={"intermediate_result": result})

    # Continue processing
    final_result = process_second_part(result)

    # Final update
    langfuse.update_current_span(output=final_result)
@deprecated('Trace-level input/output is deprecated. For trace attributes (user_id, session_id, tags, etc.), use propagate_attributes() instead. This method will be removed in a future major version.')
def set_current_trace_io( self, *, input: Optional[Any] = None, output: Optional[Any] = None) -> None:
1383    @deprecated(
1384        "Trace-level input/output is deprecated. "
1385        "For trace attributes (user_id, session_id, tags, etc.), use propagate_attributes() instead. "
1386        "This method will be removed in a future major version."
1387    )
1388    def set_current_trace_io(
1389        self,
1390        *,
1391        input: Optional[Any] = None,
1392        output: Optional[Any] = None,
1393    ) -> None:
1394        """Set trace-level input and output for the current span's trace.
1395
1396        .. deprecated::
1397            This is a legacy method for backward compatibility with Langfuse platform
1398            features that still rely on trace-level input/output (e.g., legacy LLM-as-a-judge
1399            evaluators). It will be removed in a future major version.
1400
1401            For setting other trace attributes (user_id, session_id, metadata, tags, version),
1402            use :meth:`propagate_attributes` instead.
1403
1404        Args:
1405            input: Input data to associate with the trace.
1406            output: Output data to associate with the trace.
1407        """
1408        if not self._tracing_enabled:
1409            langfuse_logger.debug(
1410                "Operation skipped: set_current_trace_io - Tracing is disabled or client is in no-op mode."
1411            )
1412            return
1413
1414        current_otel_span = self._get_current_otel_span()
1415
1416        if current_otel_span is not None and current_otel_span.is_recording():
1417            existing_observation_type = current_otel_span.attributes.get(  # type: ignore[attr-defined]
1418                LangfuseOtelSpanAttributes.OBSERVATION_TYPE, "span"
1419            )
1420            # We need to preserve the class to keep the correct observation type
1421            span_class = self._get_span_class(existing_observation_type)
1422            span = span_class(
1423                otel_span=current_otel_span,
1424                langfuse_client=self,
1425                environment=self._environment,
1426                release=self._release,
1427            )
1428
1429            span.set_trace_io(
1430                input=input,
1431                output=output,
1432            )

Set trace-level input and output for the current span's trace.

Deprecated since version : This is a legacy method for backward compatibility with Langfuse platform features that still rely on trace-level input/output (e.g., legacy LLM-as-a-judge evaluators). It will be removed in a future major version.

For setting other trace attributes (user_id, session_id, metadata, tags, version), use propagate_attributes() instead.

Arguments:
  • input: Input data to associate with the trace.
  • output: Output data to associate with the trace.
def set_current_trace_as_public(self) -> None:
1434    def set_current_trace_as_public(self) -> None:
1435        """Make the current trace publicly accessible via its URL.
1436
1437        When a trace is published, anyone with the trace link can view the full trace
1438        without needing to be logged in to Langfuse. This action cannot be undone
1439        programmatically - once published, the entire trace becomes public.
1440
1441        This is a convenience method that publishes the trace from the currently
1442        active span context. Use this when you want to make a trace public from
1443        within a traced function without needing direct access to the span object.
1444        """
1445        if not self._tracing_enabled:
1446            langfuse_logger.debug(
1447                "Operation skipped: set_current_trace_as_public - Tracing is disabled or client is in no-op mode."
1448            )
1449            return
1450
1451        current_otel_span = self._get_current_otel_span()
1452
1453        if current_otel_span is not None and current_otel_span.is_recording():
1454            existing_observation_type = current_otel_span.attributes.get(  # type: ignore[attr-defined]
1455                LangfuseOtelSpanAttributes.OBSERVATION_TYPE, "span"
1456            )
1457            # We need to preserve the class to keep the correct observation type
1458            span_class = self._get_span_class(existing_observation_type)
1459            span = span_class(
1460                otel_span=current_otel_span,
1461                langfuse_client=self,
1462                environment=self._environment,
1463            )
1464
1465            span.set_trace_as_public()

Make the current trace publicly accessible via its URL.

When a trace is published, anyone with the trace link can view the full trace without needing to be logged in to Langfuse. This action cannot be undone programmatically - once published, the entire trace becomes public.

This is a convenience method that publishes the trace from the currently active span context. Use this when you want to make a trace public from within a traced function without needing direct access to the span object.

def create_event( self, *, trace_context: Optional[langfuse.types.TraceContext] = None, name: str, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None) -> LangfuseEvent:
1467    def create_event(
1468        self,
1469        *,
1470        trace_context: Optional[TraceContext] = None,
1471        name: str,
1472        input: Optional[Any] = None,
1473        output: Optional[Any] = None,
1474        metadata: Optional[Any] = None,
1475        version: Optional[str] = None,
1476        level: Optional[SpanLevel] = None,
1477        status_message: Optional[str] = None,
1478    ) -> LangfuseEvent:
1479        """Create a new Langfuse observation of type 'EVENT'.
1480
1481        The created Langfuse Event observation will be the child of the current span in the context.
1482
1483        Args:
1484            trace_context: Optional context for connecting to an existing trace
1485            name: Name of the span (e.g., function or operation name)
1486            input: Input data for the operation (can be any JSON-serializable object)
1487            output: Output data from the operation (can be any JSON-serializable object)
1488            metadata: Additional metadata to associate with the span
1489            version: Version identifier for the code or component
1490            level: Importance level of the span (info, warning, error)
1491            status_message: Optional status message for the span
1492
1493        Returns:
1494            The Langfuse Event object
1495
1496        Example:
1497            ```python
1498            event = langfuse.create_event(name="process-event")
1499            ```
1500        """
1501        timestamp = time_ns()
1502
1503        if trace_context:
1504            trace_id = trace_context.get("trace_id", None)
1505            parent_span_id = trace_context.get("parent_span_id", None)
1506
1507            if trace_id:
1508                remote_parent_span = self._create_remote_parent_span(
1509                    trace_id=trace_id, parent_span_id=parent_span_id
1510                )
1511
1512                with otel_trace_api.use_span(
1513                    cast(otel_trace_api.Span, remote_parent_span)
1514                ):
1515                    otel_span = self._otel_tracer.start_span(
1516                        name=name, start_time=timestamp
1517                    )
1518                    otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True)
1519
1520                    return cast(
1521                        LangfuseEvent,
1522                        LangfuseEvent(
1523                            otel_span=otel_span,
1524                            langfuse_client=self,
1525                            environment=self._environment,
1526                            release=self._release,
1527                            input=input,
1528                            output=output,
1529                            metadata=metadata,
1530                            version=version,
1531                            level=level,
1532                            status_message=status_message,
1533                        ).end(end_time=timestamp),
1534                    )
1535
1536        otel_span = self._otel_tracer.start_span(name=name, start_time=timestamp)
1537
1538        return cast(
1539            LangfuseEvent,
1540            LangfuseEvent(
1541                otel_span=otel_span,
1542                langfuse_client=self,
1543                environment=self._environment,
1544                release=self._release,
1545                input=input,
1546                output=output,
1547                metadata=metadata,
1548                version=version,
1549                level=level,
1550                status_message=status_message,
1551            ).end(end_time=timestamp),
1552        )

Create a new Langfuse observation of type 'EVENT'.

The created Langfuse Event observation will be the child of the current span in the context.

Arguments:
  • trace_context: Optional context for connecting to an existing trace
  • name: Name of the span (e.g., function or operation name)
  • input: Input data for the operation (can be any JSON-serializable object)
  • output: Output data from the operation (can be any JSON-serializable object)
  • metadata: Additional metadata to associate with the span
  • version: Version identifier for the code or component
  • level: Importance level of the span (info, warning, error)
  • status_message: Optional status message for the span
Returns:

The Langfuse Event object

Example:
event = langfuse.create_event(name="process-event")
@staticmethod
def create_trace_id(*, seed: Optional[str] = None) -> str:
1641    @staticmethod
1642    def create_trace_id(*, seed: Optional[str] = None) -> str:
1643        """Create a unique trace ID for use with Langfuse.
1644
1645        This method generates a unique trace ID for use with various Langfuse APIs.
1646        It can either generate a random ID or create a deterministic ID based on
1647        a seed string.
1648
1649        Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes.
1650        This method ensures the generated ID meets this requirement. If you need to
1651        correlate an external ID with a Langfuse trace ID, use the external ID as the
1652        seed to get a valid, deterministic Langfuse trace ID.
1653
1654        Args:
1655            seed: Optional string to use as a seed for deterministic ID generation.
1656                 If provided, the same seed will always produce the same ID.
1657                 If not provided, a random ID will be generated.
1658
1659        Returns:
1660            A 32-character lowercase hexadecimal string representing the Langfuse trace ID.
1661
1662        Example:
1663            ```python
1664            # Generate a random trace ID
1665            trace_id = langfuse.create_trace_id()
1666
1667            # Generate a deterministic ID based on a seed
1668            session_trace_id = langfuse.create_trace_id(seed="session-456")
1669
1670            # Correlate an external ID with a Langfuse trace ID
1671            external_id = "external-system-123456"
1672            correlated_trace_id = langfuse.create_trace_id(seed=external_id)
1673
1674            # Use the ID with trace context
1675            with langfuse.start_as_current_observation(
1676                name="process-request",
1677                trace_context={"trace_id": trace_id}
1678            ) as span:
1679                # Operation will be part of the specific trace
1680                pass
1681            ```
1682        """
1683        if not seed:
1684            trace_id_int = RandomIdGenerator().generate_trace_id()
1685
1686            return Langfuse._format_otel_trace_id(trace_id_int)
1687
1688        return sha256(seed.encode("utf-8")).digest()[:16].hex()

Create a unique trace ID for use with Langfuse.

This method generates a unique trace ID for use with various Langfuse APIs. It can either generate a random ID or create a deterministic ID based on a seed string.

Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes. This method ensures the generated ID meets this requirement. If you need to correlate an external ID with a Langfuse trace ID, use the external ID as the seed to get a valid, deterministic Langfuse trace ID.

Arguments:
  • seed: Optional string to use as a seed for deterministic ID generation. If provided, the same seed will always produce the same ID. If not provided, a random ID will be generated.
Returns:

A 32-character lowercase hexadecimal string representing the Langfuse trace ID.

Example:
# Generate a random trace ID
trace_id = langfuse.create_trace_id()

# Generate a deterministic ID based on a seed
session_trace_id = langfuse.create_trace_id(seed="session-456")

# Correlate an external ID with a Langfuse trace ID
external_id = "external-system-123456"
correlated_trace_id = langfuse.create_trace_id(seed=external_id)

# Use the ID with trace context
with langfuse.start_as_current_observation(
    name="process-request",
    trace_context={"trace_id": trace_id}
) as span:
    # Operation will be part of the specific trace
    pass
def create_score( self, *, name: str, value: Union[float, str], session_id: Optional[str] = None, dataset_run_id: Optional[str] = None, trace_id: Optional[str] = None, observation_id: Optional[str] = None, score_id: Optional[str] = None, data_type: Optional[Literal['NUMERIC', 'CATEGORICAL', 'BOOLEAN', 'TEXT']] = None, comment: Optional[str] = None, config_id: Optional[str] = None, metadata: Optional[Any] = None, timestamp: Optional[datetime.datetime] = None) -> None:
1766    def create_score(
1767        self,
1768        *,
1769        name: str,
1770        value: Union[float, str],
1771        session_id: Optional[str] = None,
1772        dataset_run_id: Optional[str] = None,
1773        trace_id: Optional[str] = None,
1774        observation_id: Optional[str] = None,
1775        score_id: Optional[str] = None,
1776        data_type: Optional[ScoreDataType] = None,
1777        comment: Optional[str] = None,
1778        config_id: Optional[str] = None,
1779        metadata: Optional[Any] = None,
1780        timestamp: Optional[datetime] = None,
1781    ) -> None:
1782        """Create a score for a specific trace or observation.
1783
1784        This method creates a score for evaluating a Langfuse trace or observation. Scores can be
1785        used to track quality metrics, user feedback, or automated evaluations.
1786
1787        Args:
1788            name: Name of the score (e.g., "relevance", "accuracy")
1789            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT)
1790            session_id: ID of the Langfuse session to associate the score with
1791            dataset_run_id: ID of the Langfuse dataset run to associate the score with
1792            trace_id: ID of the Langfuse trace to associate the score with
1793            observation_id: Optional ID of the specific observation to score. Trace ID must be provided too.
1794            score_id: Optional custom ID for the score (auto-generated if not provided)
1795            data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT)
1796            comment: Optional comment or explanation for the score
1797            config_id: Optional ID of a score config defined in Langfuse
1798            metadata: Optional metadata to be attached to the score
1799            timestamp: Optional timestamp for the score (defaults to current UTC time)
1800
1801        Example:
1802            ```python
1803            # Create a numeric score for accuracy
1804            langfuse.create_score(
1805                name="accuracy",
1806                value=0.92,
1807                trace_id="abcdef1234567890abcdef1234567890",
1808                data_type="NUMERIC",
1809                comment="High accuracy with minor irrelevant details"
1810            )
1811
1812            # Create a categorical score for sentiment
1813            langfuse.create_score(
1814                name="sentiment",
1815                value="positive",
1816                trace_id="abcdef1234567890abcdef1234567890",
1817                observation_id="abcdef1234567890",
1818                data_type="CATEGORICAL"
1819            )
1820            ```
1821        """
1822        if not self._tracing_enabled:
1823            return
1824
1825        score_id = score_id or self._create_observation_id()
1826
1827        try:
1828            new_body = ScoreBody(
1829                id=score_id,
1830                sessionId=session_id,
1831                datasetRunId=dataset_run_id,
1832                traceId=trace_id,
1833                observationId=observation_id,
1834                name=name,
1835                value=value,
1836                dataType=data_type,  # type: ignore
1837                comment=comment,
1838                configId=config_id,
1839                environment=self._environment,
1840                metadata=metadata,
1841            )
1842
1843            event = {
1844                "id": self.create_trace_id(),
1845                "type": "score-create",
1846                "timestamp": timestamp or _get_timestamp(),
1847                "body": new_body,
1848            }
1849
1850            if self._resources is not None:
1851                # Force the score to be in sample if it was for a legacy trace ID, i.e. non-32 hexchar
1852                force_sample = (
1853                    not self._is_valid_trace_id(trace_id) if trace_id else True
1854                )
1855
1856                self._resources.add_score_task(
1857                    event,
1858                    force_sample=force_sample,
1859                )
1860
1861        except Exception as e:
1862            langfuse_logger.exception(
1863                f"Error creating score: Failed to process score event for trace_id={trace_id}, name={name}. Error: {e}"
1864            )

Create a score for a specific trace or observation.

This method creates a score for evaluating a Langfuse trace or observation. Scores can be used to track quality metrics, user feedback, or automated evaluations.

Arguments:
  • name: Name of the score (e.g., "relevance", "accuracy")
  • value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT)
  • session_id: ID of the Langfuse session to associate the score with
  • dataset_run_id: ID of the Langfuse dataset run to associate the score with
  • trace_id: ID of the Langfuse trace to associate the score with
  • observation_id: Optional ID of the specific observation to score. Trace ID must be provided too.
  • score_id: Optional custom ID for the score (auto-generated if not provided)
  • data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT)
  • comment: Optional comment or explanation for the score
  • config_id: Optional ID of a score config defined in Langfuse
  • metadata: Optional metadata to be attached to the score
  • timestamp: Optional timestamp for the score (defaults to current UTC time)
Example:
# Create a numeric score for accuracy
langfuse.create_score(
    name="accuracy",
    value=0.92,
    trace_id="abcdef1234567890abcdef1234567890",
    data_type="NUMERIC",
    comment="High accuracy with minor irrelevant details"
)

# Create a categorical score for sentiment
langfuse.create_score(
    name="sentiment",
    value="positive",
    trace_id="abcdef1234567890abcdef1234567890",
    observation_id="abcdef1234567890",
    data_type="CATEGORICAL"
)
def score_current_span( self, *, name: str, value: Union[float, str], score_id: Optional[str] = None, data_type: Optional[Literal['NUMERIC', 'CATEGORICAL', 'BOOLEAN', 'TEXT']] = None, comment: Optional[str] = None, config_id: Optional[str] = None, metadata: Optional[Any] = None) -> None:
1925    def score_current_span(
1926        self,
1927        *,
1928        name: str,
1929        value: Union[float, str],
1930        score_id: Optional[str] = None,
1931        data_type: Optional[ScoreDataType] = None,
1932        comment: Optional[str] = None,
1933        config_id: Optional[str] = None,
1934        metadata: Optional[Any] = None,
1935    ) -> None:
1936        """Create a score for the current active span.
1937
1938        This method scores the currently active span in the context. It's a convenient
1939        way to score the current operation without needing to know its trace and span IDs.
1940
1941        Args:
1942            name: Name of the score (e.g., "relevance", "accuracy")
1943            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT)
1944            score_id: Optional custom ID for the score (auto-generated if not provided)
1945            data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT)
1946            comment: Optional comment or explanation for the score
1947            config_id: Optional ID of a score config defined in Langfuse
1948            metadata: Optional metadata to be attached to the score
1949
1950        Example:
1951            ```python
1952            with langfuse.start_as_current_generation(name="answer-query") as generation:
1953                # Generate answer
1954                response = generate_answer(...)
1955                generation.update(output=response)
1956
1957                # Score the generation
1958                langfuse.score_current_span(
1959                    name="relevance",
1960                    value=0.85,
1961                    data_type="NUMERIC",
1962                    comment="Mostly relevant but contains some tangential information",
1963                    metadata={"model": "gpt-4", "prompt_version": "v2"}
1964                )
1965            ```
1966        """
1967        current_span = self._get_current_otel_span()
1968
1969        if current_span is not None:
1970            trace_id = self._get_otel_trace_id(current_span)
1971            observation_id = self._get_otel_span_id(current_span)
1972
1973            langfuse_logger.info(
1974                f"Score: Creating score name='{name}' value={value} for current span ({observation_id}) in trace {trace_id}"
1975            )
1976
1977            self.create_score(
1978                trace_id=trace_id,
1979                observation_id=observation_id,
1980                name=name,
1981                value=cast(str, value),
1982                score_id=score_id,
1983                data_type=cast(Literal["CATEGORICAL", "TEXT"], data_type),
1984                comment=comment,
1985                config_id=config_id,
1986                metadata=metadata,
1987            )

Create a score for the current active span.

This method scores the currently active span in the context. It's a convenient way to score the current operation without needing to know its trace and span IDs.

Arguments:
  • name: Name of the score (e.g., "relevance", "accuracy")
  • value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT)
  • score_id: Optional custom ID for the score (auto-generated if not provided)
  • data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT)
  • comment: Optional comment or explanation for the score
  • config_id: Optional ID of a score config defined in Langfuse
  • metadata: Optional metadata to be attached to the score
Example:
with langfuse.start_as_current_generation(name="answer-query") as generation:
    # Generate answer
    response = generate_answer(...)
    generation.update(output=response)

    # Score the generation
    langfuse.score_current_span(
        name="relevance",
        value=0.85,
        data_type="NUMERIC",
        comment="Mostly relevant but contains some tangential information",
        metadata={"model": "gpt-4", "prompt_version": "v2"}
    )
def score_current_trace( self, *, name: str, value: Union[float, str], score_id: Optional[str] = None, data_type: Optional[Literal['NUMERIC', 'CATEGORICAL', 'BOOLEAN', 'TEXT']] = None, comment: Optional[str] = None, config_id: Optional[str] = None, metadata: Optional[Any] = None) -> None:
2015    def score_current_trace(
2016        self,
2017        *,
2018        name: str,
2019        value: Union[float, str],
2020        score_id: Optional[str] = None,
2021        data_type: Optional[ScoreDataType] = None,
2022        comment: Optional[str] = None,
2023        config_id: Optional[str] = None,
2024        metadata: Optional[Any] = None,
2025    ) -> None:
2026        """Create a score for the current trace.
2027
2028        This method scores the trace of the currently active span. Unlike score_current_span,
2029        this method associates the score with the entire trace rather than a specific span.
2030        It's useful for scoring overall performance or quality of the entire operation.
2031
2032        Args:
2033            name: Name of the score (e.g., "user_satisfaction", "overall_quality")
2034            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT)
2035            score_id: Optional custom ID for the score (auto-generated if not provided)
2036            data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT)
2037            comment: Optional comment or explanation for the score
2038            config_id: Optional ID of a score config defined in Langfuse
2039            metadata: Optional metadata to be attached to the score
2040
2041        Example:
2042            ```python
2043            with langfuse.start_as_current_observation(name="process-user-request") as span:
2044                # Process request
2045                result = process_complete_request()
2046                span.update(output=result)
2047
2048                # Score the overall trace
2049                langfuse.score_current_trace(
2050                    name="overall_quality",
2051                    value=0.95,
2052                    data_type="NUMERIC",
2053                    comment="High quality end-to-end response",
2054                    metadata={"evaluator": "gpt-4", "criteria": "comprehensive"}
2055                )
2056            ```
2057        """
2058        current_span = self._get_current_otel_span()
2059
2060        if current_span is not None:
2061            trace_id = self._get_otel_trace_id(current_span)
2062
2063            langfuse_logger.info(
2064                f"Score: Creating score name='{name}' value={value} for entire trace {trace_id}"
2065            )
2066
2067            self.create_score(
2068                trace_id=trace_id,
2069                name=name,
2070                value=cast(str, value),
2071                score_id=score_id,
2072                data_type=cast(Literal["CATEGORICAL", "TEXT"], data_type),
2073                comment=comment,
2074                config_id=config_id,
2075                metadata=metadata,
2076            )

Create a score for the current trace.

This method scores the trace of the currently active span. Unlike score_current_span, this method associates the score with the entire trace rather than a specific span. It's useful for scoring overall performance or quality of the entire operation.

Arguments:
  • name: Name of the score (e.g., "user_satisfaction", "overall_quality")
  • value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT)
  • score_id: Optional custom ID for the score (auto-generated if not provided)
  • data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT)
  • comment: Optional comment or explanation for the score
  • config_id: Optional ID of a score config defined in Langfuse
  • metadata: Optional metadata to be attached to the score
Example:
with langfuse.start_as_current_observation(name="process-user-request") as span:
    # Process request
    result = process_complete_request()
    span.update(output=result)

    # Score the overall trace
    langfuse.score_current_trace(
        name="overall_quality",
        value=0.95,
        data_type="NUMERIC",
        comment="High quality end-to-end response",
        metadata={"evaluator": "gpt-4", "criteria": "comprehensive"}
    )
def flush(self) -> None:
2078    def flush(self) -> None:
2079        """Force flush all pending spans and events to the Langfuse API.
2080
2081        This method manually flushes any pending spans, scores, and other events to the
2082        Langfuse API. It's useful in scenarios where you want to ensure all data is sent
2083        before proceeding, without waiting for the automatic flush interval.
2084
2085        Example:
2086            ```python
2087            # Record some spans and scores
2088            with langfuse.start_as_current_observation(name="operation") as span:
2089                # Do work...
2090                pass
2091
2092            # Ensure all data is sent to Langfuse before proceeding
2093            langfuse.flush()
2094
2095            # Continue with other work
2096            ```
2097        """
2098        if self._resources is not None:
2099            self._resources.flush()

Force flush all pending spans and events to the Langfuse API.

This method manually flushes any pending spans, scores, and other events to the Langfuse API. It's useful in scenarios where you want to ensure all data is sent before proceeding, without waiting for the automatic flush interval.

Example:
# Record some spans and scores
with langfuse.start_as_current_observation(name="operation") as span:
    # Do work...
    pass

# Ensure all data is sent to Langfuse before proceeding
langfuse.flush()

# Continue with other work
def shutdown(self) -> None:
2101    def shutdown(self) -> None:
2102        """Shut down the Langfuse client and flush all pending data.
2103
2104        This method cleanly shuts down the Langfuse client, ensuring all pending data
2105        is flushed to the API and all background threads are properly terminated.
2106
2107        It's important to call this method when your application is shutting down to
2108        prevent data loss and resource leaks. For most applications, using the client
2109        as a context manager or relying on the automatic shutdown via atexit is sufficient.
2110
2111        Example:
2112            ```python
2113            # Initialize Langfuse
2114            langfuse = Langfuse(public_key="...", secret_key="...")
2115
2116            # Use Langfuse throughout your application
2117            # ...
2118
2119            # When application is shutting down
2120            langfuse.shutdown()
2121            ```
2122        """
2123        if self._resources is not None:
2124            self._resources.shutdown()

Shut down the Langfuse client and flush all pending data.

This method cleanly shuts down the Langfuse client, ensuring all pending data is flushed to the API and all background threads are properly terminated.

It's important to call this method when your application is shutting down to prevent data loss and resource leaks. For most applications, using the client as a context manager or relying on the automatic shutdown via atexit is sufficient.

Example:
# Initialize Langfuse
langfuse = Langfuse(public_key="...", secret_key="...")

# Use Langfuse throughout your application
# ...

# When application is shutting down
langfuse.shutdown()
def get_current_trace_id(self) -> Optional[str]:
2126    def get_current_trace_id(self) -> Optional[str]:
2127        """Get the trace ID of the current active span.
2128
2129        This method retrieves the trace ID from the currently active span in the context.
2130        It can be used to get the trace ID for referencing in logs, external systems,
2131        or for creating related operations.
2132
2133        Returns:
2134            The current trace ID as a 32-character lowercase hexadecimal string,
2135            or None if there is no active span.
2136
2137        Example:
2138            ```python
2139            with langfuse.start_as_current_observation(name="process-request") as span:
2140                # Get the current trace ID for reference
2141                trace_id = langfuse.get_current_trace_id()
2142
2143                # Use it for external correlation
2144                log.info(f"Processing request with trace_id: {trace_id}")
2145
2146                # Or pass to another system
2147                external_system.process(data, trace_id=trace_id)
2148            ```
2149        """
2150        if not self._tracing_enabled:
2151            langfuse_logger.debug(
2152                "Operation skipped: get_current_trace_id - Tracing is disabled or client is in no-op mode."
2153            )
2154            return None
2155
2156        current_otel_span = self._get_current_otel_span()
2157
2158        return self._get_otel_trace_id(current_otel_span) if current_otel_span else None

Get the trace ID of the current active span.

This method retrieves the trace ID from the currently active span in the context. It can be used to get the trace ID for referencing in logs, external systems, or for creating related operations.

Returns:

The current trace ID as a 32-character lowercase hexadecimal string, or None if there is no active span.

Example:
with langfuse.start_as_current_observation(name="process-request") as span:
    # Get the current trace ID for reference
    trace_id = langfuse.get_current_trace_id()

    # Use it for external correlation
    log.info(f"Processing request with trace_id: {trace_id}")

    # Or pass to another system
    external_system.process(data, trace_id=trace_id)
def get_current_observation_id(self) -> Optional[str]:
2160    def get_current_observation_id(self) -> Optional[str]:
2161        """Get the observation ID (span ID) of the current active span.
2162
2163        This method retrieves the observation ID from the currently active span in the context.
2164        It can be used to get the observation ID for referencing in logs, external systems,
2165        or for creating scores or other related operations.
2166
2167        Returns:
2168            The current observation ID as a 16-character lowercase hexadecimal string,
2169            or None if there is no active span.
2170
2171        Example:
2172            ```python
2173            with langfuse.start_as_current_observation(name="process-user-query") as span:
2174                # Get the current observation ID
2175                observation_id = langfuse.get_current_observation_id()
2176
2177                # Store it for later reference
2178                cache.set(f"query_{query_id}_observation", observation_id)
2179
2180                # Process the query...
2181            ```
2182        """
2183        if not self._tracing_enabled:
2184            langfuse_logger.debug(
2185                "Operation skipped: get_current_observation_id - Tracing is disabled or client is in no-op mode."
2186            )
2187            return None
2188
2189        current_otel_span = self._get_current_otel_span()
2190
2191        return self._get_otel_span_id(current_otel_span) if current_otel_span else None

Get the observation ID (span ID) of the current active span.

This method retrieves the observation ID from the currently active span in the context. It can be used to get the observation ID for referencing in logs, external systems, or for creating scores or other related operations.

Returns:

The current observation ID as a 16-character lowercase hexadecimal string, or None if there is no active span.

Example:
with langfuse.start_as_current_observation(name="process-user-query") as span:
    # Get the current observation ID
    observation_id = langfuse.get_current_observation_id()

    # Store it for later reference
    cache.set(f"query_{query_id}_observation", observation_id)

    # Process the query...
def get_trace_url(self, *, trace_id: Optional[str] = None) -> Optional[str]:
2204    def get_trace_url(self, *, trace_id: Optional[str] = None) -> Optional[str]:
2205        """Get the URL to view a trace in the Langfuse UI.
2206
2207        This method generates a URL that links directly to a trace in the Langfuse UI.
2208        It's useful for providing links in logs, notifications, or debugging tools.
2209
2210        Args:
2211            trace_id: Optional trace ID to generate a URL for. If not provided,
2212                     the trace ID of the current active span will be used.
2213
2214        Returns:
2215            A URL string pointing to the trace in the Langfuse UI,
2216            or None if the project ID couldn't be retrieved or no trace ID is available.
2217
2218        Example:
2219            ```python
2220            # Get URL for the current trace
2221            with langfuse.start_as_current_observation(name="process-request") as span:
2222                trace_url = langfuse.get_trace_url()
2223                log.info(f"Processing trace: {trace_url}")
2224
2225            # Get URL for a specific trace
2226            specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef")
2227            send_notification(f"Review needed for trace: {specific_trace_url}")
2228            ```
2229        """
2230        final_trace_id = trace_id or self.get_current_trace_id()
2231        if not final_trace_id:
2232            return None
2233
2234        project_id = self._get_project_id()
2235
2236        return (
2237            f"{self._base_url}/project/{project_id}/traces/{final_trace_id}"
2238            if project_id and final_trace_id
2239            else None
2240        )

Get the URL to view a trace in the Langfuse UI.

This method generates a URL that links directly to a trace in the Langfuse UI. It's useful for providing links in logs, notifications, or debugging tools.

Arguments:
  • trace_id: Optional trace ID to generate a URL for. If not provided, the trace ID of the current active span will be used.
Returns:

A URL string pointing to the trace in the Langfuse UI, or None if the project ID couldn't be retrieved or no trace ID is available.

Example:
# Get URL for the current trace
with langfuse.start_as_current_observation(name="process-request") as span:
    trace_url = langfuse.get_trace_url()
    log.info(f"Processing trace: {trace_url}")

# Get URL for a specific trace
specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef")
send_notification(f"Review needed for trace: {specific_trace_url}")
def get_dataset( self, name: str, *, fetch_items_page_size: Optional[int] = 50, version: Optional[datetime.datetime] = None) -> langfuse._client.datasets.DatasetClient:
2242    def get_dataset(
2243        self,
2244        name: str,
2245        *,
2246        fetch_items_page_size: Optional[int] = 50,
2247        version: Optional[datetime] = None,
2248    ) -> "DatasetClient":
2249        """Fetch a dataset by its name.
2250
2251        Args:
2252            name (str): The name of the dataset to fetch.
2253            fetch_items_page_size (Optional[int]): All items of the dataset will be fetched in chunks of this size. Defaults to 50.
2254            version (Optional[datetime]): Retrieve dataset items as they existed at this specific point in time (UTC).
2255                If provided, returns the state of items at the specified UTC timestamp.
2256                If not provided, returns the latest version. Must be a timezone-aware datetime object in UTC.
2257
2258        Returns:
2259            DatasetClient: The dataset with the given name.
2260        """
2261        try:
2262            langfuse_logger.debug(f"Getting datasets {name}")
2263            dataset = self.api.datasets.get(dataset_name=self._url_encode(name))
2264
2265            dataset_items = []
2266            page = 1
2267
2268            while True:
2269                new_items = self.api.dataset_items.list(
2270                    dataset_name=self._url_encode(name, is_url_param=True),
2271                    page=page,
2272                    limit=fetch_items_page_size,
2273                    version=version,
2274                )
2275                dataset_items.extend(new_items.data)
2276
2277                if new_items.meta.total_pages <= page:
2278                    break
2279
2280                page += 1
2281
2282            return DatasetClient(
2283                dataset=dataset,
2284                items=dataset_items,
2285                version=version,
2286                langfuse_client=self,
2287            )
2288
2289        except Error as e:
2290            handle_fern_exception(e)
2291            raise e

Fetch a dataset by its name.

Arguments:
  • name (str): The name of the dataset to fetch.
  • fetch_items_page_size (Optional[int]): All items of the dataset will be fetched in chunks of this size. Defaults to 50.
  • version (Optional[datetime]): Retrieve dataset items as they existed at this specific point in time (UTC). If provided, returns the state of items at the specified UTC timestamp. If not provided, returns the latest version. Must be a timezone-aware datetime object in UTC.
Returns:

DatasetClient: The dataset with the given name.

def get_dataset_run( self, *, dataset_name: str, run_name: str) -> langfuse.api.DatasetRunWithItems:
2293    def get_dataset_run(
2294        self, *, dataset_name: str, run_name: str
2295    ) -> DatasetRunWithItems:
2296        """Fetch a dataset run by dataset name and run name.
2297
2298        Args:
2299            dataset_name (str): The name of the dataset.
2300            run_name (str): The name of the run.
2301
2302        Returns:
2303            DatasetRunWithItems: The dataset run with its items.
2304        """
2305        try:
2306            return cast(
2307                DatasetRunWithItems,
2308                self.api.datasets.get_run(
2309                    dataset_name=self._url_encode(dataset_name),
2310                    run_name=self._url_encode(run_name),
2311                    request_options=None,
2312                ),
2313            )
2314        except Error as e:
2315            handle_fern_exception(e)
2316            raise e

Fetch a dataset run by dataset name and run name.

Arguments:
  • dataset_name (str): The name of the dataset.
  • run_name (str): The name of the run.
Returns:

DatasetRunWithItems: The dataset run with its items.

def get_dataset_runs( self, *, dataset_name: str, page: Optional[int] = None, limit: Optional[int] = None) -> langfuse.api.PaginatedDatasetRuns:
2318    def get_dataset_runs(
2319        self,
2320        *,
2321        dataset_name: str,
2322        page: Optional[int] = None,
2323        limit: Optional[int] = None,
2324    ) -> PaginatedDatasetRuns:
2325        """Fetch all runs for a dataset.
2326
2327        Args:
2328            dataset_name (str): The name of the dataset.
2329            page (Optional[int]): Page number, starts at 1.
2330            limit (Optional[int]): Limit of items per page.
2331
2332        Returns:
2333            PaginatedDatasetRuns: Paginated list of dataset runs.
2334        """
2335        try:
2336            return cast(
2337                PaginatedDatasetRuns,
2338                self.api.datasets.get_runs(
2339                    dataset_name=self._url_encode(dataset_name),
2340                    page=page,
2341                    limit=limit,
2342                    request_options=None,
2343                ),
2344            )
2345        except Error as e:
2346            handle_fern_exception(e)
2347            raise e

Fetch all runs for a dataset.

Arguments:
  • dataset_name (str): The name of the dataset.
  • page (Optional[int]): Page number, starts at 1.
  • limit (Optional[int]): Limit of items per page.
Returns:

PaginatedDatasetRuns: Paginated list of dataset runs.

def delete_dataset_run( self, *, dataset_name: str, run_name: str) -> langfuse.api.DeleteDatasetRunResponse:
2349    def delete_dataset_run(
2350        self, *, dataset_name: str, run_name: str
2351    ) -> DeleteDatasetRunResponse:
2352        """Delete a dataset run and all its run items. This action is irreversible.
2353
2354        Args:
2355            dataset_name (str): The name of the dataset.
2356            run_name (str): The name of the run.
2357
2358        Returns:
2359            DeleteDatasetRunResponse: Confirmation of deletion.
2360        """
2361        try:
2362            return cast(
2363                DeleteDatasetRunResponse,
2364                self.api.datasets.delete_run(
2365                    dataset_name=self._url_encode(dataset_name),
2366                    run_name=self._url_encode(run_name),
2367                    request_options=None,
2368                ),
2369            )
2370        except Error as e:
2371            handle_fern_exception(e)
2372            raise e

Delete a dataset run and all its run items. This action is irreversible.

Arguments:
  • dataset_name (str): The name of the dataset.
  • run_name (str): The name of the run.
Returns:

DeleteDatasetRunResponse: Confirmation of deletion.

def run_experiment( self, *, name: str, run_name: Optional[str] = None, description: Optional[str] = None, data: Union[List[langfuse.experiment.LocalExperimentItem], List[langfuse.api.DatasetItem]], task: langfuse.experiment.TaskFunction, evaluators: List[langfuse.experiment.EvaluatorFunction] = [], composite_evaluator: Optional[CompositeEvaluatorFunction] = None, run_evaluators: List[langfuse.experiment.RunEvaluatorFunction] = [], max_concurrency: int = 50, metadata: Optional[Dict[str, str]] = None, _dataset_version: Optional[datetime.datetime] = None) -> langfuse.experiment.ExperimentResult:
2374    def run_experiment(
2375        self,
2376        *,
2377        name: str,
2378        run_name: Optional[str] = None,
2379        description: Optional[str] = None,
2380        data: ExperimentData,
2381        task: TaskFunction,
2382        evaluators: List[EvaluatorFunction] = [],
2383        composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
2384        run_evaluators: List[RunEvaluatorFunction] = [],
2385        max_concurrency: int = 50,
2386        metadata: Optional[Dict[str, str]] = None,
2387        _dataset_version: Optional[datetime] = None,
2388    ) -> ExperimentResult:
2389        """Run an experiment on a dataset with automatic tracing and evaluation.
2390
2391        This method executes a task function on each item in the provided dataset,
2392        automatically traces all executions with Langfuse for observability, runs
2393        item-level and run-level evaluators on the outputs, and returns comprehensive
2394        results with evaluation metrics.
2395
2396        The experiment system provides:
2397        - Automatic tracing of all task executions
2398        - Concurrent processing with configurable limits
2399        - Comprehensive error handling that isolates failures
2400        - Integration with Langfuse datasets for experiment tracking
2401        - Flexible evaluation framework supporting both sync and async evaluators
2402
2403        Args:
2404            name: Human-readable name for the experiment. Used for identification
2405                in the Langfuse UI.
2406            run_name: Optional exact name for the experiment run. If provided, this will be
2407                used as the exact dataset run name if the `data` contains Langfuse dataset items.
2408                If not provided, this will default to the experiment name appended with an ISO timestamp.
2409            description: Optional description explaining the experiment's purpose,
2410                methodology, or expected outcomes.
2411            data: Array of data items to process. Can be either:
2412                - List of dict-like items with 'input', 'expected_output', 'metadata' keys
2413                - List of Langfuse DatasetItem objects from dataset.items
2414            task: Function that processes each data item and returns output.
2415                Must accept 'item' as keyword argument and can return sync or async results.
2416                The task function signature should be: task(*, item, **kwargs) -> Any
2417            evaluators: List of functions to evaluate each item's output individually.
2418                Each evaluator receives input, output, expected_output, and metadata.
2419                Can return single Evaluation dict or list of Evaluation dicts.
2420            composite_evaluator: Optional function that creates composite scores from item-level evaluations.
2421                Receives the same inputs as item-level evaluators (input, output, expected_output, metadata)
2422                plus the list of evaluations from item-level evaluators. Useful for weighted averages,
2423                pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics.
2424            run_evaluators: List of functions to evaluate the entire experiment run.
2425                Each run evaluator receives all item_results and can compute aggregate metrics.
2426                Useful for calculating averages, distributions, or cross-item comparisons.
2427            max_concurrency: Maximum number of concurrent task executions (default: 50).
2428                Controls the number of items processed simultaneously. Adjust based on
2429                API rate limits and system resources.
2430            metadata: Optional metadata dictionary to attach to all experiment traces.
2431                This metadata will be included in every trace created during the experiment.
2432                If `data` are Langfuse dataset items, the metadata will be attached to the dataset run, too.
2433
2434        Returns:
2435            ExperimentResult containing:
2436            - run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset.
2437            - item_results: List of results for each processed item with outputs and evaluations
2438            - run_evaluations: List of aggregate evaluation results for the entire run
2439            - experiment_id: Stable identifier for the experiment run across all items
2440            - dataset_run_id: ID of the dataset run (if using Langfuse datasets)
2441            - dataset_run_url: Direct URL to view results in Langfuse UI (if applicable)
2442
2443        Raises:
2444            ValueError: If required parameters are missing or invalid
2445            Exception: If experiment setup fails (individual item failures are handled gracefully)
2446
2447        Examples:
2448            Basic experiment with local data:
2449            ```python
2450            def summarize_text(*, item, **kwargs):
2451                return f"Summary: {item['input'][:50]}..."
2452
2453            def length_evaluator(*, input, output, expected_output=None, **kwargs):
2454                return {
2455                    "name": "output_length",
2456                    "value": len(output),
2457                    "comment": f"Output contains {len(output)} characters"
2458                }
2459
2460            result = langfuse.run_experiment(
2461                name="Text Summarization Test",
2462                description="Evaluate summarization quality and length",
2463                data=[
2464                    {"input": "Long article text...", "expected_output": "Expected summary"},
2465                    {"input": "Another article...", "expected_output": "Another summary"}
2466                ],
2467                task=summarize_text,
2468                evaluators=[length_evaluator]
2469            )
2470
2471            print(f"Processed {len(result.item_results)} items")
2472            for item_result in result.item_results:
2473                print(f"Input: {item_result.item['input']}")
2474                print(f"Output: {item_result.output}")
2475                print(f"Evaluations: {item_result.evaluations}")
2476            ```
2477
2478            Advanced experiment with async task and multiple evaluators:
2479            ```python
2480            async def llm_task(*, item, **kwargs):
2481                # Simulate async LLM call
2482                response = await openai_client.chat.completions.create(
2483                    model="gpt-4",
2484                    messages=[{"role": "user", "content": item["input"]}]
2485                )
2486                return response.choices[0].message.content
2487
2488            def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
2489                if expected_output and expected_output.lower() in output.lower():
2490                    return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"}
2491                return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"}
2492
2493            def toxicity_evaluator(*, input, output, expected_output=None, **kwargs):
2494                # Simulate toxicity check
2495                toxicity_score = check_toxicity(output)  # Your toxicity checker
2496                return {
2497                    "name": "toxicity",
2498                    "value": toxicity_score,
2499                    "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}"
2500                }
2501
2502            def average_accuracy(*, item_results, **kwargs):
2503                accuracies = [
2504                    eval.value for result in item_results
2505                    for eval in result.evaluations
2506                    if eval.name == "accuracy"
2507                ]
2508                return {
2509                    "name": "average_accuracy",
2510                    "value": sum(accuracies) / len(accuracies) if accuracies else 0,
2511                    "comment": f"Average accuracy across {len(accuracies)} items"
2512                }
2513
2514            result = langfuse.run_experiment(
2515                name="LLM Safety and Accuracy Test",
2516                description="Evaluate model accuracy and safety across diverse prompts",
2517                data=test_dataset,  # Your dataset items
2518                task=llm_task,
2519                evaluators=[accuracy_evaluator, toxicity_evaluator],
2520                run_evaluators=[average_accuracy],
2521                max_concurrency=5,  # Limit concurrent API calls
2522                metadata={"model": "gpt-4", "temperature": 0.7}
2523            )
2524            ```
2525
2526            Using with Langfuse datasets:
2527            ```python
2528            # Get dataset from Langfuse
2529            dataset = langfuse.get_dataset("my-eval-dataset")
2530
2531            result = dataset.run_experiment(
2532                name="Production Model Evaluation",
2533                description="Monthly evaluation of production model performance",
2534                task=my_production_task,
2535                evaluators=[accuracy_evaluator, latency_evaluator]
2536            )
2537
2538            # Results automatically linked to dataset in Langfuse UI
2539            print(f"View results: {result['dataset_run_url']}")
2540            ```
2541
2542        Note:
2543            - Task and evaluator functions can be either synchronous or asynchronous
2544            - Individual item failures are logged but don't stop the experiment
2545            - All executions are automatically traced and visible in Langfuse UI
2546            - When using Langfuse datasets, results are automatically linked for easy comparison
2547            - This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.)
2548            - Async execution is handled automatically with smart event loop detection
2549        """
2550        return cast(
2551            ExperimentResult,
2552            run_async_safely(
2553                self._run_experiment_async(
2554                    name=name,
2555                    run_name=self._create_experiment_run_name(
2556                        name=name, run_name=run_name
2557                    ),
2558                    description=description,
2559                    data=data,
2560                    task=task,
2561                    evaluators=evaluators or [],
2562                    composite_evaluator=composite_evaluator,
2563                    run_evaluators=run_evaluators or [],
2564                    max_concurrency=max_concurrency,
2565                    metadata=metadata,
2566                    dataset_version=_dataset_version,
2567                ),
2568            ),
2569        )

Run an experiment on a dataset with automatic tracing and evaluation.

This method executes a task function on each item in the provided dataset, automatically traces all executions with Langfuse for observability, runs item-level and run-level evaluators on the outputs, and returns comprehensive results with evaluation metrics.

The experiment system provides:

  • Automatic tracing of all task executions
  • Concurrent processing with configurable limits
  • Comprehensive error handling that isolates failures
  • Integration with Langfuse datasets for experiment tracking
  • Flexible evaluation framework supporting both sync and async evaluators
Arguments:
  • name: Human-readable name for the experiment. Used for identification in the Langfuse UI.
  • run_name: Optional exact name for the experiment run. If provided, this will be used as the exact dataset run name if the data contains Langfuse dataset items. If not provided, this will default to the experiment name appended with an ISO timestamp.
  • description: Optional description explaining the experiment's purpose, methodology, or expected outcomes.
  • data: Array of data items to process. Can be either:
    • List of dict-like items with 'input', 'expected_output', 'metadata' keys
    • List of Langfuse DatasetItem objects from dataset.items
  • task: Function that processes each data item and returns output. Must accept 'item' as keyword argument and can return sync or async results. The task function signature should be: task(*, item, **kwargs) -> Any
  • evaluators: List of functions to evaluate each item's output individually. Each evaluator receives input, output, expected_output, and metadata. Can return single Evaluation dict or list of Evaluation dicts.
  • composite_evaluator: Optional function that creates composite scores from item-level evaluations. Receives the same inputs as item-level evaluators (input, output, expected_output, metadata) plus the list of evaluations from item-level evaluators. Useful for weighted averages, pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics.
  • run_evaluators: List of functions to evaluate the entire experiment run. Each run evaluator receives all item_results and can compute aggregate metrics. Useful for calculating averages, distributions, or cross-item comparisons.
  • max_concurrency: Maximum number of concurrent task executions (default: 50). Controls the number of items processed simultaneously. Adjust based on API rate limits and system resources.
  • metadata: Optional metadata dictionary to attach to all experiment traces. This metadata will be included in every trace created during the experiment. If data are Langfuse dataset items, the metadata will be attached to the dataset run, too.
Returns:

ExperimentResult containing:

  • run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset.
  • item_results: List of results for each processed item with outputs and evaluations
  • run_evaluations: List of aggregate evaluation results for the entire run
  • experiment_id: Stable identifier for the experiment run across all items
  • dataset_run_id: ID of the dataset run (if using Langfuse datasets)
  • dataset_run_url: Direct URL to view results in Langfuse UI (if applicable)
Raises:
  • ValueError: If required parameters are missing or invalid
  • Exception: If experiment setup fails (individual item failures are handled gracefully)
Examples:

Basic experiment with local data:

def summarize_text(*, item, **kwargs):
    return f"Summary: {item['input'][:50]}..."

def length_evaluator(*, input, output, expected_output=None, **kwargs):
    return {
        "name": "output_length",
        "value": len(output),
        "comment": f"Output contains {len(output)} characters"
    }

result = langfuse.run_experiment(
    name="Text Summarization Test",
    description="Evaluate summarization quality and length",
    data=[
        {"input": "Long article text...", "expected_output": "Expected summary"},
        {"input": "Another article...", "expected_output": "Another summary"}
    ],
    task=summarize_text,
    evaluators=[length_evaluator]
)

print(f"Processed {len(result.item_results)} items")
for item_result in result.item_results:
    print(f"Input: {item_result.item['input']}")
    print(f"Output: {item_result.output}")
    print(f"Evaluations: {item_result.evaluations}")

Advanced experiment with async task and multiple evaluators:

async def llm_task(*, item, **kwargs):
    # Simulate async LLM call
    response = await openai_client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": item["input"]}]
    )
    return response.choices[0].message.content

def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
    if expected_output and expected_output.lower() in output.lower():
        return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"}
    return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"}

def toxicity_evaluator(*, input, output, expected_output=None, **kwargs):
    # Simulate toxicity check
    toxicity_score = check_toxicity(output)  # Your toxicity checker
    return {
        "name": "toxicity",
        "value": toxicity_score,
        "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}"
    }

def average_accuracy(*, item_results, **kwargs):
    accuracies = [
        eval.value for result in item_results
        for eval in result.evaluations
        if eval.name == "accuracy"
    ]
    return {
        "name": "average_accuracy",
        "value": sum(accuracies) / len(accuracies) if accuracies else 0,
        "comment": f"Average accuracy across {len(accuracies)} items"
    }

result = langfuse.run_experiment(
    name="LLM Safety and Accuracy Test",
    description="Evaluate model accuracy and safety across diverse prompts",
    data=test_dataset,  # Your dataset items
    task=llm_task,
    evaluators=[accuracy_evaluator, toxicity_evaluator],
    run_evaluators=[average_accuracy],
    max_concurrency=5,  # Limit concurrent API calls
    metadata={"model": "gpt-4", "temperature": 0.7}
)

Using with Langfuse datasets:

# Get dataset from Langfuse
dataset = langfuse.get_dataset("my-eval-dataset")

result = dataset.run_experiment(
    name="Production Model Evaluation",
    description="Monthly evaluation of production model performance",
    task=my_production_task,
    evaluators=[accuracy_evaluator, latency_evaluator]
)

# Results automatically linked to dataset in Langfuse UI
print(f"View results: {result['dataset_run_url']}")
Note:
  • Task and evaluator functions can be either synchronous or asynchronous
  • Individual item failures are logged but don't stop the experiment
  • All executions are automatically traced and visible in Langfuse UI
  • When using Langfuse datasets, results are automatically linked for easy comparison
  • This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.)
  • Async execution is handled automatically with smart event loop detection
def run_batched_evaluation( self, *, scope: Literal['traces', 'observations'], mapper: MapperFunction, filter: Optional[str] = None, fetch_batch_size: int = 50, fetch_trace_fields: Optional[str] = None, max_items: Optional[int] = None, max_retries: int = 3, evaluators: List[langfuse.experiment.EvaluatorFunction], composite_evaluator: Optional[CompositeEvaluatorFunction] = None, max_concurrency: int = 5, metadata: Optional[Dict[str, Any]] = None, _add_observation_scores_to_trace: bool = False, _additional_trace_tags: Optional[List[str]] = None, resume_from: Optional[BatchEvaluationResumeToken] = None, verbose: bool = False) -> BatchEvaluationResult:
2931    def run_batched_evaluation(
2932        self,
2933        *,
2934        scope: Literal["traces", "observations"],
2935        mapper: MapperFunction,
2936        filter: Optional[str] = None,
2937        fetch_batch_size: int = 50,
2938        fetch_trace_fields: Optional[str] = None,
2939        max_items: Optional[int] = None,
2940        max_retries: int = 3,
2941        evaluators: List[EvaluatorFunction],
2942        composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
2943        max_concurrency: int = 5,
2944        metadata: Optional[Dict[str, Any]] = None,
2945        _add_observation_scores_to_trace: bool = False,
2946        _additional_trace_tags: Optional[List[str]] = None,
2947        resume_from: Optional[BatchEvaluationResumeToken] = None,
2948        verbose: bool = False,
2949    ) -> BatchEvaluationResult:
2950        """Fetch traces or observations and run evaluations on each item.
2951
2952        This method provides a powerful way to evaluate existing data in Langfuse at scale.
2953        It fetches items based on filters, transforms them using a mapper function, runs
2954        evaluators on each item, and creates scores that are linked back to the original
2955        entities. This is ideal for:
2956
2957        - Running evaluations on production traces after deployment
2958        - Backtesting new evaluation metrics on historical data
2959        - Batch scoring of observations for quality monitoring
2960        - Periodic evaluation runs on recent data
2961
2962        The method uses a streaming/pipeline approach to process items in batches, making
2963        it memory-efficient for large datasets. It includes comprehensive error handling,
2964        retry logic, and resume capability for long-running evaluations.
2965
2966        Args:
2967            scope: The type of items to evaluate. Must be one of:
2968                - "traces": Evaluate complete traces with all their observations
2969                - "observations": Evaluate individual observations (spans, generations, events)
2970            mapper: Function that transforms API response objects into evaluator inputs.
2971                Receives a trace/observation object and returns an EvaluatorInputs
2972                instance with input, output, expected_output, and metadata fields.
2973                Can be sync or async.
2974            evaluators: List of evaluation functions to run on each item. Each evaluator
2975                receives the mapped inputs and returns Evaluation object(s). Evaluator
2976                failures are logged but don't stop the batch evaluation.
2977            filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples:
2978                - '{"tags": ["production"]}'
2979                - '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}'
2980                Default: None (fetches all items).
2981            fetch_batch_size: Number of items to fetch per API call and hold in memory.
2982                Larger values may be faster but use more memory. Default: 50.
2983            fetch_trace_fields: Comma-separated list of fields to include when fetching traces. Available field groups: 'core' (always included), 'io' (input, output, metadata), 'scores', 'observations', 'metrics'. If not specified, all fields are returned. Example: 'core,scores,metrics'. Note: Excluded 'observations' or 'scores' fields return empty arrays; excluded 'metrics' returns -1 for 'totalCost' and 'latency'. Only relevant if scope is 'traces'.
2984            max_items: Maximum total number of items to process. If None, processes all
2985                items matching the filter. Useful for testing or limiting evaluation runs.
2986                Default: None (process all).
2987            max_concurrency: Maximum number of items to evaluate concurrently. Controls
2988                parallelism and resource usage. Default: 5.
2989            composite_evaluator: Optional function that creates a composite score from
2990                item-level evaluations. Receives the original item and its evaluations,
2991                returns a single Evaluation. Useful for weighted averages or combined metrics.
2992                Default: None.
2993            metadata: Optional metadata dict to add to all created scores. Useful for
2994                tracking evaluation runs, versions, or other context. Default: None.
2995            max_retries: Maximum number of retry attempts for failed batch fetches.
2996                Uses exponential backoff (1s, 2s, 4s). Default: 3.
2997            verbose: If True, logs progress information to console. Useful for monitoring
2998                long-running evaluations. Default: False.
2999            resume_from: Optional resume token from a previous incomplete run. Allows
3000                continuing evaluation after interruption or failure. Default: None.
3001
3002
3003        Returns:
3004            BatchEvaluationResult containing:
3005                - total_items_fetched: Number of items fetched from API
3006                - total_items_processed: Number of items successfully evaluated
3007                - total_items_failed: Number of items that failed evaluation
3008                - total_scores_created: Scores created by item-level evaluators
3009                - total_composite_scores_created: Scores created by composite evaluator
3010                - total_evaluations_failed: Individual evaluator failures
3011                - evaluator_stats: Per-evaluator statistics (success rate, scores created)
3012                - resume_token: Token for resuming if incomplete (None if completed)
3013                - completed: True if all items processed
3014                - duration_seconds: Total execution time
3015                - failed_item_ids: IDs of items that failed
3016                - error_summary: Error types and counts
3017                - has_more_items: True if max_items reached but more exist
3018
3019        Raises:
3020            ValueError: If invalid scope is provided.
3021
3022        Examples:
3023            Basic trace evaluation:
3024            ```python
3025            from langfuse import Langfuse, EvaluatorInputs, Evaluation
3026
3027            client = Langfuse()
3028
3029            # Define mapper to extract fields from traces
3030            def trace_mapper(trace):
3031                return EvaluatorInputs(
3032                    input=trace.input,
3033                    output=trace.output,
3034                    expected_output=None,
3035                    metadata={"trace_id": trace.id}
3036                )
3037
3038            # Define evaluator
3039            def length_evaluator(*, input, output, expected_output, metadata):
3040                return Evaluation(
3041                    name="output_length",
3042                    value=len(output) if output else 0
3043                )
3044
3045            # Run batch evaluation
3046            result = client.run_batched_evaluation(
3047                scope="traces",
3048                mapper=trace_mapper,
3049                evaluators=[length_evaluator],
3050                filter='{"tags": ["production"]}',
3051                max_items=1000,
3052                verbose=True
3053            )
3054
3055            print(f"Processed {result.total_items_processed} traces")
3056            print(f"Created {result.total_scores_created} scores")
3057            ```
3058
3059            Evaluation with composite scorer:
3060            ```python
3061            def accuracy_evaluator(*, input, output, expected_output, metadata):
3062                # ... evaluation logic
3063                return Evaluation(name="accuracy", value=0.85)
3064
3065            def relevance_evaluator(*, input, output, expected_output, metadata):
3066                # ... evaluation logic
3067                return Evaluation(name="relevance", value=0.92)
3068
3069            def composite_evaluator(*, item, evaluations):
3070                # Weighted average of evaluations
3071                weights = {"accuracy": 0.6, "relevance": 0.4}
3072                total = sum(
3073                    e.value * weights.get(e.name, 0)
3074                    for e in evaluations
3075                    if isinstance(e.value, (int, float))
3076                )
3077                return Evaluation(
3078                    name="composite_score",
3079                    value=total,
3080                    comment=f"Weighted average of {len(evaluations)} metrics"
3081                )
3082
3083            result = client.run_batched_evaluation(
3084                scope="traces",
3085                mapper=trace_mapper,
3086                evaluators=[accuracy_evaluator, relevance_evaluator],
3087                composite_evaluator=composite_evaluator,
3088                filter='{"user_id": "important_user"}',
3089                verbose=True
3090            )
3091            ```
3092
3093            Handling incomplete runs with resume:
3094            ```python
3095            # Initial run that may fail or timeout
3096            result = client.run_batched_evaluation(
3097                scope="observations",
3098                mapper=obs_mapper,
3099                evaluators=[my_evaluator],
3100                max_items=10000,
3101                verbose=True
3102            )
3103
3104            # Check if incomplete
3105            if not result.completed and result.resume_token:
3106                print(f"Processed {result.resume_token.items_processed} items before interruption")
3107
3108                # Resume from where it left off
3109                result = client.run_batched_evaluation(
3110                    scope="observations",
3111                    mapper=obs_mapper,
3112                    evaluators=[my_evaluator],
3113                    resume_from=result.resume_token,
3114                    verbose=True
3115                )
3116
3117            print(f"Total items processed: {result.total_items_processed}")
3118            ```
3119
3120            Monitoring evaluator performance:
3121            ```python
3122            result = client.run_batched_evaluation(...)
3123
3124            for stats in result.evaluator_stats:
3125                success_rate = stats.successful_runs / stats.total_runs
3126                print(f"{stats.name}:")
3127                print(f"  Success rate: {success_rate:.1%}")
3128                print(f"  Scores created: {stats.total_scores_created}")
3129
3130                if stats.failed_runs > 0:
3131                    print(f"  âš ī¸  Failed {stats.failed_runs} times")
3132            ```
3133
3134        Note:
3135            - Evaluator failures are logged but don't stop the batch evaluation
3136            - Individual item failures are tracked but don't stop processing
3137            - Fetch failures are retried with exponential backoff
3138            - All scores are automatically flushed to Langfuse at the end
3139            - The resume mechanism uses timestamp-based filtering to avoid duplicates
3140        """
3141        runner = BatchEvaluationRunner(self)
3142
3143        return cast(
3144            BatchEvaluationResult,
3145            run_async_safely(
3146                runner.run_async(
3147                    scope=scope,
3148                    mapper=mapper,
3149                    evaluators=evaluators,
3150                    filter=filter,
3151                    fetch_batch_size=fetch_batch_size,
3152                    fetch_trace_fields=fetch_trace_fields,
3153                    max_items=max_items,
3154                    max_concurrency=max_concurrency,
3155                    composite_evaluator=composite_evaluator,
3156                    metadata=metadata,
3157                    _add_observation_scores_to_trace=_add_observation_scores_to_trace,
3158                    _additional_trace_tags=_additional_trace_tags,
3159                    max_retries=max_retries,
3160                    verbose=verbose,
3161                    resume_from=resume_from,
3162                )
3163            ),
3164        )

Fetch traces or observations and run evaluations on each item.

This method provides a powerful way to evaluate existing data in Langfuse at scale. It fetches items based on filters, transforms them using a mapper function, runs evaluators on each item, and creates scores that are linked back to the original entities. This is ideal for:

  • Running evaluations on production traces after deployment
  • Backtesting new evaluation metrics on historical data
  • Batch scoring of observations for quality monitoring
  • Periodic evaluation runs on recent data

The method uses a streaming/pipeline approach to process items in batches, making it memory-efficient for large datasets. It includes comprehensive error handling, retry logic, and resume capability for long-running evaluations.

Arguments:
  • scope: The type of items to evaluate. Must be one of:
    • "traces": Evaluate complete traces with all their observations
    • "observations": Evaluate individual observations (spans, generations, events)
  • mapper: Function that transforms API response objects into evaluator inputs. Receives a trace/observation object and returns an EvaluatorInputs instance with input, output, expected_output, and metadata fields. Can be sync or async.
  • evaluators: List of evaluation functions to run on each item. Each evaluator receives the mapped inputs and returns Evaluation object(s). Evaluator failures are logged but don't stop the batch evaluation.
  • filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples:
    • '{"tags": ["production"]}'
    • '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}' Default: None (fetches all items).
  • fetch_batch_size: Number of items to fetch per API call and hold in memory. Larger values may be faster but use more memory. Default: 50.
  • fetch_trace_fields: Comma-separated list of fields to include when fetching traces. Available field groups: 'core' (always included), 'io' (input, output, metadata), 'scores', 'observations', 'metrics'. If not specified, all fields are returned. Example: 'core,scores,metrics'. Note: Excluded 'observations' or 'scores' fields return empty arrays; excluded 'metrics' returns -1 for 'totalCost' and 'latency'. Only relevant if scope is 'traces'.
  • max_items: Maximum total number of items to process. If None, processes all items matching the filter. Useful for testing or limiting evaluation runs. Default: None (process all).
  • max_concurrency: Maximum number of items to evaluate concurrently. Controls parallelism and resource usage. Default: 5.
  • composite_evaluator: Optional function that creates a composite score from item-level evaluations. Receives the original item and its evaluations, returns a single Evaluation. Useful for weighted averages or combined metrics. Default: None.
  • metadata: Optional metadata dict to add to all created scores. Useful for tracking evaluation runs, versions, or other context. Default: None.
  • max_retries: Maximum number of retry attempts for failed batch fetches. Uses exponential backoff (1s, 2s, 4s). Default: 3.
  • verbose: If True, logs progress information to console. Useful for monitoring long-running evaluations. Default: False.
  • resume_from: Optional resume token from a previous incomplete run. Allows continuing evaluation after interruption or failure. Default: None.
Returns:

BatchEvaluationResult containing: - total_items_fetched: Number of items fetched from API - total_items_processed: Number of items successfully evaluated - total_items_failed: Number of items that failed evaluation - total_scores_created: Scores created by item-level evaluators - total_composite_scores_created: Scores created by composite evaluator - total_evaluations_failed: Individual evaluator failures - evaluator_stats: Per-evaluator statistics (success rate, scores created) - resume_token: Token for resuming if incomplete (None if completed) - completed: True if all items processed - duration_seconds: Total execution time - failed_item_ids: IDs of items that failed - error_summary: Error types and counts - has_more_items: True if max_items reached but more exist

Raises:
  • ValueError: If invalid scope is provided.
Examples:

Basic trace evaluation:

from langfuse import Langfuse, EvaluatorInputs, Evaluation

client = Langfuse()

# Define mapper to extract fields from traces
def trace_mapper(trace):
    return EvaluatorInputs(
        input=trace.input,
        output=trace.output,
        expected_output=None,
        metadata={"trace_id": trace.id}
    )

# Define evaluator
def length_evaluator(*, input, output, expected_output, metadata):
    return Evaluation(
        name="output_length",
        value=len(output) if output else 0
    )

# Run batch evaluation
result = client.run_batched_evaluation(
    scope="traces",
    mapper=trace_mapper,
    evaluators=[length_evaluator],
    filter='{"tags": ["production"]}',
    max_items=1000,
    verbose=True
)

print(f"Processed {result.total_items_processed} traces")
print(f"Created {result.total_scores_created} scores")

Evaluation with composite scorer:

def accuracy_evaluator(*, input, output, expected_output, metadata):
    # ... evaluation logic
    return Evaluation(name="accuracy", value=0.85)

def relevance_evaluator(*, input, output, expected_output, metadata):
    # ... evaluation logic
    return Evaluation(name="relevance", value=0.92)

def composite_evaluator(*, item, evaluations):
    # Weighted average of evaluations
    weights = {"accuracy": 0.6, "relevance": 0.4}
    total = sum(
        e.value * weights.get(e.name, 0)
        for e in evaluations
        if isinstance(e.value, (int, float))
    )
    return Evaluation(
        name="composite_score",
        value=total,
        comment=f"Weighted average of {len(evaluations)} metrics"
    )

result = client.run_batched_evaluation(
    scope="traces",
    mapper=trace_mapper,
    evaluators=[accuracy_evaluator, relevance_evaluator],
    composite_evaluator=composite_evaluator,
    filter='{"user_id": "important_user"}',
    verbose=True
)

Handling incomplete runs with resume:

# Initial run that may fail or timeout
result = client.run_batched_evaluation(
    scope="observations",
    mapper=obs_mapper,
    evaluators=[my_evaluator],
    max_items=10000,
    verbose=True
)

# Check if incomplete
if not result.completed and result.resume_token:
    print(f"Processed {result.resume_token.items_processed} items before interruption")

    # Resume from where it left off
    result = client.run_batched_evaluation(
        scope="observations",
        mapper=obs_mapper,
        evaluators=[my_evaluator],
        resume_from=result.resume_token,
        verbose=True
    )

print(f"Total items processed: {result.total_items_processed}")

Monitoring evaluator performance:

result = client.run_batched_evaluation(...)

for stats in result.evaluator_stats:
    success_rate = stats.successful_runs / stats.total_runs
    print(f"{stats.name}:")
    print(f"  Success rate: {success_rate:.1%}")
    print(f"  Scores created: {stats.total_scores_created}")

    if stats.failed_runs > 0:
        print(f"  âš ī¸  Failed {stats.failed_runs} times")
Note:
  • Evaluator failures are logged but don't stop the batch evaluation
  • Individual item failures are tracked but don't stop processing
  • Fetch failures are retried with exponential backoff
  • All scores are automatically flushed to Langfuse at the end
  • The resume mechanism uses timestamp-based filtering to avoid duplicates
def auth_check(self) -> bool:
3166    def auth_check(self) -> bool:
3167        """Check if the provided credentials (public and secret key) are valid.
3168
3169        Raises:
3170            Exception: If no projects were found for the provided credentials.
3171
3172        Note:
3173            This method is blocking. It is discouraged to use it in production code.
3174        """
3175        try:
3176            projects = self.api.projects.get()
3177            langfuse_logger.debug(
3178                f"Auth check successful, found {len(projects.data)} projects"
3179            )
3180            if len(projects.data) == 0:
3181                raise Exception(
3182                    "Auth check failed, no project found for the keys provided."
3183                )
3184            return True
3185
3186        except AttributeError as e:
3187            langfuse_logger.warning(
3188                f"Auth check failed: Client not properly initialized. Error: {e}"
3189            )
3190            return False
3191
3192        except Error as e:
3193            handle_fern_exception(e)
3194            raise e

Check if the provided credentials (public and secret key) are valid.

Raises:
  • Exception: If no projects were found for the provided credentials.
Note:

This method is blocking. It is discouraged to use it in production code.

def create_dataset( self, *, name: str, description: Optional[str] = None, metadata: Optional[Any] = None, input_schema: Optional[Any] = None, expected_output_schema: Optional[Any] = None) -> langfuse.api.Dataset:
3196    def create_dataset(
3197        self,
3198        *,
3199        name: str,
3200        description: Optional[str] = None,
3201        metadata: Optional[Any] = None,
3202        input_schema: Optional[Any] = None,
3203        expected_output_schema: Optional[Any] = None,
3204    ) -> Dataset:
3205        """Create a dataset with the given name on Langfuse.
3206
3207        Args:
3208            name: Name of the dataset to create.
3209            description: Description of the dataset. Defaults to None.
3210            metadata: Additional metadata. Defaults to None.
3211            input_schema: JSON Schema for validating dataset item inputs. When set, all new items will be validated against this schema.
3212            expected_output_schema: JSON Schema for validating dataset item expected outputs. When set, all new items will be validated against this schema.
3213
3214        Returns:
3215            Dataset: The created dataset as returned by the Langfuse API.
3216        """
3217        try:
3218            langfuse_logger.debug(f"Creating datasets {name}")
3219
3220            result = self.api.datasets.create(
3221                name=name,
3222                description=description,
3223                metadata=metadata,
3224                input_schema=input_schema,
3225                expected_output_schema=expected_output_schema,
3226            )
3227
3228            return cast(Dataset, result)
3229
3230        except Error as e:
3231            handle_fern_exception(e)
3232            raise e

Create a dataset with the given name on Langfuse.

Arguments:
  • name: Name of the dataset to create.
  • description: Description of the dataset. Defaults to None.
  • metadata: Additional metadata. Defaults to None.
  • input_schema: JSON Schema for validating dataset item inputs. When set, all new items will be validated against this schema.
  • expected_output_schema: JSON Schema for validating dataset item expected outputs. When set, all new items will be validated against this schema.
Returns:

Dataset: The created dataset as returned by the Langfuse API.

def create_dataset_item( self, *, dataset_name: str, input: Optional[Any] = None, expected_output: Optional[Any] = None, metadata: Optional[Any] = None, source_trace_id: Optional[str] = None, source_observation_id: Optional[str] = None, status: Optional[langfuse.api.DatasetStatus] = None, id: Optional[str] = None) -> langfuse.api.DatasetItem:
3234    def create_dataset_item(
3235        self,
3236        *,
3237        dataset_name: str,
3238        input: Optional[Any] = None,
3239        expected_output: Optional[Any] = None,
3240        metadata: Optional[Any] = None,
3241        source_trace_id: Optional[str] = None,
3242        source_observation_id: Optional[str] = None,
3243        status: Optional[DatasetStatus] = None,
3244        id: Optional[str] = None,
3245    ) -> DatasetItem:
3246        """Create a dataset item.
3247
3248        Upserts if an item with id already exists.
3249
3250        Args:
3251            dataset_name: Name of the dataset in which the dataset item should be created.
3252            input: Input data. Defaults to None. Can contain any dict, list or scalar.
3253            expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar.
3254            metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar.
3255            source_trace_id: Id of the source trace. Defaults to None.
3256            source_observation_id: Id of the source observation. Defaults to None.
3257            status: Status of the dataset item. Defaults to ACTIVE for newly created items.
3258            id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets.
3259
3260        Returns:
3261            DatasetItem: The created dataset item as returned by the Langfuse API.
3262
3263        Example:
3264            ```python
3265            from langfuse import Langfuse
3266
3267            langfuse = Langfuse()
3268
3269            # Uploading items to the Langfuse dataset named "capital_cities"
3270            langfuse.create_dataset_item(
3271                dataset_name="capital_cities",
3272                input={"input": {"country": "Italy"}},
3273                expected_output={"expected_output": "Rome"},
3274                metadata={"foo": "bar"}
3275            )
3276            ```
3277        """
3278        try:
3279            langfuse_logger.debug(f"Creating dataset item for dataset {dataset_name}")
3280
3281            result = self.api.dataset_items.create(
3282                dataset_name=dataset_name,
3283                input=input,
3284                expected_output=expected_output,
3285                metadata=metadata,
3286                source_trace_id=source_trace_id,
3287                source_observation_id=source_observation_id,
3288                status=status,
3289                id=id,
3290            )
3291
3292            return cast(DatasetItem, result)
3293        except Error as e:
3294            handle_fern_exception(e)
3295            raise e

Create a dataset item.

Upserts if an item with id already exists.

Arguments:
  • dataset_name: Name of the dataset in which the dataset item should be created.
  • input: Input data. Defaults to None. Can contain any dict, list or scalar.
  • expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar.
  • metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar.
  • source_trace_id: Id of the source trace. Defaults to None.
  • source_observation_id: Id of the source observation. Defaults to None.
  • status: Status of the dataset item. Defaults to ACTIVE for newly created items.
  • id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets.
Returns:

DatasetItem: The created dataset item as returned by the Langfuse API.

Example:
from langfuse import Langfuse

langfuse = Langfuse()

# Uploading items to the Langfuse dataset named "capital_cities"
langfuse.create_dataset_item(
    dataset_name="capital_cities",
    input={"input": {"country": "Italy"}},
    expected_output={"expected_output": "Rome"},
    metadata={"foo": "bar"}
)
def resolve_media_references( self, *, obj: Any, resolve_with: Literal['base64_data_uri'], max_depth: int = 10, content_fetch_timeout_seconds: int = 5) -> Any:
3297    def resolve_media_references(
3298        self,
3299        *,
3300        obj: Any,
3301        resolve_with: Literal["base64_data_uri"],
3302        max_depth: int = 10,
3303        content_fetch_timeout_seconds: int = 5,
3304    ) -> Any:
3305        """Replace media reference strings in an object with base64 data URIs.
3306
3307        This method recursively traverses an object (up to max_depth) looking for media reference strings
3308        in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using
3309        the provided Langfuse client and replaces the reference string with a base64 data URI.
3310
3311        If fetching media content fails for a reference string, a warning is logged and the reference
3312        string is left unchanged.
3313
3314        Args:
3315            obj: The object to process. Can be a primitive value, array, or nested object.
3316                If the object has a __dict__ attribute, a dict will be returned instead of the original object type.
3317            resolve_with: The representation of the media content to replace the media reference string with.
3318                Currently only "base64_data_uri" is supported.
3319            max_depth: int: The maximum depth to traverse the object. Default is 10.
3320            content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5.
3321
3322        Returns:
3323            A deep copy of the input object with all media references replaced with base64 data URIs where possible.
3324            If the input object has a __dict__ attribute, a dict will be returned instead of the original object type.
3325
3326        Example:
3327            obj = {
3328                "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@",
3329                "nested": {
3330                    "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@"
3331                }
3332            }
3333
3334            result = await LangfuseMedia.resolve_media_references(obj, langfuse_client)
3335
3336            # Result:
3337            # {
3338            #     "image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...",
3339            #     "nested": {
3340            #         "pdf": "data:application/pdf;base64,JVBERi0xLjcK..."
3341            #     }
3342            # }
3343        """
3344        return LangfuseMedia.resolve_media_references(
3345            langfuse_client=self,
3346            obj=obj,
3347            resolve_with=resolve_with,
3348            max_depth=max_depth,
3349            content_fetch_timeout_seconds=content_fetch_timeout_seconds,
3350        )

Replace media reference strings in an object with base64 data URIs.

This method recursively traverses an object (up to max_depth) looking for media reference strings in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using the provided Langfuse client and replaces the reference string with a base64 data URI.

If fetching media content fails for a reference string, a warning is logged and the reference string is left unchanged.

Arguments:
  • obj: The object to process. Can be a primitive value, array, or nested object. If the object has a __dict__ attribute, a dict will be returned instead of the original object type.
  • resolve_with: The representation of the media content to replace the media reference string with. Currently only "base64_data_uri" is supported.
  • max_depth: int: The maximum depth to traverse the object. Default is 10.
  • content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5.
Returns:

A deep copy of the input object with all media references replaced with base64 data URIs where possible. If the input object has a __dict__ attribute, a dict will be returned instead of the original object type.

Example:

obj = { "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@", "nested": { "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@" } }

result = await LangfuseMedia.resolve_media_references(obj, langfuse_client)

Result:

{

"image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...",

"nested": {

"pdf": "data:application/pdf;base64,JVBERi0xLjcK..."

}

}

def get_prompt( self, name: str, *, version: Optional[int] = None, label: Optional[str] = None, type: Literal['chat', 'text'] = 'text', cache_ttl_seconds: Optional[int] = None, fallback: Union[List[langfuse.model.ChatMessageDict], NoneType, str] = None, max_retries: Optional[int] = None, fetch_timeout_seconds: Optional[int] = None) -> Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient]:
3380    def get_prompt(
3381        self,
3382        name: str,
3383        *,
3384        version: Optional[int] = None,
3385        label: Optional[str] = None,
3386        type: Literal["chat", "text"] = "text",
3387        cache_ttl_seconds: Optional[int] = None,
3388        fallback: Union[Optional[List[ChatMessageDict]], Optional[str]] = None,
3389        max_retries: Optional[int] = None,
3390        fetch_timeout_seconds: Optional[int] = None,
3391    ) -> PromptClient:
3392        """Get a prompt.
3393
3394        This method attempts to fetch the requested prompt from the local cache. If the prompt is not found
3395        in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again
3396        and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will
3397        return the expired prompt as a fallback.
3398
3399        Args:
3400            name (str): The name of the prompt to retrieve.
3401
3402        Keyword Args:
3403            version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both.
3404            label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both.
3405            cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a
3406            keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0.
3407            type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text".
3408            fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None.
3409            max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds.
3410            fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default.
3411
3412        Returns:
3413            The prompt object retrieved from the cache or directly fetched if not cached or expired of type
3414            - TextPromptClient, if type argument is 'text'.
3415            - ChatPromptClient, if type argument is 'chat'.
3416
3417        Raises:
3418            Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an
3419            expired prompt in the cache, in which case it logs a warning and returns the expired prompt.
3420        """
3421        if self._resources is None:
3422            raise Error(
3423                "SDK is not correctly initialized. Check the init logs for more details."
3424            )
3425        if version is not None and label is not None:
3426            raise ValueError("Cannot specify both version and label at the same time.")
3427
3428        if not name:
3429            raise ValueError("Prompt name cannot be empty.")
3430
3431        cache_key = PromptCache.generate_cache_key(name, version=version, label=label)
3432        bounded_max_retries = self._get_bounded_max_retries(
3433            max_retries, default_max_retries=2, max_retries_upper_bound=4
3434        )
3435
3436        langfuse_logger.debug(f"Getting prompt '{cache_key}'")
3437        cached_prompt = self._resources.prompt_cache.get(cache_key)
3438
3439        if cached_prompt is None or cache_ttl_seconds == 0:
3440            langfuse_logger.debug(
3441                f"Prompt '{cache_key}' not found in cache or caching disabled."
3442            )
3443            try:
3444                return self._fetch_prompt_and_update_cache(
3445                    name,
3446                    version=version,
3447                    label=label,
3448                    ttl_seconds=cache_ttl_seconds,
3449                    max_retries=bounded_max_retries,
3450                    fetch_timeout_seconds=fetch_timeout_seconds,
3451                )
3452            except Exception as e:
3453                if fallback:
3454                    langfuse_logger.warning(
3455                        f"Returning fallback prompt for '{cache_key}' due to fetch error: {e}"
3456                    )
3457
3458                    fallback_client_args: Dict[str, Any] = {
3459                        "name": name,
3460                        "prompt": fallback,
3461                        "type": type,
3462                        "version": version or 0,
3463                        "config": {},
3464                        "labels": [label] if label else [],
3465                        "tags": [],
3466                    }
3467
3468                    if type == "text":
3469                        return TextPromptClient(
3470                            prompt=Prompt_Text(**fallback_client_args),
3471                            is_fallback=True,
3472                        )
3473
3474                    if type == "chat":
3475                        return ChatPromptClient(
3476                            prompt=Prompt_Chat(**fallback_client_args),
3477                            is_fallback=True,
3478                        )
3479
3480                raise e
3481
3482        if cached_prompt.is_expired():
3483            langfuse_logger.debug(f"Stale prompt '{cache_key}' found in cache.")
3484            try:
3485                # refresh prompt in background thread, refresh_prompt deduplicates tasks
3486                langfuse_logger.debug(f"Refreshing prompt '{cache_key}' in background.")
3487
3488                def refresh_task() -> None:
3489                    self._fetch_prompt_and_update_cache(
3490                        name,
3491                        version=version,
3492                        label=label,
3493                        ttl_seconds=cache_ttl_seconds,
3494                        max_retries=bounded_max_retries,
3495                        fetch_timeout_seconds=fetch_timeout_seconds,
3496                    )
3497
3498                self._resources.prompt_cache.add_refresh_prompt_task_if_current(
3499                    cache_key,
3500                    cached_prompt,
3501                    refresh_task,
3502                )
3503                langfuse_logger.debug(
3504                    f"Returning stale prompt '{cache_key}' from cache."
3505                )
3506                # return stale prompt
3507                return cached_prompt.value
3508
3509            except Exception as e:
3510                langfuse_logger.warning(
3511                    f"Error when refreshing cached prompt '{cache_key}', returning cached version. Error: {e}"
3512                )
3513                # creation of refresh prompt task failed, return stale prompt
3514                return cached_prompt.value
3515
3516        return cached_prompt.value

Get a prompt.

This method attempts to fetch the requested prompt from the local cache. If the prompt is not found in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will return the expired prompt as a fallback.

Arguments:
  • name (str): The name of the prompt to retrieve.
Keyword Args:
  • version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the production label is returned. Specify either version or label, not both.
  • label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the production label is returned. Specify either version or label, not both.
  • cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a
  • keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0.
  • type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text".
  • fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None.
  • max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds.
  • fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default.
Returns:

The prompt object retrieved from the cache or directly fetched if not cached or expired of type

  • TextPromptClient, if type argument is 'text'.
  • ChatPromptClient, if type argument is 'chat'.
Raises:
  • Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an
  • expired prompt in the cache, in which case it logs a warning and returns the expired prompt.
def create_prompt( self, *, name: str, prompt: Union[str, List[Union[langfuse.model.ChatMessageDict, langfuse.model.ChatMessageWithPlaceholdersDict_Message, langfuse.model.ChatMessageWithPlaceholdersDict_Placeholder]]], labels: List[str] = [], tags: Optional[List[str]] = None, type: Optional[Literal['chat', 'text']] = 'text', config: Optional[Any] = None, commit_message: Optional[str] = None) -> Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient]:
3618    def create_prompt(
3619        self,
3620        *,
3621        name: str,
3622        prompt: Union[
3623            str, List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]]
3624        ],
3625        labels: List[str] = [],
3626        tags: Optional[List[str]] = None,
3627        type: Optional[Literal["chat", "text"]] = "text",
3628        config: Optional[Any] = None,
3629        commit_message: Optional[str] = None,
3630    ) -> PromptClient:
3631        """Create a new prompt in Langfuse.
3632
3633        Keyword Args:
3634            name : The name of the prompt to be created.
3635            prompt : The content of the prompt to be created.
3636            is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead.
3637            labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label.
3638            tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt.
3639            config: Additional structured data to be saved with the prompt. Defaults to None.
3640            type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text".
3641            commit_message: Optional string describing the change.
3642
3643        Returns:
3644            TextPromptClient: The prompt if type argument is 'text'.
3645            ChatPromptClient: The prompt if type argument is 'chat'.
3646        """
3647        try:
3648            langfuse_logger.debug(f"Creating prompt {name=}, {labels=}")
3649
3650            if type == "chat":
3651                if not isinstance(prompt, list):
3652                    raise ValueError(
3653                        "For 'chat' type, 'prompt' must be a list of chat messages with role and content attributes."
3654                    )
3655                request: Union[CreateChatPromptRequest, CreateTextPromptRequest] = (
3656                    CreateChatPromptRequest(
3657                        name=name,
3658                        prompt=cast(Any, prompt),
3659                        labels=labels,
3660                        tags=tags,
3661                        config=config or {},
3662                        commit_message=commit_message,
3663                        type=CreateChatPromptType.CHAT,
3664                    )
3665                )
3666                server_prompt = self.api.prompts.create(request=request)
3667
3668                if self._resources is not None:
3669                    self._resources.prompt_cache.invalidate(name)
3670
3671                return ChatPromptClient(prompt=cast(Prompt_Chat, server_prompt))
3672
3673            if not isinstance(prompt, str):
3674                raise ValueError("For 'text' type, 'prompt' must be a string.")
3675
3676            request = CreateTextPromptRequest(
3677                name=name,
3678                prompt=prompt,
3679                labels=labels,
3680                tags=tags,
3681                config=config or {},
3682                commit_message=commit_message,
3683            )
3684
3685            server_prompt = self.api.prompts.create(request=request)
3686
3687            if self._resources is not None:
3688                self._resources.prompt_cache.invalidate(name)
3689
3690            return TextPromptClient(prompt=cast(Prompt_Text, server_prompt))
3691
3692        except Error as e:
3693            handle_fern_exception(e)
3694            raise e

Create a new prompt in Langfuse.

Keyword Args:
  • name : The name of the prompt to be created.
  • prompt : The content of the prompt to be created.
  • is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead.
  • labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label.
  • tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt.
  • config: Additional structured data to be saved with the prompt. Defaults to None.
  • type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text".
  • commit_message: Optional string describing the change.
Returns:

TextPromptClient: The prompt if type argument is 'text'. ChatPromptClient: The prompt if type argument is 'chat'.

def update_prompt(self, *, name: str, version: int, new_labels: List[str] = []) -> Any:
3696    def update_prompt(
3697        self,
3698        *,
3699        name: str,
3700        version: int,
3701        new_labels: List[str] = [],
3702    ) -> Any:
3703        """Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name.
3704
3705        Args:
3706            name (str): The name of the prompt to update.
3707            version (int): The version number of the prompt to update.
3708            new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to [].
3709
3710        Returns:
3711            Prompt: The updated prompt from the Langfuse API.
3712
3713        """
3714        updated_prompt = self.api.prompt_version.update(
3715            name=self._url_encode(name),
3716            version=version,
3717            new_labels=new_labels,
3718        )
3719
3720        if self._resources is not None:
3721            self._resources.prompt_cache.invalidate(name)
3722
3723        return updated_prompt

Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name.

Arguments:
  • name (str): The name of the prompt to update.
  • version (int): The version number of the prompt to update.
  • new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to [].
Returns:

Prompt: The updated prompt from the Langfuse API.

def clear_prompt_cache(self) -> None:
3738    def clear_prompt_cache(self) -> None:
3739        """Clear the entire prompt cache, removing all cached prompts.
3740
3741        This method is useful when you want to force a complete refresh of all
3742        cached prompts, for example after major updates or when you need to
3743        ensure the latest versions are fetched from the server.
3744        """
3745        if self._resources is not None:
3746            self._resources.prompt_cache.clear()

Clear the entire prompt cache, removing all cached prompts.

This method is useful when you want to force a complete refresh of all cached prompts, for example after major updates or when you need to ensure the latest versions are fetched from the server.

def get_client(*, public_key: Optional[str] = None) -> Langfuse:
 63def get_client(*, public_key: Optional[str] = None) -> Langfuse:
 64    """Get or create a Langfuse client instance.
 65
 66    Returns an existing Langfuse client or creates a new one if none exists. In multi-project setups,
 67    providing a public_key is required. Multi-project support is experimental - see Langfuse docs.
 68
 69    Behavior:
 70    - Single project: Returns existing client or creates new one
 71    - Multi-project: Requires public_key to return specific client
 72    - No public_key in multi-project: Returns disabled client to prevent data leakage
 73
 74    The function uses a singleton pattern per public_key to conserve resources and maintain state.
 75
 76    Args:
 77        public_key (Optional[str]): Project identifier
 78            - With key: Returns client for that project
 79            - Without key: Returns single client or disabled client if multiple exist
 80
 81    Returns:
 82        Langfuse: Client instance in one of three states:
 83            1. Client for specified public_key
 84            2. Default client for single-project setup
 85            3. Disabled client when multiple projects exist without key
 86
 87    Security:
 88        Disables tracing when multiple projects exist without explicit key to prevent
 89        cross-project data leakage. Multi-project setups are experimental.
 90
 91    Example:
 92        ```python
 93        # Single project
 94        client = get_client()  # Default client
 95
 96        # In multi-project usage:
 97        client_a = get_client(public_key="project_a_key")  # Returns project A's client
 98        client_b = get_client(public_key="project_b_key")  # Returns project B's client
 99
100        # Without specific key in multi-project setup:
101        client = get_client()  # Returns disabled client for safety
102        ```
103    """
104    with LangfuseResourceManager._lock:
105        active_instances = LangfuseResourceManager._instances
106
107        # If no explicit public_key provided, check execution context
108        if not public_key:
109            public_key = _current_public_key.get(None)
110
111        if not public_key:
112            if len(active_instances) == 0:
113                # No clients initialized yet, create default instance
114                return Langfuse()
115
116            if len(active_instances) == 1:
117                # Only one client exists, safe to use without specifying key
118                instance = list(active_instances.values())[0]
119
120                # Initialize with the credentials bound to the instance
121                # This is important if the original instance was instantiated
122                # via constructor arguments
123                return _create_client_from_instance(instance)
124
125            else:
126                # Multiple clients exist but no key specified - disable tracing
127                # to prevent cross-project data leakage
128                langfuse_logger.warning(
129                    "No 'langfuse_public_key' passed to decorated function, but multiple langfuse clients are instantiated in current process. Skipping tracing for this function to avoid cross-project leakage."
130                )
131                return Langfuse(
132                    tracing_enabled=False, public_key="fake", secret_key="fake"
133                )
134
135        else:
136            # Specific key provided, look up existing instance
137            target_instance: Optional[LangfuseResourceManager] = active_instances.get(
138                public_key, None
139            )
140
141            if target_instance is None:
142                # No instance found with this key - client not initialized properly
143                langfuse_logger.warning(
144                    f"No Langfuse client with public key {public_key} has been initialized. Skipping tracing for decorated function."
145                )
146                return Langfuse(
147                    tracing_enabled=False, public_key="fake", secret_key="fake"
148                )
149
150            # target_instance is guaranteed to be not None at this point
151            return _create_client_from_instance(target_instance, public_key)

Get or create a Langfuse client instance.

Returns an existing Langfuse client or creates a new one if none exists. In multi-project setups, providing a public_key is required. Multi-project support is experimental - see Langfuse docs.

Behavior:

  • Single project: Returns existing client or creates new one
  • Multi-project: Requires public_key to return specific client
  • No public_key in multi-project: Returns disabled client to prevent data leakage

The function uses a singleton pattern per public_key to conserve resources and maintain state.

Arguments:
  • public_key (Optional[str]): Project identifier
    • With key: Returns client for that project
    • Without key: Returns single client or disabled client if multiple exist
Returns:

Langfuse: Client instance in one of three states: 1. Client for specified public_key 2. Default client for single-project setup 3. Disabled client when multiple projects exist without key

Security:

Disables tracing when multiple projects exist without explicit key to prevent cross-project data leakage. Multi-project setups are experimental.

Example:
# Single project
client = get_client()  # Default client

# In multi-project usage:
client_a = get_client(public_key="project_a_key")  # Returns project A's client
client_b = get_client(public_key="project_b_key")  # Returns project B's client

# Without specific key in multi-project setup:
client = get_client()  # Returns disabled client for safety
def observe( func: Optional[~F] = None, *, name: Optional[str] = None, as_type: Union[Literal['generation', 'embedding'], Literal['span', 'agent', 'tool', 'chain', 'retriever', 'evaluator', 'guardrail'], NoneType] = None, capture_input: Optional[bool] = None, capture_output: Optional[bool] = None, transform_to_string: Optional[Callable[[Iterable], str]] = None) -> Union[~F, Callable[[~F], ~F]]:
 88    def observe(
 89        self,
 90        func: Optional[F] = None,
 91        *,
 92        name: Optional[str] = None,
 93        as_type: Optional[ObservationTypeLiteralNoEvent] = None,
 94        capture_input: Optional[bool] = None,
 95        capture_output: Optional[bool] = None,
 96        transform_to_string: Optional[Callable[[Iterable], str]] = None,
 97    ) -> Union[F, Callable[[F], F]]:
 98        """Wrap a function to create and manage Langfuse tracing around its execution, supporting both synchronous and asynchronous functions.
 99
100        This decorator provides seamless integration of Langfuse observability into your codebase. It automatically creates
101        spans or generations around function execution, capturing timing, inputs/outputs, and error states. The decorator
102        intelligently handles both synchronous and asynchronous functions, preserving function signatures and type hints.
103
104        Using OpenTelemetry's distributed tracing system, it maintains proper trace context propagation throughout your application,
105        enabling you to see hierarchical traces of function calls with detailed performance metrics and function-specific details.
106
107        Args:
108            func (Optional[Callable]): The function to decorate. When used with parentheses @observe(), this will be None.
109            name (Optional[str]): Custom name for the created trace or span. If not provided, the function name is used.
110            as_type (Optional[Literal]): Set the observation type. Supported values:
111                    "generation", "span", "agent", "tool", "chain", "retriever", "embedding", "evaluator", "guardrail".
112                    Observation types are highlighted in the Langfuse UI for filtering and visualization.
113                    The types "generation" and "embedding" create a span on which additional attributes such as model metrics
114                    can be set.
115
116        Returns:
117            Callable: A wrapped version of the original function that automatically creates and manages Langfuse spans.
118
119        Example:
120            For general function tracing with automatic naming:
121            ```python
122            @observe()
123            def process_user_request(user_id, query):
124                # Function is automatically traced with name "process_user_request"
125                return get_response(query)
126            ```
127
128            For language model generation tracking:
129            ```python
130            @observe(name="answer-generation", as_type="generation")
131            async def generate_answer(query):
132                # Creates a generation-type span with extended LLM metrics
133                response = await openai.chat.completions.create(
134                    model="gpt-4",
135                    messages=[{"role": "user", "content": query}]
136                )
137                return response.choices[0].message.content
138            ```
139
140            For trace context propagation between functions:
141            ```python
142            @observe()
143            def main_process():
144                # Parent span is created
145                return sub_process()  # Child span automatically connected to parent
146
147            @observe()
148            def sub_process():
149                # Automatically becomes a child span of main_process
150                return "result"
151            ```
152
153        Raises:
154            Exception: Propagates any exceptions from the wrapped function after logging them in the trace.
155
156        Notes:
157            - The decorator preserves the original function's signature, docstring, and return type.
158            - Proper parent-child relationships between spans are automatically maintained.
159            - Special keyword arguments can be passed to control tracing:
160              - langfuse_trace_id: Explicitly set the trace ID for this function call
161              - langfuse_parent_observation_id: Explicitly set the parent span ID
162              - langfuse_public_key: Use a specific Langfuse project (when multiple clients exist)
163            - For async functions, the decorator returns an async function wrapper.
164            - For sync functions, the decorator returns a synchronous wrapper.
165        """
166        valid_types = set(get_observation_types_list(ObservationTypeLiteralNoEvent))
167        if as_type is not None and as_type not in valid_types:
168            logger.warning(
169                f"Invalid as_type '{as_type}'. Valid types are: {', '.join(sorted(valid_types))}. Defaulting to 'span'."
170            )
171            as_type = "span"
172
173        function_io_capture_enabled = os.environ.get(
174            LANGFUSE_OBSERVE_DECORATOR_IO_CAPTURE_ENABLED, "True"
175        ).lower() not in ("false", "0")
176
177        should_capture_input = (
178            capture_input if capture_input is not None else function_io_capture_enabled
179        )
180
181        should_capture_output = (
182            capture_output
183            if capture_output is not None
184            else function_io_capture_enabled
185        )
186
187        def decorator(func: F) -> F:
188            return (
189                self._async_observe(
190                    func,
191                    name=name,
192                    as_type=as_type,
193                    capture_input=should_capture_input,
194                    capture_output=should_capture_output,
195                    transform_to_string=transform_to_string,
196                )
197                if asyncio.iscoroutinefunction(func)
198                else self._sync_observe(
199                    func,
200                    name=name,
201                    as_type=as_type,
202                    capture_input=should_capture_input,
203                    capture_output=should_capture_output,
204                    transform_to_string=transform_to_string,
205                )
206            )
207
208        """Handle decorator with or without parentheses.
209
210        This logic enables the decorator to work both with and without parentheses:
211        - @observe - Python passes the function directly to the decorator
212        - @observe() - Python calls the decorator first, which must return a function decorator
213
214        When called without arguments (@observe), the func parameter contains the function to decorate,
215        so we directly apply the decorator to it. When called with parentheses (@observe()),
216        func is None, so we return the decorator function itself for Python to apply in the next step.
217        """
218        if func is None:
219            return decorator
220        else:
221            return decorator(func)

Wrap a function to create and manage Langfuse tracing around its execution, supporting both synchronous and asynchronous functions.

This decorator provides seamless integration of Langfuse observability into your codebase. It automatically creates spans or generations around function execution, capturing timing, inputs/outputs, and error states. The decorator intelligently handles both synchronous and asynchronous functions, preserving function signatures and type hints.

Using OpenTelemetry's distributed tracing system, it maintains proper trace context propagation throughout your application, enabling you to see hierarchical traces of function calls with detailed performance metrics and function-specific details.

Arguments:
  • func (Optional[Callable]): The function to decorate. When used with parentheses @observe(), this will be None.
  • name (Optional[str]): Custom name for the created trace or span. If not provided, the function name is used.
  • as_type (Optional[Literal]): Set the observation type. Supported values: "generation", "span", "agent", "tool", "chain", "retriever", "embedding", "evaluator", "guardrail". Observation types are highlighted in the Langfuse UI for filtering and visualization. The types "generation" and "embedding" create a span on which additional attributes such as model metrics can be set.
Returns:

Callable: A wrapped version of the original function that automatically creates and manages Langfuse spans.

Example:

For general function tracing with automatic naming:

@observe()
def process_user_request(user_id, query):
    # Function is automatically traced with name "process_user_request"
    return get_response(query)

For language model generation tracking:

@observe(name="answer-generation", as_type="generation")
async def generate_answer(query):
    # Creates a generation-type span with extended LLM metrics
    response = await openai.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": query}]
    )
    return response.choices[0].message.content

For trace context propagation between functions:

@observe()
def main_process():
    # Parent span is created
    return sub_process()  # Child span automatically connected to parent

@observe()
def sub_process():
    # Automatically becomes a child span of main_process
    return "result"
Raises:
  • Exception: Propagates any exceptions from the wrapped function after logging them in the trace.
Notes:
  • The decorator preserves the original function's signature, docstring, and return type.
  • Proper parent-child relationships between spans are automatically maintained.
  • Special keyword arguments can be passed to control tracing:
    • langfuse_trace_id: Explicitly set the trace ID for this function call
    • langfuse_parent_observation_id: Explicitly set the parent span ID
    • langfuse_public_key: Use a specific Langfuse project (when multiple clients exist)
  • For async functions, the decorator returns an async function wrapper.
  • For sync functions, the decorator returns a synchronous wrapper.
def propagate_attributes( *, user_id: Optional[str] = None, session_id: Optional[str] = None, metadata: Optional[Dict[str, str]] = None, version: Optional[str] = None, tags: Optional[List[str]] = None, trace_name: Optional[str] = None, as_baggage: bool = False) -> opentelemetry.util._decorator._AgnosticContextManager[typing.Any]:
 95def propagate_attributes(
 96    *,
 97    user_id: Optional[str] = None,
 98    session_id: Optional[str] = None,
 99    metadata: Optional[Dict[str, str]] = None,
100    version: Optional[str] = None,
101    tags: Optional[List[str]] = None,
102    trace_name: Optional[str] = None,
103    as_baggage: bool = False,
104) -> _AgnosticContextManager[Any]:
105    """Propagate trace-level attributes to all spans created within this context.
106
107    This context manager sets attributes on the currently active span AND automatically
108    propagates them to all new child spans created within the context. This is the
109    recommended way to set trace-level attributes like user_id, session_id, and metadata
110    dimensions that should be consistently applied across all observations in a trace.
111
112    **IMPORTANT**: Call this as early as possible within your trace/workflow. Only the
113    currently active span and spans created after entering this context will have these
114    attributes. Pre-existing spans will NOT be retroactively updated.
115
116    **Why this matters**: Langfuse aggregation queries (e.g., total cost by user_id,
117    filtering by session_id) only include observations that have the attribute set.
118    If you call `propagate_attributes` late in your workflow, earlier spans won't be
119    included in aggregations for that attribute.
120
121    Args:
122        user_id: User identifier to associate with all spans in this context.
123            Must be US-ASCII string, ≤200 characters. Use this to track which user
124            generated each trace and enable e.g. per-user cost/performance analysis.
125        session_id: Session identifier to associate with all spans in this context.
126            Must be US-ASCII string, ≤200 characters. Use this to group related traces
127            within a user session (e.g., a conversation thread, multi-turn interaction).
128        metadata: Additional key-value metadata to propagate to all spans.
129            - Keys and values must be US-ASCII strings
130            - All values must be ≤200 characters
131            - Use for dimensions like internal correlating identifiers
132            - AVOID: large payloads, sensitive data, non-string values (will be dropped with warning)
133        version: Version identfier for parts of your application that are independently versioned, e.g. agents
134        tags: List of tags to categorize the group of observations
135        trace_name: Name to assign to the trace. Must be US-ASCII string, ≤200 characters.
136            Use this to set a consistent trace name for all spans created within this context.
137        as_baggage: If True, propagates attributes using OpenTelemetry baggage for
138            cross-process/service propagation. **Security warning**: When enabled,
139            attribute values are added to HTTP headers on ALL outbound requests.
140            Only enable if values are safe to transmit via HTTP headers and you need
141            cross-service tracing. Default: False.
142
143    Returns:
144        Context manager that propagates attributes to all child spans.
145
146    Example:
147        Basic usage with user and session tracking:
148
149        ```python
150        from langfuse import Langfuse
151
152        langfuse = Langfuse()
153
154        # Set attributes early in the trace
155        with langfuse.start_as_current_observation(name="user_workflow") as span:
156            with langfuse.propagate_attributes(
157                user_id="user_123",
158                session_id="session_abc",
159                metadata={"experiment": "variant_a", "environment": "production"}
160            ):
161                # All spans created here will have user_id, session_id, and metadata
162                with langfuse.start_observation(name="llm_call") as llm_span:
163                    # This span inherits: user_id, session_id, experiment, environment
164                    ...
165
166                with langfuse.start_generation(name="completion") as gen:
167                    # This span also inherits all attributes
168                    ...
169        ```
170
171        Late propagation (anti-pattern):
172
173        ```python
174        with langfuse.start_as_current_observation(name="workflow") as span:
175            # These spans WON'T have user_id
176            early_span = langfuse.start_observation(name="early_work")
177            early_span.end()
178
179            # Set attributes in the middle
180            with langfuse.propagate_attributes(user_id="user_123"):
181                # Only spans created AFTER this point will have user_id
182                late_span = langfuse.start_observation(name="late_work")
183                late_span.end()
184
185            # Result: Aggregations by user_id will miss "early_work" span
186        ```
187
188        Cross-service propagation with baggage (advanced):
189
190        ```python
191        # Service A - originating service
192        with langfuse.start_as_current_observation(name="api_request"):
193            with langfuse.propagate_attributes(
194                user_id="user_123",
195                session_id="session_abc",
196                as_baggage=True  # Propagate via HTTP headers
197            ):
198                # Make HTTP request to Service B
199                response = requests.get("https://service-b.example.com/api")
200                # user_id and session_id are now in HTTP headers
201
202        # Service B - downstream service
203        # OpenTelemetry will automatically extract baggage from HTTP headers
204        # and propagate to spans in Service B
205        ```
206
207    Note:
208        - **Validation**: All attribute values (user_id, session_id, metadata values)
209          must be strings ≤200 characters. Invalid values will be dropped with a
210          warning logged. Ensure values meet constraints before calling.
211        - **OpenTelemetry**: This uses OpenTelemetry context propagation under the hood,
212          making it compatible with other OTel-instrumented libraries.
213
214    Raises:
215        No exceptions are raised. Invalid values are logged as warnings and dropped.
216    """
217    return _propagate_attributes(
218        user_id=user_id,
219        session_id=session_id,
220        metadata=metadata,
221        version=version,
222        tags=tags,
223        trace_name=trace_name,
224        as_baggage=as_baggage,
225    )

Propagate trace-level attributes to all spans created within this context.

This context manager sets attributes on the currently active span AND automatically propagates them to all new child spans created within the context. This is the recommended way to set trace-level attributes like user_id, session_id, and metadata dimensions that should be consistently applied across all observations in a trace.

IMPORTANT: Call this as early as possible within your trace/workflow. Only the currently active span and spans created after entering this context will have these attributes. Pre-existing spans will NOT be retroactively updated.

Why this matters: Langfuse aggregation queries (e.g., total cost by user_id, filtering by session_id) only include observations that have the attribute set. If you call propagate_attributes late in your workflow, earlier spans won't be included in aggregations for that attribute.

Arguments:
  • user_id: User identifier to associate with all spans in this context. Must be US-ASCII string, ≤200 characters. Use this to track which user generated each trace and enable e.g. per-user cost/performance analysis.
  • session_id: Session identifier to associate with all spans in this context. Must be US-ASCII string, ≤200 characters. Use this to group related traces within a user session (e.g., a conversation thread, multi-turn interaction).
  • metadata: Additional key-value metadata to propagate to all spans.
    • Keys and values must be US-ASCII strings
    • All values must be ≤200 characters
    • Use for dimensions like internal correlating identifiers
    • AVOID: large payloads, sensitive data, non-string values (will be dropped with warning)
  • version: Version identfier for parts of your application that are independently versioned, e.g. agents
  • tags: List of tags to categorize the group of observations
  • trace_name: Name to assign to the trace. Must be US-ASCII string, ≤200 characters. Use this to set a consistent trace name for all spans created within this context.
  • as_baggage: If True, propagates attributes using OpenTelemetry baggage for cross-process/service propagation. Security warning: When enabled, attribute values are added to HTTP headers on ALL outbound requests. Only enable if values are safe to transmit via HTTP headers and you need cross-service tracing. Default: False.
Returns:

Context manager that propagates attributes to all child spans.

Example:

Basic usage with user and session tracking:

from langfuse import Langfuse

langfuse = Langfuse()

# Set attributes early in the trace
with langfuse.start_as_current_observation(name="user_workflow") as span:
    with langfuse.propagate_attributes(
        user_id="user_123",
        session_id="session_abc",
        metadata={"experiment": "variant_a", "environment": "production"}
    ):
        # All spans created here will have user_id, session_id, and metadata
        with langfuse.start_observation(name="llm_call") as llm_span:
            # This span inherits: user_id, session_id, experiment, environment
            ...

        with langfuse.start_generation(name="completion") as gen:
            # This span also inherits all attributes
            ...

Late propagation (anti-pattern):

with langfuse.start_as_current_observation(name="workflow") as span:
    # These spans WON'T have user_id
    early_span = langfuse.start_observation(name="early_work")
    early_span.end()

    # Set attributes in the middle
    with langfuse.propagate_attributes(user_id="user_123"):
        # Only spans created AFTER this point will have user_id
        late_span = langfuse.start_observation(name="late_work")
        late_span.end()

    # Result: Aggregations by user_id will miss "early_work" span

Cross-service propagation with baggage (advanced):

# Service A - originating service
with langfuse.start_as_current_observation(name="api_request"):
    with langfuse.propagate_attributes(
        user_id="user_123",
        session_id="session_abc",
        as_baggage=True  # Propagate via HTTP headers
    ):
        # Make HTTP request to Service B
        response = requests.get("https://service-b.example.com/api")
        # user_id and session_id are now in HTTP headers

# Service B - downstream service
# OpenTelemetry will automatically extract baggage from HTTP headers
# and propagate to spans in Service B
Note:
  • Validation: All attribute values (user_id, session_id, metadata values) must be strings ≤200 characters. Invalid values will be dropped with a warning logged. Ensure values meet constraints before calling.
  • OpenTelemetry: This uses OpenTelemetry context propagation under the hood, making it compatible with other OTel-instrumented libraries.
Raises:
  • No exceptions are raised. Invalid values are logged as warnings and dropped.
ObservationTypeLiteral = typing.Union[typing.Literal['generation', 'embedding'], typing.Literal['span', 'agent', 'tool', 'chain', 'retriever', 'evaluator', 'guardrail'], typing.Literal['event']]
class LangfuseSpan(langfuse._client.span.LangfuseObservationWrapper):
1247class LangfuseSpan(LangfuseObservationWrapper):
1248    """Standard span implementation for general operations in Langfuse.
1249
1250    This class represents a general-purpose span that can be used to trace
1251    any operation in your application. It extends the base LangfuseObservationWrapper
1252    with specific methods for creating child spans, generations, and updating
1253    span-specific attributes. If possible, use a more specific type for
1254    better observability and insights.
1255    """
1256
1257    def __init__(
1258        self,
1259        *,
1260        otel_span: otel_trace_api.Span,
1261        langfuse_client: "Langfuse",
1262        input: Optional[Any] = None,
1263        output: Optional[Any] = None,
1264        metadata: Optional[Any] = None,
1265        environment: Optional[str] = None,
1266        release: Optional[str] = None,
1267        version: Optional[str] = None,
1268        level: Optional[SpanLevel] = None,
1269        status_message: Optional[str] = None,
1270    ):
1271        """Initialize a new LangfuseSpan.
1272
1273        Args:
1274            otel_span: The OpenTelemetry span to wrap
1275            langfuse_client: Reference to the parent Langfuse client
1276            input: Input data for the span (any JSON-serializable object)
1277            output: Output data from the span (any JSON-serializable object)
1278            metadata: Additional metadata to associate with the span
1279            environment: The tracing environment
1280            release: Release identifier for the application
1281            version: Version identifier for the code or component
1282            level: Importance level of the span (info, warning, error)
1283            status_message: Optional status message for the span
1284        """
1285        super().__init__(
1286            otel_span=otel_span,
1287            as_type="span",
1288            langfuse_client=langfuse_client,
1289            input=input,
1290            output=output,
1291            metadata=metadata,
1292            environment=environment,
1293            release=release,
1294            version=version,
1295            level=level,
1296            status_message=status_message,
1297        )

Standard span implementation for general operations in Langfuse.

This class represents a general-purpose span that can be used to trace any operation in your application. It extends the base LangfuseObservationWrapper with specific methods for creating child spans, generations, and updating span-specific attributes. If possible, use a more specific type for better observability and insights.

LangfuseSpan( *, otel_span: opentelemetry.trace.span.Span, langfuse_client: Langfuse, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, environment: Optional[str] = None, release: Optional[str] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None)
1257    def __init__(
1258        self,
1259        *,
1260        otel_span: otel_trace_api.Span,
1261        langfuse_client: "Langfuse",
1262        input: Optional[Any] = None,
1263        output: Optional[Any] = None,
1264        metadata: Optional[Any] = None,
1265        environment: Optional[str] = None,
1266        release: Optional[str] = None,
1267        version: Optional[str] = None,
1268        level: Optional[SpanLevel] = None,
1269        status_message: Optional[str] = None,
1270    ):
1271        """Initialize a new LangfuseSpan.
1272
1273        Args:
1274            otel_span: The OpenTelemetry span to wrap
1275            langfuse_client: Reference to the parent Langfuse client
1276            input: Input data for the span (any JSON-serializable object)
1277            output: Output data from the span (any JSON-serializable object)
1278            metadata: Additional metadata to associate with the span
1279            environment: The tracing environment
1280            release: Release identifier for the application
1281            version: Version identifier for the code or component
1282            level: Importance level of the span (info, warning, error)
1283            status_message: Optional status message for the span
1284        """
1285        super().__init__(
1286            otel_span=otel_span,
1287            as_type="span",
1288            langfuse_client=langfuse_client,
1289            input=input,
1290            output=output,
1291            metadata=metadata,
1292            environment=environment,
1293            release=release,
1294            version=version,
1295            level=level,
1296            status_message=status_message,
1297        )

Initialize a new LangfuseSpan.

Arguments:
  • otel_span: The OpenTelemetry span to wrap
  • langfuse_client: Reference to the parent Langfuse client
  • input: Input data for the span (any JSON-serializable object)
  • output: Output data from the span (any JSON-serializable object)
  • metadata: Additional metadata to associate with the span
  • environment: The tracing environment
  • release: Release identifier for the application
  • version: Version identifier for the code or component
  • level: Importance level of the span (info, warning, error)
  • status_message: Optional status message for the span
class LangfuseGeneration(langfuse._client.span.LangfuseObservationWrapper):
1300class LangfuseGeneration(LangfuseObservationWrapper):
1301    """Specialized span implementation for AI model generations in Langfuse.
1302
1303    This class represents a generation span specifically designed for tracking
1304    AI/LLM operations. It extends the base LangfuseObservationWrapper with specialized
1305    attributes for model details, token usage, and costs.
1306    """
1307
1308    def __init__(
1309        self,
1310        *,
1311        otel_span: otel_trace_api.Span,
1312        langfuse_client: "Langfuse",
1313        input: Optional[Any] = None,
1314        output: Optional[Any] = None,
1315        metadata: Optional[Any] = None,
1316        environment: Optional[str] = None,
1317        release: Optional[str] = None,
1318        version: Optional[str] = None,
1319        level: Optional[SpanLevel] = None,
1320        status_message: Optional[str] = None,
1321        completion_start_time: Optional[datetime] = None,
1322        model: Optional[str] = None,
1323        model_parameters: Optional[Dict[str, MapValue]] = None,
1324        usage_details: Optional[Dict[str, int]] = None,
1325        cost_details: Optional[Dict[str, float]] = None,
1326        prompt: Optional[PromptClient] = None,
1327    ):
1328        """Initialize a new LangfuseGeneration span.
1329
1330        Args:
1331            otel_span: The OpenTelemetry span to wrap
1332            langfuse_client: Reference to the parent Langfuse client
1333            input: Input data for the generation (e.g., prompts)
1334            output: Output from the generation (e.g., completions)
1335            metadata: Additional metadata to associate with the generation
1336            environment: The tracing environment
1337            release: Release identifier for the application
1338            version: Version identifier for the model or component
1339            level: Importance level of the generation (info, warning, error)
1340            status_message: Optional status message for the generation
1341            completion_start_time: When the model started generating the response
1342            model: Name/identifier of the AI model used (e.g., "gpt-4")
1343            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1344            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1345            cost_details: Cost information for the model call
1346            prompt: Associated prompt template from Langfuse prompt management
1347        """
1348        super().__init__(
1349            as_type="generation",
1350            otel_span=otel_span,
1351            langfuse_client=langfuse_client,
1352            input=input,
1353            output=output,
1354            metadata=metadata,
1355            environment=environment,
1356            release=release,
1357            version=version,
1358            level=level,
1359            status_message=status_message,
1360            completion_start_time=completion_start_time,
1361            model=model,
1362            model_parameters=model_parameters,
1363            usage_details=usage_details,
1364            cost_details=cost_details,
1365            prompt=prompt,
1366        )

Specialized span implementation for AI model generations in Langfuse.

This class represents a generation span specifically designed for tracking AI/LLM operations. It extends the base LangfuseObservationWrapper with specialized attributes for model details, token usage, and costs.

LangfuseGeneration( *, otel_span: opentelemetry.trace.span.Span, langfuse_client: Langfuse, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, environment: Optional[str] = None, release: Optional[str] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, float, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None)
1308    def __init__(
1309        self,
1310        *,
1311        otel_span: otel_trace_api.Span,
1312        langfuse_client: "Langfuse",
1313        input: Optional[Any] = None,
1314        output: Optional[Any] = None,
1315        metadata: Optional[Any] = None,
1316        environment: Optional[str] = None,
1317        release: Optional[str] = None,
1318        version: Optional[str] = None,
1319        level: Optional[SpanLevel] = None,
1320        status_message: Optional[str] = None,
1321        completion_start_time: Optional[datetime] = None,
1322        model: Optional[str] = None,
1323        model_parameters: Optional[Dict[str, MapValue]] = None,
1324        usage_details: Optional[Dict[str, int]] = None,
1325        cost_details: Optional[Dict[str, float]] = None,
1326        prompt: Optional[PromptClient] = None,
1327    ):
1328        """Initialize a new LangfuseGeneration span.
1329
1330        Args:
1331            otel_span: The OpenTelemetry span to wrap
1332            langfuse_client: Reference to the parent Langfuse client
1333            input: Input data for the generation (e.g., prompts)
1334            output: Output from the generation (e.g., completions)
1335            metadata: Additional metadata to associate with the generation
1336            environment: The tracing environment
1337            release: Release identifier for the application
1338            version: Version identifier for the model or component
1339            level: Importance level of the generation (info, warning, error)
1340            status_message: Optional status message for the generation
1341            completion_start_time: When the model started generating the response
1342            model: Name/identifier of the AI model used (e.g., "gpt-4")
1343            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1344            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1345            cost_details: Cost information for the model call
1346            prompt: Associated prompt template from Langfuse prompt management
1347        """
1348        super().__init__(
1349            as_type="generation",
1350            otel_span=otel_span,
1351            langfuse_client=langfuse_client,
1352            input=input,
1353            output=output,
1354            metadata=metadata,
1355            environment=environment,
1356            release=release,
1357            version=version,
1358            level=level,
1359            status_message=status_message,
1360            completion_start_time=completion_start_time,
1361            model=model,
1362            model_parameters=model_parameters,
1363            usage_details=usage_details,
1364            cost_details=cost_details,
1365            prompt=prompt,
1366        )

Initialize a new LangfuseGeneration span.

Arguments:
  • otel_span: The OpenTelemetry span to wrap
  • langfuse_client: Reference to the parent Langfuse client
  • input: Input data for the generation (e.g., prompts)
  • output: Output from the generation (e.g., completions)
  • metadata: Additional metadata to associate with the generation
  • environment: The tracing environment
  • release: Release identifier for the application
  • version: Version identifier for the model or component
  • level: Importance level of the generation (info, warning, error)
  • status_message: Optional status message for the generation
  • completion_start_time: When the model started generating the response
  • model: Name/identifier of the AI model used (e.g., "gpt-4")
  • model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
  • usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
  • cost_details: Cost information for the model call
  • prompt: Associated prompt template from Langfuse prompt management
class LangfuseEvent(langfuse._client.span.LangfuseObservationWrapper):
1369class LangfuseEvent(LangfuseObservationWrapper):
1370    """Specialized span implementation for Langfuse Events."""
1371
1372    def __init__(
1373        self,
1374        *,
1375        otel_span: otel_trace_api.Span,
1376        langfuse_client: "Langfuse",
1377        input: Optional[Any] = None,
1378        output: Optional[Any] = None,
1379        metadata: Optional[Any] = None,
1380        environment: Optional[str] = None,
1381        release: Optional[str] = None,
1382        version: Optional[str] = None,
1383        level: Optional[SpanLevel] = None,
1384        status_message: Optional[str] = None,
1385    ):
1386        """Initialize a new LangfuseEvent span.
1387
1388        Args:
1389            otel_span: The OpenTelemetry span to wrap
1390            langfuse_client: Reference to the parent Langfuse client
1391            input: Input data for the event
1392            output: Output from the event
1393            metadata: Additional metadata to associate with the generation
1394            environment: The tracing environment
1395            release: Release identifier for the application
1396            version: Version identifier for the model or component
1397            level: Importance level of the generation (info, warning, error)
1398            status_message: Optional status message for the generation
1399        """
1400        super().__init__(
1401            otel_span=otel_span,
1402            as_type="event",
1403            langfuse_client=langfuse_client,
1404            input=input,
1405            output=output,
1406            metadata=metadata,
1407            environment=environment,
1408            release=release,
1409            version=version,
1410            level=level,
1411            status_message=status_message,
1412        )
1413
1414    def update(
1415        self,
1416        *,
1417        name: Optional[str] = None,
1418        input: Optional[Any] = None,
1419        output: Optional[Any] = None,
1420        metadata: Optional[Any] = None,
1421        version: Optional[str] = None,
1422        level: Optional[SpanLevel] = None,
1423        status_message: Optional[str] = None,
1424        completion_start_time: Optional[datetime] = None,
1425        model: Optional[str] = None,
1426        model_parameters: Optional[Dict[str, MapValue]] = None,
1427        usage_details: Optional[Dict[str, int]] = None,
1428        cost_details: Optional[Dict[str, float]] = None,
1429        prompt: Optional[PromptClient] = None,
1430        **kwargs: Any,
1431    ) -> "LangfuseEvent":
1432        """Update is not allowed for LangfuseEvent because events cannot be updated.
1433
1434        This method logs a warning and returns self without making changes.
1435
1436        Returns:
1437            self: Returns the unchanged LangfuseEvent instance
1438        """
1439        langfuse_logger.warning(
1440            "Attempted to update LangfuseEvent observation. Events cannot be updated after creation."
1441        )
1442        return self

Specialized span implementation for Langfuse Events.

LangfuseEvent( *, otel_span: opentelemetry.trace.span.Span, langfuse_client: Langfuse, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, environment: Optional[str] = None, release: Optional[str] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None)
1372    def __init__(
1373        self,
1374        *,
1375        otel_span: otel_trace_api.Span,
1376        langfuse_client: "Langfuse",
1377        input: Optional[Any] = None,
1378        output: Optional[Any] = None,
1379        metadata: Optional[Any] = None,
1380        environment: Optional[str] = None,
1381        release: Optional[str] = None,
1382        version: Optional[str] = None,
1383        level: Optional[SpanLevel] = None,
1384        status_message: Optional[str] = None,
1385    ):
1386        """Initialize a new LangfuseEvent span.
1387
1388        Args:
1389            otel_span: The OpenTelemetry span to wrap
1390            langfuse_client: Reference to the parent Langfuse client
1391            input: Input data for the event
1392            output: Output from the event
1393            metadata: Additional metadata to associate with the generation
1394            environment: The tracing environment
1395            release: Release identifier for the application
1396            version: Version identifier for the model or component
1397            level: Importance level of the generation (info, warning, error)
1398            status_message: Optional status message for the generation
1399        """
1400        super().__init__(
1401            otel_span=otel_span,
1402            as_type="event",
1403            langfuse_client=langfuse_client,
1404            input=input,
1405            output=output,
1406            metadata=metadata,
1407            environment=environment,
1408            release=release,
1409            version=version,
1410            level=level,
1411            status_message=status_message,
1412        )

Initialize a new LangfuseEvent span.

Arguments:
  • otel_span: The OpenTelemetry span to wrap
  • langfuse_client: Reference to the parent Langfuse client
  • input: Input data for the event
  • output: Output from the event
  • metadata: Additional metadata to associate with the generation
  • environment: The tracing environment
  • release: Release identifier for the application
  • version: Version identifier for the model or component
  • level: Importance level of the generation (info, warning, error)
  • status_message: Optional status message for the generation
def update( self, *, name: Optional[str] = None, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, float, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None, **kwargs: Any) -> LangfuseEvent:
1414    def update(
1415        self,
1416        *,
1417        name: Optional[str] = None,
1418        input: Optional[Any] = None,
1419        output: Optional[Any] = None,
1420        metadata: Optional[Any] = None,
1421        version: Optional[str] = None,
1422        level: Optional[SpanLevel] = None,
1423        status_message: Optional[str] = None,
1424        completion_start_time: Optional[datetime] = None,
1425        model: Optional[str] = None,
1426        model_parameters: Optional[Dict[str, MapValue]] = None,
1427        usage_details: Optional[Dict[str, int]] = None,
1428        cost_details: Optional[Dict[str, float]] = None,
1429        prompt: Optional[PromptClient] = None,
1430        **kwargs: Any,
1431    ) -> "LangfuseEvent":
1432        """Update is not allowed for LangfuseEvent because events cannot be updated.
1433
1434        This method logs a warning and returns self without making changes.
1435
1436        Returns:
1437            self: Returns the unchanged LangfuseEvent instance
1438        """
1439        langfuse_logger.warning(
1440            "Attempted to update LangfuseEvent observation. Events cannot be updated after creation."
1441        )
1442        return self

Update is not allowed for LangfuseEvent because events cannot be updated.

This method logs a warning and returns self without making changes.

Returns:

self: Returns the unchanged LangfuseEvent instance

class LangfuseOtelSpanAttributes:
28class LangfuseOtelSpanAttributes:
29    # Langfuse-Trace attributes
30    TRACE_NAME = "langfuse.trace.name"
31    TRACE_USER_ID = "user.id"
32    TRACE_SESSION_ID = "session.id"
33    TRACE_TAGS = "langfuse.trace.tags"
34    TRACE_PUBLIC = "langfuse.trace.public"
35    TRACE_METADATA = "langfuse.trace.metadata"
36    TRACE_INPUT = "langfuse.trace.input"
37    TRACE_OUTPUT = "langfuse.trace.output"
38
39    # Langfuse-observation attributes
40    OBSERVATION_TYPE = "langfuse.observation.type"
41    OBSERVATION_METADATA = "langfuse.observation.metadata"
42    OBSERVATION_LEVEL = "langfuse.observation.level"
43    OBSERVATION_STATUS_MESSAGE = "langfuse.observation.status_message"
44    OBSERVATION_INPUT = "langfuse.observation.input"
45    OBSERVATION_OUTPUT = "langfuse.observation.output"
46
47    # Langfuse-observation of type Generation attributes
48    OBSERVATION_COMPLETION_START_TIME = "langfuse.observation.completion_start_time"
49    OBSERVATION_MODEL = "langfuse.observation.model.name"
50    OBSERVATION_MODEL_PARAMETERS = "langfuse.observation.model.parameters"
51    OBSERVATION_USAGE_DETAILS = "langfuse.observation.usage_details"
52    OBSERVATION_COST_DETAILS = "langfuse.observation.cost_details"
53    OBSERVATION_PROMPT_NAME = "langfuse.observation.prompt.name"
54    OBSERVATION_PROMPT_VERSION = "langfuse.observation.prompt.version"
55
56    # General
57    ENVIRONMENT = "langfuse.environment"
58    RELEASE = "langfuse.release"
59    VERSION = "langfuse.version"
60
61    # Internal
62    AS_ROOT = "langfuse.internal.as_root"
63
64    # Experiments
65    EXPERIMENT_ID = "langfuse.experiment.id"
66    EXPERIMENT_NAME = "langfuse.experiment.name"
67    EXPERIMENT_DESCRIPTION = "langfuse.experiment.description"
68    EXPERIMENT_METADATA = "langfuse.experiment.metadata"
69    EXPERIMENT_DATASET_ID = "langfuse.experiment.dataset.id"
70    EXPERIMENT_ITEM_ID = "langfuse.experiment.item.id"
71    EXPERIMENT_ITEM_EXPECTED_OUTPUT = "langfuse.experiment.item.expected_output"
72    EXPERIMENT_ITEM_METADATA = "langfuse.experiment.item.metadata"
73    EXPERIMENT_ITEM_ROOT_OBSERVATION_ID = "langfuse.experiment.item.root_observation_id"
TRACE_NAME = 'langfuse.trace.name'
TRACE_USER_ID = 'user.id'
TRACE_SESSION_ID = 'session.id'
TRACE_TAGS = 'langfuse.trace.tags'
TRACE_PUBLIC = 'langfuse.trace.public'
TRACE_METADATA = 'langfuse.trace.metadata'
TRACE_INPUT = 'langfuse.trace.input'
TRACE_OUTPUT = 'langfuse.trace.output'
OBSERVATION_TYPE = 'langfuse.observation.type'
OBSERVATION_METADATA = 'langfuse.observation.metadata'
OBSERVATION_LEVEL = 'langfuse.observation.level'
OBSERVATION_STATUS_MESSAGE = 'langfuse.observation.status_message'
OBSERVATION_INPUT = 'langfuse.observation.input'
OBSERVATION_OUTPUT = 'langfuse.observation.output'
OBSERVATION_COMPLETION_START_TIME = 'langfuse.observation.completion_start_time'
OBSERVATION_MODEL = 'langfuse.observation.model.name'
OBSERVATION_MODEL_PARAMETERS = 'langfuse.observation.model.parameters'
OBSERVATION_USAGE_DETAILS = 'langfuse.observation.usage_details'
OBSERVATION_COST_DETAILS = 'langfuse.observation.cost_details'
OBSERVATION_PROMPT_NAME = 'langfuse.observation.prompt.name'
OBSERVATION_PROMPT_VERSION = 'langfuse.observation.prompt.version'
ENVIRONMENT = 'langfuse.environment'
RELEASE = 'langfuse.release'
VERSION = 'langfuse.version'
AS_ROOT = 'langfuse.internal.as_root'
EXPERIMENT_ID = 'langfuse.experiment.id'
EXPERIMENT_NAME = 'langfuse.experiment.name'
EXPERIMENT_DESCRIPTION = 'langfuse.experiment.description'
EXPERIMENT_METADATA = 'langfuse.experiment.metadata'
EXPERIMENT_DATASET_ID = 'langfuse.experiment.dataset.id'
EXPERIMENT_ITEM_ID = 'langfuse.experiment.item.id'
EXPERIMENT_ITEM_EXPECTED_OUTPUT = 'langfuse.experiment.item.expected_output'
EXPERIMENT_ITEM_METADATA = 'langfuse.experiment.item.metadata'
EXPERIMENT_ITEM_ROOT_OBSERVATION_ID = 'langfuse.experiment.item.root_observation_id'
class LangfuseAgent(langfuse._client.span.LangfuseObservationWrapper):
1445class LangfuseAgent(LangfuseObservationWrapper):
1446    """Agent observation for reasoning blocks that act on tools using LLM guidance."""
1447
1448    def __init__(self, **kwargs: Any) -> None:
1449        """Initialize a new LangfuseAgent span."""
1450        kwargs["as_type"] = "agent"
1451        super().__init__(**kwargs)

Agent observation for reasoning blocks that act on tools using LLM guidance.

LangfuseAgent(**kwargs: Any)
1448    def __init__(self, **kwargs: Any) -> None:
1449        """Initialize a new LangfuseAgent span."""
1450        kwargs["as_type"] = "agent"
1451        super().__init__(**kwargs)

Initialize a new LangfuseAgent span.

class LangfuseTool(langfuse._client.span.LangfuseObservationWrapper):
1454class LangfuseTool(LangfuseObservationWrapper):
1455    """Tool observation representing external tool calls, e.g., calling a weather API."""
1456
1457    def __init__(self, **kwargs: Any) -> None:
1458        """Initialize a new LangfuseTool span."""
1459        kwargs["as_type"] = "tool"
1460        super().__init__(**kwargs)

Tool observation representing external tool calls, e.g., calling a weather API.

LangfuseTool(**kwargs: Any)
1457    def __init__(self, **kwargs: Any) -> None:
1458        """Initialize a new LangfuseTool span."""
1459        kwargs["as_type"] = "tool"
1460        super().__init__(**kwargs)

Initialize a new LangfuseTool span.

class LangfuseChain(langfuse._client.span.LangfuseObservationWrapper):
1463class LangfuseChain(LangfuseObservationWrapper):
1464    """Chain observation for connecting LLM application steps, e.g. passing context from retriever to LLM."""
1465
1466    def __init__(self, **kwargs: Any) -> None:
1467        """Initialize a new LangfuseChain span."""
1468        kwargs["as_type"] = "chain"
1469        super().__init__(**kwargs)

Chain observation for connecting LLM application steps, e.g. passing context from retriever to LLM.

LangfuseChain(**kwargs: Any)
1466    def __init__(self, **kwargs: Any) -> None:
1467        """Initialize a new LangfuseChain span."""
1468        kwargs["as_type"] = "chain"
1469        super().__init__(**kwargs)

Initialize a new LangfuseChain span.

class LangfuseEmbedding(langfuse._client.span.LangfuseObservationWrapper):
1481class LangfuseEmbedding(LangfuseObservationWrapper):
1482    """Embedding observation for LLM embedding calls, typically used before retrieval."""
1483
1484    def __init__(self, **kwargs: Any) -> None:
1485        """Initialize a new LangfuseEmbedding span."""
1486        kwargs["as_type"] = "embedding"
1487        super().__init__(**kwargs)

Embedding observation for LLM embedding calls, typically used before retrieval.

LangfuseEmbedding(**kwargs: Any)
1484    def __init__(self, **kwargs: Any) -> None:
1485        """Initialize a new LangfuseEmbedding span."""
1486        kwargs["as_type"] = "embedding"
1487        super().__init__(**kwargs)

Initialize a new LangfuseEmbedding span.

class LangfuseEvaluator(langfuse._client.span.LangfuseObservationWrapper):
1490class LangfuseEvaluator(LangfuseObservationWrapper):
1491    """Evaluator observation for assessing relevance, correctness, or helpfulness of LLM outputs."""
1492
1493    def __init__(self, **kwargs: Any) -> None:
1494        """Initialize a new LangfuseEvaluator span."""
1495        kwargs["as_type"] = "evaluator"
1496        super().__init__(**kwargs)

Evaluator observation for assessing relevance, correctness, or helpfulness of LLM outputs.

LangfuseEvaluator(**kwargs: Any)
1493    def __init__(self, **kwargs: Any) -> None:
1494        """Initialize a new LangfuseEvaluator span."""
1495        kwargs["as_type"] = "evaluator"
1496        super().__init__(**kwargs)

Initialize a new LangfuseEvaluator span.

class LangfuseRetriever(langfuse._client.span.LangfuseObservationWrapper):
1472class LangfuseRetriever(LangfuseObservationWrapper):
1473    """Retriever observation for data retrieval steps, e.g. vector store or database queries."""
1474
1475    def __init__(self, **kwargs: Any) -> None:
1476        """Initialize a new LangfuseRetriever span."""
1477        kwargs["as_type"] = "retriever"
1478        super().__init__(**kwargs)

Retriever observation for data retrieval steps, e.g. vector store or database queries.

LangfuseRetriever(**kwargs: Any)
1475    def __init__(self, **kwargs: Any) -> None:
1476        """Initialize a new LangfuseRetriever span."""
1477        kwargs["as_type"] = "retriever"
1478        super().__init__(**kwargs)

Initialize a new LangfuseRetriever span.

class LangfuseGuardrail(langfuse._client.span.LangfuseObservationWrapper):
1499class LangfuseGuardrail(LangfuseObservationWrapper):
1500    """Guardrail observation for protection e.g. against jailbreaks or offensive content."""
1501
1502    def __init__(self, **kwargs: Any) -> None:
1503        """Initialize a new LangfuseGuardrail span."""
1504        kwargs["as_type"] = "guardrail"
1505        super().__init__(**kwargs)

Guardrail observation for protection e.g. against jailbreaks or offensive content.

LangfuseGuardrail(**kwargs: Any)
1502    def __init__(self, **kwargs: Any) -> None:
1503        """Initialize a new LangfuseGuardrail span."""
1504        kwargs["as_type"] = "guardrail"
1505        super().__init__(**kwargs)

Initialize a new LangfuseGuardrail span.

class Evaluation:
 94class Evaluation:
 95    """Represents an evaluation result for an experiment item or an entire experiment run.
 96
 97    This class provides a strongly-typed way to create evaluation results in evaluator functions.
 98    Users must use keyword arguments when instantiating this class.
 99
100    Attributes:
101        name: Unique identifier for the evaluation metric. Should be descriptive
102            and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity").
103            Used for aggregation and comparison across experiment runs.
104        value: The evaluation score or result. Can be:
105            - Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42)
106            - String: For categorical results like "positive", "negative", "neutral"
107            - Boolean: For binary assessments like "passes_safety_check"
108        comment: Optional human-readable explanation of the evaluation result.
109            Useful for providing context, explaining scoring rationale, or noting
110            special conditions. Displayed in Langfuse UI for interpretability.
111        metadata: Optional structured metadata about the evaluation process.
112            Can include confidence scores, intermediate calculations, model versions,
113            or any other relevant technical details.
114        data_type: Optional score data type. Required if value is not NUMERIC.
115            One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC.
116        config_id: Optional Langfuse score config ID.
117
118    Examples:
119        Basic accuracy evaluation:
120        ```python
121        from langfuse import Evaluation
122
123        def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
124            if not expected_output:
125                return Evaluation(name="accuracy", value=0, comment="No expected output")
126
127            is_correct = output.strip().lower() == expected_output.strip().lower()
128            return Evaluation(
129                name="accuracy",
130                value=1.0 if is_correct else 0.0,
131                comment="Correct answer" if is_correct else "Incorrect answer"
132            )
133        ```
134
135        Multi-metric evaluator:
136        ```python
137        def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs):
138            return [
139                Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"),
140                Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"),
141                Evaluation(
142                    name="quality",
143                    value=0.85,
144                    comment="High quality response",
145                    metadata={"confidence": 0.92, "model": "gpt-4"}
146                )
147            ]
148        ```
149
150        Categorical evaluation:
151        ```python
152        def sentiment_evaluator(*, input, output, **kwargs):
153            sentiment = analyze_sentiment(output)  # Returns "positive", "negative", or "neutral"
154            return Evaluation(
155                name="sentiment",
156                value=sentiment,
157                comment=f"Response expresses {sentiment} sentiment",
158                data_type="CATEGORICAL"
159            )
160        ```
161
162        Failed evaluation with error handling:
163        ```python
164        def external_api_evaluator(*, input, output, **kwargs):
165            try:
166                score = external_api.evaluate(output)
167                return Evaluation(name="external_score", value=score)
168            except Exception as e:
169                return Evaluation(
170                    name="external_score",
171                    value=0,
172                    comment=f"API unavailable: {e}",
173                    metadata={"error": str(e), "retry_count": 3}
174                )
175        ```
176
177    Note:
178        All arguments must be passed as keywords. Positional arguments are not allowed
179        to ensure code clarity and prevent errors from argument reordering.
180    """
181
182    def __init__(
183        self,
184        *,
185        name: str,
186        value: Union[int, float, str, bool],
187        comment: Optional[str] = None,
188        metadata: Optional[Dict[str, Any]] = None,
189        data_type: Optional[ExperimentScoreType] = None,
190        config_id: Optional[str] = None,
191    ):
192        """Initialize an Evaluation with the provided data.
193
194        Args:
195            name: Unique identifier for the evaluation metric.
196            value: The evaluation score or result.
197            comment: Optional human-readable explanation of the result.
198            metadata: Optional structured metadata about the evaluation process.
199            data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
200            config_id: Optional Langfuse score config ID.
201
202        Note:
203            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
204        """
205        self.name = name
206        self.value = value
207        self.comment = comment
208        self.metadata = metadata
209        self.data_type = data_type
210        self.config_id = config_id

Represents an evaluation result for an experiment item or an entire experiment run.

This class provides a strongly-typed way to create evaluation results in evaluator functions. Users must use keyword arguments when instantiating this class.

Attributes:
  • name: Unique identifier for the evaluation metric. Should be descriptive and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity"). Used for aggregation and comparison across experiment runs.
  • value: The evaluation score or result. Can be:
    • Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42)
    • String: For categorical results like "positive", "negative", "neutral"
    • Boolean: For binary assessments like "passes_safety_check"
  • comment: Optional human-readable explanation of the evaluation result. Useful for providing context, explaining scoring rationale, or noting special conditions. Displayed in Langfuse UI for interpretability.
  • metadata: Optional structured metadata about the evaluation process. Can include confidence scores, intermediate calculations, model versions, or any other relevant technical details.
  • data_type: Optional score data type. Required if value is not NUMERIC. One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC.
  • config_id: Optional Langfuse score config ID.
Examples:

Basic accuracy evaluation:

from langfuse import Evaluation

def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
    if not expected_output:
        return Evaluation(name="accuracy", value=0, comment="No expected output")

    is_correct = output.strip().lower() == expected_output.strip().lower()
    return Evaluation(
        name="accuracy",
        value=1.0 if is_correct else 0.0,
        comment="Correct answer" if is_correct else "Incorrect answer"
    )

Multi-metric evaluator:

def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs):
    return [
        Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"),
        Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"),
        Evaluation(
            name="quality",
            value=0.85,
            comment="High quality response",
            metadata={"confidence": 0.92, "model": "gpt-4"}
        )
    ]

Categorical evaluation:

def sentiment_evaluator(*, input, output, **kwargs):
    sentiment = analyze_sentiment(output)  # Returns "positive", "negative", or "neutral"
    return Evaluation(
        name="sentiment",
        value=sentiment,
        comment=f"Response expresses {sentiment} sentiment",
        data_type="CATEGORICAL"
    )

Failed evaluation with error handling:

def external_api_evaluator(*, input, output, **kwargs):
    try:
        score = external_api.evaluate(output)
        return Evaluation(name="external_score", value=score)
    except Exception as e:
        return Evaluation(
            name="external_score",
            value=0,
            comment=f"API unavailable: {e}",
            metadata={"error": str(e), "retry_count": 3}
        )
Note:

All arguments must be passed as keywords. Positional arguments are not allowed to ensure code clarity and prevent errors from argument reordering.

Evaluation( *, name: str, value: Union[int, float, str, bool], comment: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, data_type: Optional[Literal['NUMERIC', 'CATEGORICAL', 'BOOLEAN']] = None, config_id: Optional[str] = None)
182    def __init__(
183        self,
184        *,
185        name: str,
186        value: Union[int, float, str, bool],
187        comment: Optional[str] = None,
188        metadata: Optional[Dict[str, Any]] = None,
189        data_type: Optional[ExperimentScoreType] = None,
190        config_id: Optional[str] = None,
191    ):
192        """Initialize an Evaluation with the provided data.
193
194        Args:
195            name: Unique identifier for the evaluation metric.
196            value: The evaluation score or result.
197            comment: Optional human-readable explanation of the result.
198            metadata: Optional structured metadata about the evaluation process.
199            data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
200            config_id: Optional Langfuse score config ID.
201
202        Note:
203            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
204        """
205        self.name = name
206        self.value = value
207        self.comment = comment
208        self.metadata = metadata
209        self.data_type = data_type
210        self.config_id = config_id

Initialize an Evaluation with the provided data.

Arguments:
  • name: Unique identifier for the evaluation metric.
  • value: The evaluation score or result.
  • comment: Optional human-readable explanation of the result.
  • metadata: Optional structured metadata about the evaluation process.
  • data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
  • config_id: Optional Langfuse score config ID.
Note:

All arguments must be provided as keywords. Positional arguments will raise a TypeError.

name
value
comment
metadata
data_type
config_id
class EvaluatorInputs:
 38class EvaluatorInputs:
 39    """Input data structure for evaluators, returned by mapper functions.
 40
 41    This class provides a strongly-typed container for transforming API response
 42    objects (traces, observations) into the standardized format expected
 43    by evaluator functions. It ensures consistent access to input, output, expected
 44    output, and metadata regardless of the source entity type.
 45
 46    Attributes:
 47        input: The input data that was provided to generate the output being evaluated.
 48            For traces, this might be the initial prompt or request. For observations,
 49            this could be the span's input. The exact meaning depends on your use case.
 50        output: The actual output that was produced and needs to be evaluated.
 51            For traces, this is typically the final response. For observations,
 52            this might be the generation output or span result.
 53        expected_output: Optional ground truth or expected result for comparison.
 54            Used by evaluators to assess correctness. May be None if no ground truth
 55            is available for the entity being evaluated.
 56        metadata: Optional structured metadata providing additional context for evaluation.
 57            Can include information about the entity, execution context, user attributes,
 58            or any other relevant data that evaluators might use.
 59
 60    Examples:
 61        Simple mapper for traces:
 62        ```python
 63        from langfuse import EvaluatorInputs
 64
 65        def trace_mapper(trace):
 66            return EvaluatorInputs(
 67                input=trace.input,
 68                output=trace.output,
 69                expected_output=None,  # No ground truth available
 70                metadata={"user_id": trace.user_id, "tags": trace.tags}
 71            )
 72        ```
 73
 74        Mapper for observations extracting specific fields:
 75        ```python
 76        def observation_mapper(observation):
 77            # Extract input/output from observation's data
 78            input_data = observation.input if hasattr(observation, 'input') else None
 79            output_data = observation.output if hasattr(observation, 'output') else None
 80
 81            return EvaluatorInputs(
 82                input=input_data,
 83                output=output_data,
 84                expected_output=None,
 85                metadata={
 86                    "observation_type": observation.type,
 87                    "model": observation.model,
 88                    "latency_ms": observation.end_time - observation.start_time
 89                }
 90            )
 91        ```
 92        ```
 93
 94    Note:
 95        All arguments must be passed as keywords when instantiating this class.
 96    """
 97
 98    def __init__(
 99        self,
100        *,
101        input: Any,
102        output: Any,
103        expected_output: Any = None,
104        metadata: Optional[Dict[str, Any]] = None,
105    ):
106        """Initialize EvaluatorInputs with the provided data.
107
108        Args:
109            input: The input data for evaluation.
110            output: The output data to be evaluated.
111            expected_output: Optional ground truth for comparison.
112            metadata: Optional additional context for evaluation.
113
114        Note:
115            All arguments must be provided as keywords.
116        """
117        self.input = input
118        self.output = output
119        self.expected_output = expected_output
120        self.metadata = metadata

Input data structure for evaluators, returned by mapper functions.

This class provides a strongly-typed container for transforming API response objects (traces, observations) into the standardized format expected by evaluator functions. It ensures consistent access to input, output, expected output, and metadata regardless of the source entity type.

Attributes:
  • input: The input data that was provided to generate the output being evaluated. For traces, this might be the initial prompt or request. For observations, this could be the span's input. The exact meaning depends on your use case.
  • output: The actual output that was produced and needs to be evaluated. For traces, this is typically the final response. For observations, this might be the generation output or span result.
  • expected_output: Optional ground truth or expected result for comparison. Used by evaluators to assess correctness. May be None if no ground truth is available for the entity being evaluated.
  • metadata: Optional structured metadata providing additional context for evaluation. Can include information about the entity, execution context, user attributes, or any other relevant data that evaluators might use.
Examples:

Simple mapper for traces:

from langfuse import EvaluatorInputs

def trace_mapper(trace):
    return EvaluatorInputs(
        input=trace.input,
        output=trace.output,
        expected_output=None,  # No ground truth available
        metadata={"user_id": trace.user_id, "tags": trace.tags}
    )

Mapper for observations extracting specific fields:

def observation_mapper(observation):
    # Extract input/output from observation's data
    input_data = observation.input if hasattr(observation, 'input') else None
    output_data = observation.output if hasattr(observation, 'output') else None

    return EvaluatorInputs(
        input=input_data,
        output=output_data,
        expected_output=None,
        metadata={
            "observation_type": observation.type,
            "model": observation.model,
            "latency_ms": observation.end_time - observation.start_time
        }
    )

```

Note:

All arguments must be passed as keywords when instantiating this class.

EvaluatorInputs( *, input: Any, output: Any, expected_output: Any = None, metadata: Optional[Dict[str, Any]] = None)
 98    def __init__(
 99        self,
100        *,
101        input: Any,
102        output: Any,
103        expected_output: Any = None,
104        metadata: Optional[Dict[str, Any]] = None,
105    ):
106        """Initialize EvaluatorInputs with the provided data.
107
108        Args:
109            input: The input data for evaluation.
110            output: The output data to be evaluated.
111            expected_output: Optional ground truth for comparison.
112            metadata: Optional additional context for evaluation.
113
114        Note:
115            All arguments must be provided as keywords.
116        """
117        self.input = input
118        self.output = output
119        self.expected_output = expected_output
120        self.metadata = metadata

Initialize EvaluatorInputs with the provided data.

Arguments:
  • input: The input data for evaluation.
  • output: The output data to be evaluated.
  • expected_output: Optional ground truth for comparison.
  • metadata: Optional additional context for evaluation.
Note:

All arguments must be provided as keywords.

input
output
expected_output
metadata
class MapperFunction(typing.Protocol):
123class MapperFunction(Protocol):
124    """Protocol defining the interface for mapper functions in batch evaluation.
125
126    Mapper functions transform API response objects (traces or observations)
127    into the standardized EvaluatorInputs format that evaluators expect. This abstraction
128    allows you to define how to extract and structure evaluation data from different
129    entity types.
130
131    Mapper functions must:
132    - Accept a single item parameter (trace, observation)
133    - Return an EvaluatorInputs instance with input, output, expected_output, metadata
134    - Can be either synchronous or asynchronous
135    - Should handle missing or malformed data gracefully
136    """
137
138    def __call__(
139        self,
140        *,
141        item: Union["TraceWithFullDetails", "ObservationsView"],
142        **kwargs: Dict[str, Any],
143    ) -> Union[EvaluatorInputs, Awaitable[EvaluatorInputs]]:
144        """Transform an API response object into evaluator inputs.
145
146        This method defines how to extract evaluation-relevant data from the raw
147        API response object. The implementation should map entity-specific fields
148        to the standardized input/output/expected_output/metadata structure.
149
150        Args:
151            item: The API response object to transform. The type depends on the scope:
152                - TraceWithFullDetails: When evaluating traces
153                - ObservationsView: When evaluating observations
154
155        Returns:
156            EvaluatorInputs: A structured container with:
157                - input: The input data that generated the output
158                - output: The output to be evaluated
159                - expected_output: Optional ground truth for comparison
160                - metadata: Optional additional context
161
162            Can return either a direct EvaluatorInputs instance or an awaitable
163            (for async mappers that need to fetch additional data).
164
165        Examples:
166            Basic trace mapper:
167            ```python
168            def map_trace(trace):
169                return EvaluatorInputs(
170                    input=trace.input,
171                    output=trace.output,
172                    expected_output=None,
173                    metadata={"trace_id": trace.id, "user": trace.user_id}
174                )
175            ```
176
177            Observation mapper with conditional logic:
178            ```python
179            def map_observation(observation):
180                # Extract fields based on observation type
181                if observation.type == "GENERATION":
182                    input_data = observation.input
183                    output_data = observation.output
184                else:
185                    # For other types, use different fields
186                    input_data = observation.metadata.get("input")
187                    output_data = observation.metadata.get("output")
188
189                return EvaluatorInputs(
190                    input=input_data,
191                    output=output_data,
192                    expected_output=None,
193                    metadata={"obs_id": observation.id, "type": observation.type}
194                )
195            ```
196
197            Async mapper (if additional processing needed):
198            ```python
199            async def map_trace_async(trace):
200                # Could do async processing here if needed
201                processed_output = await some_async_transformation(trace.output)
202
203                return EvaluatorInputs(
204                    input=trace.input,
205                    output=processed_output,
206                    expected_output=None,
207                    metadata={"trace_id": trace.id}
208                )
209            ```
210        """
211        ...

Protocol defining the interface for mapper functions in batch evaluation.

Mapper functions transform API response objects (traces or observations) into the standardized EvaluatorInputs format that evaluators expect. This abstraction allows you to define how to extract and structure evaluation data from different entity types.

Mapper functions must:

  • Accept a single item parameter (trace, observation)
  • Return an EvaluatorInputs instance with input, output, expected_output, metadata
  • Can be either synchronous or asynchronous
  • Should handle missing or malformed data gracefully
MapperFunction(*args, **kwargs)
1927def _no_init_or_replace_init(self, *args, **kwargs):
1928    cls = type(self)
1929
1930    if cls._is_protocol:
1931        raise TypeError('Protocols cannot be instantiated')
1932
1933    # Already using a custom `__init__`. No need to calculate correct
1934    # `__init__` to call. This can lead to RecursionError. See bpo-45121.
1935    if cls.__init__ is not _no_init_or_replace_init:
1936        return
1937
1938    # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`.
1939    # The first instantiation of the subclass will call `_no_init_or_replace_init` which
1940    # searches for a proper new `__init__` in the MRO. The new `__init__`
1941    # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent
1942    # instantiation of the protocol subclass will thus use the new
1943    # `__init__` and no longer call `_no_init_or_replace_init`.
1944    for base in cls.__mro__:
1945        init = base.__dict__.get('__init__', _no_init_or_replace_init)
1946        if init is not _no_init_or_replace_init:
1947            cls.__init__ = init
1948            break
1949    else:
1950        # should not happen
1951        cls.__init__ = object.__init__
1952
1953    cls.__init__(self, *args, **kwargs)
class CompositeEvaluatorFunction(typing.Protocol):
214class CompositeEvaluatorFunction(Protocol):
215    """Protocol defining the interface for composite evaluator functions.
216
217    Composite evaluators create aggregate scores from multiple item-level evaluations.
218    This is commonly used to compute weighted averages, combined metrics, or other
219    composite assessments based on individual evaluation results.
220
221    Composite evaluators:
222    - Accept the same inputs as item-level evaluators (input, output, expected_output, metadata)
223      plus the list of evaluations
224    - Return either a single Evaluation, a list of Evaluations, or a dict
225    - Can be either synchronous or asynchronous
226    - Have access to both raw item data and evaluation results
227    """
228
229    def __call__(
230        self,
231        *,
232        input: Optional[Any] = None,
233        output: Optional[Any] = None,
234        expected_output: Optional[Any] = None,
235        metadata: Optional[Dict[str, Any]] = None,
236        evaluations: List[Evaluation],
237        **kwargs: Dict[str, Any],
238    ) -> Union[
239        Evaluation,
240        List[Evaluation],
241        Dict[str, Any],
242        Awaitable[Evaluation],
243        Awaitable[List[Evaluation]],
244        Awaitable[Dict[str, Any]],
245    ]:
246        r"""Create a composite evaluation from item-level evaluation results.
247
248        This method combines multiple evaluation scores into a single composite metric.
249        Common use cases include weighted averages, pass/fail decisions based on multiple
250        criteria, or custom scoring logic that considers multiple dimensions.
251
252        Args:
253            input: The input data that was provided to the system being evaluated.
254            output: The output generated by the system being evaluated.
255            expected_output: The expected/reference output for comparison (if available).
256            metadata: Additional metadata about the evaluation context.
257            evaluations: List of evaluation results from item-level evaluators.
258                Each evaluation contains name, value, comment, and metadata.
259
260        Returns:
261            Can return any of:
262            - Evaluation: A single composite evaluation result
263            - List[Evaluation]: Multiple composite evaluations
264            - Dict: A dict that will be converted to an Evaluation
265                - name: Identifier for the composite metric (e.g., "composite_score")
266                - value: The computed composite value
267                - comment: Optional explanation of how the score was computed
268                - metadata: Optional details about the composition logic
269
270            Can return either a direct Evaluation instance or an awaitable
271            (for async composite evaluators).
272
273        Examples:
274            Simple weighted average:
275            ```python
276            def weighted_composite(*, input, output, expected_output, metadata, evaluations):
277                weights = {
278                    "accuracy": 0.5,
279                    "relevance": 0.3,
280                    "safety": 0.2
281                }
282
283                total_score = 0.0
284                total_weight = 0.0
285
286                for eval in evaluations:
287                    if eval.name in weights and isinstance(eval.value, (int, float)):
288                        total_score += eval.value * weights[eval.name]
289                        total_weight += weights[eval.name]
290
291                final_score = total_score / total_weight if total_weight > 0 else 0.0
292
293                return Evaluation(
294                    name="composite_score",
295                    value=final_score,
296                    comment=f"Weighted average of {len(evaluations)} metrics"
297                )
298            ```
299
300            Pass/fail composite based on thresholds:
301            ```python
302            def pass_fail_composite(*, input, output, expected_output, metadata, evaluations):
303                # Must pass all criteria
304                thresholds = {
305                    "accuracy": 0.7,
306                    "safety": 0.9,
307                    "relevance": 0.6
308                }
309
310                passes = True
311                failing_metrics = []
312
313                for metric, threshold in thresholds.items():
314                    eval_result = next((e for e in evaluations if e.name == metric), None)
315                    if eval_result and isinstance(eval_result.value, (int, float)):
316                        if eval_result.value < threshold:
317                            passes = False
318                            failing_metrics.append(metric)
319
320                return Evaluation(
321                    name="passes_all_checks",
322                    value=passes,
323                    comment=f"Failed: {', '.join(failing_metrics)}" if failing_metrics else "All checks passed",
324                    data_type="BOOLEAN"
325                )
326            ```
327
328            Async composite with external scoring:
329            ```python
330            async def llm_composite(*, input, output, expected_output, metadata, evaluations):
331                # Use LLM to synthesize multiple evaluation results
332                eval_summary = "\n".join(
333                    f"- {e.name}: {e.value}" for e in evaluations
334                )
335
336                prompt = f"Given these evaluation scores:\n{eval_summary}\n"
337                prompt += f"For the output: {output}\n"
338                prompt += "Provide an overall quality score from 0-1."
339
340                response = await openai.chat.completions.create(
341                    model="gpt-4",
342                    messages=[{"role": "user", "content": prompt}]
343                )
344
345                score = float(response.choices[0].message.content.strip())
346
347                return Evaluation(
348                    name="llm_composite_score",
349                    value=score,
350                    comment="LLM-synthesized composite score"
351                )
352            ```
353
354            Context-aware composite:
355            ```python
356            def context_composite(*, input, output, expected_output, metadata, evaluations):
357                # Adjust weighting based on metadata
358                base_weights = {"accuracy": 0.5, "speed": 0.3, "cost": 0.2}
359
360                # If metadata indicates high importance, prioritize accuracy
361                if metadata and metadata.get('importance') == 'high':
362                    weights = {"accuracy": 0.7, "speed": 0.2, "cost": 0.1}
363                else:
364                    weights = base_weights
365
366                total = sum(
367                    e.value * weights.get(e.name, 0)
368                    for e in evaluations
369                    if isinstance(e.value, (int, float))
370                )
371
372                return Evaluation(
373                    name="weighted_composite",
374                    value=total,
375                    comment="Context-aware weighted composite"
376                )
377            ```
378        """
379        ...

Protocol defining the interface for composite evaluator functions.

Composite evaluators create aggregate scores from multiple item-level evaluations. This is commonly used to compute weighted averages, combined metrics, or other composite assessments based on individual evaluation results.

Composite evaluators:

  • Accept the same inputs as item-level evaluators (input, output, expected_output, metadata) plus the list of evaluations
  • Return either a single Evaluation, a list of Evaluations, or a dict
  • Can be either synchronous or asynchronous
  • Have access to both raw item data and evaluation results
CompositeEvaluatorFunction(*args, **kwargs)
1927def _no_init_or_replace_init(self, *args, **kwargs):
1928    cls = type(self)
1929
1930    if cls._is_protocol:
1931        raise TypeError('Protocols cannot be instantiated')
1932
1933    # Already using a custom `__init__`. No need to calculate correct
1934    # `__init__` to call. This can lead to RecursionError. See bpo-45121.
1935    if cls.__init__ is not _no_init_or_replace_init:
1936        return
1937
1938    # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`.
1939    # The first instantiation of the subclass will call `_no_init_or_replace_init` which
1940    # searches for a proper new `__init__` in the MRO. The new `__init__`
1941    # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent
1942    # instantiation of the protocol subclass will thus use the new
1943    # `__init__` and no longer call `_no_init_or_replace_init`.
1944    for base in cls.__mro__:
1945        init = base.__dict__.get('__init__', _no_init_or_replace_init)
1946        if init is not _no_init_or_replace_init:
1947            cls.__init__ = init
1948            break
1949    else:
1950        # should not happen
1951        cls.__init__ = object.__init__
1952
1953    cls.__init__(self, *args, **kwargs)
class EvaluatorStats:
382class EvaluatorStats:
383    """Statistics for a single evaluator's performance during batch evaluation.
384
385    This class tracks detailed metrics about how a specific evaluator performed
386    across all items in a batch evaluation run. It helps identify evaluator issues,
387    understand reliability, and optimize evaluation pipelines.
388
389    Attributes:
390        name: The name of the evaluator function (extracted from __name__).
391        total_runs: Total number of times the evaluator was invoked.
392        successful_runs: Number of times the evaluator completed successfully.
393        failed_runs: Number of times the evaluator raised an exception or failed.
394        total_scores_created: Total number of evaluation scores created by this evaluator.
395            Can be higher than successful_runs if the evaluator returns multiple scores.
396
397    Examples:
398        Accessing evaluator stats from batch evaluation result:
399        ```python
400        result = client.run_batched_evaluation(...)
401
402        for stats in result.evaluator_stats:
403            print(f"Evaluator: {stats.name}")
404            print(f"  Success rate: {stats.successful_runs / stats.total_runs:.1%}")
405            print(f"  Scores created: {stats.total_scores_created}")
406
407            if stats.failed_runs > 0:
408                print(f"  âš ī¸  Failed {stats.failed_runs} times")
409        ```
410
411        Identifying problematic evaluators:
412        ```python
413        result = client.run_batched_evaluation(...)
414
415        # Find evaluators with high failure rates
416        for stats in result.evaluator_stats:
417            failure_rate = stats.failed_runs / stats.total_runs
418            if failure_rate > 0.1:  # More than 10% failures
419                print(f"âš ī¸  {stats.name} has {failure_rate:.1%} failure rate")
420                print(f"    Consider debugging or removing this evaluator")
421        ```
422
423    Note:
424        All arguments must be passed as keywords when instantiating this class.
425    """
426
427    def __init__(
428        self,
429        *,
430        name: str,
431        total_runs: int = 0,
432        successful_runs: int = 0,
433        failed_runs: int = 0,
434        total_scores_created: int = 0,
435    ):
436        """Initialize EvaluatorStats with the provided metrics.
437
438        Args:
439            name: The evaluator function name.
440            total_runs: Total number of evaluator invocations.
441            successful_runs: Number of successful completions.
442            failed_runs: Number of failures.
443            total_scores_created: Total scores created by this evaluator.
444
445        Note:
446            All arguments must be provided as keywords.
447        """
448        self.name = name
449        self.total_runs = total_runs
450        self.successful_runs = successful_runs
451        self.failed_runs = failed_runs
452        self.total_scores_created = total_scores_created

Statistics for a single evaluator's performance during batch evaluation.

This class tracks detailed metrics about how a specific evaluator performed across all items in a batch evaluation run. It helps identify evaluator issues, understand reliability, and optimize evaluation pipelines.

Attributes:
  • name: The name of the evaluator function (extracted from __name__).
  • total_runs: Total number of times the evaluator was invoked.
  • successful_runs: Number of times the evaluator completed successfully.
  • failed_runs: Number of times the evaluator raised an exception or failed.
  • total_scores_created: Total number of evaluation scores created by this evaluator. Can be higher than successful_runs if the evaluator returns multiple scores.
Examples:

Accessing evaluator stats from batch evaluation result:

result = client.run_batched_evaluation(...)

for stats in result.evaluator_stats:
    print(f"Evaluator: {stats.name}")
    print(f"  Success rate: {stats.successful_runs / stats.total_runs:.1%}")
    print(f"  Scores created: {stats.total_scores_created}")

    if stats.failed_runs > 0:
        print(f"  âš ī¸  Failed {stats.failed_runs} times")

Identifying problematic evaluators:

result = client.run_batched_evaluation(...)

# Find evaluators with high failure rates
for stats in result.evaluator_stats:
    failure_rate = stats.failed_runs / stats.total_runs
    if failure_rate > 0.1:  # More than 10% failures
        print(f"âš ī¸  {stats.name} has {failure_rate:.1%} failure rate")
        print(f"    Consider debugging or removing this evaluator")
Note:

All arguments must be passed as keywords when instantiating this class.

EvaluatorStats( *, name: str, total_runs: int = 0, successful_runs: int = 0, failed_runs: int = 0, total_scores_created: int = 0)
427    def __init__(
428        self,
429        *,
430        name: str,
431        total_runs: int = 0,
432        successful_runs: int = 0,
433        failed_runs: int = 0,
434        total_scores_created: int = 0,
435    ):
436        """Initialize EvaluatorStats with the provided metrics.
437
438        Args:
439            name: The evaluator function name.
440            total_runs: Total number of evaluator invocations.
441            successful_runs: Number of successful completions.
442            failed_runs: Number of failures.
443            total_scores_created: Total scores created by this evaluator.
444
445        Note:
446            All arguments must be provided as keywords.
447        """
448        self.name = name
449        self.total_runs = total_runs
450        self.successful_runs = successful_runs
451        self.failed_runs = failed_runs
452        self.total_scores_created = total_scores_created

Initialize EvaluatorStats with the provided metrics.

Arguments:
  • name: The evaluator function name.
  • total_runs: Total number of evaluator invocations.
  • successful_runs: Number of successful completions.
  • failed_runs: Number of failures.
  • total_scores_created: Total scores created by this evaluator.
Note:

All arguments must be provided as keywords.

name
total_runs
successful_runs
failed_runs
total_scores_created
class BatchEvaluationResumeToken:
455class BatchEvaluationResumeToken:
456    """Token for resuming a failed batch evaluation run.
457
458    This class encapsulates all the information needed to resume a batch evaluation
459    that was interrupted or failed partway through. It uses timestamp-based filtering
460    to avoid re-processing items that were already evaluated, even if the underlying
461    dataset changed between runs.
462
463    Attributes:
464        scope: The type of items being evaluated ("traces", "observations").
465        filter: The original JSON filter string used to query items.
466        last_processed_timestamp: ISO 8601 timestamp of the last successfully processed item.
467            Used to construct a filter that only fetches items after this timestamp.
468        last_processed_id: The ID of the last successfully processed item, for reference.
469        items_processed: Count of items successfully processed before interruption.
470
471    Examples:
472        Resuming a failed batch evaluation:
473        ```python
474        # Initial run that fails partway through
475        try:
476            result = client.run_batched_evaluation(
477                scope="traces",
478                mapper=my_mapper,
479                evaluators=[evaluator1, evaluator2],
480                filter='{"tags": ["production"]}',
481                max_items=10000
482            )
483        except Exception as e:
484            print(f"Evaluation failed: {e}")
485
486            # Save the resume token
487            if result.resume_token:
488                # Store resume token for later (e.g., in a file or database)
489                import json
490                with open("resume_token.json", "w") as f:
491                    json.dump({
492                        "scope": result.resume_token.scope,
493                        "filter": result.resume_token.filter,
494                        "last_timestamp": result.resume_token.last_processed_timestamp,
495                        "last_id": result.resume_token.last_processed_id,
496                        "items_done": result.resume_token.items_processed
497                    }, f)
498
499        # Later, resume from where it left off
500        with open("resume_token.json") as f:
501            token_data = json.load(f)
502
503        resume_token = BatchEvaluationResumeToken(
504            scope=token_data["scope"],
505            filter=token_data["filter"],
506            last_processed_timestamp=token_data["last_timestamp"],
507            last_processed_id=token_data["last_id"],
508            items_processed=token_data["items_done"]
509        )
510
511        # Resume the evaluation
512        result = client.run_batched_evaluation(
513            scope="traces",
514            mapper=my_mapper,
515            evaluators=[evaluator1, evaluator2],
516            resume_from=resume_token
517        )
518
519        print(f"Processed {result.total_items_processed} additional items")
520        ```
521
522        Handling partial completion:
523        ```python
524        result = client.run_batched_evaluation(...)
525
526        if not result.completed:
527            print(f"Evaluation incomplete. Processed {result.resume_token.items_processed} items")
528            print(f"Last item: {result.resume_token.last_processed_id}")
529            print(f"Resume from: {result.resume_token.last_processed_timestamp}")
530
531            # Optionally retry automatically
532            if result.resume_token:
533                print("Retrying...")
534                result = client.run_batched_evaluation(
535                    scope=result.resume_token.scope,
536                    mapper=my_mapper,
537                    evaluators=my_evaluators,
538                    resume_from=result.resume_token
539                )
540        ```
541
542    Note:
543        All arguments must be passed as keywords when instantiating this class.
544        The timestamp-based approach means that items created after the initial run
545        but before the timestamp will be skipped. This is intentional to avoid
546        duplicates and ensure consistent evaluation.
547    """
548
549    def __init__(
550        self,
551        *,
552        scope: str,
553        filter: Optional[str],
554        last_processed_timestamp: str,
555        last_processed_id: str,
556        items_processed: int,
557    ):
558        """Initialize BatchEvaluationResumeToken with the provided state.
559
560        Args:
561            scope: The scope type ("traces", "observations").
562            filter: The original JSON filter string.
563            last_processed_timestamp: ISO 8601 timestamp of last processed item.
564            last_processed_id: ID of last processed item.
565            items_processed: Count of items processed before interruption.
566
567        Note:
568            All arguments must be provided as keywords.
569        """
570        self.scope = scope
571        self.filter = filter
572        self.last_processed_timestamp = last_processed_timestamp
573        self.last_processed_id = last_processed_id
574        self.items_processed = items_processed

Token for resuming a failed batch evaluation run.

This class encapsulates all the information needed to resume a batch evaluation that was interrupted or failed partway through. It uses timestamp-based filtering to avoid re-processing items that were already evaluated, even if the underlying dataset changed between runs.

Attributes:
  • scope: The type of items being evaluated ("traces", "observations").
  • filter: The original JSON filter string used to query items.
  • last_processed_timestamp: ISO 8601 timestamp of the last successfully processed item. Used to construct a filter that only fetches items after this timestamp.
  • last_processed_id: The ID of the last successfully processed item, for reference.
  • items_processed: Count of items successfully processed before interruption.
Examples:

Resuming a failed batch evaluation:

# Initial run that fails partway through
try:
    result = client.run_batched_evaluation(
        scope="traces",
        mapper=my_mapper,
        evaluators=[evaluator1, evaluator2],
        filter='{"tags": ["production"]}',
        max_items=10000
    )
except Exception as e:
    print(f"Evaluation failed: {e}")

    # Save the resume token
    if result.resume_token:
        # Store resume token for later (e.g., in a file or database)
        import json
        with open("resume_token.json", "w") as f:
            json.dump({
                "scope": result.resume_token.scope,
                "filter": result.resume_token.filter,
                "last_timestamp": result.resume_token.last_processed_timestamp,
                "last_id": result.resume_token.last_processed_id,
                "items_done": result.resume_token.items_processed
            }, f)

# Later, resume from where it left off
with open("resume_token.json") as f:
    token_data = json.load(f)

resume_token = BatchEvaluationResumeToken(
    scope=token_data["scope"],
    filter=token_data["filter"],
    last_processed_timestamp=token_data["last_timestamp"],
    last_processed_id=token_data["last_id"],
    items_processed=token_data["items_done"]
)

# Resume the evaluation
result = client.run_batched_evaluation(
    scope="traces",
    mapper=my_mapper,
    evaluators=[evaluator1, evaluator2],
    resume_from=resume_token
)

print(f"Processed {result.total_items_processed} additional items")

Handling partial completion:

result = client.run_batched_evaluation(...)

if not result.completed:
    print(f"Evaluation incomplete. Processed {result.resume_token.items_processed} items")
    print(f"Last item: {result.resume_token.last_processed_id}")
    print(f"Resume from: {result.resume_token.last_processed_timestamp}")

    # Optionally retry automatically
    if result.resume_token:
        print("Retrying...")
        result = client.run_batched_evaluation(
            scope=result.resume_token.scope,
            mapper=my_mapper,
            evaluators=my_evaluators,
            resume_from=result.resume_token
        )
Note:

All arguments must be passed as keywords when instantiating this class. The timestamp-based approach means that items created after the initial run but before the timestamp will be skipped. This is intentional to avoid duplicates and ensure consistent evaluation.

BatchEvaluationResumeToken( *, scope: str, filter: Optional[str], last_processed_timestamp: str, last_processed_id: str, items_processed: int)
549    def __init__(
550        self,
551        *,
552        scope: str,
553        filter: Optional[str],
554        last_processed_timestamp: str,
555        last_processed_id: str,
556        items_processed: int,
557    ):
558        """Initialize BatchEvaluationResumeToken with the provided state.
559
560        Args:
561            scope: The scope type ("traces", "observations").
562            filter: The original JSON filter string.
563            last_processed_timestamp: ISO 8601 timestamp of last processed item.
564            last_processed_id: ID of last processed item.
565            items_processed: Count of items processed before interruption.
566
567        Note:
568            All arguments must be provided as keywords.
569        """
570        self.scope = scope
571        self.filter = filter
572        self.last_processed_timestamp = last_processed_timestamp
573        self.last_processed_id = last_processed_id
574        self.items_processed = items_processed

Initialize BatchEvaluationResumeToken with the provided state.

Arguments:
  • scope: The scope type ("traces", "observations").
  • filter: The original JSON filter string.
  • last_processed_timestamp: ISO 8601 timestamp of last processed item.
  • last_processed_id: ID of last processed item.
  • items_processed: Count of items processed before interruption.
Note:

All arguments must be provided as keywords.

scope
filter
last_processed_timestamp
last_processed_id
items_processed
class BatchEvaluationResult:
577class BatchEvaluationResult:
578    r"""Complete result structure for batch evaluation execution.
579
580    This class encapsulates comprehensive statistics and metadata about a batch
581    evaluation run, including counts, evaluator-specific metrics, timing information,
582    error details, and resume capability.
583
584    Attributes:
585        total_items_fetched: Total number of items fetched from the API.
586        total_items_processed: Number of items successfully evaluated.
587        total_items_failed: Number of items that failed during evaluation.
588        total_scores_created: Total scores created by all item-level evaluators.
589        total_composite_scores_created: Scores created by the composite evaluator.
590        total_evaluations_failed: Number of individual evaluator failures across all items.
591        evaluator_stats: List of per-evaluator statistics (success/failure rates, scores created).
592        resume_token: Token for resuming if evaluation was interrupted (None if completed).
593        completed: True if all items were processed, False if stopped early or failed.
594        duration_seconds: Total time taken to execute the batch evaluation.
595        failed_item_ids: List of IDs for items that failed evaluation.
596        error_summary: Dictionary mapping error types to occurrence counts.
597        has_more_items: True if max_items limit was reached but more items exist.
598        item_evaluations: Dictionary mapping item IDs to their evaluation results (both regular and composite).
599
600    Examples:
601        Basic result inspection:
602        ```python
603        result = client.run_batched_evaluation(...)
604
605        print(f"Processed: {result.total_items_processed}/{result.total_items_fetched}")
606        print(f"Scores created: {result.total_scores_created}")
607        print(f"Duration: {result.duration_seconds:.2f}s")
608        print(f"Success rate: {result.total_items_processed / result.total_items_fetched:.1%}")
609        ```
610
611        Detailed analysis with evaluator stats:
612        ```python
613        result = client.run_batched_evaluation(...)
614
615        print(f"\n📊 Batch Evaluation Results")
616        print(f"{'='*50}")
617        print(f"Items processed: {result.total_items_processed}")
618        print(f"Items failed: {result.total_items_failed}")
619        print(f"Scores created: {result.total_scores_created}")
620
621        if result.total_composite_scores_created > 0:
622            print(f"Composite scores: {result.total_composite_scores_created}")
623
624        print(f"\n📈 Evaluator Performance:")
625        for stats in result.evaluator_stats:
626            success_rate = stats.successful_runs / stats.total_runs if stats.total_runs > 0 else 0
627            print(f"\n  {stats.name}:")
628            print(f"    Success rate: {success_rate:.1%}")
629            print(f"    Scores created: {stats.total_scores_created}")
630            if stats.failed_runs > 0:
631                print(f"    âš ī¸  Failures: {stats.failed_runs}")
632
633        if result.error_summary:
634            print(f"\nâš ī¸  Errors encountered:")
635            for error_type, count in result.error_summary.items():
636                print(f"    {error_type}: {count}")
637        ```
638
639        Handling incomplete runs:
640        ```python
641        result = client.run_batched_evaluation(...)
642
643        if not result.completed:
644            print("âš ī¸  Evaluation incomplete!")
645
646            if result.resume_token:
647                print(f"Processed {result.resume_token.items_processed} items before failure")
648                print(f"Use resume_from parameter to continue from:")
649                print(f"  Timestamp: {result.resume_token.last_processed_timestamp}")
650                print(f"  Last ID: {result.resume_token.last_processed_id}")
651
652        if result.has_more_items:
653            print(f"â„šī¸  More items available beyond max_items limit")
654        ```
655
656        Performance monitoring:
657        ```python
658        result = client.run_batched_evaluation(...)
659
660        items_per_second = result.total_items_processed / result.duration_seconds
661        avg_scores_per_item = result.total_scores_created / result.total_items_processed
662
663        print(f"Performance metrics:")
664        print(f"  Throughput: {items_per_second:.2f} items/second")
665        print(f"  Avg scores/item: {avg_scores_per_item:.2f}")
666        print(f"  Total duration: {result.duration_seconds:.2f}s")
667
668        if result.total_evaluations_failed > 0:
669            failure_rate = result.total_evaluations_failed / (
670                result.total_items_processed * len(result.evaluator_stats)
671            )
672            print(f"  Evaluation failure rate: {failure_rate:.1%}")
673        ```
674
675    Note:
676        All arguments must be passed as keywords when instantiating this class.
677    """
678
679    def __init__(
680        self,
681        *,
682        total_items_fetched: int,
683        total_items_processed: int,
684        total_items_failed: int,
685        total_scores_created: int,
686        total_composite_scores_created: int,
687        total_evaluations_failed: int,
688        evaluator_stats: List[EvaluatorStats],
689        resume_token: Optional[BatchEvaluationResumeToken],
690        completed: bool,
691        duration_seconds: float,
692        failed_item_ids: List[str],
693        error_summary: Dict[str, int],
694        has_more_items: bool,
695        item_evaluations: Dict[str, List["Evaluation"]],
696    ):
697        """Initialize BatchEvaluationResult with comprehensive statistics.
698
699        Args:
700            total_items_fetched: Total items fetched from API.
701            total_items_processed: Items successfully evaluated.
702            total_items_failed: Items that failed evaluation.
703            total_scores_created: Scores from item-level evaluators.
704            total_composite_scores_created: Scores from composite evaluator.
705            total_evaluations_failed: Individual evaluator failures.
706            evaluator_stats: Per-evaluator statistics.
707            resume_token: Token for resuming (None if completed).
708            completed: Whether all items were processed.
709            duration_seconds: Total execution time.
710            failed_item_ids: IDs of failed items.
711            error_summary: Error types and counts.
712            has_more_items: Whether more items exist beyond max_items.
713            item_evaluations: Dictionary mapping item IDs to their evaluation results.
714
715        Note:
716            All arguments must be provided as keywords.
717        """
718        self.total_items_fetched = total_items_fetched
719        self.total_items_processed = total_items_processed
720        self.total_items_failed = total_items_failed
721        self.total_scores_created = total_scores_created
722        self.total_composite_scores_created = total_composite_scores_created
723        self.total_evaluations_failed = total_evaluations_failed
724        self.evaluator_stats = evaluator_stats
725        self.resume_token = resume_token
726        self.completed = completed
727        self.duration_seconds = duration_seconds
728        self.failed_item_ids = failed_item_ids
729        self.error_summary = error_summary
730        self.has_more_items = has_more_items
731        self.item_evaluations = item_evaluations
732
733    def __str__(self) -> str:
734        """Return a formatted string representation of the batch evaluation results.
735
736        Returns:
737            A multi-line string with a summary of the evaluation results.
738        """
739        lines = []
740        lines.append("=" * 60)
741        lines.append("Batch Evaluation Results")
742        lines.append("=" * 60)
743
744        # Summary statistics
745        lines.append(f"\nStatus: {'Completed' if self.completed else 'Incomplete'}")
746        lines.append(f"Duration: {self.duration_seconds:.2f}s")
747        lines.append(f"\nItems fetched: {self.total_items_fetched}")
748        lines.append(f"Items processed: {self.total_items_processed}")
749
750        if self.total_items_failed > 0:
751            lines.append(f"Items failed: {self.total_items_failed}")
752
753        # Success rate
754        if self.total_items_fetched > 0:
755            success_rate = self.total_items_processed / self.total_items_fetched * 100
756            lines.append(f"Success rate: {success_rate:.1f}%")
757
758        # Scores created
759        lines.append(f"\nScores created: {self.total_scores_created}")
760        if self.total_composite_scores_created > 0:
761            lines.append(f"Composite scores: {self.total_composite_scores_created}")
762
763        total_scores = self.total_scores_created + self.total_composite_scores_created
764        lines.append(f"Total scores: {total_scores}")
765
766        # Evaluator statistics
767        if self.evaluator_stats:
768            lines.append("\nEvaluator Performance:")
769            for stats in self.evaluator_stats:
770                lines.append(f"  {stats.name}:")
771                if stats.total_runs > 0:
772                    success_rate = (
773                        stats.successful_runs / stats.total_runs * 100
774                        if stats.total_runs > 0
775                        else 0
776                    )
777                    lines.append(
778                        f"    Runs: {stats.successful_runs}/{stats.total_runs} "
779                        f"({success_rate:.1f}% success)"
780                    )
781                    lines.append(f"    Scores created: {stats.total_scores_created}")
782                    if stats.failed_runs > 0:
783                        lines.append(f"    Failed runs: {stats.failed_runs}")
784
785        # Performance metrics
786        if self.total_items_processed > 0 and self.duration_seconds > 0:
787            items_per_sec = self.total_items_processed / self.duration_seconds
788            lines.append("\nPerformance:")
789            lines.append(f"  Throughput: {items_per_sec:.2f} items/second")
790            if self.total_scores_created > 0:
791                avg_scores = self.total_scores_created / self.total_items_processed
792                lines.append(f"  Avg scores per item: {avg_scores:.2f}")
793
794        # Errors and warnings
795        if self.error_summary:
796            lines.append("\nErrors encountered:")
797            for error_type, count in self.error_summary.items():
798                lines.append(f"  {error_type}: {count}")
799
800        # Incomplete run information
801        if not self.completed:
802            lines.append("\nWarning: Evaluation incomplete")
803            if self.resume_token:
804                lines.append(
805                    f"  Last processed: {self.resume_token.last_processed_timestamp}"
806                )
807                lines.append(f"  Items processed: {self.resume_token.items_processed}")
808                lines.append("  Use resume_from parameter to continue")
809
810        if self.has_more_items:
811            lines.append("\nNote: More items available beyond max_items limit")
812
813        lines.append("=" * 60)
814        return "\n".join(lines)

Complete result structure for batch evaluation execution.

This class encapsulates comprehensive statistics and metadata about a batch evaluation run, including counts, evaluator-specific metrics, timing information, error details, and resume capability.

Attributes:
  • total_items_fetched: Total number of items fetched from the API.
  • total_items_processed: Number of items successfully evaluated.
  • total_items_failed: Number of items that failed during evaluation.
  • total_scores_created: Total scores created by all item-level evaluators.
  • total_composite_scores_created: Scores created by the composite evaluator.
  • total_evaluations_failed: Number of individual evaluator failures across all items.
  • evaluator_stats: List of per-evaluator statistics (success/failure rates, scores created).
  • resume_token: Token for resuming if evaluation was interrupted (None if completed).
  • completed: True if all items were processed, False if stopped early or failed.
  • duration_seconds: Total time taken to execute the batch evaluation.
  • failed_item_ids: List of IDs for items that failed evaluation.
  • error_summary: Dictionary mapping error types to occurrence counts.
  • has_more_items: True if max_items limit was reached but more items exist.
  • item_evaluations: Dictionary mapping item IDs to their evaluation results (both regular and composite).
Examples:

Basic result inspection:

result = client.run_batched_evaluation(...)

print(f"Processed: {result.total_items_processed}/{result.total_items_fetched}")
print(f"Scores created: {result.total_scores_created}")
print(f"Duration: {result.duration_seconds:.2f}s")
print(f"Success rate: {result.total_items_processed / result.total_items_fetched:.1%}")

Detailed analysis with evaluator stats:

result = client.run_batched_evaluation(...)

print(f"\n📊 Batch Evaluation Results")
print(f"{'='*50}")
print(f"Items processed: {result.total_items_processed}")
print(f"Items failed: {result.total_items_failed}")
print(f"Scores created: {result.total_scores_created}")

if result.total_composite_scores_created > 0:
    print(f"Composite scores: {result.total_composite_scores_created}")

print(f"\n📈 Evaluator Performance:")
for stats in result.evaluator_stats:
    success_rate = stats.successful_runs / stats.total_runs if stats.total_runs > 0 else 0
    print(f"\n  {stats.name}:")
    print(f"    Success rate: {success_rate:.1%}")
    print(f"    Scores created: {stats.total_scores_created}")
    if stats.failed_runs > 0:
        print(f"    âš ī¸  Failures: {stats.failed_runs}")

if result.error_summary:
    print(f"\nâš ī¸  Errors encountered:")
    for error_type, count in result.error_summary.items():
        print(f"    {error_type}: {count}")

Handling incomplete runs:

result = client.run_batched_evaluation(...)

if not result.completed:
    print("âš ī¸  Evaluation incomplete!")

    if result.resume_token:
        print(f"Processed {result.resume_token.items_processed} items before failure")
        print(f"Use resume_from parameter to continue from:")
        print(f"  Timestamp: {result.resume_token.last_processed_timestamp}")
        print(f"  Last ID: {result.resume_token.last_processed_id}")

if result.has_more_items:
    print(f"â„šī¸  More items available beyond max_items limit")

Performance monitoring:

result = client.run_batched_evaluation(...)

items_per_second = result.total_items_processed / result.duration_seconds
avg_scores_per_item = result.total_scores_created / result.total_items_processed

print(f"Performance metrics:")
print(f"  Throughput: {items_per_second:.2f} items/second")
print(f"  Avg scores/item: {avg_scores_per_item:.2f}")
print(f"  Total duration: {result.duration_seconds:.2f}s")

if result.total_evaluations_failed > 0:
    failure_rate = result.total_evaluations_failed / (
        result.total_items_processed * len(result.evaluator_stats)
    )
    print(f"  Evaluation failure rate: {failure_rate:.1%}")
Note:

All arguments must be passed as keywords when instantiating this class.

BatchEvaluationResult( *, total_items_fetched: int, total_items_processed: int, total_items_failed: int, total_scores_created: int, total_composite_scores_created: int, total_evaluations_failed: int, evaluator_stats: List[EvaluatorStats], resume_token: Optional[BatchEvaluationResumeToken], completed: bool, duration_seconds: float, failed_item_ids: List[str], error_summary: Dict[str, int], has_more_items: bool, item_evaluations: Dict[str, List[Evaluation]])
679    def __init__(
680        self,
681        *,
682        total_items_fetched: int,
683        total_items_processed: int,
684        total_items_failed: int,
685        total_scores_created: int,
686        total_composite_scores_created: int,
687        total_evaluations_failed: int,
688        evaluator_stats: List[EvaluatorStats],
689        resume_token: Optional[BatchEvaluationResumeToken],
690        completed: bool,
691        duration_seconds: float,
692        failed_item_ids: List[str],
693        error_summary: Dict[str, int],
694        has_more_items: bool,
695        item_evaluations: Dict[str, List["Evaluation"]],
696    ):
697        """Initialize BatchEvaluationResult with comprehensive statistics.
698
699        Args:
700            total_items_fetched: Total items fetched from API.
701            total_items_processed: Items successfully evaluated.
702            total_items_failed: Items that failed evaluation.
703            total_scores_created: Scores from item-level evaluators.
704            total_composite_scores_created: Scores from composite evaluator.
705            total_evaluations_failed: Individual evaluator failures.
706            evaluator_stats: Per-evaluator statistics.
707            resume_token: Token for resuming (None if completed).
708            completed: Whether all items were processed.
709            duration_seconds: Total execution time.
710            failed_item_ids: IDs of failed items.
711            error_summary: Error types and counts.
712            has_more_items: Whether more items exist beyond max_items.
713            item_evaluations: Dictionary mapping item IDs to their evaluation results.
714
715        Note:
716            All arguments must be provided as keywords.
717        """
718        self.total_items_fetched = total_items_fetched
719        self.total_items_processed = total_items_processed
720        self.total_items_failed = total_items_failed
721        self.total_scores_created = total_scores_created
722        self.total_composite_scores_created = total_composite_scores_created
723        self.total_evaluations_failed = total_evaluations_failed
724        self.evaluator_stats = evaluator_stats
725        self.resume_token = resume_token
726        self.completed = completed
727        self.duration_seconds = duration_seconds
728        self.failed_item_ids = failed_item_ids
729        self.error_summary = error_summary
730        self.has_more_items = has_more_items
731        self.item_evaluations = item_evaluations

Initialize BatchEvaluationResult with comprehensive statistics.

Arguments:
  • total_items_fetched: Total items fetched from API.
  • total_items_processed: Items successfully evaluated.
  • total_items_failed: Items that failed evaluation.
  • total_scores_created: Scores from item-level evaluators.
  • total_composite_scores_created: Scores from composite evaluator.
  • total_evaluations_failed: Individual evaluator failures.
  • evaluator_stats: Per-evaluator statistics.
  • resume_token: Token for resuming (None if completed).
  • completed: Whether all items were processed.
  • duration_seconds: Total execution time.
  • failed_item_ids: IDs of failed items.
  • error_summary: Error types and counts.
  • has_more_items: Whether more items exist beyond max_items.
  • item_evaluations: Dictionary mapping item IDs to their evaluation results.
Note:

All arguments must be provided as keywords.

total_items_fetched
total_items_processed
total_items_failed
total_scores_created
total_composite_scores_created
total_evaluations_failed
evaluator_stats
resume_token
completed
duration_seconds
failed_item_ids
error_summary
has_more_items
item_evaluations
__version__ = '4.5.1'
def is_default_export_span(span: opentelemetry.sdk.trace.ReadableSpan) -> bool:
73def is_default_export_span(span: ReadableSpan) -> bool:
74    """Return whether a span should be exported by default."""
75    return (
76        is_langfuse_span(span) or is_genai_span(span) or is_known_llm_instrumentor(span)
77    )

Return whether a span should be exported by default.

def is_langfuse_span(span: opentelemetry.sdk.trace.ReadableSpan) -> bool:
36def is_langfuse_span(span: ReadableSpan) -> bool:
37    """Return whether the span was created by the Langfuse SDK tracer."""
38    return (
39        span.instrumentation_scope is not None
40        and span.instrumentation_scope.name == LANGFUSE_TRACER_NAME
41    )

Return whether the span was created by the Langfuse SDK tracer.

def is_genai_span(span: opentelemetry.sdk.trace.ReadableSpan) -> bool:
44def is_genai_span(span: ReadableSpan) -> bool:
45    """Return whether the span has any ``gen_ai.*`` semantic convention attribute."""
46    if span.attributes is None:
47        return False
48
49    return any(
50        isinstance(key, str) and key.startswith("gen_ai")
51        for key in span.attributes.keys()
52    )

Return whether the span has any gen_ai.* semantic convention attribute.

def is_known_llm_instrumentor(span: opentelemetry.sdk.trace.ReadableSpan) -> bool:
60def is_known_llm_instrumentor(span: ReadableSpan) -> bool:
61    """Return whether the span comes from a known LLM instrumentation scope."""
62    if span.instrumentation_scope is None:
63        return False
64
65    scope_name = span.instrumentation_scope.name
66
67    return any(
68        _matches_scope_prefix(scope_name, prefix)
69        for prefix in KNOWN_LLM_INSTRUMENTATION_SCOPE_PREFIXES
70    )

Return whether the span comes from a known LLM instrumentation scope.

KNOWN_LLM_INSTRUMENTATION_SCOPE_PREFIXES = frozenset({'strands-agents', 'langsmith', 'vllm', 'agent_framework', 'litellm', 'langfuse-sdk', 'openinference', 'ai', 'opentelemetry.instrumentation.anthropic', 'haystack'})