langfuse

Langfuse GitHub Banner

Langfuse Python SDK

MIT License CI test status PyPI Version GitHub Repo stars Discord YC W23

Installation

Important

The SDK was rewritten in v3 and released in June 2025. Refer to the v3 migration guide for instructions on updating your code.

pip install langfuse

Docs

Please see our docs for detailed information on this SDK.

 1""".. include:: ../README.md"""
 2
 3from langfuse.batch_evaluation import (
 4    BatchEvaluationResult,
 5    BatchEvaluationResumeToken,
 6    CompositeEvaluatorFunction,
 7    EvaluatorInputs,
 8    EvaluatorStats,
 9    MapperFunction,
10)
11from langfuse.experiment import Evaluation
12
13from ._client import client as _client_module
14from ._client.attributes import LangfuseOtelSpanAttributes
15from ._client.constants import ObservationTypeLiteral
16from ._client.get_client import get_client
17from ._client.observe import observe
18from ._client.propagation import propagate_attributes
19from ._client.span import (
20    LangfuseAgent,
21    LangfuseChain,
22    LangfuseEmbedding,
23    LangfuseEvaluator,
24    LangfuseEvent,
25    LangfuseGeneration,
26    LangfuseGuardrail,
27    LangfuseRetriever,
28    LangfuseSpan,
29    LangfuseTool,
30)
31from .span_filter import (
32    KNOWN_LLM_INSTRUMENTATION_SCOPE_PREFIXES,
33    is_default_export_span,
34    is_genai_span,
35    is_known_llm_instrumentor,
36    is_langfuse_span,
37)
38
39Langfuse = _client_module.Langfuse
40
41__all__ = [
42    "Langfuse",
43    "get_client",
44    "observe",
45    "propagate_attributes",
46    "ObservationTypeLiteral",
47    "LangfuseSpan",
48    "LangfuseGeneration",
49    "LangfuseEvent",
50    "LangfuseOtelSpanAttributes",
51    "LangfuseAgent",
52    "LangfuseTool",
53    "LangfuseChain",
54    "LangfuseEmbedding",
55    "LangfuseEvaluator",
56    "LangfuseRetriever",
57    "LangfuseGuardrail",
58    "Evaluation",
59    "EvaluatorInputs",
60    "MapperFunction",
61    "CompositeEvaluatorFunction",
62    "EvaluatorStats",
63    "BatchEvaluationResumeToken",
64    "BatchEvaluationResult",
65    "is_default_export_span",
66    "is_langfuse_span",
67    "is_genai_span",
68    "is_known_llm_instrumentor",
69    "KNOWN_LLM_INSTRUMENTATION_SCOPE_PREFIXES",
70    "experiment",
71    "api",
72]
class Langfuse:
 132class Langfuse:
 133    """Main client for Langfuse tracing and platform features.
 134
 135    This class provides an interface for creating and managing traces, spans,
 136    and generations in Langfuse as well as interacting with the Langfuse API.
 137
 138    The client features a thread-safe singleton pattern for each unique public API key,
 139    ensuring consistent trace context propagation across your application. It implements
 140    efficient batching of spans with configurable flush settings and includes background
 141    thread management for media uploads and score ingestion.
 142
 143    Configuration is flexible through either direct parameters or environment variables,
 144    with graceful fallbacks and runtime configuration updates.
 145
 146    Attributes:
 147        api: Synchronous API client for Langfuse backend communication
 148        async_api: Asynchronous API client for Langfuse backend communication
 149        _otel_tracer: Internal LangfuseTracer instance managing OpenTelemetry components
 150
 151    Parameters:
 152        public_key (Optional[str]): Your Langfuse public API key. Can also be set via LANGFUSE_PUBLIC_KEY environment variable.
 153        secret_key (Optional[str]): Your Langfuse secret API key. Can also be set via LANGFUSE_SECRET_KEY environment variable.
 154        base_url (Optional[str]): The Langfuse API base URL. Defaults to "https://cloud.langfuse.com". Can also be set via LANGFUSE_BASE_URL environment variable.
 155        host (Optional[str]): Deprecated. Use base_url instead. The Langfuse API host URL. Defaults to "https://cloud.langfuse.com".
 156        timeout (Optional[int]): Timeout in seconds for API requests. Defaults to 5 seconds.
 157        httpx_client (Optional[httpx.Client]): Custom httpx client for making non-tracing HTTP requests. If not provided, a default client will be created.
 158        debug (bool): Enable debug logging. Defaults to False. Can also be set via LANGFUSE_DEBUG environment variable.
 159        tracing_enabled (Optional[bool]): Enable or disable tracing. Defaults to True. Can also be set via LANGFUSE_TRACING_ENABLED environment variable.
 160        flush_at (Optional[int]): Number of spans to batch before sending to the API. Defaults to 512. Can also be set via LANGFUSE_FLUSH_AT environment variable.
 161        flush_interval (Optional[float]): Time in seconds between batch flushes. Defaults to 5 seconds. Can also be set via LANGFUSE_FLUSH_INTERVAL environment variable.
 162        environment (Optional[str]): Environment name for tracing. Default is 'default'. Can also be set via LANGFUSE_TRACING_ENVIRONMENT environment variable. Can be any lowercase alphanumeric string with hyphens and underscores that does not start with 'langfuse'.
 163        release (Optional[str]): Release version/hash of your application. Used for grouping analytics by release.
 164        media_upload_thread_count (Optional[int]): Number of background threads for handling media uploads. Defaults to 1. Can also be set via LANGFUSE_MEDIA_UPLOAD_THREAD_COUNT environment variable.
 165        sample_rate (Optional[float]): Sampling rate for traces (0.0 to 1.0). Defaults to 1.0 (100% of traces are sampled). Can also be set via LANGFUSE_SAMPLE_RATE environment variable.
 166        mask (Optional[MaskFunction]): Function to mask sensitive data in traces before sending to the API.
 167        blocked_instrumentation_scopes (Optional[List[str]]): Deprecated. Use `should_export_span` instead. Equivalent behavior:
 168            ```python
 169            from langfuse.span_filter import is_default_export_span
 170            blocked = {"sqlite", "requests"}
 171
 172            should_export_span = lambda span: (
 173                is_default_export_span(span)
 174                and (
 175                    span.instrumentation_scope is None
 176                    or span.instrumentation_scope.name not in blocked
 177                )
 178            )
 179            ```
 180        should_export_span (Optional[Callable[[ReadableSpan], bool]]): Callback to decide whether to export a span. If omitted, Langfuse uses the default filter (Langfuse SDK spans, spans with `gen_ai.*` attributes, and known LLM instrumentation scopes).
 181        additional_headers (Optional[Dict[str, str]]): Additional headers to include in all API requests and OTLPSpanExporter requests. These headers will be merged with default headers. Note: If httpx_client is provided, additional_headers must be set directly on your custom httpx_client as well.
 182        tracer_provider(Optional[TracerProvider]): OpenTelemetry TracerProvider to use for Langfuse. This can be useful to set to have disconnected tracing between Langfuse and other OpenTelemetry-span emitting libraries. Note: To track active spans, the context is still shared between TracerProviders. This may lead to broken trace trees.
 183
 184    Example:
 185        ```python
 186        from langfuse.otel import Langfuse
 187
 188        # Initialize the client (reads from env vars if not provided)
 189        langfuse = Langfuse(
 190            public_key="your-public-key",
 191            secret_key="your-secret-key",
 192            host="https://cloud.langfuse.com",  # Optional, default shown
 193        )
 194
 195        # Create a trace span
 196        with langfuse.start_as_current_observation(name="process-query") as span:
 197            # Your application code here
 198
 199            # Create a nested generation span for an LLM call
 200            with span.start_as_current_generation(
 201                name="generate-response",
 202                model="gpt-4",
 203                input={"query": "Tell me about AI"},
 204                model_parameters={"temperature": 0.7, "max_tokens": 500}
 205            ) as generation:
 206                # Generate response here
 207                response = "AI is a field of computer science..."
 208
 209                generation.update(
 210                    output=response,
 211                    usage_details={"prompt_tokens": 10, "completion_tokens": 50},
 212                    cost_details={"total_cost": 0.0023}
 213                )
 214
 215                # Score the generation (supports NUMERIC, BOOLEAN, CATEGORICAL)
 216                generation.score(name="relevance", value=0.95, data_type="NUMERIC")
 217        ```
 218    """
 219
 220    _resources: Optional[LangfuseResourceManager] = None
 221    _mask: Optional[MaskFunction] = None
 222    _otel_tracer: otel_trace_api.Tracer
 223
 224    def __init__(
 225        self,
 226        *,
 227        public_key: Optional[str] = None,
 228        secret_key: Optional[str] = None,
 229        base_url: Optional[str] = None,
 230        host: Optional[str] = None,
 231        timeout: Optional[int] = None,
 232        httpx_client: Optional[httpx.Client] = None,
 233        debug: bool = False,
 234        tracing_enabled: Optional[bool] = True,
 235        flush_at: Optional[int] = None,
 236        flush_interval: Optional[float] = None,
 237        environment: Optional[str] = None,
 238        release: Optional[str] = None,
 239        media_upload_thread_count: Optional[int] = None,
 240        sample_rate: Optional[float] = None,
 241        mask: Optional[MaskFunction] = None,
 242        blocked_instrumentation_scopes: Optional[List[str]] = None,
 243        should_export_span: Optional[Callable[[ReadableSpan], bool]] = None,
 244        additional_headers: Optional[Dict[str, str]] = None,
 245        tracer_provider: Optional[TracerProvider] = None,
 246    ):
 247        self._base_url = (
 248            base_url
 249            or os.environ.get(LANGFUSE_BASE_URL)
 250            or host
 251            or os.environ.get(LANGFUSE_HOST, "https://cloud.langfuse.com")
 252        )
 253        self._environment = environment or cast(
 254            str, os.environ.get(LANGFUSE_TRACING_ENVIRONMENT)
 255        )
 256        self._project_id: Optional[str] = None
 257        sample_rate = sample_rate or float(os.environ.get(LANGFUSE_SAMPLE_RATE, 1.0))
 258        if not 0.0 <= sample_rate <= 1.0:
 259            raise ValueError(
 260                f"Sample rate must be between 0.0 and 1.0, got {sample_rate}"
 261            )
 262
 263        timeout = timeout or int(os.environ.get(LANGFUSE_TIMEOUT, 5))
 264
 265        self._tracing_enabled = (
 266            tracing_enabled
 267            and os.environ.get(LANGFUSE_TRACING_ENABLED, "true").lower() != "false"
 268        )
 269        if not self._tracing_enabled:
 270            langfuse_logger.info(
 271                "Configuration: Langfuse tracing is explicitly disabled. No data will be sent to the Langfuse API."
 272            )
 273
 274        debug = (
 275            debug if debug else (os.getenv(LANGFUSE_DEBUG, "false").lower() == "true")
 276        )
 277        if debug:
 278            logging.basicConfig(
 279                format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 280            )
 281            langfuse_logger.setLevel(logging.DEBUG)
 282
 283        public_key = public_key or os.environ.get(LANGFUSE_PUBLIC_KEY)
 284        if public_key is None:
 285            langfuse_logger.warning(
 286                "Authentication error: Langfuse client initialized without public_key. Client will be disabled. "
 287                "Provide a public_key parameter or set LANGFUSE_PUBLIC_KEY environment variable. "
 288            )
 289            self._otel_tracer = otel_trace_api.NoOpTracer()
 290            return
 291
 292        secret_key = secret_key or os.environ.get(LANGFUSE_SECRET_KEY)
 293        if secret_key is None:
 294            langfuse_logger.warning(
 295                "Authentication error: Langfuse client initialized without secret_key. Client will be disabled. "
 296                "Provide a secret_key parameter or set LANGFUSE_SECRET_KEY environment variable. "
 297            )
 298            self._otel_tracer = otel_trace_api.NoOpTracer()
 299            return
 300
 301        if os.environ.get("OTEL_SDK_DISABLED", "false").lower() == "true":
 302            langfuse_logger.warning(
 303                "OTEL_SDK_DISABLED is set. Langfuse tracing will be disabled and no traces will appear in the UI."
 304            )
 305
 306        if blocked_instrumentation_scopes is not None:
 307            warnings.warn(
 308                "`blocked_instrumentation_scopes` is deprecated and will be removed in a future release. "
 309                "Use `should_export_span` instead. Example: "
 310                "from langfuse.span_filter import is_default_export_span; "
 311                'blocked={"scope"}; should_export_span=lambda span: '
 312                "is_default_export_span(span) and (span.instrumentation_scope is None or "
 313                "span.instrumentation_scope.name not in blocked).",
 314                DeprecationWarning,
 315                stacklevel=2,
 316            )
 317
 318        # Initialize api and tracer if requirements are met
 319        self._resources = LangfuseResourceManager(
 320            public_key=public_key,
 321            secret_key=secret_key,
 322            base_url=self._base_url,
 323            timeout=timeout,
 324            environment=self._environment,
 325            release=release,
 326            flush_at=flush_at,
 327            flush_interval=flush_interval,
 328            httpx_client=httpx_client,
 329            media_upload_thread_count=media_upload_thread_count,
 330            sample_rate=sample_rate,
 331            mask=mask,
 332            tracing_enabled=self._tracing_enabled,
 333            blocked_instrumentation_scopes=blocked_instrumentation_scopes,
 334            should_export_span=should_export_span,
 335            additional_headers=additional_headers,
 336            tracer_provider=tracer_provider,
 337        )
 338        self._mask = self._resources.mask
 339
 340        self._otel_tracer = (
 341            self._resources.tracer
 342            if self._tracing_enabled and self._resources.tracer is not None
 343            else otel_trace_api.NoOpTracer()
 344        )
 345        self.api = self._resources.api
 346        self.async_api = self._resources.async_api
 347
 348    @overload
 349    def start_observation(
 350        self,
 351        *,
 352        trace_context: Optional[TraceContext] = None,
 353        name: str,
 354        as_type: Literal["generation"],
 355        input: Optional[Any] = None,
 356        output: Optional[Any] = None,
 357        metadata: Optional[Any] = None,
 358        version: Optional[str] = None,
 359        level: Optional[SpanLevel] = None,
 360        status_message: Optional[str] = None,
 361        completion_start_time: Optional[datetime] = None,
 362        model: Optional[str] = None,
 363        model_parameters: Optional[Dict[str, MapValue]] = None,
 364        usage_details: Optional[Dict[str, int]] = None,
 365        cost_details: Optional[Dict[str, float]] = None,
 366        prompt: Optional[PromptClient] = None,
 367    ) -> LangfuseGeneration: ...
 368
 369    @overload
 370    def start_observation(
 371        self,
 372        *,
 373        trace_context: Optional[TraceContext] = None,
 374        name: str,
 375        as_type: Literal["span"] = "span",
 376        input: Optional[Any] = None,
 377        output: Optional[Any] = None,
 378        metadata: Optional[Any] = None,
 379        version: Optional[str] = None,
 380        level: Optional[SpanLevel] = None,
 381        status_message: Optional[str] = None,
 382    ) -> LangfuseSpan: ...
 383
 384    @overload
 385    def start_observation(
 386        self,
 387        *,
 388        trace_context: Optional[TraceContext] = None,
 389        name: str,
 390        as_type: Literal["agent"],
 391        input: Optional[Any] = None,
 392        output: Optional[Any] = None,
 393        metadata: Optional[Any] = None,
 394        version: Optional[str] = None,
 395        level: Optional[SpanLevel] = None,
 396        status_message: Optional[str] = None,
 397    ) -> LangfuseAgent: ...
 398
 399    @overload
 400    def start_observation(
 401        self,
 402        *,
 403        trace_context: Optional[TraceContext] = None,
 404        name: str,
 405        as_type: Literal["tool"],
 406        input: Optional[Any] = None,
 407        output: Optional[Any] = None,
 408        metadata: Optional[Any] = None,
 409        version: Optional[str] = None,
 410        level: Optional[SpanLevel] = None,
 411        status_message: Optional[str] = None,
 412    ) -> LangfuseTool: ...
 413
 414    @overload
 415    def start_observation(
 416        self,
 417        *,
 418        trace_context: Optional[TraceContext] = None,
 419        name: str,
 420        as_type: Literal["chain"],
 421        input: Optional[Any] = None,
 422        output: Optional[Any] = None,
 423        metadata: Optional[Any] = None,
 424        version: Optional[str] = None,
 425        level: Optional[SpanLevel] = None,
 426        status_message: Optional[str] = None,
 427    ) -> LangfuseChain: ...
 428
 429    @overload
 430    def start_observation(
 431        self,
 432        *,
 433        trace_context: Optional[TraceContext] = None,
 434        name: str,
 435        as_type: Literal["retriever"],
 436        input: Optional[Any] = None,
 437        output: Optional[Any] = None,
 438        metadata: Optional[Any] = None,
 439        version: Optional[str] = None,
 440        level: Optional[SpanLevel] = None,
 441        status_message: Optional[str] = None,
 442    ) -> LangfuseRetriever: ...
 443
 444    @overload
 445    def start_observation(
 446        self,
 447        *,
 448        trace_context: Optional[TraceContext] = None,
 449        name: str,
 450        as_type: Literal["evaluator"],
 451        input: Optional[Any] = None,
 452        output: Optional[Any] = None,
 453        metadata: Optional[Any] = None,
 454        version: Optional[str] = None,
 455        level: Optional[SpanLevel] = None,
 456        status_message: Optional[str] = None,
 457    ) -> LangfuseEvaluator: ...
 458
 459    @overload
 460    def start_observation(
 461        self,
 462        *,
 463        trace_context: Optional[TraceContext] = None,
 464        name: str,
 465        as_type: Literal["embedding"],
 466        input: Optional[Any] = None,
 467        output: Optional[Any] = None,
 468        metadata: Optional[Any] = None,
 469        version: Optional[str] = None,
 470        level: Optional[SpanLevel] = None,
 471        status_message: Optional[str] = None,
 472        completion_start_time: Optional[datetime] = None,
 473        model: Optional[str] = None,
 474        model_parameters: Optional[Dict[str, MapValue]] = None,
 475        usage_details: Optional[Dict[str, int]] = None,
 476        cost_details: Optional[Dict[str, float]] = None,
 477        prompt: Optional[PromptClient] = None,
 478    ) -> LangfuseEmbedding: ...
 479
 480    @overload
 481    def start_observation(
 482        self,
 483        *,
 484        trace_context: Optional[TraceContext] = None,
 485        name: str,
 486        as_type: Literal["guardrail"],
 487        input: Optional[Any] = None,
 488        output: Optional[Any] = None,
 489        metadata: Optional[Any] = None,
 490        version: Optional[str] = None,
 491        level: Optional[SpanLevel] = None,
 492        status_message: Optional[str] = None,
 493    ) -> LangfuseGuardrail: ...
 494
 495    def start_observation(
 496        self,
 497        *,
 498        trace_context: Optional[TraceContext] = None,
 499        name: str,
 500        as_type: ObservationTypeLiteralNoEvent = "span",
 501        input: Optional[Any] = None,
 502        output: Optional[Any] = None,
 503        metadata: Optional[Any] = None,
 504        version: Optional[str] = None,
 505        level: Optional[SpanLevel] = None,
 506        status_message: Optional[str] = None,
 507        completion_start_time: Optional[datetime] = None,
 508        model: Optional[str] = None,
 509        model_parameters: Optional[Dict[str, MapValue]] = None,
 510        usage_details: Optional[Dict[str, int]] = None,
 511        cost_details: Optional[Dict[str, float]] = None,
 512        prompt: Optional[PromptClient] = None,
 513    ) -> Union[
 514        LangfuseSpan,
 515        LangfuseGeneration,
 516        LangfuseAgent,
 517        LangfuseTool,
 518        LangfuseChain,
 519        LangfuseRetriever,
 520        LangfuseEvaluator,
 521        LangfuseEmbedding,
 522        LangfuseGuardrail,
 523    ]:
 524        """Create a new observation of the specified type.
 525
 526        This method creates a new observation but does not set it as the current span in the
 527        context. To create and use an observation within a context, use start_as_current_observation().
 528
 529        Args:
 530            trace_context: Optional context for connecting to an existing trace
 531            name: Name of the observation
 532            as_type: Type of observation to create (defaults to "span")
 533            input: Input data for the operation
 534            output: Output data from the operation
 535            metadata: Additional metadata to associate with the observation
 536            version: Version identifier for the code or component
 537            level: Importance level of the observation
 538            status_message: Optional status message for the observation
 539            completion_start_time: When the model started generating (for generation types)
 540            model: Name/identifier of the AI model used (for generation types)
 541            model_parameters: Parameters used for the model (for generation types)
 542            usage_details: Token usage information (for generation types)
 543            cost_details: Cost information (for generation types)
 544            prompt: Associated prompt template (for generation types)
 545
 546        Returns:
 547            An observation object of the appropriate type that must be ended with .end()
 548        """
 549        if trace_context:
 550            trace_id = trace_context.get("trace_id", None)
 551            parent_span_id = trace_context.get("parent_span_id", None)
 552
 553            if trace_id:
 554                remote_parent_span = self._create_remote_parent_span(
 555                    trace_id=trace_id, parent_span_id=parent_span_id
 556                )
 557
 558                with otel_trace_api.use_span(
 559                    cast(otel_trace_api.Span, remote_parent_span)
 560                ):
 561                    otel_span = self._otel_tracer.start_span(name=name)
 562                    otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True)
 563
 564                    return self._create_observation_from_otel_span(
 565                        otel_span=otel_span,
 566                        as_type=as_type,
 567                        input=input,
 568                        output=output,
 569                        metadata=metadata,
 570                        version=version,
 571                        level=level,
 572                        status_message=status_message,
 573                        completion_start_time=completion_start_time,
 574                        model=model,
 575                        model_parameters=model_parameters,
 576                        usage_details=usage_details,
 577                        cost_details=cost_details,
 578                        prompt=prompt,
 579                    )
 580
 581        otel_span = self._otel_tracer.start_span(name=name)
 582
 583        return self._create_observation_from_otel_span(
 584            otel_span=otel_span,
 585            as_type=as_type,
 586            input=input,
 587            output=output,
 588            metadata=metadata,
 589            version=version,
 590            level=level,
 591            status_message=status_message,
 592            completion_start_time=completion_start_time,
 593            model=model,
 594            model_parameters=model_parameters,
 595            usage_details=usage_details,
 596            cost_details=cost_details,
 597            prompt=prompt,
 598        )
 599
 600    def _create_observation_from_otel_span(
 601        self,
 602        *,
 603        otel_span: otel_trace_api.Span,
 604        as_type: ObservationTypeLiteralNoEvent,
 605        input: Optional[Any] = None,
 606        output: Optional[Any] = None,
 607        metadata: Optional[Any] = None,
 608        version: Optional[str] = None,
 609        level: Optional[SpanLevel] = None,
 610        status_message: Optional[str] = None,
 611        completion_start_time: Optional[datetime] = None,
 612        model: Optional[str] = None,
 613        model_parameters: Optional[Dict[str, MapValue]] = None,
 614        usage_details: Optional[Dict[str, int]] = None,
 615        cost_details: Optional[Dict[str, float]] = None,
 616        prompt: Optional[PromptClient] = None,
 617    ) -> Union[
 618        LangfuseSpan,
 619        LangfuseGeneration,
 620        LangfuseAgent,
 621        LangfuseTool,
 622        LangfuseChain,
 623        LangfuseRetriever,
 624        LangfuseEvaluator,
 625        LangfuseEmbedding,
 626        LangfuseGuardrail,
 627    ]:
 628        """Create the appropriate observation type from an OTEL span."""
 629        if as_type in get_observation_types_list(ObservationTypeGenerationLike):
 630            observation_class = self._get_span_class(as_type)
 631            # Type ignore to prevent overloads of internal _get_span_class function,
 632            # issue is that LangfuseEvent could be returned and that classes have diff. args
 633            return observation_class(  # type: ignore[return-value,call-arg]
 634                otel_span=otel_span,
 635                langfuse_client=self,
 636                environment=self._environment,
 637                input=input,
 638                output=output,
 639                metadata=metadata,
 640                version=version,
 641                level=level,
 642                status_message=status_message,
 643                completion_start_time=completion_start_time,
 644                model=model,
 645                model_parameters=model_parameters,
 646                usage_details=usage_details,
 647                cost_details=cost_details,
 648                prompt=prompt,
 649            )
 650        else:
 651            # For other types (e.g. span, guardrail), create appropriate class without generation properties
 652            observation_class = self._get_span_class(as_type)
 653            # Type ignore to prevent overloads of internal _get_span_class function,
 654            # issue is that LangfuseEvent could be returned and that classes have diff. args
 655            return observation_class(  # type: ignore[return-value,call-arg]
 656                otel_span=otel_span,
 657                langfuse_client=self,
 658                environment=self._environment,
 659                input=input,
 660                output=output,
 661                metadata=metadata,
 662                version=version,
 663                level=level,
 664                status_message=status_message,
 665            )
 666            # span._observation_type = as_type
 667            # span._otel_span.set_attribute("langfuse.observation.type", as_type)
 668            # return span
 669
 670    @overload
 671    def start_as_current_observation(
 672        self,
 673        *,
 674        trace_context: Optional[TraceContext] = None,
 675        name: str,
 676        as_type: Literal["generation"],
 677        input: Optional[Any] = None,
 678        output: Optional[Any] = None,
 679        metadata: Optional[Any] = None,
 680        version: Optional[str] = None,
 681        level: Optional[SpanLevel] = None,
 682        status_message: Optional[str] = None,
 683        completion_start_time: Optional[datetime] = None,
 684        model: Optional[str] = None,
 685        model_parameters: Optional[Dict[str, MapValue]] = None,
 686        usage_details: Optional[Dict[str, int]] = None,
 687        cost_details: Optional[Dict[str, float]] = None,
 688        prompt: Optional[PromptClient] = None,
 689        end_on_exit: Optional[bool] = None,
 690    ) -> _AgnosticContextManager[LangfuseGeneration]: ...
 691
 692    @overload
 693    def start_as_current_observation(
 694        self,
 695        *,
 696        trace_context: Optional[TraceContext] = None,
 697        name: str,
 698        as_type: Literal["span"] = "span",
 699        input: Optional[Any] = None,
 700        output: Optional[Any] = None,
 701        metadata: Optional[Any] = None,
 702        version: Optional[str] = None,
 703        level: Optional[SpanLevel] = None,
 704        status_message: Optional[str] = None,
 705        end_on_exit: Optional[bool] = None,
 706    ) -> _AgnosticContextManager[LangfuseSpan]: ...
 707
 708    @overload
 709    def start_as_current_observation(
 710        self,
 711        *,
 712        trace_context: Optional[TraceContext] = None,
 713        name: str,
 714        as_type: Literal["agent"],
 715        input: Optional[Any] = None,
 716        output: Optional[Any] = None,
 717        metadata: Optional[Any] = None,
 718        version: Optional[str] = None,
 719        level: Optional[SpanLevel] = None,
 720        status_message: Optional[str] = None,
 721        end_on_exit: Optional[bool] = None,
 722    ) -> _AgnosticContextManager[LangfuseAgent]: ...
 723
 724    @overload
 725    def start_as_current_observation(
 726        self,
 727        *,
 728        trace_context: Optional[TraceContext] = None,
 729        name: str,
 730        as_type: Literal["tool"],
 731        input: Optional[Any] = None,
 732        output: Optional[Any] = None,
 733        metadata: Optional[Any] = None,
 734        version: Optional[str] = None,
 735        level: Optional[SpanLevel] = None,
 736        status_message: Optional[str] = None,
 737        end_on_exit: Optional[bool] = None,
 738    ) -> _AgnosticContextManager[LangfuseTool]: ...
 739
 740    @overload
 741    def start_as_current_observation(
 742        self,
 743        *,
 744        trace_context: Optional[TraceContext] = None,
 745        name: str,
 746        as_type: Literal["chain"],
 747        input: Optional[Any] = None,
 748        output: Optional[Any] = None,
 749        metadata: Optional[Any] = None,
 750        version: Optional[str] = None,
 751        level: Optional[SpanLevel] = None,
 752        status_message: Optional[str] = None,
 753        end_on_exit: Optional[bool] = None,
 754    ) -> _AgnosticContextManager[LangfuseChain]: ...
 755
 756    @overload
 757    def start_as_current_observation(
 758        self,
 759        *,
 760        trace_context: Optional[TraceContext] = None,
 761        name: str,
 762        as_type: Literal["retriever"],
 763        input: Optional[Any] = None,
 764        output: Optional[Any] = None,
 765        metadata: Optional[Any] = None,
 766        version: Optional[str] = None,
 767        level: Optional[SpanLevel] = None,
 768        status_message: Optional[str] = None,
 769        end_on_exit: Optional[bool] = None,
 770    ) -> _AgnosticContextManager[LangfuseRetriever]: ...
 771
 772    @overload
 773    def start_as_current_observation(
 774        self,
 775        *,
 776        trace_context: Optional[TraceContext] = None,
 777        name: str,
 778        as_type: Literal["evaluator"],
 779        input: Optional[Any] = None,
 780        output: Optional[Any] = None,
 781        metadata: Optional[Any] = None,
 782        version: Optional[str] = None,
 783        level: Optional[SpanLevel] = None,
 784        status_message: Optional[str] = None,
 785        end_on_exit: Optional[bool] = None,
 786    ) -> _AgnosticContextManager[LangfuseEvaluator]: ...
 787
 788    @overload
 789    def start_as_current_observation(
 790        self,
 791        *,
 792        trace_context: Optional[TraceContext] = None,
 793        name: str,
 794        as_type: Literal["embedding"],
 795        input: Optional[Any] = None,
 796        output: Optional[Any] = None,
 797        metadata: Optional[Any] = None,
 798        version: Optional[str] = None,
 799        level: Optional[SpanLevel] = None,
 800        status_message: Optional[str] = None,
 801        completion_start_time: Optional[datetime] = None,
 802        model: Optional[str] = None,
 803        model_parameters: Optional[Dict[str, MapValue]] = None,
 804        usage_details: Optional[Dict[str, int]] = None,
 805        cost_details: Optional[Dict[str, float]] = None,
 806        prompt: Optional[PromptClient] = None,
 807        end_on_exit: Optional[bool] = None,
 808    ) -> _AgnosticContextManager[LangfuseEmbedding]: ...
 809
 810    @overload
 811    def start_as_current_observation(
 812        self,
 813        *,
 814        trace_context: Optional[TraceContext] = None,
 815        name: str,
 816        as_type: Literal["guardrail"],
 817        input: Optional[Any] = None,
 818        output: Optional[Any] = None,
 819        metadata: Optional[Any] = None,
 820        version: Optional[str] = None,
 821        level: Optional[SpanLevel] = None,
 822        status_message: Optional[str] = None,
 823        end_on_exit: Optional[bool] = None,
 824    ) -> _AgnosticContextManager[LangfuseGuardrail]: ...
 825
 826    def start_as_current_observation(
 827        self,
 828        *,
 829        trace_context: Optional[TraceContext] = None,
 830        name: str,
 831        as_type: ObservationTypeLiteralNoEvent = "span",
 832        input: Optional[Any] = None,
 833        output: Optional[Any] = None,
 834        metadata: Optional[Any] = None,
 835        version: Optional[str] = None,
 836        level: Optional[SpanLevel] = None,
 837        status_message: Optional[str] = None,
 838        completion_start_time: Optional[datetime] = None,
 839        model: Optional[str] = None,
 840        model_parameters: Optional[Dict[str, MapValue]] = None,
 841        usage_details: Optional[Dict[str, int]] = None,
 842        cost_details: Optional[Dict[str, float]] = None,
 843        prompt: Optional[PromptClient] = None,
 844        end_on_exit: Optional[bool] = None,
 845    ) -> Union[
 846        _AgnosticContextManager[LangfuseGeneration],
 847        _AgnosticContextManager[LangfuseSpan],
 848        _AgnosticContextManager[LangfuseAgent],
 849        _AgnosticContextManager[LangfuseTool],
 850        _AgnosticContextManager[LangfuseChain],
 851        _AgnosticContextManager[LangfuseRetriever],
 852        _AgnosticContextManager[LangfuseEvaluator],
 853        _AgnosticContextManager[LangfuseEmbedding],
 854        _AgnosticContextManager[LangfuseGuardrail],
 855    ]:
 856        """Create a new observation and set it as the current span in a context manager.
 857
 858        This method creates a new observation of the specified type and sets it as the
 859        current span within a context manager. Use this method with a 'with' statement to
 860        automatically handle the observation lifecycle within a code block.
 861
 862        The created observation will be the child of the current span in the context.
 863
 864        Args:
 865            trace_context: Optional context for connecting to an existing trace
 866            name: Name of the observation (e.g., function or operation name)
 867            as_type: Type of observation to create (defaults to "span")
 868            input: Input data for the operation (can be any JSON-serializable object)
 869            output: Output data from the operation (can be any JSON-serializable object)
 870            metadata: Additional metadata to associate with the observation
 871            version: Version identifier for the code or component
 872            level: Importance level of the observation (info, warning, error)
 873            status_message: Optional status message for the observation
 874            end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
 875
 876            The following parameters are available when as_type is: "generation" or "embedding".
 877            completion_start_time: When the model started generating the response
 878            model: Name/identifier of the AI model used (e.g., "gpt-4")
 879            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
 880            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
 881            cost_details: Cost information for the model call
 882            prompt: Associated prompt template from Langfuse prompt management
 883
 884        Returns:
 885            A context manager that yields the appropriate observation type based on as_type
 886
 887        Example:
 888            ```python
 889            # Create a span
 890            with langfuse.start_as_current_observation(name="process-query", as_type="span") as span:
 891                # Do work
 892                result = process_data()
 893                span.update(output=result)
 894
 895                # Create a child span automatically
 896                with span.start_as_current_observation(name="sub-operation") as child_span:
 897                    # Do sub-operation work
 898                    child_span.update(output="sub-result")
 899
 900            # Create a tool observation
 901            with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool:
 902                # Do tool work
 903                results = search_web(query)
 904                tool.update(output=results)
 905
 906            # Create a generation observation
 907            with langfuse.start_as_current_observation(
 908                name="answer-generation",
 909                as_type="generation",
 910                model="gpt-4"
 911            ) as generation:
 912                # Generate answer
 913                response = llm.generate(...)
 914                generation.update(output=response)
 915            ```
 916        """
 917        if as_type in get_observation_types_list(ObservationTypeGenerationLike):
 918            if trace_context:
 919                trace_id = trace_context.get("trace_id", None)
 920                parent_span_id = trace_context.get("parent_span_id", None)
 921
 922                if trace_id:
 923                    remote_parent_span = self._create_remote_parent_span(
 924                        trace_id=trace_id, parent_span_id=parent_span_id
 925                    )
 926
 927                    return cast(
 928                        Union[
 929                            _AgnosticContextManager[LangfuseGeneration],
 930                            _AgnosticContextManager[LangfuseEmbedding],
 931                        ],
 932                        self._create_span_with_parent_context(
 933                            as_type=as_type,
 934                            name=name,
 935                            remote_parent_span=remote_parent_span,
 936                            parent=None,
 937                            end_on_exit=end_on_exit,
 938                            input=input,
 939                            output=output,
 940                            metadata=metadata,
 941                            version=version,
 942                            level=level,
 943                            status_message=status_message,
 944                            completion_start_time=completion_start_time,
 945                            model=model,
 946                            model_parameters=model_parameters,
 947                            usage_details=usage_details,
 948                            cost_details=cost_details,
 949                            prompt=prompt,
 950                        ),
 951                    )
 952
 953            return cast(
 954                Union[
 955                    _AgnosticContextManager[LangfuseGeneration],
 956                    _AgnosticContextManager[LangfuseEmbedding],
 957                ],
 958                self._start_as_current_otel_span_with_processed_media(
 959                    as_type=as_type,
 960                    name=name,
 961                    end_on_exit=end_on_exit,
 962                    input=input,
 963                    output=output,
 964                    metadata=metadata,
 965                    version=version,
 966                    level=level,
 967                    status_message=status_message,
 968                    completion_start_time=completion_start_time,
 969                    model=model,
 970                    model_parameters=model_parameters,
 971                    usage_details=usage_details,
 972                    cost_details=cost_details,
 973                    prompt=prompt,
 974                ),
 975            )
 976
 977        if as_type in get_observation_types_list(ObservationTypeSpanLike):
 978            if trace_context:
 979                trace_id = trace_context.get("trace_id", None)
 980                parent_span_id = trace_context.get("parent_span_id", None)
 981
 982                if trace_id:
 983                    remote_parent_span = self._create_remote_parent_span(
 984                        trace_id=trace_id, parent_span_id=parent_span_id
 985                    )
 986
 987                    return cast(
 988                        Union[
 989                            _AgnosticContextManager[LangfuseSpan],
 990                            _AgnosticContextManager[LangfuseAgent],
 991                            _AgnosticContextManager[LangfuseTool],
 992                            _AgnosticContextManager[LangfuseChain],
 993                            _AgnosticContextManager[LangfuseRetriever],
 994                            _AgnosticContextManager[LangfuseEvaluator],
 995                            _AgnosticContextManager[LangfuseGuardrail],
 996                        ],
 997                        self._create_span_with_parent_context(
 998                            as_type=as_type,
 999                            name=name,
1000                            remote_parent_span=remote_parent_span,
1001                            parent=None,
1002                            end_on_exit=end_on_exit,
1003                            input=input,
1004                            output=output,
1005                            metadata=metadata,
1006                            version=version,
1007                            level=level,
1008                            status_message=status_message,
1009                        ),
1010                    )
1011
1012            return cast(
1013                Union[
1014                    _AgnosticContextManager[LangfuseSpan],
1015                    _AgnosticContextManager[LangfuseAgent],
1016                    _AgnosticContextManager[LangfuseTool],
1017                    _AgnosticContextManager[LangfuseChain],
1018                    _AgnosticContextManager[LangfuseRetriever],
1019                    _AgnosticContextManager[LangfuseEvaluator],
1020                    _AgnosticContextManager[LangfuseGuardrail],
1021                ],
1022                self._start_as_current_otel_span_with_processed_media(
1023                    as_type=as_type,
1024                    name=name,
1025                    end_on_exit=end_on_exit,
1026                    input=input,
1027                    output=output,
1028                    metadata=metadata,
1029                    version=version,
1030                    level=level,
1031                    status_message=status_message,
1032                ),
1033            )
1034
1035        # This should never be reached since all valid types are handled above
1036        langfuse_logger.warning(
1037            f"Unknown observation type: {as_type}, falling back to span"
1038        )
1039        return self._start_as_current_otel_span_with_processed_media(
1040            as_type="span",
1041            name=name,
1042            end_on_exit=end_on_exit,
1043            input=input,
1044            output=output,
1045            metadata=metadata,
1046            version=version,
1047            level=level,
1048            status_message=status_message,
1049        )
1050
1051    def _get_span_class(
1052        self,
1053        as_type: ObservationTypeLiteral,
1054    ) -> Union[
1055        Type[LangfuseAgent],
1056        Type[LangfuseTool],
1057        Type[LangfuseChain],
1058        Type[LangfuseRetriever],
1059        Type[LangfuseEvaluator],
1060        Type[LangfuseEmbedding],
1061        Type[LangfuseGuardrail],
1062        Type[LangfuseGeneration],
1063        Type[LangfuseEvent],
1064        Type[LangfuseSpan],
1065    ]:
1066        """Get the appropriate span class based on as_type."""
1067        normalized_type = as_type.lower()
1068
1069        if normalized_type == "agent":
1070            return LangfuseAgent
1071        elif normalized_type == "tool":
1072            return LangfuseTool
1073        elif normalized_type == "chain":
1074            return LangfuseChain
1075        elif normalized_type == "retriever":
1076            return LangfuseRetriever
1077        elif normalized_type == "evaluator":
1078            return LangfuseEvaluator
1079        elif normalized_type == "embedding":
1080            return LangfuseEmbedding
1081        elif normalized_type == "guardrail":
1082            return LangfuseGuardrail
1083        elif normalized_type == "generation":
1084            return LangfuseGeneration
1085        elif normalized_type == "event":
1086            return LangfuseEvent
1087        elif normalized_type == "span":
1088            return LangfuseSpan
1089        else:
1090            return LangfuseSpan
1091
1092    @_agnosticcontextmanager
1093    def _create_span_with_parent_context(
1094        self,
1095        *,
1096        name: str,
1097        parent: Optional[otel_trace_api.Span] = None,
1098        remote_parent_span: Optional[otel_trace_api.Span] = None,
1099        as_type: ObservationTypeLiteralNoEvent,
1100        end_on_exit: Optional[bool] = None,
1101        input: Optional[Any] = None,
1102        output: Optional[Any] = None,
1103        metadata: Optional[Any] = None,
1104        version: Optional[str] = None,
1105        level: Optional[SpanLevel] = None,
1106        status_message: Optional[str] = None,
1107        completion_start_time: Optional[datetime] = None,
1108        model: Optional[str] = None,
1109        model_parameters: Optional[Dict[str, MapValue]] = None,
1110        usage_details: Optional[Dict[str, int]] = None,
1111        cost_details: Optional[Dict[str, float]] = None,
1112        prompt: Optional[PromptClient] = None,
1113    ) -> Any:
1114        parent_span = parent or cast(otel_trace_api.Span, remote_parent_span)
1115
1116        with otel_trace_api.use_span(parent_span):
1117            with self._start_as_current_otel_span_with_processed_media(
1118                name=name,
1119                as_type=as_type,
1120                end_on_exit=end_on_exit,
1121                input=input,
1122                output=output,
1123                metadata=metadata,
1124                version=version,
1125                level=level,
1126                status_message=status_message,
1127                completion_start_time=completion_start_time,
1128                model=model,
1129                model_parameters=model_parameters,
1130                usage_details=usage_details,
1131                cost_details=cost_details,
1132                prompt=prompt,
1133            ) as langfuse_span:
1134                if remote_parent_span is not None:
1135                    langfuse_span._otel_span.set_attribute(
1136                        LangfuseOtelSpanAttributes.AS_ROOT, True
1137                    )
1138
1139                yield langfuse_span
1140
1141    @_agnosticcontextmanager
1142    def _start_as_current_otel_span_with_processed_media(
1143        self,
1144        *,
1145        name: str,
1146        as_type: Optional[ObservationTypeLiteralNoEvent] = None,
1147        end_on_exit: Optional[bool] = None,
1148        input: Optional[Any] = None,
1149        output: Optional[Any] = None,
1150        metadata: Optional[Any] = None,
1151        version: Optional[str] = None,
1152        level: Optional[SpanLevel] = None,
1153        status_message: Optional[str] = None,
1154        completion_start_time: Optional[datetime] = None,
1155        model: Optional[str] = None,
1156        model_parameters: Optional[Dict[str, MapValue]] = None,
1157        usage_details: Optional[Dict[str, int]] = None,
1158        cost_details: Optional[Dict[str, float]] = None,
1159        prompt: Optional[PromptClient] = None,
1160    ) -> Any:
1161        with self._otel_tracer.start_as_current_span(
1162            name=name,
1163            end_on_exit=end_on_exit if end_on_exit is not None else True,
1164        ) as otel_span:
1165            span_class = self._get_span_class(
1166                as_type or "generation"
1167            )  # default was "generation"
1168            common_args = {
1169                "otel_span": otel_span,
1170                "langfuse_client": self,
1171                "environment": self._environment,
1172                "input": input,
1173                "output": output,
1174                "metadata": metadata,
1175                "version": version,
1176                "level": level,
1177                "status_message": status_message,
1178            }
1179
1180            if span_class in [
1181                LangfuseGeneration,
1182                LangfuseEmbedding,
1183            ]:
1184                common_args.update(
1185                    {
1186                        "completion_start_time": completion_start_time,
1187                        "model": model,
1188                        "model_parameters": model_parameters,
1189                        "usage_details": usage_details,
1190                        "cost_details": cost_details,
1191                        "prompt": prompt,
1192                    }
1193                )
1194            # For span-like types (span, agent, tool, chain, retriever, evaluator, guardrail), no generation properties needed
1195
1196            yield span_class(**common_args)  # type: ignore[arg-type]
1197
1198    def _get_current_otel_span(self) -> Optional[otel_trace_api.Span]:
1199        current_span = otel_trace_api.get_current_span()
1200
1201        if current_span is otel_trace_api.INVALID_SPAN:
1202            langfuse_logger.warning(
1203                "Context error: No active span in current context. Operations that depend on an active span will be skipped. "
1204                "Ensure spans are created with start_as_current_observation() or that you're operating within an active span context."
1205            )
1206            return None
1207
1208        return current_span
1209
1210    def update_current_generation(
1211        self,
1212        *,
1213        name: Optional[str] = None,
1214        input: Optional[Any] = None,
1215        output: Optional[Any] = None,
1216        metadata: Optional[Any] = None,
1217        version: Optional[str] = None,
1218        level: Optional[SpanLevel] = None,
1219        status_message: Optional[str] = None,
1220        completion_start_time: Optional[datetime] = None,
1221        model: Optional[str] = None,
1222        model_parameters: Optional[Dict[str, MapValue]] = None,
1223        usage_details: Optional[Dict[str, int]] = None,
1224        cost_details: Optional[Dict[str, float]] = None,
1225        prompt: Optional[PromptClient] = None,
1226    ) -> None:
1227        """Update the current active generation span with new information.
1228
1229        This method updates the current generation span in the active context with
1230        additional information. It's useful for adding output, usage stats, or other
1231        details that become available during or after model generation.
1232
1233        Args:
1234            name: The generation name
1235            input: Updated input data for the model
1236            output: Output from the model (e.g., completions)
1237            metadata: Additional metadata to associate with the generation
1238            version: Version identifier for the model or component
1239            level: Importance level of the generation (info, warning, error)
1240            status_message: Optional status message for the generation
1241            completion_start_time: When the model started generating the response
1242            model: Name/identifier of the AI model used (e.g., "gpt-4")
1243            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1244            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1245            cost_details: Cost information for the model call
1246            prompt: Associated prompt template from Langfuse prompt management
1247
1248        Example:
1249            ```python
1250            with langfuse.start_as_current_generation(name="answer-query") as generation:
1251                # Initial setup and API call
1252                response = llm.generate(...)
1253
1254                # Update with results that weren't available at creation time
1255                langfuse.update_current_generation(
1256                    output=response.text,
1257                    usage_details={
1258                        "prompt_tokens": response.usage.prompt_tokens,
1259                        "completion_tokens": response.usage.completion_tokens
1260                    }
1261                )
1262            ```
1263        """
1264        if not self._tracing_enabled:
1265            langfuse_logger.debug(
1266                "Operation skipped: update_current_generation - Tracing is disabled or client is in no-op mode."
1267            )
1268            return
1269
1270        current_otel_span = self._get_current_otel_span()
1271
1272        if current_otel_span is not None:
1273            generation = LangfuseGeneration(
1274                otel_span=current_otel_span, langfuse_client=self
1275            )
1276
1277            if name:
1278                current_otel_span.update_name(name)
1279
1280            generation.update(
1281                input=input,
1282                output=output,
1283                metadata=metadata,
1284                version=version,
1285                level=level,
1286                status_message=status_message,
1287                completion_start_time=completion_start_time,
1288                model=model,
1289                model_parameters=model_parameters,
1290                usage_details=usage_details,
1291                cost_details=cost_details,
1292                prompt=prompt,
1293            )
1294
1295    def update_current_span(
1296        self,
1297        *,
1298        name: Optional[str] = None,
1299        input: Optional[Any] = None,
1300        output: Optional[Any] = None,
1301        metadata: Optional[Any] = None,
1302        version: Optional[str] = None,
1303        level: Optional[SpanLevel] = None,
1304        status_message: Optional[str] = None,
1305    ) -> None:
1306        """Update the current active span with new information.
1307
1308        This method updates the current span in the active context with
1309        additional information. It's useful for adding outputs or metadata
1310        that become available during execution.
1311
1312        Args:
1313            name: The span name
1314            input: Updated input data for the operation
1315            output: Output data from the operation
1316            metadata: Additional metadata to associate with the span
1317            version: Version identifier for the code or component
1318            level: Importance level of the span (info, warning, error)
1319            status_message: Optional status message for the span
1320
1321        Example:
1322            ```python
1323            with langfuse.start_as_current_observation(name="process-data") as span:
1324                # Initial processing
1325                result = process_first_part()
1326
1327                # Update with intermediate results
1328                langfuse.update_current_span(metadata={"intermediate_result": result})
1329
1330                # Continue processing
1331                final_result = process_second_part(result)
1332
1333                # Final update
1334                langfuse.update_current_span(output=final_result)
1335            ```
1336        """
1337        if not self._tracing_enabled:
1338            langfuse_logger.debug(
1339                "Operation skipped: update_current_span - Tracing is disabled or client is in no-op mode."
1340            )
1341            return
1342
1343        current_otel_span = self._get_current_otel_span()
1344
1345        if current_otel_span is not None:
1346            span = LangfuseSpan(
1347                otel_span=current_otel_span,
1348                langfuse_client=self,
1349                environment=self._environment,
1350            )
1351
1352            if name:
1353                current_otel_span.update_name(name)
1354
1355            span.update(
1356                input=input,
1357                output=output,
1358                metadata=metadata,
1359                version=version,
1360                level=level,
1361                status_message=status_message,
1362            )
1363
1364    @deprecated(
1365        "Trace-level input/output is deprecated. "
1366        "For trace attributes (user_id, session_id, tags, etc.), use propagate_attributes() instead. "
1367        "This method will be removed in a future major version."
1368    )
1369    def set_current_trace_io(
1370        self,
1371        *,
1372        input: Optional[Any] = None,
1373        output: Optional[Any] = None,
1374    ) -> None:
1375        """Set trace-level input and output for the current span's trace.
1376
1377        .. deprecated::
1378            This is a legacy method for backward compatibility with Langfuse platform
1379            features that still rely on trace-level input/output (e.g., legacy LLM-as-a-judge
1380            evaluators). It will be removed in a future major version.
1381
1382            For setting other trace attributes (user_id, session_id, metadata, tags, version),
1383            use :meth:`propagate_attributes` instead.
1384
1385        Args:
1386            input: Input data to associate with the trace.
1387            output: Output data to associate with the trace.
1388        """
1389        if not self._tracing_enabled:
1390            langfuse_logger.debug(
1391                "Operation skipped: set_current_trace_io - Tracing is disabled or client is in no-op mode."
1392            )
1393            return
1394
1395        current_otel_span = self._get_current_otel_span()
1396
1397        if current_otel_span is not None and current_otel_span.is_recording():
1398            existing_observation_type = current_otel_span.attributes.get(  # type: ignore[attr-defined]
1399                LangfuseOtelSpanAttributes.OBSERVATION_TYPE, "span"
1400            )
1401            # We need to preserve the class to keep the correct observation type
1402            span_class = self._get_span_class(existing_observation_type)
1403            span = span_class(
1404                otel_span=current_otel_span,
1405                langfuse_client=self,
1406                environment=self._environment,
1407            )
1408
1409            span.set_trace_io(
1410                input=input,
1411                output=output,
1412            )
1413
1414    def set_current_trace_as_public(self) -> None:
1415        """Make the current trace publicly accessible via its URL.
1416
1417        When a trace is published, anyone with the trace link can view the full trace
1418        without needing to be logged in to Langfuse. This action cannot be undone
1419        programmatically - once published, the entire trace becomes public.
1420
1421        This is a convenience method that publishes the trace from the currently
1422        active span context. Use this when you want to make a trace public from
1423        within a traced function without needing direct access to the span object.
1424        """
1425        if not self._tracing_enabled:
1426            langfuse_logger.debug(
1427                "Operation skipped: set_current_trace_as_public - Tracing is disabled or client is in no-op mode."
1428            )
1429            return
1430
1431        current_otel_span = self._get_current_otel_span()
1432
1433        if current_otel_span is not None and current_otel_span.is_recording():
1434            existing_observation_type = current_otel_span.attributes.get(  # type: ignore[attr-defined]
1435                LangfuseOtelSpanAttributes.OBSERVATION_TYPE, "span"
1436            )
1437            # We need to preserve the class to keep the correct observation type
1438            span_class = self._get_span_class(existing_observation_type)
1439            span = span_class(
1440                otel_span=current_otel_span,
1441                langfuse_client=self,
1442                environment=self._environment,
1443            )
1444
1445            span.set_trace_as_public()
1446
1447    def create_event(
1448        self,
1449        *,
1450        trace_context: Optional[TraceContext] = None,
1451        name: str,
1452        input: Optional[Any] = None,
1453        output: Optional[Any] = None,
1454        metadata: Optional[Any] = None,
1455        version: Optional[str] = None,
1456        level: Optional[SpanLevel] = None,
1457        status_message: Optional[str] = None,
1458    ) -> LangfuseEvent:
1459        """Create a new Langfuse observation of type 'EVENT'.
1460
1461        The created Langfuse Event observation will be the child of the current span in the context.
1462
1463        Args:
1464            trace_context: Optional context for connecting to an existing trace
1465            name: Name of the span (e.g., function or operation name)
1466            input: Input data for the operation (can be any JSON-serializable object)
1467            output: Output data from the operation (can be any JSON-serializable object)
1468            metadata: Additional metadata to associate with the span
1469            version: Version identifier for the code or component
1470            level: Importance level of the span (info, warning, error)
1471            status_message: Optional status message for the span
1472
1473        Returns:
1474            The Langfuse Event object
1475
1476        Example:
1477            ```python
1478            event = langfuse.create_event(name="process-event")
1479            ```
1480        """
1481        timestamp = time_ns()
1482
1483        if trace_context:
1484            trace_id = trace_context.get("trace_id", None)
1485            parent_span_id = trace_context.get("parent_span_id", None)
1486
1487            if trace_id:
1488                remote_parent_span = self._create_remote_parent_span(
1489                    trace_id=trace_id, parent_span_id=parent_span_id
1490                )
1491
1492                with otel_trace_api.use_span(
1493                    cast(otel_trace_api.Span, remote_parent_span)
1494                ):
1495                    otel_span = self._otel_tracer.start_span(
1496                        name=name, start_time=timestamp
1497                    )
1498                    otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True)
1499
1500                    return cast(
1501                        LangfuseEvent,
1502                        LangfuseEvent(
1503                            otel_span=otel_span,
1504                            langfuse_client=self,
1505                            environment=self._environment,
1506                            input=input,
1507                            output=output,
1508                            metadata=metadata,
1509                            version=version,
1510                            level=level,
1511                            status_message=status_message,
1512                        ).end(end_time=timestamp),
1513                    )
1514
1515        otel_span = self._otel_tracer.start_span(name=name, start_time=timestamp)
1516
1517        return cast(
1518            LangfuseEvent,
1519            LangfuseEvent(
1520                otel_span=otel_span,
1521                langfuse_client=self,
1522                environment=self._environment,
1523                input=input,
1524                output=output,
1525                metadata=metadata,
1526                version=version,
1527                level=level,
1528                status_message=status_message,
1529            ).end(end_time=timestamp),
1530        )
1531
1532    def _create_remote_parent_span(
1533        self, *, trace_id: str, parent_span_id: Optional[str]
1534    ) -> Any:
1535        if not self._is_valid_trace_id(trace_id):
1536            langfuse_logger.warning(
1537                f"Passed trace ID '{trace_id}' is not a valid 32 lowercase hex char Langfuse trace id. Ignoring trace ID."
1538            )
1539
1540        if parent_span_id and not self._is_valid_span_id(parent_span_id):
1541            langfuse_logger.warning(
1542                f"Passed span ID '{parent_span_id}' is not a valid 16 lowercase hex char Langfuse span id. Ignoring parent span ID."
1543            )
1544
1545        int_trace_id = int(trace_id, 16)
1546        int_parent_span_id = (
1547            int(parent_span_id, 16)
1548            if parent_span_id
1549            else RandomIdGenerator().generate_span_id()
1550        )
1551
1552        span_context = otel_trace_api.SpanContext(
1553            trace_id=int_trace_id,
1554            span_id=int_parent_span_id,
1555            trace_flags=otel_trace_api.TraceFlags(0x01),  # mark span as sampled
1556            is_remote=False,
1557        )
1558
1559        return otel_trace_api.NonRecordingSpan(span_context)
1560
1561    def _is_valid_trace_id(self, trace_id: str) -> bool:
1562        pattern = r"^[0-9a-f]{32}$"
1563
1564        return bool(re.match(pattern, trace_id))
1565
1566    def _is_valid_span_id(self, span_id: str) -> bool:
1567        pattern = r"^[0-9a-f]{16}$"
1568
1569        return bool(re.match(pattern, span_id))
1570
1571    def _create_observation_id(self, *, seed: Optional[str] = None) -> str:
1572        """Create a unique observation ID for use with Langfuse.
1573
1574        This method generates a unique observation ID (span ID in OpenTelemetry terms)
1575        for use with various Langfuse APIs. It can either generate a random ID or
1576        create a deterministic ID based on a seed string.
1577
1578        Observation IDs must be 16 lowercase hexadecimal characters, representing 8 bytes.
1579        This method ensures the generated ID meets this requirement. If you need to
1580        correlate an external ID with a Langfuse observation ID, use the external ID as
1581        the seed to get a valid, deterministic observation ID.
1582
1583        Args:
1584            seed: Optional string to use as a seed for deterministic ID generation.
1585                 If provided, the same seed will always produce the same ID.
1586                 If not provided, a random ID will be generated.
1587
1588        Returns:
1589            A 16-character lowercase hexadecimal string representing the observation ID.
1590
1591        Example:
1592            ```python
1593            # Generate a random observation ID
1594            obs_id = langfuse.create_observation_id()
1595
1596            # Generate a deterministic ID based on a seed
1597            user_obs_id = langfuse.create_observation_id(seed="user-123-feedback")
1598
1599            # Correlate an external item ID with a Langfuse observation ID
1600            item_id = "item-789012"
1601            correlated_obs_id = langfuse.create_observation_id(seed=item_id)
1602
1603            # Use the ID with Langfuse APIs
1604            langfuse.create_score(
1605                name="relevance",
1606                value=0.95,
1607                trace_id=trace_id,
1608                observation_id=obs_id
1609            )
1610            ```
1611        """
1612        if not seed:
1613            span_id_int = RandomIdGenerator().generate_span_id()
1614
1615            return self._format_otel_span_id(span_id_int)
1616
1617        return sha256(seed.encode("utf-8")).digest()[:8].hex()
1618
1619    @staticmethod
1620    def create_trace_id(*, seed: Optional[str] = None) -> str:
1621        """Create a unique trace ID for use with Langfuse.
1622
1623        This method generates a unique trace ID for use with various Langfuse APIs.
1624        It can either generate a random ID or create a deterministic ID based on
1625        a seed string.
1626
1627        Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes.
1628        This method ensures the generated ID meets this requirement. If you need to
1629        correlate an external ID with a Langfuse trace ID, use the external ID as the
1630        seed to get a valid, deterministic Langfuse trace ID.
1631
1632        Args:
1633            seed: Optional string to use as a seed for deterministic ID generation.
1634                 If provided, the same seed will always produce the same ID.
1635                 If not provided, a random ID will be generated.
1636
1637        Returns:
1638            A 32-character lowercase hexadecimal string representing the Langfuse trace ID.
1639
1640        Example:
1641            ```python
1642            # Generate a random trace ID
1643            trace_id = langfuse.create_trace_id()
1644
1645            # Generate a deterministic ID based on a seed
1646            session_trace_id = langfuse.create_trace_id(seed="session-456")
1647
1648            # Correlate an external ID with a Langfuse trace ID
1649            external_id = "external-system-123456"
1650            correlated_trace_id = langfuse.create_trace_id(seed=external_id)
1651
1652            # Use the ID with trace context
1653            with langfuse.start_as_current_observation(
1654                name="process-request",
1655                trace_context={"trace_id": trace_id}
1656            ) as span:
1657                # Operation will be part of the specific trace
1658                pass
1659            ```
1660        """
1661        if not seed:
1662            trace_id_int = RandomIdGenerator().generate_trace_id()
1663
1664            return Langfuse._format_otel_trace_id(trace_id_int)
1665
1666        return sha256(seed.encode("utf-8")).digest()[:16].hex()
1667
1668    def _get_otel_trace_id(self, otel_span: otel_trace_api.Span) -> str:
1669        span_context = otel_span.get_span_context()
1670
1671        return self._format_otel_trace_id(span_context.trace_id)
1672
1673    def _get_otel_span_id(self, otel_span: otel_trace_api.Span) -> str:
1674        span_context = otel_span.get_span_context()
1675
1676        return self._format_otel_span_id(span_context.span_id)
1677
1678    @staticmethod
1679    def _format_otel_span_id(span_id_int: int) -> str:
1680        """Format an integer span ID to a 16-character lowercase hex string.
1681
1682        Internal method to convert an OpenTelemetry integer span ID to the standard
1683        W3C Trace Context format (16-character lowercase hex string).
1684
1685        Args:
1686            span_id_int: 64-bit integer representing a span ID
1687
1688        Returns:
1689            A 16-character lowercase hexadecimal string
1690        """
1691        return format(span_id_int, "016x")
1692
1693    @staticmethod
1694    def _format_otel_trace_id(trace_id_int: int) -> str:
1695        """Format an integer trace ID to a 32-character lowercase hex string.
1696
1697        Internal method to convert an OpenTelemetry integer trace ID to the standard
1698        W3C Trace Context format (32-character lowercase hex string).
1699
1700        Args:
1701            trace_id_int: 128-bit integer representing a trace ID
1702
1703        Returns:
1704            A 32-character lowercase hexadecimal string
1705        """
1706        return format(trace_id_int, "032x")
1707
1708    @overload
1709    def create_score(
1710        self,
1711        *,
1712        name: str,
1713        value: float,
1714        session_id: Optional[str] = None,
1715        dataset_run_id: Optional[str] = None,
1716        trace_id: Optional[str] = None,
1717        observation_id: Optional[str] = None,
1718        score_id: Optional[str] = None,
1719        data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None,
1720        comment: Optional[str] = None,
1721        config_id: Optional[str] = None,
1722        metadata: Optional[Any] = None,
1723        timestamp: Optional[datetime] = None,
1724    ) -> None: ...
1725
1726    @overload
1727    def create_score(
1728        self,
1729        *,
1730        name: str,
1731        value: str,
1732        session_id: Optional[str] = None,
1733        dataset_run_id: Optional[str] = None,
1734        trace_id: Optional[str] = None,
1735        score_id: Optional[str] = None,
1736        observation_id: Optional[str] = None,
1737        data_type: Optional[Literal["CATEGORICAL"]] = "CATEGORICAL",
1738        comment: Optional[str] = None,
1739        config_id: Optional[str] = None,
1740        metadata: Optional[Any] = None,
1741        timestamp: Optional[datetime] = None,
1742    ) -> None: ...
1743
1744    def create_score(
1745        self,
1746        *,
1747        name: str,
1748        value: Union[float, str],
1749        session_id: Optional[str] = None,
1750        dataset_run_id: Optional[str] = None,
1751        trace_id: Optional[str] = None,
1752        observation_id: Optional[str] = None,
1753        score_id: Optional[str] = None,
1754        data_type: Optional[ScoreDataType] = None,
1755        comment: Optional[str] = None,
1756        config_id: Optional[str] = None,
1757        metadata: Optional[Any] = None,
1758        timestamp: Optional[datetime] = None,
1759    ) -> None:
1760        """Create a score for a specific trace or observation.
1761
1762        This method creates a score for evaluating a Langfuse trace or observation. Scores can be
1763        used to track quality metrics, user feedback, or automated evaluations.
1764
1765        Args:
1766            name: Name of the score (e.g., "relevance", "accuracy")
1767            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
1768            session_id: ID of the Langfuse session to associate the score with
1769            dataset_run_id: ID of the Langfuse dataset run to associate the score with
1770            trace_id: ID of the Langfuse trace to associate the score with
1771            observation_id: Optional ID of the specific observation to score. Trace ID must be provided too.
1772            score_id: Optional custom ID for the score (auto-generated if not provided)
1773            data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
1774            comment: Optional comment or explanation for the score
1775            config_id: Optional ID of a score config defined in Langfuse
1776            metadata: Optional metadata to be attached to the score
1777            timestamp: Optional timestamp for the score (defaults to current UTC time)
1778
1779        Example:
1780            ```python
1781            # Create a numeric score for accuracy
1782            langfuse.create_score(
1783                name="accuracy",
1784                value=0.92,
1785                trace_id="abcdef1234567890abcdef1234567890",
1786                data_type="NUMERIC",
1787                comment="High accuracy with minor irrelevant details"
1788            )
1789
1790            # Create a categorical score for sentiment
1791            langfuse.create_score(
1792                name="sentiment",
1793                value="positive",
1794                trace_id="abcdef1234567890abcdef1234567890",
1795                observation_id="abcdef1234567890",
1796                data_type="CATEGORICAL"
1797            )
1798            ```
1799        """
1800        if not self._tracing_enabled:
1801            return
1802
1803        score_id = score_id or self._create_observation_id()
1804
1805        try:
1806            new_body = ScoreBody(
1807                id=score_id,
1808                session_id=session_id,
1809                datasetRunId=dataset_run_id,
1810                traceId=trace_id,
1811                observationId=observation_id,
1812                name=name,
1813                value=value,
1814                dataType=data_type,  # type: ignore
1815                comment=comment,
1816                configId=config_id,
1817                environment=self._environment,
1818                metadata=metadata,
1819            )
1820
1821            event = {
1822                "id": self.create_trace_id(),
1823                "type": "score-create",
1824                "timestamp": timestamp or _get_timestamp(),
1825                "body": new_body,
1826            }
1827
1828            if self._resources is not None:
1829                # Force the score to be in sample if it was for a legacy trace ID, i.e. non-32 hexchar
1830                force_sample = (
1831                    not self._is_valid_trace_id(trace_id) if trace_id else True
1832                )
1833
1834                self._resources.add_score_task(
1835                    event,
1836                    force_sample=force_sample,
1837                )
1838
1839        except Exception as e:
1840            langfuse_logger.exception(
1841                f"Error creating score: Failed to process score event for trace_id={trace_id}, name={name}. Error: {e}"
1842            )
1843
1844    def _create_trace_tags_via_ingestion(
1845        self,
1846        *,
1847        trace_id: str,
1848        tags: List[str],
1849    ) -> None:
1850        """Private helper to enqueue trace tag updates via ingestion API events."""
1851        if not self._tracing_enabled:
1852            return
1853
1854        if len(tags) == 0:
1855            return
1856
1857        try:
1858            new_body = TraceBody(
1859                id=trace_id,
1860                tags=tags,
1861            )
1862
1863            event = {
1864                "id": self.create_trace_id(),
1865                "type": "trace-create",
1866                "timestamp": _get_timestamp(),
1867                "body": new_body,
1868            }
1869
1870            if self._resources is not None:
1871                self._resources.add_trace_task(event)
1872        except Exception as e:
1873            langfuse_logger.exception(
1874                f"Error updating trace tags: Failed to process trace update event for trace_id={trace_id}. Error: {e}"
1875            )
1876
1877    @overload
1878    def score_current_span(
1879        self,
1880        *,
1881        name: str,
1882        value: float,
1883        score_id: Optional[str] = None,
1884        data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None,
1885        comment: Optional[str] = None,
1886        config_id: Optional[str] = None,
1887        metadata: Optional[Any] = None,
1888    ) -> None: ...
1889
1890    @overload
1891    def score_current_span(
1892        self,
1893        *,
1894        name: str,
1895        value: str,
1896        score_id: Optional[str] = None,
1897        data_type: Optional[Literal["CATEGORICAL"]] = "CATEGORICAL",
1898        comment: Optional[str] = None,
1899        config_id: Optional[str] = None,
1900        metadata: Optional[Any] = None,
1901    ) -> None: ...
1902
1903    def score_current_span(
1904        self,
1905        *,
1906        name: str,
1907        value: Union[float, str],
1908        score_id: Optional[str] = None,
1909        data_type: Optional[ScoreDataType] = None,
1910        comment: Optional[str] = None,
1911        config_id: Optional[str] = None,
1912        metadata: Optional[Any] = None,
1913    ) -> None:
1914        """Create a score for the current active span.
1915
1916        This method scores the currently active span in the context. It's a convenient
1917        way to score the current operation without needing to know its trace and span IDs.
1918
1919        Args:
1920            name: Name of the score (e.g., "relevance", "accuracy")
1921            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
1922            score_id: Optional custom ID for the score (auto-generated if not provided)
1923            data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
1924            comment: Optional comment or explanation for the score
1925            config_id: Optional ID of a score config defined in Langfuse
1926            metadata: Optional metadata to be attached to the score
1927
1928        Example:
1929            ```python
1930            with langfuse.start_as_current_generation(name="answer-query") as generation:
1931                # Generate answer
1932                response = generate_answer(...)
1933                generation.update(output=response)
1934
1935                # Score the generation
1936                langfuse.score_current_span(
1937                    name="relevance",
1938                    value=0.85,
1939                    data_type="NUMERIC",
1940                    comment="Mostly relevant but contains some tangential information",
1941                    metadata={"model": "gpt-4", "prompt_version": "v2"}
1942                )
1943            ```
1944        """
1945        current_span = self._get_current_otel_span()
1946
1947        if current_span is not None:
1948            trace_id = self._get_otel_trace_id(current_span)
1949            observation_id = self._get_otel_span_id(current_span)
1950
1951            langfuse_logger.info(
1952                f"Score: Creating score name='{name}' value={value} for current span ({observation_id}) in trace {trace_id}"
1953            )
1954
1955            self.create_score(
1956                trace_id=trace_id,
1957                observation_id=observation_id,
1958                name=name,
1959                value=cast(str, value),
1960                score_id=score_id,
1961                data_type=cast(Literal["CATEGORICAL"], data_type),
1962                comment=comment,
1963                config_id=config_id,
1964                metadata=metadata,
1965            )
1966
1967    @overload
1968    def score_current_trace(
1969        self,
1970        *,
1971        name: str,
1972        value: float,
1973        score_id: Optional[str] = None,
1974        data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None,
1975        comment: Optional[str] = None,
1976        config_id: Optional[str] = None,
1977        metadata: Optional[Any] = None,
1978    ) -> None: ...
1979
1980    @overload
1981    def score_current_trace(
1982        self,
1983        *,
1984        name: str,
1985        value: str,
1986        score_id: Optional[str] = None,
1987        data_type: Optional[Literal["CATEGORICAL"]] = "CATEGORICAL",
1988        comment: Optional[str] = None,
1989        config_id: Optional[str] = None,
1990        metadata: Optional[Any] = None,
1991    ) -> None: ...
1992
1993    def score_current_trace(
1994        self,
1995        *,
1996        name: str,
1997        value: Union[float, str],
1998        score_id: Optional[str] = None,
1999        data_type: Optional[ScoreDataType] = None,
2000        comment: Optional[str] = None,
2001        config_id: Optional[str] = None,
2002        metadata: Optional[Any] = None,
2003    ) -> None:
2004        """Create a score for the current trace.
2005
2006        This method scores the trace of the currently active span. Unlike score_current_span,
2007        this method associates the score with the entire trace rather than a specific span.
2008        It's useful for scoring overall performance or quality of the entire operation.
2009
2010        Args:
2011            name: Name of the score (e.g., "user_satisfaction", "overall_quality")
2012            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
2013            score_id: Optional custom ID for the score (auto-generated if not provided)
2014            data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
2015            comment: Optional comment or explanation for the score
2016            config_id: Optional ID of a score config defined in Langfuse
2017            metadata: Optional metadata to be attached to the score
2018
2019        Example:
2020            ```python
2021            with langfuse.start_as_current_observation(name="process-user-request") as span:
2022                # Process request
2023                result = process_complete_request()
2024                span.update(output=result)
2025
2026                # Score the overall trace
2027                langfuse.score_current_trace(
2028                    name="overall_quality",
2029                    value=0.95,
2030                    data_type="NUMERIC",
2031                    comment="High quality end-to-end response",
2032                    metadata={"evaluator": "gpt-4", "criteria": "comprehensive"}
2033                )
2034            ```
2035        """
2036        current_span = self._get_current_otel_span()
2037
2038        if current_span is not None:
2039            trace_id = self._get_otel_trace_id(current_span)
2040
2041            langfuse_logger.info(
2042                f"Score: Creating score name='{name}' value={value} for entire trace {trace_id}"
2043            )
2044
2045            self.create_score(
2046                trace_id=trace_id,
2047                name=name,
2048                value=cast(str, value),
2049                score_id=score_id,
2050                data_type=cast(Literal["CATEGORICAL"], data_type),
2051                comment=comment,
2052                config_id=config_id,
2053                metadata=metadata,
2054            )
2055
2056    def flush(self) -> None:
2057        """Force flush all pending spans and events to the Langfuse API.
2058
2059        This method manually flushes any pending spans, scores, and other events to the
2060        Langfuse API. It's useful in scenarios where you want to ensure all data is sent
2061        before proceeding, without waiting for the automatic flush interval.
2062
2063        Example:
2064            ```python
2065            # Record some spans and scores
2066            with langfuse.start_as_current_observation(name="operation") as span:
2067                # Do work...
2068                pass
2069
2070            # Ensure all data is sent to Langfuse before proceeding
2071            langfuse.flush()
2072
2073            # Continue with other work
2074            ```
2075        """
2076        if self._resources is not None:
2077            self._resources.flush()
2078
2079    def shutdown(self) -> None:
2080        """Shut down the Langfuse client and flush all pending data.
2081
2082        This method cleanly shuts down the Langfuse client, ensuring all pending data
2083        is flushed to the API and all background threads are properly terminated.
2084
2085        It's important to call this method when your application is shutting down to
2086        prevent data loss and resource leaks. For most applications, using the client
2087        as a context manager or relying on the automatic shutdown via atexit is sufficient.
2088
2089        Example:
2090            ```python
2091            # Initialize Langfuse
2092            langfuse = Langfuse(public_key="...", secret_key="...")
2093
2094            # Use Langfuse throughout your application
2095            # ...
2096
2097            # When application is shutting down
2098            langfuse.shutdown()
2099            ```
2100        """
2101        if self._resources is not None:
2102            self._resources.shutdown()
2103
2104    def get_current_trace_id(self) -> Optional[str]:
2105        """Get the trace ID of the current active span.
2106
2107        This method retrieves the trace ID from the currently active span in the context.
2108        It can be used to get the trace ID for referencing in logs, external systems,
2109        or for creating related operations.
2110
2111        Returns:
2112            The current trace ID as a 32-character lowercase hexadecimal string,
2113            or None if there is no active span.
2114
2115        Example:
2116            ```python
2117            with langfuse.start_as_current_observation(name="process-request") as span:
2118                # Get the current trace ID for reference
2119                trace_id = langfuse.get_current_trace_id()
2120
2121                # Use it for external correlation
2122                log.info(f"Processing request with trace_id: {trace_id}")
2123
2124                # Or pass to another system
2125                external_system.process(data, trace_id=trace_id)
2126            ```
2127        """
2128        if not self._tracing_enabled:
2129            langfuse_logger.debug(
2130                "Operation skipped: get_current_trace_id - Tracing is disabled or client is in no-op mode."
2131            )
2132            return None
2133
2134        current_otel_span = self._get_current_otel_span()
2135
2136        return self._get_otel_trace_id(current_otel_span) if current_otel_span else None
2137
2138    def get_current_observation_id(self) -> Optional[str]:
2139        """Get the observation ID (span ID) of the current active span.
2140
2141        This method retrieves the observation ID from the currently active span in the context.
2142        It can be used to get the observation ID for referencing in logs, external systems,
2143        or for creating scores or other related operations.
2144
2145        Returns:
2146            The current observation ID as a 16-character lowercase hexadecimal string,
2147            or None if there is no active span.
2148
2149        Example:
2150            ```python
2151            with langfuse.start_as_current_observation(name="process-user-query") as span:
2152                # Get the current observation ID
2153                observation_id = langfuse.get_current_observation_id()
2154
2155                # Store it for later reference
2156                cache.set(f"query_{query_id}_observation", observation_id)
2157
2158                # Process the query...
2159            ```
2160        """
2161        if not self._tracing_enabled:
2162            langfuse_logger.debug(
2163                "Operation skipped: get_current_observation_id - Tracing is disabled or client is in no-op mode."
2164            )
2165            return None
2166
2167        current_otel_span = self._get_current_otel_span()
2168
2169        return self._get_otel_span_id(current_otel_span) if current_otel_span else None
2170
2171    def _get_project_id(self) -> Optional[str]:
2172        """Fetch and return the current project id. Persisted across requests. Returns None if no project id is found for api keys."""
2173        if not self._project_id:
2174            proj = self.api.projects.get()
2175            if not proj.data or not proj.data[0].id:
2176                return None
2177
2178            self._project_id = proj.data[0].id
2179
2180        return self._project_id
2181
2182    def get_trace_url(self, *, trace_id: Optional[str] = None) -> Optional[str]:
2183        """Get the URL to view a trace in the Langfuse UI.
2184
2185        This method generates a URL that links directly to a trace in the Langfuse UI.
2186        It's useful for providing links in logs, notifications, or debugging tools.
2187
2188        Args:
2189            trace_id: Optional trace ID to generate a URL for. If not provided,
2190                     the trace ID of the current active span will be used.
2191
2192        Returns:
2193            A URL string pointing to the trace in the Langfuse UI,
2194            or None if the project ID couldn't be retrieved or no trace ID is available.
2195
2196        Example:
2197            ```python
2198            # Get URL for the current trace
2199            with langfuse.start_as_current_observation(name="process-request") as span:
2200                trace_url = langfuse.get_trace_url()
2201                log.info(f"Processing trace: {trace_url}")
2202
2203            # Get URL for a specific trace
2204            specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef")
2205            send_notification(f"Review needed for trace: {specific_trace_url}")
2206            ```
2207        """
2208        final_trace_id = trace_id or self.get_current_trace_id()
2209        if not final_trace_id:
2210            return None
2211
2212        project_id = self._get_project_id()
2213
2214        return (
2215            f"{self._base_url}/project/{project_id}/traces/{final_trace_id}"
2216            if project_id and final_trace_id
2217            else None
2218        )
2219
2220    def get_dataset(
2221        self,
2222        name: str,
2223        *,
2224        fetch_items_page_size: Optional[int] = 50,
2225        version: Optional[datetime] = None,
2226    ) -> "DatasetClient":
2227        """Fetch a dataset by its name.
2228
2229        Args:
2230            name (str): The name of the dataset to fetch.
2231            fetch_items_page_size (Optional[int]): All items of the dataset will be fetched in chunks of this size. Defaults to 50.
2232            version (Optional[datetime]): Retrieve dataset items as they existed at this specific point in time (UTC).
2233                If provided, returns the state of items at the specified UTC timestamp.
2234                If not provided, returns the latest version. Must be a timezone-aware datetime object in UTC.
2235
2236        Returns:
2237            DatasetClient: The dataset with the given name.
2238        """
2239        try:
2240            langfuse_logger.debug(f"Getting datasets {name}")
2241            dataset = self.api.datasets.get(dataset_name=self._url_encode(name))
2242
2243            dataset_items = []
2244            page = 1
2245
2246            while True:
2247                new_items = self.api.dataset_items.list(
2248                    dataset_name=self._url_encode(name, is_url_param=True),
2249                    page=page,
2250                    limit=fetch_items_page_size,
2251                    version=version,
2252                )
2253                dataset_items.extend(new_items.data)
2254
2255                if new_items.meta.total_pages <= page:
2256                    break
2257
2258                page += 1
2259
2260            return DatasetClient(
2261                dataset=dataset,
2262                items=dataset_items,
2263                version=version,
2264                langfuse_client=self,
2265            )
2266
2267        except Error as e:
2268            handle_fern_exception(e)
2269            raise e
2270
2271    def get_dataset_run(
2272        self, *, dataset_name: str, run_name: str
2273    ) -> DatasetRunWithItems:
2274        """Fetch a dataset run by dataset name and run name.
2275
2276        Args:
2277            dataset_name (str): The name of the dataset.
2278            run_name (str): The name of the run.
2279
2280        Returns:
2281            DatasetRunWithItems: The dataset run with its items.
2282        """
2283        try:
2284            return cast(
2285                DatasetRunWithItems,
2286                self.api.datasets.get_run(
2287                    dataset_name=self._url_encode(dataset_name),
2288                    run_name=self._url_encode(run_name),
2289                    request_options=None,
2290                ),
2291            )
2292        except Error as e:
2293            handle_fern_exception(e)
2294            raise e
2295
2296    def get_dataset_runs(
2297        self,
2298        *,
2299        dataset_name: str,
2300        page: Optional[int] = None,
2301        limit: Optional[int] = None,
2302    ) -> PaginatedDatasetRuns:
2303        """Fetch all runs for a dataset.
2304
2305        Args:
2306            dataset_name (str): The name of the dataset.
2307            page (Optional[int]): Page number, starts at 1.
2308            limit (Optional[int]): Limit of items per page.
2309
2310        Returns:
2311            PaginatedDatasetRuns: Paginated list of dataset runs.
2312        """
2313        try:
2314            return cast(
2315                PaginatedDatasetRuns,
2316                self.api.datasets.get_runs(
2317                    dataset_name=self._url_encode(dataset_name),
2318                    page=page,
2319                    limit=limit,
2320                    request_options=None,
2321                ),
2322            )
2323        except Error as e:
2324            handle_fern_exception(e)
2325            raise e
2326
2327    def delete_dataset_run(
2328        self, *, dataset_name: str, run_name: str
2329    ) -> DeleteDatasetRunResponse:
2330        """Delete a dataset run and all its run items. This action is irreversible.
2331
2332        Args:
2333            dataset_name (str): The name of the dataset.
2334            run_name (str): The name of the run.
2335
2336        Returns:
2337            DeleteDatasetRunResponse: Confirmation of deletion.
2338        """
2339        try:
2340            return cast(
2341                DeleteDatasetRunResponse,
2342                self.api.datasets.delete_run(
2343                    dataset_name=self._url_encode(dataset_name),
2344                    run_name=self._url_encode(run_name),
2345                    request_options=None,
2346                ),
2347            )
2348        except Error as e:
2349            handle_fern_exception(e)
2350            raise e
2351
2352    def run_experiment(
2353        self,
2354        *,
2355        name: str,
2356        run_name: Optional[str] = None,
2357        description: Optional[str] = None,
2358        data: ExperimentData,
2359        task: TaskFunction,
2360        evaluators: List[EvaluatorFunction] = [],
2361        composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
2362        run_evaluators: List[RunEvaluatorFunction] = [],
2363        max_concurrency: int = 50,
2364        metadata: Optional[Dict[str, str]] = None,
2365        _dataset_version: Optional[datetime] = None,
2366    ) -> ExperimentResult:
2367        """Run an experiment on a dataset with automatic tracing and evaluation.
2368
2369        This method executes a task function on each item in the provided dataset,
2370        automatically traces all executions with Langfuse for observability, runs
2371        item-level and run-level evaluators on the outputs, and returns comprehensive
2372        results with evaluation metrics.
2373
2374        The experiment system provides:
2375        - Automatic tracing of all task executions
2376        - Concurrent processing with configurable limits
2377        - Comprehensive error handling that isolates failures
2378        - Integration with Langfuse datasets for experiment tracking
2379        - Flexible evaluation framework supporting both sync and async evaluators
2380
2381        Args:
2382            name: Human-readable name for the experiment. Used for identification
2383                in the Langfuse UI.
2384            run_name: Optional exact name for the experiment run. If provided, this will be
2385                used as the exact dataset run name if the `data` contains Langfuse dataset items.
2386                If not provided, this will default to the experiment name appended with an ISO timestamp.
2387            description: Optional description explaining the experiment's purpose,
2388                methodology, or expected outcomes.
2389            data: Array of data items to process. Can be either:
2390                - List of dict-like items with 'input', 'expected_output', 'metadata' keys
2391                - List of Langfuse DatasetItem objects from dataset.items
2392            task: Function that processes each data item and returns output.
2393                Must accept 'item' as keyword argument and can return sync or async results.
2394                The task function signature should be: task(*, item, **kwargs) -> Any
2395            evaluators: List of functions to evaluate each item's output individually.
2396                Each evaluator receives input, output, expected_output, and metadata.
2397                Can return single Evaluation dict or list of Evaluation dicts.
2398            composite_evaluator: Optional function that creates composite scores from item-level evaluations.
2399                Receives the same inputs as item-level evaluators (input, output, expected_output, metadata)
2400                plus the list of evaluations from item-level evaluators. Useful for weighted averages,
2401                pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics.
2402            run_evaluators: List of functions to evaluate the entire experiment run.
2403                Each run evaluator receives all item_results and can compute aggregate metrics.
2404                Useful for calculating averages, distributions, or cross-item comparisons.
2405            max_concurrency: Maximum number of concurrent task executions (default: 50).
2406                Controls the number of items processed simultaneously. Adjust based on
2407                API rate limits and system resources.
2408            metadata: Optional metadata dictionary to attach to all experiment traces.
2409                This metadata will be included in every trace created during the experiment.
2410                If `data` are Langfuse dataset items, the metadata will be attached to the dataset run, too.
2411
2412        Returns:
2413            ExperimentResult containing:
2414            - run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset.
2415            - item_results: List of results for each processed item with outputs and evaluations
2416            - run_evaluations: List of aggregate evaluation results for the entire run
2417            - dataset_run_id: ID of the dataset run (if using Langfuse datasets)
2418            - dataset_run_url: Direct URL to view results in Langfuse UI (if applicable)
2419
2420        Raises:
2421            ValueError: If required parameters are missing or invalid
2422            Exception: If experiment setup fails (individual item failures are handled gracefully)
2423
2424        Examples:
2425            Basic experiment with local data:
2426            ```python
2427            def summarize_text(*, item, **kwargs):
2428                return f"Summary: {item['input'][:50]}..."
2429
2430            def length_evaluator(*, input, output, expected_output=None, **kwargs):
2431                return {
2432                    "name": "output_length",
2433                    "value": len(output),
2434                    "comment": f"Output contains {len(output)} characters"
2435                }
2436
2437            result = langfuse.run_experiment(
2438                name="Text Summarization Test",
2439                description="Evaluate summarization quality and length",
2440                data=[
2441                    {"input": "Long article text...", "expected_output": "Expected summary"},
2442                    {"input": "Another article...", "expected_output": "Another summary"}
2443                ],
2444                task=summarize_text,
2445                evaluators=[length_evaluator]
2446            )
2447
2448            print(f"Processed {len(result.item_results)} items")
2449            for item_result in result.item_results:
2450                print(f"Input: {item_result.item['input']}")
2451                print(f"Output: {item_result.output}")
2452                print(f"Evaluations: {item_result.evaluations}")
2453            ```
2454
2455            Advanced experiment with async task and multiple evaluators:
2456            ```python
2457            async def llm_task(*, item, **kwargs):
2458                # Simulate async LLM call
2459                response = await openai_client.chat.completions.create(
2460                    model="gpt-4",
2461                    messages=[{"role": "user", "content": item["input"]}]
2462                )
2463                return response.choices[0].message.content
2464
2465            def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
2466                if expected_output and expected_output.lower() in output.lower():
2467                    return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"}
2468                return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"}
2469
2470            def toxicity_evaluator(*, input, output, expected_output=None, **kwargs):
2471                # Simulate toxicity check
2472                toxicity_score = check_toxicity(output)  # Your toxicity checker
2473                return {
2474                    "name": "toxicity",
2475                    "value": toxicity_score,
2476                    "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}"
2477                }
2478
2479            def average_accuracy(*, item_results, **kwargs):
2480                accuracies = [
2481                    eval.value for result in item_results
2482                    for eval in result.evaluations
2483                    if eval.name == "accuracy"
2484                ]
2485                return {
2486                    "name": "average_accuracy",
2487                    "value": sum(accuracies) / len(accuracies) if accuracies else 0,
2488                    "comment": f"Average accuracy across {len(accuracies)} items"
2489                }
2490
2491            result = langfuse.run_experiment(
2492                name="LLM Safety and Accuracy Test",
2493                description="Evaluate model accuracy and safety across diverse prompts",
2494                data=test_dataset,  # Your dataset items
2495                task=llm_task,
2496                evaluators=[accuracy_evaluator, toxicity_evaluator],
2497                run_evaluators=[average_accuracy],
2498                max_concurrency=5,  # Limit concurrent API calls
2499                metadata={"model": "gpt-4", "temperature": 0.7}
2500            )
2501            ```
2502
2503            Using with Langfuse datasets:
2504            ```python
2505            # Get dataset from Langfuse
2506            dataset = langfuse.get_dataset("my-eval-dataset")
2507
2508            result = dataset.run_experiment(
2509                name="Production Model Evaluation",
2510                description="Monthly evaluation of production model performance",
2511                task=my_production_task,
2512                evaluators=[accuracy_evaluator, latency_evaluator]
2513            )
2514
2515            # Results automatically linked to dataset in Langfuse UI
2516            print(f"View results: {result['dataset_run_url']}")
2517            ```
2518
2519        Note:
2520            - Task and evaluator functions can be either synchronous or asynchronous
2521            - Individual item failures are logged but don't stop the experiment
2522            - All executions are automatically traced and visible in Langfuse UI
2523            - When using Langfuse datasets, results are automatically linked for easy comparison
2524            - This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.)
2525            - Async execution is handled automatically with smart event loop detection
2526        """
2527        return cast(
2528            ExperimentResult,
2529            run_async_safely(
2530                self._run_experiment_async(
2531                    name=name,
2532                    run_name=self._create_experiment_run_name(
2533                        name=name, run_name=run_name
2534                    ),
2535                    description=description,
2536                    data=data,
2537                    task=task,
2538                    evaluators=evaluators or [],
2539                    composite_evaluator=composite_evaluator,
2540                    run_evaluators=run_evaluators or [],
2541                    max_concurrency=max_concurrency,
2542                    metadata=metadata,
2543                    dataset_version=_dataset_version,
2544                ),
2545            ),
2546        )
2547
2548    async def _run_experiment_async(
2549        self,
2550        *,
2551        name: str,
2552        run_name: str,
2553        description: Optional[str],
2554        data: ExperimentData,
2555        task: TaskFunction,
2556        evaluators: List[EvaluatorFunction],
2557        composite_evaluator: Optional[CompositeEvaluatorFunction],
2558        run_evaluators: List[RunEvaluatorFunction],
2559        max_concurrency: int,
2560        metadata: Optional[Dict[str, Any]] = None,
2561        dataset_version: Optional[datetime] = None,
2562    ) -> ExperimentResult:
2563        langfuse_logger.debug(
2564            f"Starting experiment '{name}' run '{run_name}' with {len(data)} items"
2565        )
2566
2567        # Set up concurrency control
2568        semaphore = asyncio.Semaphore(max_concurrency)
2569
2570        # Process all items
2571        async def process_item(item: ExperimentItem) -> ExperimentItemResult:
2572            async with semaphore:
2573                return await self._process_experiment_item(
2574                    item,
2575                    task,
2576                    evaluators,
2577                    composite_evaluator,
2578                    name,
2579                    run_name,
2580                    description,
2581                    metadata,
2582                    dataset_version,
2583                )
2584
2585        # Run all items concurrently
2586        tasks = [process_item(item) for item in data]
2587        item_results = await asyncio.gather(*tasks, return_exceptions=True)
2588
2589        # Filter out any exceptions and log errors
2590        valid_results: List[ExperimentItemResult] = []
2591        for i, result in enumerate(item_results):
2592            if isinstance(result, Exception):
2593                langfuse_logger.error(f"Item {i} failed: {result}")
2594            elif isinstance(result, ExperimentItemResult):
2595                valid_results.append(result)  # type: ignore
2596
2597        # Run experiment-level evaluators
2598        run_evaluations: List[Evaluation] = []
2599        for run_evaluator in run_evaluators:
2600            try:
2601                evaluations = await _run_evaluator(
2602                    run_evaluator, item_results=valid_results
2603                )
2604                run_evaluations.extend(evaluations)
2605            except Exception as e:
2606                langfuse_logger.error(f"Run evaluator failed: {e}")
2607
2608        # Generate dataset run URL if applicable
2609        dataset_run_id = valid_results[0].dataset_run_id if valid_results else None
2610        dataset_run_url = None
2611        if dataset_run_id and data:
2612            try:
2613                # Check if the first item has dataset_id (for DatasetItem objects)
2614                first_item = data[0]
2615                dataset_id = None
2616
2617                if hasattr(first_item, "dataset_id"):
2618                    dataset_id = getattr(first_item, "dataset_id", None)
2619
2620                if dataset_id:
2621                    project_id = self._get_project_id()
2622
2623                    if project_id:
2624                        dataset_run_url = f"{self._base_url}/project/{project_id}/datasets/{dataset_id}/runs/{dataset_run_id}"
2625
2626            except Exception:
2627                pass  # URL generation is optional
2628
2629        # Store run-level evaluations as scores
2630        for evaluation in run_evaluations:
2631            try:
2632                if dataset_run_id:
2633                    self.create_score(
2634                        dataset_run_id=dataset_run_id,
2635                        name=evaluation.name or "<unknown>",
2636                        value=evaluation.value,  # type: ignore
2637                        comment=evaluation.comment,
2638                        metadata=evaluation.metadata,
2639                        data_type=evaluation.data_type,  # type: ignore
2640                        config_id=evaluation.config_id,
2641                    )
2642
2643            except Exception as e:
2644                langfuse_logger.error(f"Failed to store run evaluation: {e}")
2645
2646        # Flush scores and traces
2647        self.flush()
2648
2649        return ExperimentResult(
2650            name=name,
2651            run_name=run_name,
2652            description=description,
2653            item_results=valid_results,
2654            run_evaluations=run_evaluations,
2655            dataset_run_id=dataset_run_id,
2656            dataset_run_url=dataset_run_url,
2657        )
2658
2659    async def _process_experiment_item(
2660        self,
2661        item: ExperimentItem,
2662        task: Callable,
2663        evaluators: List[Callable],
2664        composite_evaluator: Optional[CompositeEvaluatorFunction],
2665        experiment_name: str,
2666        experiment_run_name: str,
2667        experiment_description: Optional[str],
2668        experiment_metadata: Optional[Dict[str, Any]] = None,
2669        dataset_version: Optional[datetime] = None,
2670    ) -> ExperimentItemResult:
2671        span_name = "experiment-item-run"
2672
2673        with self.start_as_current_observation(name=span_name) as span:
2674            try:
2675                input_data = (
2676                    item.get("input")
2677                    if isinstance(item, dict)
2678                    else getattr(item, "input", None)
2679                )
2680
2681                if input_data is None:
2682                    raise ValueError("Experiment Item is missing input. Skipping item.")
2683
2684                expected_output = (
2685                    item.get("expected_output")
2686                    if isinstance(item, dict)
2687                    else getattr(item, "expected_output", None)
2688                )
2689
2690                item_metadata = (
2691                    item.get("metadata")
2692                    if isinstance(item, dict)
2693                    else getattr(item, "metadata", None)
2694                )
2695
2696                final_observation_metadata = {
2697                    "experiment_name": experiment_name,
2698                    "experiment_run_name": experiment_run_name,
2699                    **(experiment_metadata or {}),
2700                }
2701
2702                trace_id = span.trace_id
2703                dataset_id = None
2704                dataset_item_id = None
2705                dataset_run_id = None
2706
2707                # Link to dataset run if this is a dataset item
2708                if hasattr(item, "id") and hasattr(item, "dataset_id"):
2709                    try:
2710                        # Use sync API to avoid event loop issues when run_async_safely
2711                        # creates multiple event loops across different threads
2712                        dataset_run_item = await asyncio.to_thread(
2713                            self.api.dataset_run_items.create,
2714                            run_name=experiment_run_name,
2715                            run_description=experiment_description,
2716                            metadata=experiment_metadata,
2717                            dataset_item_id=item.id,  # type: ignore
2718                            trace_id=trace_id,
2719                            observation_id=span.id,
2720                            dataset_version=dataset_version,
2721                        )
2722
2723                        dataset_run_id = dataset_run_item.dataset_run_id
2724
2725                    except Exception as e:
2726                        langfuse_logger.error(f"Failed to create dataset run item: {e}")
2727
2728                if (
2729                    not isinstance(item, dict)
2730                    and hasattr(item, "dataset_id")
2731                    and hasattr(item, "id")
2732                ):
2733                    dataset_id = item.dataset_id
2734                    dataset_item_id = item.id
2735
2736                    final_observation_metadata.update(
2737                        {"dataset_id": dataset_id, "dataset_item_id": dataset_item_id}
2738                    )
2739
2740                if isinstance(item_metadata, dict):
2741                    final_observation_metadata.update(item_metadata)
2742
2743                experiment_id = dataset_run_id or self._create_observation_id()
2744                experiment_item_id = (
2745                    dataset_item_id or get_sha256_hash_hex(_serialize(input_data))[:16]
2746                )
2747                span._otel_span.set_attributes(
2748                    {
2749                        k: v
2750                        for k, v in {
2751                            LangfuseOtelSpanAttributes.ENVIRONMENT: LANGFUSE_SDK_EXPERIMENT_ENVIRONMENT,
2752                            LangfuseOtelSpanAttributes.EXPERIMENT_DESCRIPTION: experiment_description,
2753                            LangfuseOtelSpanAttributes.EXPERIMENT_ITEM_EXPECTED_OUTPUT: _serialize(
2754                                expected_output
2755                            ),
2756                        }.items()
2757                        if v is not None
2758                    }
2759                )
2760
2761                propagated_experiment_attributes = PropagatedExperimentAttributes(
2762                    experiment_id=experiment_id,
2763                    experiment_name=experiment_run_name,
2764                    experiment_metadata=_serialize(experiment_metadata),
2765                    experiment_dataset_id=dataset_id,
2766                    experiment_item_id=experiment_item_id,
2767                    experiment_item_metadata=_serialize(item_metadata),
2768                    experiment_item_root_observation_id=span.id,
2769                )
2770
2771                with _propagate_attributes(experiment=propagated_experiment_attributes):
2772                    output = await _run_task(task, item)
2773
2774                span.update(
2775                    input=input_data,
2776                    output=output,
2777                    metadata=final_observation_metadata,
2778                )
2779
2780            except Exception as e:
2781                span.update(
2782                    output=f"Error: {str(e)}", level="ERROR", status_message=str(e)
2783                )
2784                raise e
2785
2786            # Run evaluators
2787            evaluations = []
2788
2789            for evaluator in evaluators:
2790                try:
2791                    eval_metadata: Optional[Dict[str, Any]] = None
2792
2793                    if isinstance(item, dict):
2794                        eval_metadata = item.get("metadata")
2795                    elif hasattr(item, "metadata"):
2796                        eval_metadata = item.metadata
2797
2798                    with _propagate_attributes(
2799                        experiment=propagated_experiment_attributes
2800                    ):
2801                        eval_results = await _run_evaluator(
2802                            evaluator,
2803                            input=input_data,
2804                            output=output,
2805                            expected_output=expected_output,
2806                            metadata=eval_metadata,
2807                        )
2808                        evaluations.extend(eval_results)
2809
2810                        # Store evaluations as scores
2811                        for evaluation in eval_results:
2812                            self.create_score(
2813                                trace_id=trace_id,
2814                                observation_id=span.id,
2815                                name=evaluation.name,
2816                                value=evaluation.value,  # type: ignore
2817                                comment=evaluation.comment,
2818                                metadata=evaluation.metadata,
2819                                config_id=evaluation.config_id,
2820                                data_type=evaluation.data_type,  # type: ignore
2821                            )
2822
2823                except Exception as e:
2824                    langfuse_logger.error(f"Evaluator failed: {e}")
2825
2826            # Run composite evaluator if provided and we have evaluations
2827            if composite_evaluator and evaluations:
2828                try:
2829                    composite_eval_metadata: Optional[Dict[str, Any]] = None
2830                    if isinstance(item, dict):
2831                        composite_eval_metadata = item.get("metadata")
2832                    elif hasattr(item, "metadata"):
2833                        composite_eval_metadata = item.metadata
2834
2835                    with _propagate_attributes(
2836                        experiment=propagated_experiment_attributes
2837                    ):
2838                        result = composite_evaluator(
2839                            input=input_data,
2840                            output=output,
2841                            expected_output=expected_output,
2842                            metadata=composite_eval_metadata,
2843                            evaluations=evaluations,
2844                        )
2845
2846                        # Handle async composite evaluators
2847                        if asyncio.iscoroutine(result):
2848                            result = await result
2849
2850                        # Normalize to list
2851                        composite_evals: List[Evaluation] = []
2852                        if isinstance(result, (dict, Evaluation)):
2853                            composite_evals = [result]  # type: ignore
2854                        elif isinstance(result, list):
2855                            composite_evals = result  # type: ignore
2856
2857                        # Store composite evaluations as scores and add to evaluations list
2858                        for composite_evaluation in composite_evals:
2859                            self.create_score(
2860                                trace_id=trace_id,
2861                                observation_id=span.id,
2862                                name=composite_evaluation.name,
2863                                value=composite_evaluation.value,  # type: ignore
2864                                comment=composite_evaluation.comment,
2865                                metadata=composite_evaluation.metadata,
2866                                config_id=composite_evaluation.config_id,
2867                                data_type=composite_evaluation.data_type,  # type: ignore
2868                            )
2869                            evaluations.append(composite_evaluation)
2870
2871                except Exception as e:
2872                    langfuse_logger.error(f"Composite evaluator failed: {e}")
2873
2874            return ExperimentItemResult(
2875                item=item,
2876                output=output,
2877                evaluations=evaluations,
2878                trace_id=trace_id,
2879                dataset_run_id=dataset_run_id,
2880            )
2881
2882    def _create_experiment_run_name(
2883        self, *, name: Optional[str] = None, run_name: Optional[str] = None
2884    ) -> str:
2885        if run_name:
2886            return run_name
2887
2888        iso_timestamp = _get_timestamp().isoformat().replace("+00:00", "Z")
2889
2890        return f"{name} - {iso_timestamp}"
2891
2892    def run_batched_evaluation(
2893        self,
2894        *,
2895        scope: Literal["traces", "observations"],
2896        mapper: MapperFunction,
2897        filter: Optional[str] = None,
2898        fetch_batch_size: int = 50,
2899        fetch_trace_fields: Optional[str] = None,
2900        max_items: Optional[int] = None,
2901        max_retries: int = 3,
2902        evaluators: List[EvaluatorFunction],
2903        composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
2904        max_concurrency: int = 5,
2905        metadata: Optional[Dict[str, Any]] = None,
2906        _add_observation_scores_to_trace: bool = False,
2907        _additional_trace_tags: Optional[List[str]] = None,
2908        resume_from: Optional[BatchEvaluationResumeToken] = None,
2909        verbose: bool = False,
2910    ) -> BatchEvaluationResult:
2911        """Fetch traces or observations and run evaluations on each item.
2912
2913        This method provides a powerful way to evaluate existing data in Langfuse at scale.
2914        It fetches items based on filters, transforms them using a mapper function, runs
2915        evaluators on each item, and creates scores that are linked back to the original
2916        entities. This is ideal for:
2917
2918        - Running evaluations on production traces after deployment
2919        - Backtesting new evaluation metrics on historical data
2920        - Batch scoring of observations for quality monitoring
2921        - Periodic evaluation runs on recent data
2922
2923        The method uses a streaming/pipeline approach to process items in batches, making
2924        it memory-efficient for large datasets. It includes comprehensive error handling,
2925        retry logic, and resume capability for long-running evaluations.
2926
2927        Args:
2928            scope: The type of items to evaluate. Must be one of:
2929                - "traces": Evaluate complete traces with all their observations
2930                - "observations": Evaluate individual observations (spans, generations, events)
2931            mapper: Function that transforms API response objects into evaluator inputs.
2932                Receives a trace/observation object and returns an EvaluatorInputs
2933                instance with input, output, expected_output, and metadata fields.
2934                Can be sync or async.
2935            evaluators: List of evaluation functions to run on each item. Each evaluator
2936                receives the mapped inputs and returns Evaluation object(s). Evaluator
2937                failures are logged but don't stop the batch evaluation.
2938            filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples:
2939                - '{"tags": ["production"]}'
2940                - '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}'
2941                Default: None (fetches all items).
2942            fetch_batch_size: Number of items to fetch per API call and hold in memory.
2943                Larger values may be faster but use more memory. Default: 50.
2944            fetch_trace_fields: Comma-separated list of fields to include when fetching traces. Available field groups: 'core' (always included), 'io' (input, output, metadata), 'scores', 'observations', 'metrics'. If not specified, all fields are returned. Example: 'core,scores,metrics'. Note: Excluded 'observations' or 'scores' fields return empty arrays; excluded 'metrics' returns -1 for 'totalCost' and 'latency'. Only relevant if scope is 'traces'.
2945            max_items: Maximum total number of items to process. If None, processes all
2946                items matching the filter. Useful for testing or limiting evaluation runs.
2947                Default: None (process all).
2948            max_concurrency: Maximum number of items to evaluate concurrently. Controls
2949                parallelism and resource usage. Default: 5.
2950            composite_evaluator: Optional function that creates a composite score from
2951                item-level evaluations. Receives the original item and its evaluations,
2952                returns a single Evaluation. Useful for weighted averages or combined metrics.
2953                Default: None.
2954            metadata: Optional metadata dict to add to all created scores. Useful for
2955                tracking evaluation runs, versions, or other context. Default: None.
2956            max_retries: Maximum number of retry attempts for failed batch fetches.
2957                Uses exponential backoff (1s, 2s, 4s). Default: 3.
2958            verbose: If True, logs progress information to console. Useful for monitoring
2959                long-running evaluations. Default: False.
2960            resume_from: Optional resume token from a previous incomplete run. Allows
2961                continuing evaluation after interruption or failure. Default: None.
2962
2963
2964        Returns:
2965            BatchEvaluationResult containing:
2966                - total_items_fetched: Number of items fetched from API
2967                - total_items_processed: Number of items successfully evaluated
2968                - total_items_failed: Number of items that failed evaluation
2969                - total_scores_created: Scores created by item-level evaluators
2970                - total_composite_scores_created: Scores created by composite evaluator
2971                - total_evaluations_failed: Individual evaluator failures
2972                - evaluator_stats: Per-evaluator statistics (success rate, scores created)
2973                - resume_token: Token for resuming if incomplete (None if completed)
2974                - completed: True if all items processed
2975                - duration_seconds: Total execution time
2976                - failed_item_ids: IDs of items that failed
2977                - error_summary: Error types and counts
2978                - has_more_items: True if max_items reached but more exist
2979
2980        Raises:
2981            ValueError: If invalid scope is provided.
2982
2983        Examples:
2984            Basic trace evaluation:
2985            ```python
2986            from langfuse import Langfuse, EvaluatorInputs, Evaluation
2987
2988            client = Langfuse()
2989
2990            # Define mapper to extract fields from traces
2991            def trace_mapper(trace):
2992                return EvaluatorInputs(
2993                    input=trace.input,
2994                    output=trace.output,
2995                    expected_output=None,
2996                    metadata={"trace_id": trace.id}
2997                )
2998
2999            # Define evaluator
3000            def length_evaluator(*, input, output, expected_output, metadata):
3001                return Evaluation(
3002                    name="output_length",
3003                    value=len(output) if output else 0
3004                )
3005
3006            # Run batch evaluation
3007            result = client.run_batched_evaluation(
3008                scope="traces",
3009                mapper=trace_mapper,
3010                evaluators=[length_evaluator],
3011                filter='{"tags": ["production"]}',
3012                max_items=1000,
3013                verbose=True
3014            )
3015
3016            print(f"Processed {result.total_items_processed} traces")
3017            print(f"Created {result.total_scores_created} scores")
3018            ```
3019
3020            Evaluation with composite scorer:
3021            ```python
3022            def accuracy_evaluator(*, input, output, expected_output, metadata):
3023                # ... evaluation logic
3024                return Evaluation(name="accuracy", value=0.85)
3025
3026            def relevance_evaluator(*, input, output, expected_output, metadata):
3027                # ... evaluation logic
3028                return Evaluation(name="relevance", value=0.92)
3029
3030            def composite_evaluator(*, item, evaluations):
3031                # Weighted average of evaluations
3032                weights = {"accuracy": 0.6, "relevance": 0.4}
3033                total = sum(
3034                    e.value * weights.get(e.name, 0)
3035                    for e in evaluations
3036                    if isinstance(e.value, (int, float))
3037                )
3038                return Evaluation(
3039                    name="composite_score",
3040                    value=total,
3041                    comment=f"Weighted average of {len(evaluations)} metrics"
3042                )
3043
3044            result = client.run_batched_evaluation(
3045                scope="traces",
3046                mapper=trace_mapper,
3047                evaluators=[accuracy_evaluator, relevance_evaluator],
3048                composite_evaluator=composite_evaluator,
3049                filter='{"user_id": "important_user"}',
3050                verbose=True
3051            )
3052            ```
3053
3054            Handling incomplete runs with resume:
3055            ```python
3056            # Initial run that may fail or timeout
3057            result = client.run_batched_evaluation(
3058                scope="observations",
3059                mapper=obs_mapper,
3060                evaluators=[my_evaluator],
3061                max_items=10000,
3062                verbose=True
3063            )
3064
3065            # Check if incomplete
3066            if not result.completed and result.resume_token:
3067                print(f"Processed {result.resume_token.items_processed} items before interruption")
3068
3069                # Resume from where it left off
3070                result = client.run_batched_evaluation(
3071                    scope="observations",
3072                    mapper=obs_mapper,
3073                    evaluators=[my_evaluator],
3074                    resume_from=result.resume_token,
3075                    verbose=True
3076                )
3077
3078            print(f"Total items processed: {result.total_items_processed}")
3079            ```
3080
3081            Monitoring evaluator performance:
3082            ```python
3083            result = client.run_batched_evaluation(...)
3084
3085            for stats in result.evaluator_stats:
3086                success_rate = stats.successful_runs / stats.total_runs
3087                print(f"{stats.name}:")
3088                print(f"  Success rate: {success_rate:.1%}")
3089                print(f"  Scores created: {stats.total_scores_created}")
3090
3091                if stats.failed_runs > 0:
3092                    print(f"  âš ī¸  Failed {stats.failed_runs} times")
3093            ```
3094
3095        Note:
3096            - Evaluator failures are logged but don't stop the batch evaluation
3097            - Individual item failures are tracked but don't stop processing
3098            - Fetch failures are retried with exponential backoff
3099            - All scores are automatically flushed to Langfuse at the end
3100            - The resume mechanism uses timestamp-based filtering to avoid duplicates
3101        """
3102        runner = BatchEvaluationRunner(self)
3103
3104        return cast(
3105            BatchEvaluationResult,
3106            run_async_safely(
3107                runner.run_async(
3108                    scope=scope,
3109                    mapper=mapper,
3110                    evaluators=evaluators,
3111                    filter=filter,
3112                    fetch_batch_size=fetch_batch_size,
3113                    fetch_trace_fields=fetch_trace_fields,
3114                    max_items=max_items,
3115                    max_concurrency=max_concurrency,
3116                    composite_evaluator=composite_evaluator,
3117                    metadata=metadata,
3118                    _add_observation_scores_to_trace=_add_observation_scores_to_trace,
3119                    _additional_trace_tags=_additional_trace_tags,
3120                    max_retries=max_retries,
3121                    verbose=verbose,
3122                    resume_from=resume_from,
3123                )
3124            ),
3125        )
3126
3127    def auth_check(self) -> bool:
3128        """Check if the provided credentials (public and secret key) are valid.
3129
3130        Raises:
3131            Exception: If no projects were found for the provided credentials.
3132
3133        Note:
3134            This method is blocking. It is discouraged to use it in production code.
3135        """
3136        try:
3137            projects = self.api.projects.get()
3138            langfuse_logger.debug(
3139                f"Auth check successful, found {len(projects.data)} projects"
3140            )
3141            if len(projects.data) == 0:
3142                raise Exception(
3143                    "Auth check failed, no project found for the keys provided."
3144                )
3145            return True
3146
3147        except AttributeError as e:
3148            langfuse_logger.warning(
3149                f"Auth check failed: Client not properly initialized. Error: {e}"
3150            )
3151            return False
3152
3153        except Error as e:
3154            handle_fern_exception(e)
3155            raise e
3156
3157    def create_dataset(
3158        self,
3159        *,
3160        name: str,
3161        description: Optional[str] = None,
3162        metadata: Optional[Any] = None,
3163        input_schema: Optional[Any] = None,
3164        expected_output_schema: Optional[Any] = None,
3165    ) -> Dataset:
3166        """Create a dataset with the given name on Langfuse.
3167
3168        Args:
3169            name: Name of the dataset to create.
3170            description: Description of the dataset. Defaults to None.
3171            metadata: Additional metadata. Defaults to None.
3172            input_schema: JSON Schema for validating dataset item inputs. When set, all new items will be validated against this schema.
3173            expected_output_schema: JSON Schema for validating dataset item expected outputs. When set, all new items will be validated against this schema.
3174
3175        Returns:
3176            Dataset: The created dataset as returned by the Langfuse API.
3177        """
3178        try:
3179            langfuse_logger.debug(f"Creating datasets {name}")
3180
3181            result = self.api.datasets.create(
3182                name=name,
3183                description=description,
3184                metadata=metadata,
3185                input_schema=input_schema,
3186                expected_output_schema=expected_output_schema,
3187            )
3188
3189            return cast(Dataset, result)
3190
3191        except Error as e:
3192            handle_fern_exception(e)
3193            raise e
3194
3195    def create_dataset_item(
3196        self,
3197        *,
3198        dataset_name: str,
3199        input: Optional[Any] = None,
3200        expected_output: Optional[Any] = None,
3201        metadata: Optional[Any] = None,
3202        source_trace_id: Optional[str] = None,
3203        source_observation_id: Optional[str] = None,
3204        status: Optional[DatasetStatus] = None,
3205        id: Optional[str] = None,
3206    ) -> DatasetItem:
3207        """Create a dataset item.
3208
3209        Upserts if an item with id already exists.
3210
3211        Args:
3212            dataset_name: Name of the dataset in which the dataset item should be created.
3213            input: Input data. Defaults to None. Can contain any dict, list or scalar.
3214            expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar.
3215            metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar.
3216            source_trace_id: Id of the source trace. Defaults to None.
3217            source_observation_id: Id of the source observation. Defaults to None.
3218            status: Status of the dataset item. Defaults to ACTIVE for newly created items.
3219            id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets.
3220
3221        Returns:
3222            DatasetItem: The created dataset item as returned by the Langfuse API.
3223
3224        Example:
3225            ```python
3226            from langfuse import Langfuse
3227
3228            langfuse = Langfuse()
3229
3230            # Uploading items to the Langfuse dataset named "capital_cities"
3231            langfuse.create_dataset_item(
3232                dataset_name="capital_cities",
3233                input={"input": {"country": "Italy"}},
3234                expected_output={"expected_output": "Rome"},
3235                metadata={"foo": "bar"}
3236            )
3237            ```
3238        """
3239        try:
3240            langfuse_logger.debug(f"Creating dataset item for dataset {dataset_name}")
3241
3242            result = self.api.dataset_items.create(
3243                dataset_name=dataset_name,
3244                input=input,
3245                expected_output=expected_output,
3246                metadata=metadata,
3247                source_trace_id=source_trace_id,
3248                source_observation_id=source_observation_id,
3249                status=status,
3250                id=id,
3251            )
3252
3253            return cast(DatasetItem, result)
3254        except Error as e:
3255            handle_fern_exception(e)
3256            raise e
3257
3258    def resolve_media_references(
3259        self,
3260        *,
3261        obj: Any,
3262        resolve_with: Literal["base64_data_uri"],
3263        max_depth: int = 10,
3264        content_fetch_timeout_seconds: int = 5,
3265    ) -> Any:
3266        """Replace media reference strings in an object with base64 data URIs.
3267
3268        This method recursively traverses an object (up to max_depth) looking for media reference strings
3269        in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using
3270        the provided Langfuse client and replaces the reference string with a base64 data URI.
3271
3272        If fetching media content fails for a reference string, a warning is logged and the reference
3273        string is left unchanged.
3274
3275        Args:
3276            obj: The object to process. Can be a primitive value, array, or nested object.
3277                If the object has a __dict__ attribute, a dict will be returned instead of the original object type.
3278            resolve_with: The representation of the media content to replace the media reference string with.
3279                Currently only "base64_data_uri" is supported.
3280            max_depth: int: The maximum depth to traverse the object. Default is 10.
3281            content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5.
3282
3283        Returns:
3284            A deep copy of the input object with all media references replaced with base64 data URIs where possible.
3285            If the input object has a __dict__ attribute, a dict will be returned instead of the original object type.
3286
3287        Example:
3288            obj = {
3289                "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@",
3290                "nested": {
3291                    "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@"
3292                }
3293            }
3294
3295            result = await LangfuseMedia.resolve_media_references(obj, langfuse_client)
3296
3297            # Result:
3298            # {
3299            #     "image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...",
3300            #     "nested": {
3301            #         "pdf": "data:application/pdf;base64,JVBERi0xLjcK..."
3302            #     }
3303            # }
3304        """
3305        return LangfuseMedia.resolve_media_references(
3306            langfuse_client=self,
3307            obj=obj,
3308            resolve_with=resolve_with,
3309            max_depth=max_depth,
3310            content_fetch_timeout_seconds=content_fetch_timeout_seconds,
3311        )
3312
3313    @overload
3314    def get_prompt(
3315        self,
3316        name: str,
3317        *,
3318        version: Optional[int] = None,
3319        label: Optional[str] = None,
3320        type: Literal["chat"],
3321        cache_ttl_seconds: Optional[int] = None,
3322        fallback: Optional[List[ChatMessageDict]] = None,
3323        max_retries: Optional[int] = None,
3324        fetch_timeout_seconds: Optional[int] = None,
3325    ) -> ChatPromptClient: ...
3326
3327    @overload
3328    def get_prompt(
3329        self,
3330        name: str,
3331        *,
3332        version: Optional[int] = None,
3333        label: Optional[str] = None,
3334        type: Literal["text"] = "text",
3335        cache_ttl_seconds: Optional[int] = None,
3336        fallback: Optional[str] = None,
3337        max_retries: Optional[int] = None,
3338        fetch_timeout_seconds: Optional[int] = None,
3339    ) -> TextPromptClient: ...
3340
3341    def get_prompt(
3342        self,
3343        name: str,
3344        *,
3345        version: Optional[int] = None,
3346        label: Optional[str] = None,
3347        type: Literal["chat", "text"] = "text",
3348        cache_ttl_seconds: Optional[int] = None,
3349        fallback: Union[Optional[List[ChatMessageDict]], Optional[str]] = None,
3350        max_retries: Optional[int] = None,
3351        fetch_timeout_seconds: Optional[int] = None,
3352    ) -> PromptClient:
3353        """Get a prompt.
3354
3355        This method attempts to fetch the requested prompt from the local cache. If the prompt is not found
3356        in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again
3357        and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will
3358        return the expired prompt as a fallback.
3359
3360        Args:
3361            name (str): The name of the prompt to retrieve.
3362
3363        Keyword Args:
3364            version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both.
3365            label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both.
3366            cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a
3367            keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0.
3368            type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text".
3369            fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None.
3370            max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds.
3371            fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default.
3372
3373        Returns:
3374            The prompt object retrieved from the cache or directly fetched if not cached or expired of type
3375            - TextPromptClient, if type argument is 'text'.
3376            - ChatPromptClient, if type argument is 'chat'.
3377
3378        Raises:
3379            Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an
3380            expired prompt in the cache, in which case it logs a warning and returns the expired prompt.
3381        """
3382        if self._resources is None:
3383            raise Error(
3384                "SDK is not correctly initialized. Check the init logs for more details."
3385            )
3386        if version is not None and label is not None:
3387            raise ValueError("Cannot specify both version and label at the same time.")
3388
3389        if not name:
3390            raise ValueError("Prompt name cannot be empty.")
3391
3392        cache_key = PromptCache.generate_cache_key(name, version=version, label=label)
3393        bounded_max_retries = self._get_bounded_max_retries(
3394            max_retries, default_max_retries=2, max_retries_upper_bound=4
3395        )
3396
3397        langfuse_logger.debug(f"Getting prompt '{cache_key}'")
3398        cached_prompt = self._resources.prompt_cache.get(cache_key)
3399
3400        if cached_prompt is None or cache_ttl_seconds == 0:
3401            langfuse_logger.debug(
3402                f"Prompt '{cache_key}' not found in cache or caching disabled."
3403            )
3404            try:
3405                return self._fetch_prompt_and_update_cache(
3406                    name,
3407                    version=version,
3408                    label=label,
3409                    ttl_seconds=cache_ttl_seconds,
3410                    max_retries=bounded_max_retries,
3411                    fetch_timeout_seconds=fetch_timeout_seconds,
3412                )
3413            except Exception as e:
3414                if fallback:
3415                    langfuse_logger.warning(
3416                        f"Returning fallback prompt for '{cache_key}' due to fetch error: {e}"
3417                    )
3418
3419                    fallback_client_args: Dict[str, Any] = {
3420                        "name": name,
3421                        "prompt": fallback,
3422                        "type": type,
3423                        "version": version or 0,
3424                        "config": {},
3425                        "labels": [label] if label else [],
3426                        "tags": [],
3427                    }
3428
3429                    if type == "text":
3430                        return TextPromptClient(
3431                            prompt=Prompt_Text(**fallback_client_args),
3432                            is_fallback=True,
3433                        )
3434
3435                    if type == "chat":
3436                        return ChatPromptClient(
3437                            prompt=Prompt_Chat(**fallback_client_args),
3438                            is_fallback=True,
3439                        )
3440
3441                raise e
3442
3443        if cached_prompt.is_expired():
3444            langfuse_logger.debug(f"Stale prompt '{cache_key}' found in cache.")
3445            try:
3446                # refresh prompt in background thread, refresh_prompt deduplicates tasks
3447                langfuse_logger.debug(f"Refreshing prompt '{cache_key}' in background.")
3448
3449                def refresh_task() -> None:
3450                    self._fetch_prompt_and_update_cache(
3451                        name,
3452                        version=version,
3453                        label=label,
3454                        ttl_seconds=cache_ttl_seconds,
3455                        max_retries=bounded_max_retries,
3456                        fetch_timeout_seconds=fetch_timeout_seconds,
3457                    )
3458
3459                self._resources.prompt_cache.add_refresh_prompt_task(
3460                    cache_key,
3461                    refresh_task,
3462                )
3463                langfuse_logger.debug(
3464                    f"Returning stale prompt '{cache_key}' from cache."
3465                )
3466                # return stale prompt
3467                return cached_prompt.value
3468
3469            except Exception as e:
3470                langfuse_logger.warning(
3471                    f"Error when refreshing cached prompt '{cache_key}', returning cached version. Error: {e}"
3472                )
3473                # creation of refresh prompt task failed, return stale prompt
3474                return cached_prompt.value
3475
3476        return cached_prompt.value
3477
3478    def _fetch_prompt_and_update_cache(
3479        self,
3480        name: str,
3481        *,
3482        version: Optional[int] = None,
3483        label: Optional[str] = None,
3484        ttl_seconds: Optional[int] = None,
3485        max_retries: int,
3486        fetch_timeout_seconds: Optional[int],
3487    ) -> PromptClient:
3488        cache_key = PromptCache.generate_cache_key(name, version=version, label=label)
3489        langfuse_logger.debug(f"Fetching prompt '{cache_key}' from server...")
3490
3491        try:
3492
3493            @backoff.on_exception(
3494                backoff.constant, Exception, max_tries=max_retries + 1, logger=None
3495            )
3496            def fetch_prompts() -> Any:
3497                return self.api.prompts.get(
3498                    self._url_encode(name),
3499                    version=version,
3500                    label=label,
3501                    request_options={
3502                        "timeout_in_seconds": fetch_timeout_seconds,
3503                    }
3504                    if fetch_timeout_seconds is not None
3505                    else None,
3506                )
3507
3508            prompt_response = fetch_prompts()
3509
3510            prompt: PromptClient
3511            if prompt_response.type == "chat":
3512                prompt = ChatPromptClient(prompt_response)
3513            else:
3514                prompt = TextPromptClient(prompt_response)
3515
3516            if self._resources is not None:
3517                self._resources.prompt_cache.set(cache_key, prompt, ttl_seconds)
3518
3519            return prompt
3520
3521        except NotFoundError as not_found_error:
3522            langfuse_logger.warning(
3523                f"Prompt '{cache_key}' not found during refresh, evicting from cache."
3524            )
3525            if self._resources is not None:
3526                self._resources.prompt_cache.delete(cache_key)
3527            raise not_found_error
3528
3529        except Exception as e:
3530            langfuse_logger.error(
3531                f"Error while fetching prompt '{cache_key}': {str(e)}"
3532            )
3533            raise e
3534
3535    def _get_bounded_max_retries(
3536        self,
3537        max_retries: Optional[int],
3538        *,
3539        default_max_retries: int = 2,
3540        max_retries_upper_bound: int = 4,
3541    ) -> int:
3542        if max_retries is None:
3543            return default_max_retries
3544
3545        bounded_max_retries = min(
3546            max(max_retries, 0),
3547            max_retries_upper_bound,
3548        )
3549
3550        return bounded_max_retries
3551
3552    @overload
3553    def create_prompt(
3554        self,
3555        *,
3556        name: str,
3557        prompt: List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]],
3558        labels: List[str] = [],
3559        tags: Optional[List[str]] = None,
3560        type: Optional[Literal["chat"]],
3561        config: Optional[Any] = None,
3562        commit_message: Optional[str] = None,
3563    ) -> ChatPromptClient: ...
3564
3565    @overload
3566    def create_prompt(
3567        self,
3568        *,
3569        name: str,
3570        prompt: str,
3571        labels: List[str] = [],
3572        tags: Optional[List[str]] = None,
3573        type: Optional[Literal["text"]] = "text",
3574        config: Optional[Any] = None,
3575        commit_message: Optional[str] = None,
3576    ) -> TextPromptClient: ...
3577
3578    def create_prompt(
3579        self,
3580        *,
3581        name: str,
3582        prompt: Union[
3583            str, List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]]
3584        ],
3585        labels: List[str] = [],
3586        tags: Optional[List[str]] = None,
3587        type: Optional[Literal["chat", "text"]] = "text",
3588        config: Optional[Any] = None,
3589        commit_message: Optional[str] = None,
3590    ) -> PromptClient:
3591        """Create a new prompt in Langfuse.
3592
3593        Keyword Args:
3594            name : The name of the prompt to be created.
3595            prompt : The content of the prompt to be created.
3596            is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead.
3597            labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label.
3598            tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt.
3599            config: Additional structured data to be saved with the prompt. Defaults to None.
3600            type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text".
3601            commit_message: Optional string describing the change.
3602
3603        Returns:
3604            TextPromptClient: The prompt if type argument is 'text'.
3605            ChatPromptClient: The prompt if type argument is 'chat'.
3606        """
3607        try:
3608            langfuse_logger.debug(f"Creating prompt {name=}, {labels=}")
3609
3610            if type == "chat":
3611                if not isinstance(prompt, list):
3612                    raise ValueError(
3613                        "For 'chat' type, 'prompt' must be a list of chat messages with role and content attributes."
3614                    )
3615                request: Union[CreateChatPromptRequest, CreateTextPromptRequest] = (
3616                    CreateChatPromptRequest(
3617                        name=name,
3618                        prompt=cast(Any, prompt),
3619                        labels=labels,
3620                        tags=tags,
3621                        config=config or {},
3622                        commit_message=commit_message,
3623                        type=CreateChatPromptType.CHAT,
3624                    )
3625                )
3626                server_prompt = self.api.prompts.create(request=request)
3627
3628                if self._resources is not None:
3629                    self._resources.prompt_cache.invalidate(name)
3630
3631                return ChatPromptClient(prompt=cast(Prompt_Chat, server_prompt))
3632
3633            if not isinstance(prompt, str):
3634                raise ValueError("For 'text' type, 'prompt' must be a string.")
3635
3636            request = CreateTextPromptRequest(
3637                name=name,
3638                prompt=prompt,
3639                labels=labels,
3640                tags=tags,
3641                config=config or {},
3642                commit_message=commit_message,
3643            )
3644
3645            server_prompt = self.api.prompts.create(request=request)
3646
3647            if self._resources is not None:
3648                self._resources.prompt_cache.invalidate(name)
3649
3650            return TextPromptClient(prompt=cast(Prompt_Text, server_prompt))
3651
3652        except Error as e:
3653            handle_fern_exception(e)
3654            raise e
3655
3656    def update_prompt(
3657        self,
3658        *,
3659        name: str,
3660        version: int,
3661        new_labels: List[str] = [],
3662    ) -> Any:
3663        """Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name.
3664
3665        Args:
3666            name (str): The name of the prompt to update.
3667            version (int): The version number of the prompt to update.
3668            new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to [].
3669
3670        Returns:
3671            Prompt: The updated prompt from the Langfuse API.
3672
3673        """
3674        updated_prompt = self.api.prompt_version.update(
3675            name=self._url_encode(name),
3676            version=version,
3677            new_labels=new_labels,
3678        )
3679
3680        if self._resources is not None:
3681            self._resources.prompt_cache.invalidate(name)
3682
3683        return updated_prompt
3684
3685    def _url_encode(self, url: str, *, is_url_param: Optional[bool] = False) -> str:
3686        # httpx â‰Ĩ 0.28 does its own WHATWG-compliant quoting (eg. encodes bare
3687        # “%”, “?”, “#”, “|”, â€Ļ in query/path parts).  Re-quoting here would
3688        # double-encode, so we skip when the value is about to be sent straight
3689        # to httpx (`is_url_param=True`) and the installed version is â‰Ĩ 0.28.
3690        if is_url_param and Version(httpx.__version__) >= Version("0.28.0"):
3691            return url
3692
3693        # urllib.parse.quote does not escape slashes "/" by default; we need to add safe="" to force escaping
3694        # we need add safe="" to force escaping of slashes
3695        # This is necessary for prompts in prompt folders
3696        return urllib.parse.quote(url, safe="")
3697
3698    def clear_prompt_cache(self) -> None:
3699        """Clear the entire prompt cache, removing all cached prompts.
3700
3701        This method is useful when you want to force a complete refresh of all
3702        cached prompts, for example after major updates or when you need to
3703        ensure the latest versions are fetched from the server.
3704        """
3705        if self._resources is not None:
3706            self._resources.prompt_cache.clear()

Main client for Langfuse tracing and platform features.

This class provides an interface for creating and managing traces, spans, and generations in Langfuse as well as interacting with the Langfuse API.

The client features a thread-safe singleton pattern for each unique public API key, ensuring consistent trace context propagation across your application. It implements efficient batching of spans with configurable flush settings and includes background thread management for media uploads and score ingestion.

Configuration is flexible through either direct parameters or environment variables, with graceful fallbacks and runtime configuration updates.

Attributes:
  • api: Synchronous API client for Langfuse backend communication
  • async_api: Asynchronous API client for Langfuse backend communication
  • _otel_tracer: Internal LangfuseTracer instance managing OpenTelemetry components
Arguments:
  • public_key (Optional[str]): Your Langfuse public API key. Can also be set via LANGFUSE_PUBLIC_KEY environment variable.
  • secret_key (Optional[str]): Your Langfuse secret API key. Can also be set via LANGFUSE_SECRET_KEY environment variable.
  • base_url (Optional[str]): The Langfuse API base URL. Defaults to "https://cloud.langfuse.com". Can also be set via LANGFUSE_BASE_URL environment variable.
  • host (Optional[str]): Deprecated. Use base_url instead. The Langfuse API host URL. Defaults to "https://cloud.langfuse.com".
  • timeout (Optional[int]): Timeout in seconds for API requests. Defaults to 5 seconds.
  • httpx_client (Optional[httpx.Client]): Custom httpx client for making non-tracing HTTP requests. If not provided, a default client will be created.
  • debug (bool): Enable debug logging. Defaults to False. Can also be set via LANGFUSE_DEBUG environment variable.
  • tracing_enabled (Optional[bool]): Enable or disable tracing. Defaults to True. Can also be set via LANGFUSE_TRACING_ENABLED environment variable.
  • flush_at (Optional[int]): Number of spans to batch before sending to the API. Defaults to 512. Can also be set via LANGFUSE_FLUSH_AT environment variable.
  • flush_interval (Optional[float]): Time in seconds between batch flushes. Defaults to 5 seconds. Can also be set via LANGFUSE_FLUSH_INTERVAL environment variable.
  • environment (Optional[str]): Environment name for tracing. Default is 'default'. Can also be set via LANGFUSE_TRACING_ENVIRONMENT environment variable. Can be any lowercase alphanumeric string with hyphens and underscores that does not start with 'langfuse'.
  • release (Optional[str]): Release version/hash of your application. Used for grouping analytics by release.
  • media_upload_thread_count (Optional[int]): Number of background threads for handling media uploads. Defaults to 1. Can also be set via LANGFUSE_MEDIA_UPLOAD_THREAD_COUNT environment variable.
  • sample_rate (Optional[float]): Sampling rate for traces (0.0 to 1.0). Defaults to 1.0 (100% of traces are sampled). Can also be set via LANGFUSE_SAMPLE_RATE environment variable.
  • mask (Optional[MaskFunction]): Function to mask sensitive data in traces before sending to the API.
  • blocked_instrumentation_scopes (Optional[List[str]]): Deprecated. Use should_export_span instead. Equivalent behavior:

    from langfuse.span_filter import is_default_export_span
    blocked = {"sqlite", "requests"}
    
    should_export_span = lambda span: (
        is_default_export_span(span)
        and (
            span.instrumentation_scope is None
            or span.instrumentation_scope.name not in blocked
        )
    )
    
  • should_export_span (Optional[Callable[[ReadableSpan], bool]]): Callback to decide whether to export a span. If omitted, Langfuse uses the default filter (Langfuse SDK spans, spans with gen_ai.* attributes, and known LLM instrumentation scopes).

  • additional_headers (Optional[Dict[str, str]]): Additional headers to include in all API requests and OTLPSpanExporter requests. These headers will be merged with default headers. Note: If httpx_client is provided, additional_headers must be set directly on your custom httpx_client as well.
  • tracer_provider(Optional[TracerProvider]): OpenTelemetry TracerProvider to use for Langfuse. This can be useful to set to have disconnected tracing between Langfuse and other OpenTelemetry-span emitting libraries. Note: To track active spans, the context is still shared between TracerProviders. This may lead to broken trace trees.
Example:
from langfuse.otel import Langfuse

# Initialize the client (reads from env vars if not provided)
langfuse = Langfuse(
    public_key="your-public-key",
    secret_key="your-secret-key",
    host="https://cloud.langfuse.com",  # Optional, default shown
)

# Create a trace span
with langfuse.start_as_current_observation(name="process-query") as span:
    # Your application code here

    # Create a nested generation span for an LLM call
    with span.start_as_current_generation(
        name="generate-response",
        model="gpt-4",
        input={"query": "Tell me about AI"},
        model_parameters={"temperature": 0.7, "max_tokens": 500}
    ) as generation:
        # Generate response here
        response = "AI is a field of computer science..."

        generation.update(
            output=response,
            usage_details={"prompt_tokens": 10, "completion_tokens": 50},
            cost_details={"total_cost": 0.0023}
        )

        # Score the generation (supports NUMERIC, BOOLEAN, CATEGORICAL)
        generation.score(name="relevance", value=0.95, data_type="NUMERIC")
Langfuse( *, public_key: Optional[str] = None, secret_key: Optional[str] = None, base_url: Optional[str] = None, host: Optional[str] = None, timeout: Optional[int] = None, httpx_client: Optional[httpx.Client] = None, debug: bool = False, tracing_enabled: Optional[bool] = True, flush_at: Optional[int] = None, flush_interval: Optional[float] = None, environment: Optional[str] = None, release: Optional[str] = None, media_upload_thread_count: Optional[int] = None, sample_rate: Optional[float] = None, mask: Optional[langfuse.types.MaskFunction] = None, blocked_instrumentation_scopes: Optional[List[str]] = None, should_export_span: Optional[Callable[[opentelemetry.sdk.trace.ReadableSpan], bool]] = None, additional_headers: Optional[Dict[str, str]] = None, tracer_provider: Optional[opentelemetry.sdk.trace.TracerProvider] = None)
224    def __init__(
225        self,
226        *,
227        public_key: Optional[str] = None,
228        secret_key: Optional[str] = None,
229        base_url: Optional[str] = None,
230        host: Optional[str] = None,
231        timeout: Optional[int] = None,
232        httpx_client: Optional[httpx.Client] = None,
233        debug: bool = False,
234        tracing_enabled: Optional[bool] = True,
235        flush_at: Optional[int] = None,
236        flush_interval: Optional[float] = None,
237        environment: Optional[str] = None,
238        release: Optional[str] = None,
239        media_upload_thread_count: Optional[int] = None,
240        sample_rate: Optional[float] = None,
241        mask: Optional[MaskFunction] = None,
242        blocked_instrumentation_scopes: Optional[List[str]] = None,
243        should_export_span: Optional[Callable[[ReadableSpan], bool]] = None,
244        additional_headers: Optional[Dict[str, str]] = None,
245        tracer_provider: Optional[TracerProvider] = None,
246    ):
247        self._base_url = (
248            base_url
249            or os.environ.get(LANGFUSE_BASE_URL)
250            or host
251            or os.environ.get(LANGFUSE_HOST, "https://cloud.langfuse.com")
252        )
253        self._environment = environment or cast(
254            str, os.environ.get(LANGFUSE_TRACING_ENVIRONMENT)
255        )
256        self._project_id: Optional[str] = None
257        sample_rate = sample_rate or float(os.environ.get(LANGFUSE_SAMPLE_RATE, 1.0))
258        if not 0.0 <= sample_rate <= 1.0:
259            raise ValueError(
260                f"Sample rate must be between 0.0 and 1.0, got {sample_rate}"
261            )
262
263        timeout = timeout or int(os.environ.get(LANGFUSE_TIMEOUT, 5))
264
265        self._tracing_enabled = (
266            tracing_enabled
267            and os.environ.get(LANGFUSE_TRACING_ENABLED, "true").lower() != "false"
268        )
269        if not self._tracing_enabled:
270            langfuse_logger.info(
271                "Configuration: Langfuse tracing is explicitly disabled. No data will be sent to the Langfuse API."
272            )
273
274        debug = (
275            debug if debug else (os.getenv(LANGFUSE_DEBUG, "false").lower() == "true")
276        )
277        if debug:
278            logging.basicConfig(
279                format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
280            )
281            langfuse_logger.setLevel(logging.DEBUG)
282
283        public_key = public_key or os.environ.get(LANGFUSE_PUBLIC_KEY)
284        if public_key is None:
285            langfuse_logger.warning(
286                "Authentication error: Langfuse client initialized without public_key. Client will be disabled. "
287                "Provide a public_key parameter or set LANGFUSE_PUBLIC_KEY environment variable. "
288            )
289            self._otel_tracer = otel_trace_api.NoOpTracer()
290            return
291
292        secret_key = secret_key or os.environ.get(LANGFUSE_SECRET_KEY)
293        if secret_key is None:
294            langfuse_logger.warning(
295                "Authentication error: Langfuse client initialized without secret_key. Client will be disabled. "
296                "Provide a secret_key parameter or set LANGFUSE_SECRET_KEY environment variable. "
297            )
298            self._otel_tracer = otel_trace_api.NoOpTracer()
299            return
300
301        if os.environ.get("OTEL_SDK_DISABLED", "false").lower() == "true":
302            langfuse_logger.warning(
303                "OTEL_SDK_DISABLED is set. Langfuse tracing will be disabled and no traces will appear in the UI."
304            )
305
306        if blocked_instrumentation_scopes is not None:
307            warnings.warn(
308                "`blocked_instrumentation_scopes` is deprecated and will be removed in a future release. "
309                "Use `should_export_span` instead. Example: "
310                "from langfuse.span_filter import is_default_export_span; "
311                'blocked={"scope"}; should_export_span=lambda span: '
312                "is_default_export_span(span) and (span.instrumentation_scope is None or "
313                "span.instrumentation_scope.name not in blocked).",
314                DeprecationWarning,
315                stacklevel=2,
316            )
317
318        # Initialize api and tracer if requirements are met
319        self._resources = LangfuseResourceManager(
320            public_key=public_key,
321            secret_key=secret_key,
322            base_url=self._base_url,
323            timeout=timeout,
324            environment=self._environment,
325            release=release,
326            flush_at=flush_at,
327            flush_interval=flush_interval,
328            httpx_client=httpx_client,
329            media_upload_thread_count=media_upload_thread_count,
330            sample_rate=sample_rate,
331            mask=mask,
332            tracing_enabled=self._tracing_enabled,
333            blocked_instrumentation_scopes=blocked_instrumentation_scopes,
334            should_export_span=should_export_span,
335            additional_headers=additional_headers,
336            tracer_provider=tracer_provider,
337        )
338        self._mask = self._resources.mask
339
340        self._otel_tracer = (
341            self._resources.tracer
342            if self._tracing_enabled and self._resources.tracer is not None
343            else otel_trace_api.NoOpTracer()
344        )
345        self.api = self._resources.api
346        self.async_api = self._resources.async_api
api
async_api
def start_observation( self, *, trace_context: Optional[langfuse.types.TraceContext] = None, name: str, as_type: Union[Literal['generation', 'embedding'], Literal['span', 'agent', 'tool', 'chain', 'retriever', 'evaluator', 'guardrail']] = 'span', input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, float, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None) -> Union[LangfuseSpan, LangfuseGeneration, LangfuseAgent, LangfuseTool, LangfuseChain, LangfuseRetriever, LangfuseEvaluator, LangfuseEmbedding, LangfuseGuardrail]:
495    def start_observation(
496        self,
497        *,
498        trace_context: Optional[TraceContext] = None,
499        name: str,
500        as_type: ObservationTypeLiteralNoEvent = "span",
501        input: Optional[Any] = None,
502        output: Optional[Any] = None,
503        metadata: Optional[Any] = None,
504        version: Optional[str] = None,
505        level: Optional[SpanLevel] = None,
506        status_message: Optional[str] = None,
507        completion_start_time: Optional[datetime] = None,
508        model: Optional[str] = None,
509        model_parameters: Optional[Dict[str, MapValue]] = None,
510        usage_details: Optional[Dict[str, int]] = None,
511        cost_details: Optional[Dict[str, float]] = None,
512        prompt: Optional[PromptClient] = None,
513    ) -> Union[
514        LangfuseSpan,
515        LangfuseGeneration,
516        LangfuseAgent,
517        LangfuseTool,
518        LangfuseChain,
519        LangfuseRetriever,
520        LangfuseEvaluator,
521        LangfuseEmbedding,
522        LangfuseGuardrail,
523    ]:
524        """Create a new observation of the specified type.
525
526        This method creates a new observation but does not set it as the current span in the
527        context. To create and use an observation within a context, use start_as_current_observation().
528
529        Args:
530            trace_context: Optional context for connecting to an existing trace
531            name: Name of the observation
532            as_type: Type of observation to create (defaults to "span")
533            input: Input data for the operation
534            output: Output data from the operation
535            metadata: Additional metadata to associate with the observation
536            version: Version identifier for the code or component
537            level: Importance level of the observation
538            status_message: Optional status message for the observation
539            completion_start_time: When the model started generating (for generation types)
540            model: Name/identifier of the AI model used (for generation types)
541            model_parameters: Parameters used for the model (for generation types)
542            usage_details: Token usage information (for generation types)
543            cost_details: Cost information (for generation types)
544            prompt: Associated prompt template (for generation types)
545
546        Returns:
547            An observation object of the appropriate type that must be ended with .end()
548        """
549        if trace_context:
550            trace_id = trace_context.get("trace_id", None)
551            parent_span_id = trace_context.get("parent_span_id", None)
552
553            if trace_id:
554                remote_parent_span = self._create_remote_parent_span(
555                    trace_id=trace_id, parent_span_id=parent_span_id
556                )
557
558                with otel_trace_api.use_span(
559                    cast(otel_trace_api.Span, remote_parent_span)
560                ):
561                    otel_span = self._otel_tracer.start_span(name=name)
562                    otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True)
563
564                    return self._create_observation_from_otel_span(
565                        otel_span=otel_span,
566                        as_type=as_type,
567                        input=input,
568                        output=output,
569                        metadata=metadata,
570                        version=version,
571                        level=level,
572                        status_message=status_message,
573                        completion_start_time=completion_start_time,
574                        model=model,
575                        model_parameters=model_parameters,
576                        usage_details=usage_details,
577                        cost_details=cost_details,
578                        prompt=prompt,
579                    )
580
581        otel_span = self._otel_tracer.start_span(name=name)
582
583        return self._create_observation_from_otel_span(
584            otel_span=otel_span,
585            as_type=as_type,
586            input=input,
587            output=output,
588            metadata=metadata,
589            version=version,
590            level=level,
591            status_message=status_message,
592            completion_start_time=completion_start_time,
593            model=model,
594            model_parameters=model_parameters,
595            usage_details=usage_details,
596            cost_details=cost_details,
597            prompt=prompt,
598        )

Create a new observation of the specified type.

This method creates a new observation but does not set it as the current span in the context. To create and use an observation within a context, use start_as_current_observation().

Arguments:
  • trace_context: Optional context for connecting to an existing trace
  • name: Name of the observation
  • as_type: Type of observation to create (defaults to "span")
  • input: Input data for the operation
  • output: Output data from the operation
  • metadata: Additional metadata to associate with the observation
  • version: Version identifier for the code or component
  • level: Importance level of the observation
  • status_message: Optional status message for the observation
  • completion_start_time: When the model started generating (for generation types)
  • model: Name/identifier of the AI model used (for generation types)
  • model_parameters: Parameters used for the model (for generation types)
  • usage_details: Token usage information (for generation types)
  • cost_details: Cost information (for generation types)
  • prompt: Associated prompt template (for generation types)
Returns:

An observation object of the appropriate type that must be ended with .end()

def start_as_current_observation( self, *, trace_context: Optional[langfuse.types.TraceContext] = None, name: str, as_type: Union[Literal['generation', 'embedding'], Literal['span', 'agent', 'tool', 'chain', 'retriever', 'evaluator', 'guardrail']] = 'span', input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, float, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None, end_on_exit: Optional[bool] = None) -> Union[opentelemetry.util._decorator._AgnosticContextManager[LangfuseGeneration], opentelemetry.util._decorator._AgnosticContextManager[LangfuseSpan], opentelemetry.util._decorator._AgnosticContextManager[LangfuseAgent], opentelemetry.util._decorator._AgnosticContextManager[LangfuseTool], opentelemetry.util._decorator._AgnosticContextManager[LangfuseChain], opentelemetry.util._decorator._AgnosticContextManager[LangfuseRetriever], opentelemetry.util._decorator._AgnosticContextManager[LangfuseEvaluator], opentelemetry.util._decorator._AgnosticContextManager[LangfuseEmbedding], opentelemetry.util._decorator._AgnosticContextManager[LangfuseGuardrail]]:
 826    def start_as_current_observation(
 827        self,
 828        *,
 829        trace_context: Optional[TraceContext] = None,
 830        name: str,
 831        as_type: ObservationTypeLiteralNoEvent = "span",
 832        input: Optional[Any] = None,
 833        output: Optional[Any] = None,
 834        metadata: Optional[Any] = None,
 835        version: Optional[str] = None,
 836        level: Optional[SpanLevel] = None,
 837        status_message: Optional[str] = None,
 838        completion_start_time: Optional[datetime] = None,
 839        model: Optional[str] = None,
 840        model_parameters: Optional[Dict[str, MapValue]] = None,
 841        usage_details: Optional[Dict[str, int]] = None,
 842        cost_details: Optional[Dict[str, float]] = None,
 843        prompt: Optional[PromptClient] = None,
 844        end_on_exit: Optional[bool] = None,
 845    ) -> Union[
 846        _AgnosticContextManager[LangfuseGeneration],
 847        _AgnosticContextManager[LangfuseSpan],
 848        _AgnosticContextManager[LangfuseAgent],
 849        _AgnosticContextManager[LangfuseTool],
 850        _AgnosticContextManager[LangfuseChain],
 851        _AgnosticContextManager[LangfuseRetriever],
 852        _AgnosticContextManager[LangfuseEvaluator],
 853        _AgnosticContextManager[LangfuseEmbedding],
 854        _AgnosticContextManager[LangfuseGuardrail],
 855    ]:
 856        """Create a new observation and set it as the current span in a context manager.
 857
 858        This method creates a new observation of the specified type and sets it as the
 859        current span within a context manager. Use this method with a 'with' statement to
 860        automatically handle the observation lifecycle within a code block.
 861
 862        The created observation will be the child of the current span in the context.
 863
 864        Args:
 865            trace_context: Optional context for connecting to an existing trace
 866            name: Name of the observation (e.g., function or operation name)
 867            as_type: Type of observation to create (defaults to "span")
 868            input: Input data for the operation (can be any JSON-serializable object)
 869            output: Output data from the operation (can be any JSON-serializable object)
 870            metadata: Additional metadata to associate with the observation
 871            version: Version identifier for the code or component
 872            level: Importance level of the observation (info, warning, error)
 873            status_message: Optional status message for the observation
 874            end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
 875
 876            The following parameters are available when as_type is: "generation" or "embedding".
 877            completion_start_time: When the model started generating the response
 878            model: Name/identifier of the AI model used (e.g., "gpt-4")
 879            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
 880            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
 881            cost_details: Cost information for the model call
 882            prompt: Associated prompt template from Langfuse prompt management
 883
 884        Returns:
 885            A context manager that yields the appropriate observation type based on as_type
 886
 887        Example:
 888            ```python
 889            # Create a span
 890            with langfuse.start_as_current_observation(name="process-query", as_type="span") as span:
 891                # Do work
 892                result = process_data()
 893                span.update(output=result)
 894
 895                # Create a child span automatically
 896                with span.start_as_current_observation(name="sub-operation") as child_span:
 897                    # Do sub-operation work
 898                    child_span.update(output="sub-result")
 899
 900            # Create a tool observation
 901            with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool:
 902                # Do tool work
 903                results = search_web(query)
 904                tool.update(output=results)
 905
 906            # Create a generation observation
 907            with langfuse.start_as_current_observation(
 908                name="answer-generation",
 909                as_type="generation",
 910                model="gpt-4"
 911            ) as generation:
 912                # Generate answer
 913                response = llm.generate(...)
 914                generation.update(output=response)
 915            ```
 916        """
 917        if as_type in get_observation_types_list(ObservationTypeGenerationLike):
 918            if trace_context:
 919                trace_id = trace_context.get("trace_id", None)
 920                parent_span_id = trace_context.get("parent_span_id", None)
 921
 922                if trace_id:
 923                    remote_parent_span = self._create_remote_parent_span(
 924                        trace_id=trace_id, parent_span_id=parent_span_id
 925                    )
 926
 927                    return cast(
 928                        Union[
 929                            _AgnosticContextManager[LangfuseGeneration],
 930                            _AgnosticContextManager[LangfuseEmbedding],
 931                        ],
 932                        self._create_span_with_parent_context(
 933                            as_type=as_type,
 934                            name=name,
 935                            remote_parent_span=remote_parent_span,
 936                            parent=None,
 937                            end_on_exit=end_on_exit,
 938                            input=input,
 939                            output=output,
 940                            metadata=metadata,
 941                            version=version,
 942                            level=level,
 943                            status_message=status_message,
 944                            completion_start_time=completion_start_time,
 945                            model=model,
 946                            model_parameters=model_parameters,
 947                            usage_details=usage_details,
 948                            cost_details=cost_details,
 949                            prompt=prompt,
 950                        ),
 951                    )
 952
 953            return cast(
 954                Union[
 955                    _AgnosticContextManager[LangfuseGeneration],
 956                    _AgnosticContextManager[LangfuseEmbedding],
 957                ],
 958                self._start_as_current_otel_span_with_processed_media(
 959                    as_type=as_type,
 960                    name=name,
 961                    end_on_exit=end_on_exit,
 962                    input=input,
 963                    output=output,
 964                    metadata=metadata,
 965                    version=version,
 966                    level=level,
 967                    status_message=status_message,
 968                    completion_start_time=completion_start_time,
 969                    model=model,
 970                    model_parameters=model_parameters,
 971                    usage_details=usage_details,
 972                    cost_details=cost_details,
 973                    prompt=prompt,
 974                ),
 975            )
 976
 977        if as_type in get_observation_types_list(ObservationTypeSpanLike):
 978            if trace_context:
 979                trace_id = trace_context.get("trace_id", None)
 980                parent_span_id = trace_context.get("parent_span_id", None)
 981
 982                if trace_id:
 983                    remote_parent_span = self._create_remote_parent_span(
 984                        trace_id=trace_id, parent_span_id=parent_span_id
 985                    )
 986
 987                    return cast(
 988                        Union[
 989                            _AgnosticContextManager[LangfuseSpan],
 990                            _AgnosticContextManager[LangfuseAgent],
 991                            _AgnosticContextManager[LangfuseTool],
 992                            _AgnosticContextManager[LangfuseChain],
 993                            _AgnosticContextManager[LangfuseRetriever],
 994                            _AgnosticContextManager[LangfuseEvaluator],
 995                            _AgnosticContextManager[LangfuseGuardrail],
 996                        ],
 997                        self._create_span_with_parent_context(
 998                            as_type=as_type,
 999                            name=name,
1000                            remote_parent_span=remote_parent_span,
1001                            parent=None,
1002                            end_on_exit=end_on_exit,
1003                            input=input,
1004                            output=output,
1005                            metadata=metadata,
1006                            version=version,
1007                            level=level,
1008                            status_message=status_message,
1009                        ),
1010                    )
1011
1012            return cast(
1013                Union[
1014                    _AgnosticContextManager[LangfuseSpan],
1015                    _AgnosticContextManager[LangfuseAgent],
1016                    _AgnosticContextManager[LangfuseTool],
1017                    _AgnosticContextManager[LangfuseChain],
1018                    _AgnosticContextManager[LangfuseRetriever],
1019                    _AgnosticContextManager[LangfuseEvaluator],
1020                    _AgnosticContextManager[LangfuseGuardrail],
1021                ],
1022                self._start_as_current_otel_span_with_processed_media(
1023                    as_type=as_type,
1024                    name=name,
1025                    end_on_exit=end_on_exit,
1026                    input=input,
1027                    output=output,
1028                    metadata=metadata,
1029                    version=version,
1030                    level=level,
1031                    status_message=status_message,
1032                ),
1033            )
1034
1035        # This should never be reached since all valid types are handled above
1036        langfuse_logger.warning(
1037            f"Unknown observation type: {as_type}, falling back to span"
1038        )
1039        return self._start_as_current_otel_span_with_processed_media(
1040            as_type="span",
1041            name=name,
1042            end_on_exit=end_on_exit,
1043            input=input,
1044            output=output,
1045            metadata=metadata,
1046            version=version,
1047            level=level,
1048            status_message=status_message,
1049        )

Create a new observation and set it as the current span in a context manager.

This method creates a new observation of the specified type and sets it as the current span within a context manager. Use this method with a 'with' statement to automatically handle the observation lifecycle within a code block.

The created observation will be the child of the current span in the context.

Arguments:
  • trace_context: Optional context for connecting to an existing trace
  • name: Name of the observation (e.g., function or operation name)
  • as_type: Type of observation to create (defaults to "span")
  • input: Input data for the operation (can be any JSON-serializable object)
  • output: Output data from the operation (can be any JSON-serializable object)
  • metadata: Additional metadata to associate with the observation
  • version: Version identifier for the code or component
  • level: Importance level of the observation (info, warning, error)
  • status_message: Optional status message for the observation
  • end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
  • The following parameters are available when as_type is: "generation" or "embedding".
  • completion_start_time: When the model started generating the response
  • model: Name/identifier of the AI model used (e.g., "gpt-4")
  • model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
  • usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
  • cost_details: Cost information for the model call
  • prompt: Associated prompt template from Langfuse prompt management
Returns:

A context manager that yields the appropriate observation type based on as_type

Example:
# Create a span
with langfuse.start_as_current_observation(name="process-query", as_type="span") as span:
    # Do work
    result = process_data()
    span.update(output=result)

    # Create a child span automatically
    with span.start_as_current_observation(name="sub-operation") as child_span:
        # Do sub-operation work
        child_span.update(output="sub-result")

# Create a tool observation
with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool:
    # Do tool work
    results = search_web(query)
    tool.update(output=results)

# Create a generation observation
with langfuse.start_as_current_observation(
    name="answer-generation",
    as_type="generation",
    model="gpt-4"
) as generation:
    # Generate answer
    response = llm.generate(...)
    generation.update(output=response)
def update_current_generation( self, *, name: Optional[str] = None, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, float, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None) -> None:
1210    def update_current_generation(
1211        self,
1212        *,
1213        name: Optional[str] = None,
1214        input: Optional[Any] = None,
1215        output: Optional[Any] = None,
1216        metadata: Optional[Any] = None,
1217        version: Optional[str] = None,
1218        level: Optional[SpanLevel] = None,
1219        status_message: Optional[str] = None,
1220        completion_start_time: Optional[datetime] = None,
1221        model: Optional[str] = None,
1222        model_parameters: Optional[Dict[str, MapValue]] = None,
1223        usage_details: Optional[Dict[str, int]] = None,
1224        cost_details: Optional[Dict[str, float]] = None,
1225        prompt: Optional[PromptClient] = None,
1226    ) -> None:
1227        """Update the current active generation span with new information.
1228
1229        This method updates the current generation span in the active context with
1230        additional information. It's useful for adding output, usage stats, or other
1231        details that become available during or after model generation.
1232
1233        Args:
1234            name: The generation name
1235            input: Updated input data for the model
1236            output: Output from the model (e.g., completions)
1237            metadata: Additional metadata to associate with the generation
1238            version: Version identifier for the model or component
1239            level: Importance level of the generation (info, warning, error)
1240            status_message: Optional status message for the generation
1241            completion_start_time: When the model started generating the response
1242            model: Name/identifier of the AI model used (e.g., "gpt-4")
1243            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1244            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1245            cost_details: Cost information for the model call
1246            prompt: Associated prompt template from Langfuse prompt management
1247
1248        Example:
1249            ```python
1250            with langfuse.start_as_current_generation(name="answer-query") as generation:
1251                # Initial setup and API call
1252                response = llm.generate(...)
1253
1254                # Update with results that weren't available at creation time
1255                langfuse.update_current_generation(
1256                    output=response.text,
1257                    usage_details={
1258                        "prompt_tokens": response.usage.prompt_tokens,
1259                        "completion_tokens": response.usage.completion_tokens
1260                    }
1261                )
1262            ```
1263        """
1264        if not self._tracing_enabled:
1265            langfuse_logger.debug(
1266                "Operation skipped: update_current_generation - Tracing is disabled or client is in no-op mode."
1267            )
1268            return
1269
1270        current_otel_span = self._get_current_otel_span()
1271
1272        if current_otel_span is not None:
1273            generation = LangfuseGeneration(
1274                otel_span=current_otel_span, langfuse_client=self
1275            )
1276
1277            if name:
1278                current_otel_span.update_name(name)
1279
1280            generation.update(
1281                input=input,
1282                output=output,
1283                metadata=metadata,
1284                version=version,
1285                level=level,
1286                status_message=status_message,
1287                completion_start_time=completion_start_time,
1288                model=model,
1289                model_parameters=model_parameters,
1290                usage_details=usage_details,
1291                cost_details=cost_details,
1292                prompt=prompt,
1293            )

Update the current active generation span with new information.

This method updates the current generation span in the active context with additional information. It's useful for adding output, usage stats, or other details that become available during or after model generation.

Arguments:
  • name: The generation name
  • input: Updated input data for the model
  • output: Output from the model (e.g., completions)
  • metadata: Additional metadata to associate with the generation
  • version: Version identifier for the model or component
  • level: Importance level of the generation (info, warning, error)
  • status_message: Optional status message for the generation
  • completion_start_time: When the model started generating the response
  • model: Name/identifier of the AI model used (e.g., "gpt-4")
  • model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
  • usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
  • cost_details: Cost information for the model call
  • prompt: Associated prompt template from Langfuse prompt management
Example:
with langfuse.start_as_current_generation(name="answer-query") as generation:
    # Initial setup and API call
    response = llm.generate(...)

    # Update with results that weren't available at creation time
    langfuse.update_current_generation(
        output=response.text,
        usage_details={
            "prompt_tokens": response.usage.prompt_tokens,
            "completion_tokens": response.usage.completion_tokens
        }
    )
def update_current_span( self, *, name: Optional[str] = None, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None) -> None:
1295    def update_current_span(
1296        self,
1297        *,
1298        name: Optional[str] = None,
1299        input: Optional[Any] = None,
1300        output: Optional[Any] = None,
1301        metadata: Optional[Any] = None,
1302        version: Optional[str] = None,
1303        level: Optional[SpanLevel] = None,
1304        status_message: Optional[str] = None,
1305    ) -> None:
1306        """Update the current active span with new information.
1307
1308        This method updates the current span in the active context with
1309        additional information. It's useful for adding outputs or metadata
1310        that become available during execution.
1311
1312        Args:
1313            name: The span name
1314            input: Updated input data for the operation
1315            output: Output data from the operation
1316            metadata: Additional metadata to associate with the span
1317            version: Version identifier for the code or component
1318            level: Importance level of the span (info, warning, error)
1319            status_message: Optional status message for the span
1320
1321        Example:
1322            ```python
1323            with langfuse.start_as_current_observation(name="process-data") as span:
1324                # Initial processing
1325                result = process_first_part()
1326
1327                # Update with intermediate results
1328                langfuse.update_current_span(metadata={"intermediate_result": result})
1329
1330                # Continue processing
1331                final_result = process_second_part(result)
1332
1333                # Final update
1334                langfuse.update_current_span(output=final_result)
1335            ```
1336        """
1337        if not self._tracing_enabled:
1338            langfuse_logger.debug(
1339                "Operation skipped: update_current_span - Tracing is disabled or client is in no-op mode."
1340            )
1341            return
1342
1343        current_otel_span = self._get_current_otel_span()
1344
1345        if current_otel_span is not None:
1346            span = LangfuseSpan(
1347                otel_span=current_otel_span,
1348                langfuse_client=self,
1349                environment=self._environment,
1350            )
1351
1352            if name:
1353                current_otel_span.update_name(name)
1354
1355            span.update(
1356                input=input,
1357                output=output,
1358                metadata=metadata,
1359                version=version,
1360                level=level,
1361                status_message=status_message,
1362            )

Update the current active span with new information.

This method updates the current span in the active context with additional information. It's useful for adding outputs or metadata that become available during execution.

Arguments:
  • name: The span name
  • input: Updated input data for the operation
  • output: Output data from the operation
  • metadata: Additional metadata to associate with the span
  • version: Version identifier for the code or component
  • level: Importance level of the span (info, warning, error)
  • status_message: Optional status message for the span
Example:
with langfuse.start_as_current_observation(name="process-data") as span:
    # Initial processing
    result = process_first_part()

    # Update with intermediate results
    langfuse.update_current_span(metadata={"intermediate_result": result})

    # Continue processing
    final_result = process_second_part(result)

    # Final update
    langfuse.update_current_span(output=final_result)
@deprecated('Trace-level input/output is deprecated. For trace attributes (user_id, session_id, tags, etc.), use propagate_attributes() instead. This method will be removed in a future major version.')
def set_current_trace_io( self, *, input: Optional[Any] = None, output: Optional[Any] = None) -> None:
1364    @deprecated(
1365        "Trace-level input/output is deprecated. "
1366        "For trace attributes (user_id, session_id, tags, etc.), use propagate_attributes() instead. "
1367        "This method will be removed in a future major version."
1368    )
1369    def set_current_trace_io(
1370        self,
1371        *,
1372        input: Optional[Any] = None,
1373        output: Optional[Any] = None,
1374    ) -> None:
1375        """Set trace-level input and output for the current span's trace.
1376
1377        .. deprecated::
1378            This is a legacy method for backward compatibility with Langfuse platform
1379            features that still rely on trace-level input/output (e.g., legacy LLM-as-a-judge
1380            evaluators). It will be removed in a future major version.
1381
1382            For setting other trace attributes (user_id, session_id, metadata, tags, version),
1383            use :meth:`propagate_attributes` instead.
1384
1385        Args:
1386            input: Input data to associate with the trace.
1387            output: Output data to associate with the trace.
1388        """
1389        if not self._tracing_enabled:
1390            langfuse_logger.debug(
1391                "Operation skipped: set_current_trace_io - Tracing is disabled or client is in no-op mode."
1392            )
1393            return
1394
1395        current_otel_span = self._get_current_otel_span()
1396
1397        if current_otel_span is not None and current_otel_span.is_recording():
1398            existing_observation_type = current_otel_span.attributes.get(  # type: ignore[attr-defined]
1399                LangfuseOtelSpanAttributes.OBSERVATION_TYPE, "span"
1400            )
1401            # We need to preserve the class to keep the correct observation type
1402            span_class = self._get_span_class(existing_observation_type)
1403            span = span_class(
1404                otel_span=current_otel_span,
1405                langfuse_client=self,
1406                environment=self._environment,
1407            )
1408
1409            span.set_trace_io(
1410                input=input,
1411                output=output,
1412            )

Set trace-level input and output for the current span's trace.

Deprecated since version : This is a legacy method for backward compatibility with Langfuse platform features that still rely on trace-level input/output (e.g., legacy LLM-as-a-judge evaluators). It will be removed in a future major version.

For setting other trace attributes (user_id, session_id, metadata, tags, version), use propagate_attributes() instead.

Arguments:
  • input: Input data to associate with the trace.
  • output: Output data to associate with the trace.
def set_current_trace_as_public(self) -> None:
1414    def set_current_trace_as_public(self) -> None:
1415        """Make the current trace publicly accessible via its URL.
1416
1417        When a trace is published, anyone with the trace link can view the full trace
1418        without needing to be logged in to Langfuse. This action cannot be undone
1419        programmatically - once published, the entire trace becomes public.
1420
1421        This is a convenience method that publishes the trace from the currently
1422        active span context. Use this when you want to make a trace public from
1423        within a traced function without needing direct access to the span object.
1424        """
1425        if not self._tracing_enabled:
1426            langfuse_logger.debug(
1427                "Operation skipped: set_current_trace_as_public - Tracing is disabled or client is in no-op mode."
1428            )
1429            return
1430
1431        current_otel_span = self._get_current_otel_span()
1432
1433        if current_otel_span is not None and current_otel_span.is_recording():
1434            existing_observation_type = current_otel_span.attributes.get(  # type: ignore[attr-defined]
1435                LangfuseOtelSpanAttributes.OBSERVATION_TYPE, "span"
1436            )
1437            # We need to preserve the class to keep the correct observation type
1438            span_class = self._get_span_class(existing_observation_type)
1439            span = span_class(
1440                otel_span=current_otel_span,
1441                langfuse_client=self,
1442                environment=self._environment,
1443            )
1444
1445            span.set_trace_as_public()

Make the current trace publicly accessible via its URL.

When a trace is published, anyone with the trace link can view the full trace without needing to be logged in to Langfuse. This action cannot be undone programmatically - once published, the entire trace becomes public.

This is a convenience method that publishes the trace from the currently active span context. Use this when you want to make a trace public from within a traced function without needing direct access to the span object.

def create_event( self, *, trace_context: Optional[langfuse.types.TraceContext] = None, name: str, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None) -> LangfuseEvent:
1447    def create_event(
1448        self,
1449        *,
1450        trace_context: Optional[TraceContext] = None,
1451        name: str,
1452        input: Optional[Any] = None,
1453        output: Optional[Any] = None,
1454        metadata: Optional[Any] = None,
1455        version: Optional[str] = None,
1456        level: Optional[SpanLevel] = None,
1457        status_message: Optional[str] = None,
1458    ) -> LangfuseEvent:
1459        """Create a new Langfuse observation of type 'EVENT'.
1460
1461        The created Langfuse Event observation will be the child of the current span in the context.
1462
1463        Args:
1464            trace_context: Optional context for connecting to an existing trace
1465            name: Name of the span (e.g., function or operation name)
1466            input: Input data for the operation (can be any JSON-serializable object)
1467            output: Output data from the operation (can be any JSON-serializable object)
1468            metadata: Additional metadata to associate with the span
1469            version: Version identifier for the code or component
1470            level: Importance level of the span (info, warning, error)
1471            status_message: Optional status message for the span
1472
1473        Returns:
1474            The Langfuse Event object
1475
1476        Example:
1477            ```python
1478            event = langfuse.create_event(name="process-event")
1479            ```
1480        """
1481        timestamp = time_ns()
1482
1483        if trace_context:
1484            trace_id = trace_context.get("trace_id", None)
1485            parent_span_id = trace_context.get("parent_span_id", None)
1486
1487            if trace_id:
1488                remote_parent_span = self._create_remote_parent_span(
1489                    trace_id=trace_id, parent_span_id=parent_span_id
1490                )
1491
1492                with otel_trace_api.use_span(
1493                    cast(otel_trace_api.Span, remote_parent_span)
1494                ):
1495                    otel_span = self._otel_tracer.start_span(
1496                        name=name, start_time=timestamp
1497                    )
1498                    otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True)
1499
1500                    return cast(
1501                        LangfuseEvent,
1502                        LangfuseEvent(
1503                            otel_span=otel_span,
1504                            langfuse_client=self,
1505                            environment=self._environment,
1506                            input=input,
1507                            output=output,
1508                            metadata=metadata,
1509                            version=version,
1510                            level=level,
1511                            status_message=status_message,
1512                        ).end(end_time=timestamp),
1513                    )
1514
1515        otel_span = self._otel_tracer.start_span(name=name, start_time=timestamp)
1516
1517        return cast(
1518            LangfuseEvent,
1519            LangfuseEvent(
1520                otel_span=otel_span,
1521                langfuse_client=self,
1522                environment=self._environment,
1523                input=input,
1524                output=output,
1525                metadata=metadata,
1526                version=version,
1527                level=level,
1528                status_message=status_message,
1529            ).end(end_time=timestamp),
1530        )

Create a new Langfuse observation of type 'EVENT'.

The created Langfuse Event observation will be the child of the current span in the context.

Arguments:
  • trace_context: Optional context for connecting to an existing trace
  • name: Name of the span (e.g., function or operation name)
  • input: Input data for the operation (can be any JSON-serializable object)
  • output: Output data from the operation (can be any JSON-serializable object)
  • metadata: Additional metadata to associate with the span
  • version: Version identifier for the code or component
  • level: Importance level of the span (info, warning, error)
  • status_message: Optional status message for the span
Returns:

The Langfuse Event object

Example:
event = langfuse.create_event(name="process-event")
@staticmethod
def create_trace_id(*, seed: Optional[str] = None) -> str:
1619    @staticmethod
1620    def create_trace_id(*, seed: Optional[str] = None) -> str:
1621        """Create a unique trace ID for use with Langfuse.
1622
1623        This method generates a unique trace ID for use with various Langfuse APIs.
1624        It can either generate a random ID or create a deterministic ID based on
1625        a seed string.
1626
1627        Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes.
1628        This method ensures the generated ID meets this requirement. If you need to
1629        correlate an external ID with a Langfuse trace ID, use the external ID as the
1630        seed to get a valid, deterministic Langfuse trace ID.
1631
1632        Args:
1633            seed: Optional string to use as a seed for deterministic ID generation.
1634                 If provided, the same seed will always produce the same ID.
1635                 If not provided, a random ID will be generated.
1636
1637        Returns:
1638            A 32-character lowercase hexadecimal string representing the Langfuse trace ID.
1639
1640        Example:
1641            ```python
1642            # Generate a random trace ID
1643            trace_id = langfuse.create_trace_id()
1644
1645            # Generate a deterministic ID based on a seed
1646            session_trace_id = langfuse.create_trace_id(seed="session-456")
1647
1648            # Correlate an external ID with a Langfuse trace ID
1649            external_id = "external-system-123456"
1650            correlated_trace_id = langfuse.create_trace_id(seed=external_id)
1651
1652            # Use the ID with trace context
1653            with langfuse.start_as_current_observation(
1654                name="process-request",
1655                trace_context={"trace_id": trace_id}
1656            ) as span:
1657                # Operation will be part of the specific trace
1658                pass
1659            ```
1660        """
1661        if not seed:
1662            trace_id_int = RandomIdGenerator().generate_trace_id()
1663
1664            return Langfuse._format_otel_trace_id(trace_id_int)
1665
1666        return sha256(seed.encode("utf-8")).digest()[:16].hex()

Create a unique trace ID for use with Langfuse.

This method generates a unique trace ID for use with various Langfuse APIs. It can either generate a random ID or create a deterministic ID based on a seed string.

Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes. This method ensures the generated ID meets this requirement. If you need to correlate an external ID with a Langfuse trace ID, use the external ID as the seed to get a valid, deterministic Langfuse trace ID.

Arguments:
  • seed: Optional string to use as a seed for deterministic ID generation. If provided, the same seed will always produce the same ID. If not provided, a random ID will be generated.
Returns:

A 32-character lowercase hexadecimal string representing the Langfuse trace ID.

Example:
# Generate a random trace ID
trace_id = langfuse.create_trace_id()

# Generate a deterministic ID based on a seed
session_trace_id = langfuse.create_trace_id(seed="session-456")

# Correlate an external ID with a Langfuse trace ID
external_id = "external-system-123456"
correlated_trace_id = langfuse.create_trace_id(seed=external_id)

# Use the ID with trace context
with langfuse.start_as_current_observation(
    name="process-request",
    trace_context={"trace_id": trace_id}
) as span:
    # Operation will be part of the specific trace
    pass
def create_score( self, *, name: str, value: Union[float, str], session_id: Optional[str] = None, dataset_run_id: Optional[str] = None, trace_id: Optional[str] = None, observation_id: Optional[str] = None, score_id: Optional[str] = None, data_type: Optional[Literal['NUMERIC', 'CATEGORICAL', 'BOOLEAN']] = None, comment: Optional[str] = None, config_id: Optional[str] = None, metadata: Optional[Any] = None, timestamp: Optional[datetime.datetime] = None) -> None:
1744    def create_score(
1745        self,
1746        *,
1747        name: str,
1748        value: Union[float, str],
1749        session_id: Optional[str] = None,
1750        dataset_run_id: Optional[str] = None,
1751        trace_id: Optional[str] = None,
1752        observation_id: Optional[str] = None,
1753        score_id: Optional[str] = None,
1754        data_type: Optional[ScoreDataType] = None,
1755        comment: Optional[str] = None,
1756        config_id: Optional[str] = None,
1757        metadata: Optional[Any] = None,
1758        timestamp: Optional[datetime] = None,
1759    ) -> None:
1760        """Create a score for a specific trace or observation.
1761
1762        This method creates a score for evaluating a Langfuse trace or observation. Scores can be
1763        used to track quality metrics, user feedback, or automated evaluations.
1764
1765        Args:
1766            name: Name of the score (e.g., "relevance", "accuracy")
1767            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
1768            session_id: ID of the Langfuse session to associate the score with
1769            dataset_run_id: ID of the Langfuse dataset run to associate the score with
1770            trace_id: ID of the Langfuse trace to associate the score with
1771            observation_id: Optional ID of the specific observation to score. Trace ID must be provided too.
1772            score_id: Optional custom ID for the score (auto-generated if not provided)
1773            data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
1774            comment: Optional comment or explanation for the score
1775            config_id: Optional ID of a score config defined in Langfuse
1776            metadata: Optional metadata to be attached to the score
1777            timestamp: Optional timestamp for the score (defaults to current UTC time)
1778
1779        Example:
1780            ```python
1781            # Create a numeric score for accuracy
1782            langfuse.create_score(
1783                name="accuracy",
1784                value=0.92,
1785                trace_id="abcdef1234567890abcdef1234567890",
1786                data_type="NUMERIC",
1787                comment="High accuracy with minor irrelevant details"
1788            )
1789
1790            # Create a categorical score for sentiment
1791            langfuse.create_score(
1792                name="sentiment",
1793                value="positive",
1794                trace_id="abcdef1234567890abcdef1234567890",
1795                observation_id="abcdef1234567890",
1796                data_type="CATEGORICAL"
1797            )
1798            ```
1799        """
1800        if not self._tracing_enabled:
1801            return
1802
1803        score_id = score_id or self._create_observation_id()
1804
1805        try:
1806            new_body = ScoreBody(
1807                id=score_id,
1808                session_id=session_id,
1809                datasetRunId=dataset_run_id,
1810                traceId=trace_id,
1811                observationId=observation_id,
1812                name=name,
1813                value=value,
1814                dataType=data_type,  # type: ignore
1815                comment=comment,
1816                configId=config_id,
1817                environment=self._environment,
1818                metadata=metadata,
1819            )
1820
1821            event = {
1822                "id": self.create_trace_id(),
1823                "type": "score-create",
1824                "timestamp": timestamp or _get_timestamp(),
1825                "body": new_body,
1826            }
1827
1828            if self._resources is not None:
1829                # Force the score to be in sample if it was for a legacy trace ID, i.e. non-32 hexchar
1830                force_sample = (
1831                    not self._is_valid_trace_id(trace_id) if trace_id else True
1832                )
1833
1834                self._resources.add_score_task(
1835                    event,
1836                    force_sample=force_sample,
1837                )
1838
1839        except Exception as e:
1840            langfuse_logger.exception(
1841                f"Error creating score: Failed to process score event for trace_id={trace_id}, name={name}. Error: {e}"
1842            )

Create a score for a specific trace or observation.

This method creates a score for evaluating a Langfuse trace or observation. Scores can be used to track quality metrics, user feedback, or automated evaluations.

Arguments:
  • name: Name of the score (e.g., "relevance", "accuracy")
  • value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
  • session_id: ID of the Langfuse session to associate the score with
  • dataset_run_id: ID of the Langfuse dataset run to associate the score with
  • trace_id: ID of the Langfuse trace to associate the score with
  • observation_id: Optional ID of the specific observation to score. Trace ID must be provided too.
  • score_id: Optional custom ID for the score (auto-generated if not provided)
  • data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
  • comment: Optional comment or explanation for the score
  • config_id: Optional ID of a score config defined in Langfuse
  • metadata: Optional metadata to be attached to the score
  • timestamp: Optional timestamp for the score (defaults to current UTC time)
Example:
# Create a numeric score for accuracy
langfuse.create_score(
    name="accuracy",
    value=0.92,
    trace_id="abcdef1234567890abcdef1234567890",
    data_type="NUMERIC",
    comment="High accuracy with minor irrelevant details"
)

# Create a categorical score for sentiment
langfuse.create_score(
    name="sentiment",
    value="positive",
    trace_id="abcdef1234567890abcdef1234567890",
    observation_id="abcdef1234567890",
    data_type="CATEGORICAL"
)
def score_current_span( self, *, name: str, value: Union[float, str], score_id: Optional[str] = None, data_type: Optional[Literal['NUMERIC', 'CATEGORICAL', 'BOOLEAN']] = None, comment: Optional[str] = None, config_id: Optional[str] = None, metadata: Optional[Any] = None) -> None:
1903    def score_current_span(
1904        self,
1905        *,
1906        name: str,
1907        value: Union[float, str],
1908        score_id: Optional[str] = None,
1909        data_type: Optional[ScoreDataType] = None,
1910        comment: Optional[str] = None,
1911        config_id: Optional[str] = None,
1912        metadata: Optional[Any] = None,
1913    ) -> None:
1914        """Create a score for the current active span.
1915
1916        This method scores the currently active span in the context. It's a convenient
1917        way to score the current operation without needing to know its trace and span IDs.
1918
1919        Args:
1920            name: Name of the score (e.g., "relevance", "accuracy")
1921            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
1922            score_id: Optional custom ID for the score (auto-generated if not provided)
1923            data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
1924            comment: Optional comment or explanation for the score
1925            config_id: Optional ID of a score config defined in Langfuse
1926            metadata: Optional metadata to be attached to the score
1927
1928        Example:
1929            ```python
1930            with langfuse.start_as_current_generation(name="answer-query") as generation:
1931                # Generate answer
1932                response = generate_answer(...)
1933                generation.update(output=response)
1934
1935                # Score the generation
1936                langfuse.score_current_span(
1937                    name="relevance",
1938                    value=0.85,
1939                    data_type="NUMERIC",
1940                    comment="Mostly relevant but contains some tangential information",
1941                    metadata={"model": "gpt-4", "prompt_version": "v2"}
1942                )
1943            ```
1944        """
1945        current_span = self._get_current_otel_span()
1946
1947        if current_span is not None:
1948            trace_id = self._get_otel_trace_id(current_span)
1949            observation_id = self._get_otel_span_id(current_span)
1950
1951            langfuse_logger.info(
1952                f"Score: Creating score name='{name}' value={value} for current span ({observation_id}) in trace {trace_id}"
1953            )
1954
1955            self.create_score(
1956                trace_id=trace_id,
1957                observation_id=observation_id,
1958                name=name,
1959                value=cast(str, value),
1960                score_id=score_id,
1961                data_type=cast(Literal["CATEGORICAL"], data_type),
1962                comment=comment,
1963                config_id=config_id,
1964                metadata=metadata,
1965            )

Create a score for the current active span.

This method scores the currently active span in the context. It's a convenient way to score the current operation without needing to know its trace and span IDs.

Arguments:
  • name: Name of the score (e.g., "relevance", "accuracy")
  • value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
  • score_id: Optional custom ID for the score (auto-generated if not provided)
  • data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
  • comment: Optional comment or explanation for the score
  • config_id: Optional ID of a score config defined in Langfuse
  • metadata: Optional metadata to be attached to the score
Example:
with langfuse.start_as_current_generation(name="answer-query") as generation:
    # Generate answer
    response = generate_answer(...)
    generation.update(output=response)

    # Score the generation
    langfuse.score_current_span(
        name="relevance",
        value=0.85,
        data_type="NUMERIC",
        comment="Mostly relevant but contains some tangential information",
        metadata={"model": "gpt-4", "prompt_version": "v2"}
    )
def score_current_trace( self, *, name: str, value: Union[float, str], score_id: Optional[str] = None, data_type: Optional[Literal['NUMERIC', 'CATEGORICAL', 'BOOLEAN']] = None, comment: Optional[str] = None, config_id: Optional[str] = None, metadata: Optional[Any] = None) -> None:
1993    def score_current_trace(
1994        self,
1995        *,
1996        name: str,
1997        value: Union[float, str],
1998        score_id: Optional[str] = None,
1999        data_type: Optional[ScoreDataType] = None,
2000        comment: Optional[str] = None,
2001        config_id: Optional[str] = None,
2002        metadata: Optional[Any] = None,
2003    ) -> None:
2004        """Create a score for the current trace.
2005
2006        This method scores the trace of the currently active span. Unlike score_current_span,
2007        this method associates the score with the entire trace rather than a specific span.
2008        It's useful for scoring overall performance or quality of the entire operation.
2009
2010        Args:
2011            name: Name of the score (e.g., "user_satisfaction", "overall_quality")
2012            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
2013            score_id: Optional custom ID for the score (auto-generated if not provided)
2014            data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
2015            comment: Optional comment or explanation for the score
2016            config_id: Optional ID of a score config defined in Langfuse
2017            metadata: Optional metadata to be attached to the score
2018
2019        Example:
2020            ```python
2021            with langfuse.start_as_current_observation(name="process-user-request") as span:
2022                # Process request
2023                result = process_complete_request()
2024                span.update(output=result)
2025
2026                # Score the overall trace
2027                langfuse.score_current_trace(
2028                    name="overall_quality",
2029                    value=0.95,
2030                    data_type="NUMERIC",
2031                    comment="High quality end-to-end response",
2032                    metadata={"evaluator": "gpt-4", "criteria": "comprehensive"}
2033                )
2034            ```
2035        """
2036        current_span = self._get_current_otel_span()
2037
2038        if current_span is not None:
2039            trace_id = self._get_otel_trace_id(current_span)
2040
2041            langfuse_logger.info(
2042                f"Score: Creating score name='{name}' value={value} for entire trace {trace_id}"
2043            )
2044
2045            self.create_score(
2046                trace_id=trace_id,
2047                name=name,
2048                value=cast(str, value),
2049                score_id=score_id,
2050                data_type=cast(Literal["CATEGORICAL"], data_type),
2051                comment=comment,
2052                config_id=config_id,
2053                metadata=metadata,
2054            )

Create a score for the current trace.

This method scores the trace of the currently active span. Unlike score_current_span, this method associates the score with the entire trace rather than a specific span. It's useful for scoring overall performance or quality of the entire operation.

Arguments:
  • name: Name of the score (e.g., "user_satisfaction", "overall_quality")
  • value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
  • score_id: Optional custom ID for the score (auto-generated if not provided)
  • data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
  • comment: Optional comment or explanation for the score
  • config_id: Optional ID of a score config defined in Langfuse
  • metadata: Optional metadata to be attached to the score
Example:
with langfuse.start_as_current_observation(name="process-user-request") as span:
    # Process request
    result = process_complete_request()
    span.update(output=result)

    # Score the overall trace
    langfuse.score_current_trace(
        name="overall_quality",
        value=0.95,
        data_type="NUMERIC",
        comment="High quality end-to-end response",
        metadata={"evaluator": "gpt-4", "criteria": "comprehensive"}
    )
def flush(self) -> None:
2056    def flush(self) -> None:
2057        """Force flush all pending spans and events to the Langfuse API.
2058
2059        This method manually flushes any pending spans, scores, and other events to the
2060        Langfuse API. It's useful in scenarios where you want to ensure all data is sent
2061        before proceeding, without waiting for the automatic flush interval.
2062
2063        Example:
2064            ```python
2065            # Record some spans and scores
2066            with langfuse.start_as_current_observation(name="operation") as span:
2067                # Do work...
2068                pass
2069
2070            # Ensure all data is sent to Langfuse before proceeding
2071            langfuse.flush()
2072
2073            # Continue with other work
2074            ```
2075        """
2076        if self._resources is not None:
2077            self._resources.flush()

Force flush all pending spans and events to the Langfuse API.

This method manually flushes any pending spans, scores, and other events to the Langfuse API. It's useful in scenarios where you want to ensure all data is sent before proceeding, without waiting for the automatic flush interval.

Example:
# Record some spans and scores
with langfuse.start_as_current_observation(name="operation") as span:
    # Do work...
    pass

# Ensure all data is sent to Langfuse before proceeding
langfuse.flush()

# Continue with other work
def shutdown(self) -> None:
2079    def shutdown(self) -> None:
2080        """Shut down the Langfuse client and flush all pending data.
2081
2082        This method cleanly shuts down the Langfuse client, ensuring all pending data
2083        is flushed to the API and all background threads are properly terminated.
2084
2085        It's important to call this method when your application is shutting down to
2086        prevent data loss and resource leaks. For most applications, using the client
2087        as a context manager or relying on the automatic shutdown via atexit is sufficient.
2088
2089        Example:
2090            ```python
2091            # Initialize Langfuse
2092            langfuse = Langfuse(public_key="...", secret_key="...")
2093
2094            # Use Langfuse throughout your application
2095            # ...
2096
2097            # When application is shutting down
2098            langfuse.shutdown()
2099            ```
2100        """
2101        if self._resources is not None:
2102            self._resources.shutdown()

Shut down the Langfuse client and flush all pending data.

This method cleanly shuts down the Langfuse client, ensuring all pending data is flushed to the API and all background threads are properly terminated.

It's important to call this method when your application is shutting down to prevent data loss and resource leaks. For most applications, using the client as a context manager or relying on the automatic shutdown via atexit is sufficient.

Example:
# Initialize Langfuse
langfuse = Langfuse(public_key="...", secret_key="...")

# Use Langfuse throughout your application
# ...

# When application is shutting down
langfuse.shutdown()
def get_current_trace_id(self) -> Optional[str]:
2104    def get_current_trace_id(self) -> Optional[str]:
2105        """Get the trace ID of the current active span.
2106
2107        This method retrieves the trace ID from the currently active span in the context.
2108        It can be used to get the trace ID for referencing in logs, external systems,
2109        or for creating related operations.
2110
2111        Returns:
2112            The current trace ID as a 32-character lowercase hexadecimal string,
2113            or None if there is no active span.
2114
2115        Example:
2116            ```python
2117            with langfuse.start_as_current_observation(name="process-request") as span:
2118                # Get the current trace ID for reference
2119                trace_id = langfuse.get_current_trace_id()
2120
2121                # Use it for external correlation
2122                log.info(f"Processing request with trace_id: {trace_id}")
2123
2124                # Or pass to another system
2125                external_system.process(data, trace_id=trace_id)
2126            ```
2127        """
2128        if not self._tracing_enabled:
2129            langfuse_logger.debug(
2130                "Operation skipped: get_current_trace_id - Tracing is disabled or client is in no-op mode."
2131            )
2132            return None
2133
2134        current_otel_span = self._get_current_otel_span()
2135
2136        return self._get_otel_trace_id(current_otel_span) if current_otel_span else None

Get the trace ID of the current active span.

This method retrieves the trace ID from the currently active span in the context. It can be used to get the trace ID for referencing in logs, external systems, or for creating related operations.

Returns:

The current trace ID as a 32-character lowercase hexadecimal string, or None if there is no active span.

Example:
with langfuse.start_as_current_observation(name="process-request") as span:
    # Get the current trace ID for reference
    trace_id = langfuse.get_current_trace_id()

    # Use it for external correlation
    log.info(f"Processing request with trace_id: {trace_id}")

    # Or pass to another system
    external_system.process(data, trace_id=trace_id)
def get_current_observation_id(self) -> Optional[str]:
2138    def get_current_observation_id(self) -> Optional[str]:
2139        """Get the observation ID (span ID) of the current active span.
2140
2141        This method retrieves the observation ID from the currently active span in the context.
2142        It can be used to get the observation ID for referencing in logs, external systems,
2143        or for creating scores or other related operations.
2144
2145        Returns:
2146            The current observation ID as a 16-character lowercase hexadecimal string,
2147            or None if there is no active span.
2148
2149        Example:
2150            ```python
2151            with langfuse.start_as_current_observation(name="process-user-query") as span:
2152                # Get the current observation ID
2153                observation_id = langfuse.get_current_observation_id()
2154
2155                # Store it for later reference
2156                cache.set(f"query_{query_id}_observation", observation_id)
2157
2158                # Process the query...
2159            ```
2160        """
2161        if not self._tracing_enabled:
2162            langfuse_logger.debug(
2163                "Operation skipped: get_current_observation_id - Tracing is disabled or client is in no-op mode."
2164            )
2165            return None
2166
2167        current_otel_span = self._get_current_otel_span()
2168
2169        return self._get_otel_span_id(current_otel_span) if current_otel_span else None

Get the observation ID (span ID) of the current active span.

This method retrieves the observation ID from the currently active span in the context. It can be used to get the observation ID for referencing in logs, external systems, or for creating scores or other related operations.

Returns:

The current observation ID as a 16-character lowercase hexadecimal string, or None if there is no active span.

Example:
with langfuse.start_as_current_observation(name="process-user-query") as span:
    # Get the current observation ID
    observation_id = langfuse.get_current_observation_id()

    # Store it for later reference
    cache.set(f"query_{query_id}_observation", observation_id)

    # Process the query...
def get_trace_url(self, *, trace_id: Optional[str] = None) -> Optional[str]:
2182    def get_trace_url(self, *, trace_id: Optional[str] = None) -> Optional[str]:
2183        """Get the URL to view a trace in the Langfuse UI.
2184
2185        This method generates a URL that links directly to a trace in the Langfuse UI.
2186        It's useful for providing links in logs, notifications, or debugging tools.
2187
2188        Args:
2189            trace_id: Optional trace ID to generate a URL for. If not provided,
2190                     the trace ID of the current active span will be used.
2191
2192        Returns:
2193            A URL string pointing to the trace in the Langfuse UI,
2194            or None if the project ID couldn't be retrieved or no trace ID is available.
2195
2196        Example:
2197            ```python
2198            # Get URL for the current trace
2199            with langfuse.start_as_current_observation(name="process-request") as span:
2200                trace_url = langfuse.get_trace_url()
2201                log.info(f"Processing trace: {trace_url}")
2202
2203            # Get URL for a specific trace
2204            specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef")
2205            send_notification(f"Review needed for trace: {specific_trace_url}")
2206            ```
2207        """
2208        final_trace_id = trace_id or self.get_current_trace_id()
2209        if not final_trace_id:
2210            return None
2211
2212        project_id = self._get_project_id()
2213
2214        return (
2215            f"{self._base_url}/project/{project_id}/traces/{final_trace_id}"
2216            if project_id and final_trace_id
2217            else None
2218        )

Get the URL to view a trace in the Langfuse UI.

This method generates a URL that links directly to a trace in the Langfuse UI. It's useful for providing links in logs, notifications, or debugging tools.

Arguments:
  • trace_id: Optional trace ID to generate a URL for. If not provided, the trace ID of the current active span will be used.
Returns:

A URL string pointing to the trace in the Langfuse UI, or None if the project ID couldn't be retrieved or no trace ID is available.

Example:
# Get URL for the current trace
with langfuse.start_as_current_observation(name="process-request") as span:
    trace_url = langfuse.get_trace_url()
    log.info(f"Processing trace: {trace_url}")

# Get URL for a specific trace
specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef")
send_notification(f"Review needed for trace: {specific_trace_url}")
def get_dataset( self, name: str, *, fetch_items_page_size: Optional[int] = 50, version: Optional[datetime.datetime] = None) -> langfuse._client.datasets.DatasetClient:
2220    def get_dataset(
2221        self,
2222        name: str,
2223        *,
2224        fetch_items_page_size: Optional[int] = 50,
2225        version: Optional[datetime] = None,
2226    ) -> "DatasetClient":
2227        """Fetch a dataset by its name.
2228
2229        Args:
2230            name (str): The name of the dataset to fetch.
2231            fetch_items_page_size (Optional[int]): All items of the dataset will be fetched in chunks of this size. Defaults to 50.
2232            version (Optional[datetime]): Retrieve dataset items as they existed at this specific point in time (UTC).
2233                If provided, returns the state of items at the specified UTC timestamp.
2234                If not provided, returns the latest version. Must be a timezone-aware datetime object in UTC.
2235
2236        Returns:
2237            DatasetClient: The dataset with the given name.
2238        """
2239        try:
2240            langfuse_logger.debug(f"Getting datasets {name}")
2241            dataset = self.api.datasets.get(dataset_name=self._url_encode(name))
2242
2243            dataset_items = []
2244            page = 1
2245
2246            while True:
2247                new_items = self.api.dataset_items.list(
2248                    dataset_name=self._url_encode(name, is_url_param=True),
2249                    page=page,
2250                    limit=fetch_items_page_size,
2251                    version=version,
2252                )
2253                dataset_items.extend(new_items.data)
2254
2255                if new_items.meta.total_pages <= page:
2256                    break
2257
2258                page += 1
2259
2260            return DatasetClient(
2261                dataset=dataset,
2262                items=dataset_items,
2263                version=version,
2264                langfuse_client=self,
2265            )
2266
2267        except Error as e:
2268            handle_fern_exception(e)
2269            raise e

Fetch a dataset by its name.

Arguments:
  • name (str): The name of the dataset to fetch.
  • fetch_items_page_size (Optional[int]): All items of the dataset will be fetched in chunks of this size. Defaults to 50.
  • version (Optional[datetime]): Retrieve dataset items as they existed at this specific point in time (UTC). If provided, returns the state of items at the specified UTC timestamp. If not provided, returns the latest version. Must be a timezone-aware datetime object in UTC.
Returns:

DatasetClient: The dataset with the given name.

def get_dataset_run( self, *, dataset_name: str, run_name: str) -> langfuse.api.DatasetRunWithItems:
2271    def get_dataset_run(
2272        self, *, dataset_name: str, run_name: str
2273    ) -> DatasetRunWithItems:
2274        """Fetch a dataset run by dataset name and run name.
2275
2276        Args:
2277            dataset_name (str): The name of the dataset.
2278            run_name (str): The name of the run.
2279
2280        Returns:
2281            DatasetRunWithItems: The dataset run with its items.
2282        """
2283        try:
2284            return cast(
2285                DatasetRunWithItems,
2286                self.api.datasets.get_run(
2287                    dataset_name=self._url_encode(dataset_name),
2288                    run_name=self._url_encode(run_name),
2289                    request_options=None,
2290                ),
2291            )
2292        except Error as e:
2293            handle_fern_exception(e)
2294            raise e

Fetch a dataset run by dataset name and run name.

Arguments:
  • dataset_name (str): The name of the dataset.
  • run_name (str): The name of the run.
Returns:

DatasetRunWithItems: The dataset run with its items.

def get_dataset_runs( self, *, dataset_name: str, page: Optional[int] = None, limit: Optional[int] = None) -> langfuse.api.PaginatedDatasetRuns:
2296    def get_dataset_runs(
2297        self,
2298        *,
2299        dataset_name: str,
2300        page: Optional[int] = None,
2301        limit: Optional[int] = None,
2302    ) -> PaginatedDatasetRuns:
2303        """Fetch all runs for a dataset.
2304
2305        Args:
2306            dataset_name (str): The name of the dataset.
2307            page (Optional[int]): Page number, starts at 1.
2308            limit (Optional[int]): Limit of items per page.
2309
2310        Returns:
2311            PaginatedDatasetRuns: Paginated list of dataset runs.
2312        """
2313        try:
2314            return cast(
2315                PaginatedDatasetRuns,
2316                self.api.datasets.get_runs(
2317                    dataset_name=self._url_encode(dataset_name),
2318                    page=page,
2319                    limit=limit,
2320                    request_options=None,
2321                ),
2322            )
2323        except Error as e:
2324            handle_fern_exception(e)
2325            raise e

Fetch all runs for a dataset.

Arguments:
  • dataset_name (str): The name of the dataset.
  • page (Optional[int]): Page number, starts at 1.
  • limit (Optional[int]): Limit of items per page.
Returns:

PaginatedDatasetRuns: Paginated list of dataset runs.

def delete_dataset_run( self, *, dataset_name: str, run_name: str) -> langfuse.api.DeleteDatasetRunResponse:
2327    def delete_dataset_run(
2328        self, *, dataset_name: str, run_name: str
2329    ) -> DeleteDatasetRunResponse:
2330        """Delete a dataset run and all its run items. This action is irreversible.
2331
2332        Args:
2333            dataset_name (str): The name of the dataset.
2334            run_name (str): The name of the run.
2335
2336        Returns:
2337            DeleteDatasetRunResponse: Confirmation of deletion.
2338        """
2339        try:
2340            return cast(
2341                DeleteDatasetRunResponse,
2342                self.api.datasets.delete_run(
2343                    dataset_name=self._url_encode(dataset_name),
2344                    run_name=self._url_encode(run_name),
2345                    request_options=None,
2346                ),
2347            )
2348        except Error as e:
2349            handle_fern_exception(e)
2350            raise e

Delete a dataset run and all its run items. This action is irreversible.

Arguments:
  • dataset_name (str): The name of the dataset.
  • run_name (str): The name of the run.
Returns:

DeleteDatasetRunResponse: Confirmation of deletion.

def run_experiment( self, *, name: str, run_name: Optional[str] = None, description: Optional[str] = None, data: Union[List[langfuse.experiment.LocalExperimentItem], List[langfuse.api.DatasetItem]], task: langfuse.experiment.TaskFunction, evaluators: List[langfuse.experiment.EvaluatorFunction] = [], composite_evaluator: Optional[CompositeEvaluatorFunction] = None, run_evaluators: List[langfuse.experiment.RunEvaluatorFunction] = [], max_concurrency: int = 50, metadata: Optional[Dict[str, str]] = None, _dataset_version: Optional[datetime.datetime] = None) -> langfuse.experiment.ExperimentResult:
2352    def run_experiment(
2353        self,
2354        *,
2355        name: str,
2356        run_name: Optional[str] = None,
2357        description: Optional[str] = None,
2358        data: ExperimentData,
2359        task: TaskFunction,
2360        evaluators: List[EvaluatorFunction] = [],
2361        composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
2362        run_evaluators: List[RunEvaluatorFunction] = [],
2363        max_concurrency: int = 50,
2364        metadata: Optional[Dict[str, str]] = None,
2365        _dataset_version: Optional[datetime] = None,
2366    ) -> ExperimentResult:
2367        """Run an experiment on a dataset with automatic tracing and evaluation.
2368
2369        This method executes a task function on each item in the provided dataset,
2370        automatically traces all executions with Langfuse for observability, runs
2371        item-level and run-level evaluators on the outputs, and returns comprehensive
2372        results with evaluation metrics.
2373
2374        The experiment system provides:
2375        - Automatic tracing of all task executions
2376        - Concurrent processing with configurable limits
2377        - Comprehensive error handling that isolates failures
2378        - Integration with Langfuse datasets for experiment tracking
2379        - Flexible evaluation framework supporting both sync and async evaluators
2380
2381        Args:
2382            name: Human-readable name for the experiment. Used for identification
2383                in the Langfuse UI.
2384            run_name: Optional exact name for the experiment run. If provided, this will be
2385                used as the exact dataset run name if the `data` contains Langfuse dataset items.
2386                If not provided, this will default to the experiment name appended with an ISO timestamp.
2387            description: Optional description explaining the experiment's purpose,
2388                methodology, or expected outcomes.
2389            data: Array of data items to process. Can be either:
2390                - List of dict-like items with 'input', 'expected_output', 'metadata' keys
2391                - List of Langfuse DatasetItem objects from dataset.items
2392            task: Function that processes each data item and returns output.
2393                Must accept 'item' as keyword argument and can return sync or async results.
2394                The task function signature should be: task(*, item, **kwargs) -> Any
2395            evaluators: List of functions to evaluate each item's output individually.
2396                Each evaluator receives input, output, expected_output, and metadata.
2397                Can return single Evaluation dict or list of Evaluation dicts.
2398            composite_evaluator: Optional function that creates composite scores from item-level evaluations.
2399                Receives the same inputs as item-level evaluators (input, output, expected_output, metadata)
2400                plus the list of evaluations from item-level evaluators. Useful for weighted averages,
2401                pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics.
2402            run_evaluators: List of functions to evaluate the entire experiment run.
2403                Each run evaluator receives all item_results and can compute aggregate metrics.
2404                Useful for calculating averages, distributions, or cross-item comparisons.
2405            max_concurrency: Maximum number of concurrent task executions (default: 50).
2406                Controls the number of items processed simultaneously. Adjust based on
2407                API rate limits and system resources.
2408            metadata: Optional metadata dictionary to attach to all experiment traces.
2409                This metadata will be included in every trace created during the experiment.
2410                If `data` are Langfuse dataset items, the metadata will be attached to the dataset run, too.
2411
2412        Returns:
2413            ExperimentResult containing:
2414            - run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset.
2415            - item_results: List of results for each processed item with outputs and evaluations
2416            - run_evaluations: List of aggregate evaluation results for the entire run
2417            - dataset_run_id: ID of the dataset run (if using Langfuse datasets)
2418            - dataset_run_url: Direct URL to view results in Langfuse UI (if applicable)
2419
2420        Raises:
2421            ValueError: If required parameters are missing or invalid
2422            Exception: If experiment setup fails (individual item failures are handled gracefully)
2423
2424        Examples:
2425            Basic experiment with local data:
2426            ```python
2427            def summarize_text(*, item, **kwargs):
2428                return f"Summary: {item['input'][:50]}..."
2429
2430            def length_evaluator(*, input, output, expected_output=None, **kwargs):
2431                return {
2432                    "name": "output_length",
2433                    "value": len(output),
2434                    "comment": f"Output contains {len(output)} characters"
2435                }
2436
2437            result = langfuse.run_experiment(
2438                name="Text Summarization Test",
2439                description="Evaluate summarization quality and length",
2440                data=[
2441                    {"input": "Long article text...", "expected_output": "Expected summary"},
2442                    {"input": "Another article...", "expected_output": "Another summary"}
2443                ],
2444                task=summarize_text,
2445                evaluators=[length_evaluator]
2446            )
2447
2448            print(f"Processed {len(result.item_results)} items")
2449            for item_result in result.item_results:
2450                print(f"Input: {item_result.item['input']}")
2451                print(f"Output: {item_result.output}")
2452                print(f"Evaluations: {item_result.evaluations}")
2453            ```
2454
2455            Advanced experiment with async task and multiple evaluators:
2456            ```python
2457            async def llm_task(*, item, **kwargs):
2458                # Simulate async LLM call
2459                response = await openai_client.chat.completions.create(
2460                    model="gpt-4",
2461                    messages=[{"role": "user", "content": item["input"]}]
2462                )
2463                return response.choices[0].message.content
2464
2465            def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
2466                if expected_output and expected_output.lower() in output.lower():
2467                    return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"}
2468                return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"}
2469
2470            def toxicity_evaluator(*, input, output, expected_output=None, **kwargs):
2471                # Simulate toxicity check
2472                toxicity_score = check_toxicity(output)  # Your toxicity checker
2473                return {
2474                    "name": "toxicity",
2475                    "value": toxicity_score,
2476                    "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}"
2477                }
2478
2479            def average_accuracy(*, item_results, **kwargs):
2480                accuracies = [
2481                    eval.value for result in item_results
2482                    for eval in result.evaluations
2483                    if eval.name == "accuracy"
2484                ]
2485                return {
2486                    "name": "average_accuracy",
2487                    "value": sum(accuracies) / len(accuracies) if accuracies else 0,
2488                    "comment": f"Average accuracy across {len(accuracies)} items"
2489                }
2490
2491            result = langfuse.run_experiment(
2492                name="LLM Safety and Accuracy Test",
2493                description="Evaluate model accuracy and safety across diverse prompts",
2494                data=test_dataset,  # Your dataset items
2495                task=llm_task,
2496                evaluators=[accuracy_evaluator, toxicity_evaluator],
2497                run_evaluators=[average_accuracy],
2498                max_concurrency=5,  # Limit concurrent API calls
2499                metadata={"model": "gpt-4", "temperature": 0.7}
2500            )
2501            ```
2502
2503            Using with Langfuse datasets:
2504            ```python
2505            # Get dataset from Langfuse
2506            dataset = langfuse.get_dataset("my-eval-dataset")
2507
2508            result = dataset.run_experiment(
2509                name="Production Model Evaluation",
2510                description="Monthly evaluation of production model performance",
2511                task=my_production_task,
2512                evaluators=[accuracy_evaluator, latency_evaluator]
2513            )
2514
2515            # Results automatically linked to dataset in Langfuse UI
2516            print(f"View results: {result['dataset_run_url']}")
2517            ```
2518
2519        Note:
2520            - Task and evaluator functions can be either synchronous or asynchronous
2521            - Individual item failures are logged but don't stop the experiment
2522            - All executions are automatically traced and visible in Langfuse UI
2523            - When using Langfuse datasets, results are automatically linked for easy comparison
2524            - This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.)
2525            - Async execution is handled automatically with smart event loop detection
2526        """
2527        return cast(
2528            ExperimentResult,
2529            run_async_safely(
2530                self._run_experiment_async(
2531                    name=name,
2532                    run_name=self._create_experiment_run_name(
2533                        name=name, run_name=run_name
2534                    ),
2535                    description=description,
2536                    data=data,
2537                    task=task,
2538                    evaluators=evaluators or [],
2539                    composite_evaluator=composite_evaluator,
2540                    run_evaluators=run_evaluators or [],
2541                    max_concurrency=max_concurrency,
2542                    metadata=metadata,
2543                    dataset_version=_dataset_version,
2544                ),
2545            ),
2546        )

Run an experiment on a dataset with automatic tracing and evaluation.

This method executes a task function on each item in the provided dataset, automatically traces all executions with Langfuse for observability, runs item-level and run-level evaluators on the outputs, and returns comprehensive results with evaluation metrics.

The experiment system provides:

  • Automatic tracing of all task executions
  • Concurrent processing with configurable limits
  • Comprehensive error handling that isolates failures
  • Integration with Langfuse datasets for experiment tracking
  • Flexible evaluation framework supporting both sync and async evaluators
Arguments:
  • name: Human-readable name for the experiment. Used for identification in the Langfuse UI.
  • run_name: Optional exact name for the experiment run. If provided, this will be used as the exact dataset run name if the data contains Langfuse dataset items. If not provided, this will default to the experiment name appended with an ISO timestamp.
  • description: Optional description explaining the experiment's purpose, methodology, or expected outcomes.
  • data: Array of data items to process. Can be either:
    • List of dict-like items with 'input', 'expected_output', 'metadata' keys
    • List of Langfuse DatasetItem objects from dataset.items
  • task: Function that processes each data item and returns output. Must accept 'item' as keyword argument and can return sync or async results. The task function signature should be: task(, item, *kwargs) -> Any
  • evaluators: List of functions to evaluate each item's output individually. Each evaluator receives input, output, expected_output, and metadata. Can return single Evaluation dict or list of Evaluation dicts.
  • composite_evaluator: Optional function that creates composite scores from item-level evaluations. Receives the same inputs as item-level evaluators (input, output, expected_output, metadata) plus the list of evaluations from item-level evaluators. Useful for weighted averages, pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics.
  • run_evaluators: List of functions to evaluate the entire experiment run. Each run evaluator receives all item_results and can compute aggregate metrics. Useful for calculating averages, distributions, or cross-item comparisons.
  • max_concurrency: Maximum number of concurrent task executions (default: 50). Controls the number of items processed simultaneously. Adjust based on API rate limits and system resources.
  • metadata: Optional metadata dictionary to attach to all experiment traces. This metadata will be included in every trace created during the experiment. If data are Langfuse dataset items, the metadata will be attached to the dataset run, too.
Returns:

ExperimentResult containing:

  • run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset.
  • item_results: List of results for each processed item with outputs and evaluations
  • run_evaluations: List of aggregate evaluation results for the entire run
  • dataset_run_id: ID of the dataset run (if using Langfuse datasets)
  • dataset_run_url: Direct URL to view results in Langfuse UI (if applicable)
Raises:
  • ValueError: If required parameters are missing or invalid
  • Exception: If experiment setup fails (individual item failures are handled gracefully)
Examples:

Basic experiment with local data:

def summarize_text(*, item, **kwargs):
    return f"Summary: {item['input'][:50]}..."

def length_evaluator(*, input, output, expected_output=None, **kwargs):
    return {
        "name": "output_length",
        "value": len(output),
        "comment": f"Output contains {len(output)} characters"
    }

result = langfuse.run_experiment(
    name="Text Summarization Test",
    description="Evaluate summarization quality and length",
    data=[
        {"input": "Long article text...", "expected_output": "Expected summary"},
        {"input": "Another article...", "expected_output": "Another summary"}
    ],
    task=summarize_text,
    evaluators=[length_evaluator]
)

print(f"Processed {len(result.item_results)} items")
for item_result in result.item_results:
    print(f"Input: {item_result.item['input']}")
    print(f"Output: {item_result.output}")
    print(f"Evaluations: {item_result.evaluations}")

Advanced experiment with async task and multiple evaluators:

async def llm_task(*, item, **kwargs):
    # Simulate async LLM call
    response = await openai_client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": item["input"]}]
    )
    return response.choices[0].message.content

def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
    if expected_output and expected_output.lower() in output.lower():
        return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"}
    return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"}

def toxicity_evaluator(*, input, output, expected_output=None, **kwargs):
    # Simulate toxicity check
    toxicity_score = check_toxicity(output)  # Your toxicity checker
    return {
        "name": "toxicity",
        "value": toxicity_score,
        "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}"
    }

def average_accuracy(*, item_results, **kwargs):
    accuracies = [
        eval.value for result in item_results
        for eval in result.evaluations
        if eval.name == "accuracy"
    ]
    return {
        "name": "average_accuracy",
        "value": sum(accuracies) / len(accuracies) if accuracies else 0,
        "comment": f"Average accuracy across {len(accuracies)} items"
    }

result = langfuse.run_experiment(
    name="LLM Safety and Accuracy Test",
    description="Evaluate model accuracy and safety across diverse prompts",
    data=test_dataset,  # Your dataset items
    task=llm_task,
    evaluators=[accuracy_evaluator, toxicity_evaluator],
    run_evaluators=[average_accuracy],
    max_concurrency=5,  # Limit concurrent API calls
    metadata={"model": "gpt-4", "temperature": 0.7}
)

Using with Langfuse datasets:

# Get dataset from Langfuse
dataset = langfuse.get_dataset("my-eval-dataset")

result = dataset.run_experiment(
    name="Production Model Evaluation",
    description="Monthly evaluation of production model performance",
    task=my_production_task,
    evaluators=[accuracy_evaluator, latency_evaluator]
)

# Results automatically linked to dataset in Langfuse UI
print(f"View results: {result['dataset_run_url']}")
Note:
  • Task and evaluator functions can be either synchronous or asynchronous
  • Individual item failures are logged but don't stop the experiment
  • All executions are automatically traced and visible in Langfuse UI
  • When using Langfuse datasets, results are automatically linked for easy comparison
  • This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.)
  • Async execution is handled automatically with smart event loop detection
def run_batched_evaluation( self, *, scope: Literal['traces', 'observations'], mapper: MapperFunction, filter: Optional[str] = None, fetch_batch_size: int = 50, fetch_trace_fields: Optional[str] = None, max_items: Optional[int] = None, max_retries: int = 3, evaluators: List[langfuse.experiment.EvaluatorFunction], composite_evaluator: Optional[CompositeEvaluatorFunction] = None, max_concurrency: int = 5, metadata: Optional[Dict[str, Any]] = None, _add_observation_scores_to_trace: bool = False, _additional_trace_tags: Optional[List[str]] = None, resume_from: Optional[BatchEvaluationResumeToken] = None, verbose: bool = False) -> BatchEvaluationResult:
2892    def run_batched_evaluation(
2893        self,
2894        *,
2895        scope: Literal["traces", "observations"],
2896        mapper: MapperFunction,
2897        filter: Optional[str] = None,
2898        fetch_batch_size: int = 50,
2899        fetch_trace_fields: Optional[str] = None,
2900        max_items: Optional[int] = None,
2901        max_retries: int = 3,
2902        evaluators: List[EvaluatorFunction],
2903        composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
2904        max_concurrency: int = 5,
2905        metadata: Optional[Dict[str, Any]] = None,
2906        _add_observation_scores_to_trace: bool = False,
2907        _additional_trace_tags: Optional[List[str]] = None,
2908        resume_from: Optional[BatchEvaluationResumeToken] = None,
2909        verbose: bool = False,
2910    ) -> BatchEvaluationResult:
2911        """Fetch traces or observations and run evaluations on each item.
2912
2913        This method provides a powerful way to evaluate existing data in Langfuse at scale.
2914        It fetches items based on filters, transforms them using a mapper function, runs
2915        evaluators on each item, and creates scores that are linked back to the original
2916        entities. This is ideal for:
2917
2918        - Running evaluations on production traces after deployment
2919        - Backtesting new evaluation metrics on historical data
2920        - Batch scoring of observations for quality monitoring
2921        - Periodic evaluation runs on recent data
2922
2923        The method uses a streaming/pipeline approach to process items in batches, making
2924        it memory-efficient for large datasets. It includes comprehensive error handling,
2925        retry logic, and resume capability for long-running evaluations.
2926
2927        Args:
2928            scope: The type of items to evaluate. Must be one of:
2929                - "traces": Evaluate complete traces with all their observations
2930                - "observations": Evaluate individual observations (spans, generations, events)
2931            mapper: Function that transforms API response objects into evaluator inputs.
2932                Receives a trace/observation object and returns an EvaluatorInputs
2933                instance with input, output, expected_output, and metadata fields.
2934                Can be sync or async.
2935            evaluators: List of evaluation functions to run on each item. Each evaluator
2936                receives the mapped inputs and returns Evaluation object(s). Evaluator
2937                failures are logged but don't stop the batch evaluation.
2938            filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples:
2939                - '{"tags": ["production"]}'
2940                - '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}'
2941                Default: None (fetches all items).
2942            fetch_batch_size: Number of items to fetch per API call and hold in memory.
2943                Larger values may be faster but use more memory. Default: 50.
2944            fetch_trace_fields: Comma-separated list of fields to include when fetching traces. Available field groups: 'core' (always included), 'io' (input, output, metadata), 'scores', 'observations', 'metrics'. If not specified, all fields are returned. Example: 'core,scores,metrics'. Note: Excluded 'observations' or 'scores' fields return empty arrays; excluded 'metrics' returns -1 for 'totalCost' and 'latency'. Only relevant if scope is 'traces'.
2945            max_items: Maximum total number of items to process. If None, processes all
2946                items matching the filter. Useful for testing or limiting evaluation runs.
2947                Default: None (process all).
2948            max_concurrency: Maximum number of items to evaluate concurrently. Controls
2949                parallelism and resource usage. Default: 5.
2950            composite_evaluator: Optional function that creates a composite score from
2951                item-level evaluations. Receives the original item and its evaluations,
2952                returns a single Evaluation. Useful for weighted averages or combined metrics.
2953                Default: None.
2954            metadata: Optional metadata dict to add to all created scores. Useful for
2955                tracking evaluation runs, versions, or other context. Default: None.
2956            max_retries: Maximum number of retry attempts for failed batch fetches.
2957                Uses exponential backoff (1s, 2s, 4s). Default: 3.
2958            verbose: If True, logs progress information to console. Useful for monitoring
2959                long-running evaluations. Default: False.
2960            resume_from: Optional resume token from a previous incomplete run. Allows
2961                continuing evaluation after interruption or failure. Default: None.
2962
2963
2964        Returns:
2965            BatchEvaluationResult containing:
2966                - total_items_fetched: Number of items fetched from API
2967                - total_items_processed: Number of items successfully evaluated
2968                - total_items_failed: Number of items that failed evaluation
2969                - total_scores_created: Scores created by item-level evaluators
2970                - total_composite_scores_created: Scores created by composite evaluator
2971                - total_evaluations_failed: Individual evaluator failures
2972                - evaluator_stats: Per-evaluator statistics (success rate, scores created)
2973                - resume_token: Token for resuming if incomplete (None if completed)
2974                - completed: True if all items processed
2975                - duration_seconds: Total execution time
2976                - failed_item_ids: IDs of items that failed
2977                - error_summary: Error types and counts
2978                - has_more_items: True if max_items reached but more exist
2979
2980        Raises:
2981            ValueError: If invalid scope is provided.
2982
2983        Examples:
2984            Basic trace evaluation:
2985            ```python
2986            from langfuse import Langfuse, EvaluatorInputs, Evaluation
2987
2988            client = Langfuse()
2989
2990            # Define mapper to extract fields from traces
2991            def trace_mapper(trace):
2992                return EvaluatorInputs(
2993                    input=trace.input,
2994                    output=trace.output,
2995                    expected_output=None,
2996                    metadata={"trace_id": trace.id}
2997                )
2998
2999            # Define evaluator
3000            def length_evaluator(*, input, output, expected_output, metadata):
3001                return Evaluation(
3002                    name="output_length",
3003                    value=len(output) if output else 0
3004                )
3005
3006            # Run batch evaluation
3007            result = client.run_batched_evaluation(
3008                scope="traces",
3009                mapper=trace_mapper,
3010                evaluators=[length_evaluator],
3011                filter='{"tags": ["production"]}',
3012                max_items=1000,
3013                verbose=True
3014            )
3015
3016            print(f"Processed {result.total_items_processed} traces")
3017            print(f"Created {result.total_scores_created} scores")
3018            ```
3019
3020            Evaluation with composite scorer:
3021            ```python
3022            def accuracy_evaluator(*, input, output, expected_output, metadata):
3023                # ... evaluation logic
3024                return Evaluation(name="accuracy", value=0.85)
3025
3026            def relevance_evaluator(*, input, output, expected_output, metadata):
3027                # ... evaluation logic
3028                return Evaluation(name="relevance", value=0.92)
3029
3030            def composite_evaluator(*, item, evaluations):
3031                # Weighted average of evaluations
3032                weights = {"accuracy": 0.6, "relevance": 0.4}
3033                total = sum(
3034                    e.value * weights.get(e.name, 0)
3035                    for e in evaluations
3036                    if isinstance(e.value, (int, float))
3037                )
3038                return Evaluation(
3039                    name="composite_score",
3040                    value=total,
3041                    comment=f"Weighted average of {len(evaluations)} metrics"
3042                )
3043
3044            result = client.run_batched_evaluation(
3045                scope="traces",
3046                mapper=trace_mapper,
3047                evaluators=[accuracy_evaluator, relevance_evaluator],
3048                composite_evaluator=composite_evaluator,
3049                filter='{"user_id": "important_user"}',
3050                verbose=True
3051            )
3052            ```
3053
3054            Handling incomplete runs with resume:
3055            ```python
3056            # Initial run that may fail or timeout
3057            result = client.run_batched_evaluation(
3058                scope="observations",
3059                mapper=obs_mapper,
3060                evaluators=[my_evaluator],
3061                max_items=10000,
3062                verbose=True
3063            )
3064
3065            # Check if incomplete
3066            if not result.completed and result.resume_token:
3067                print(f"Processed {result.resume_token.items_processed} items before interruption")
3068
3069                # Resume from where it left off
3070                result = client.run_batched_evaluation(
3071                    scope="observations",
3072                    mapper=obs_mapper,
3073                    evaluators=[my_evaluator],
3074                    resume_from=result.resume_token,
3075                    verbose=True
3076                )
3077
3078            print(f"Total items processed: {result.total_items_processed}")
3079            ```
3080
3081            Monitoring evaluator performance:
3082            ```python
3083            result = client.run_batched_evaluation(...)
3084
3085            for stats in result.evaluator_stats:
3086                success_rate = stats.successful_runs / stats.total_runs
3087                print(f"{stats.name}:")
3088                print(f"  Success rate: {success_rate:.1%}")
3089                print(f"  Scores created: {stats.total_scores_created}")
3090
3091                if stats.failed_runs > 0:
3092                    print(f"  âš ī¸  Failed {stats.failed_runs} times")
3093            ```
3094
3095        Note:
3096            - Evaluator failures are logged but don't stop the batch evaluation
3097            - Individual item failures are tracked but don't stop processing
3098            - Fetch failures are retried with exponential backoff
3099            - All scores are automatically flushed to Langfuse at the end
3100            - The resume mechanism uses timestamp-based filtering to avoid duplicates
3101        """
3102        runner = BatchEvaluationRunner(self)
3103
3104        return cast(
3105            BatchEvaluationResult,
3106            run_async_safely(
3107                runner.run_async(
3108                    scope=scope,
3109                    mapper=mapper,
3110                    evaluators=evaluators,
3111                    filter=filter,
3112                    fetch_batch_size=fetch_batch_size,
3113                    fetch_trace_fields=fetch_trace_fields,
3114                    max_items=max_items,
3115                    max_concurrency=max_concurrency,
3116                    composite_evaluator=composite_evaluator,
3117                    metadata=metadata,
3118                    _add_observation_scores_to_trace=_add_observation_scores_to_trace,
3119                    _additional_trace_tags=_additional_trace_tags,
3120                    max_retries=max_retries,
3121                    verbose=verbose,
3122                    resume_from=resume_from,
3123                )
3124            ),
3125        )

Fetch traces or observations and run evaluations on each item.

This method provides a powerful way to evaluate existing data in Langfuse at scale. It fetches items based on filters, transforms them using a mapper function, runs evaluators on each item, and creates scores that are linked back to the original entities. This is ideal for:

  • Running evaluations on production traces after deployment
  • Backtesting new evaluation metrics on historical data
  • Batch scoring of observations for quality monitoring
  • Periodic evaluation runs on recent data

The method uses a streaming/pipeline approach to process items in batches, making it memory-efficient for large datasets. It includes comprehensive error handling, retry logic, and resume capability for long-running evaluations.

Arguments:
  • scope: The type of items to evaluate. Must be one of:
    • "traces": Evaluate complete traces with all their observations
    • "observations": Evaluate individual observations (spans, generations, events)
  • mapper: Function that transforms API response objects into evaluator inputs. Receives a trace/observation object and returns an EvaluatorInputs instance with input, output, expected_output, and metadata fields. Can be sync or async.
  • evaluators: List of evaluation functions to run on each item. Each evaluator receives the mapped inputs and returns Evaluation object(s). Evaluator failures are logged but don't stop the batch evaluation.
  • filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples:
    • '{"tags": ["production"]}'
    • '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}' Default: None (fetches all items).
  • fetch_batch_size: Number of items to fetch per API call and hold in memory. Larger values may be faster but use more memory. Default: 50.
  • fetch_trace_fields: Comma-separated list of fields to include when fetching traces. Available field groups: 'core' (always included), 'io' (input, output, metadata), 'scores', 'observations', 'metrics'. If not specified, all fields are returned. Example: 'core,scores,metrics'. Note: Excluded 'observations' or 'scores' fields return empty arrays; excluded 'metrics' returns -1 for 'totalCost' and 'latency'. Only relevant if scope is 'traces'.
  • max_items: Maximum total number of items to process. If None, processes all items matching the filter. Useful for testing or limiting evaluation runs. Default: None (process all).
  • max_concurrency: Maximum number of items to evaluate concurrently. Controls parallelism and resource usage. Default: 5.
  • composite_evaluator: Optional function that creates a composite score from item-level evaluations. Receives the original item and its evaluations, returns a single Evaluation. Useful for weighted averages or combined metrics. Default: None.
  • metadata: Optional metadata dict to add to all created scores. Useful for tracking evaluation runs, versions, or other context. Default: None.
  • max_retries: Maximum number of retry attempts for failed batch fetches. Uses exponential backoff (1s, 2s, 4s). Default: 3.
  • verbose: If True, logs progress information to console. Useful for monitoring long-running evaluations. Default: False.
  • resume_from: Optional resume token from a previous incomplete run. Allows continuing evaluation after interruption or failure. Default: None.
Returns:

BatchEvaluationResult containing: - total_items_fetched: Number of items fetched from API - total_items_processed: Number of items successfully evaluated - total_items_failed: Number of items that failed evaluation - total_scores_created: Scores created by item-level evaluators - total_composite_scores_created: Scores created by composite evaluator - total_evaluations_failed: Individual evaluator failures - evaluator_stats: Per-evaluator statistics (success rate, scores created) - resume_token: Token for resuming if incomplete (None if completed) - completed: True if all items processed - duration_seconds: Total execution time - failed_item_ids: IDs of items that failed - error_summary: Error types and counts - has_more_items: True if max_items reached but more exist

Raises:
  • ValueError: If invalid scope is provided.
Examples:

Basic trace evaluation:

from langfuse import Langfuse, EvaluatorInputs, Evaluation

client = Langfuse()

# Define mapper to extract fields from traces
def trace_mapper(trace):
    return EvaluatorInputs(
        input=trace.input,
        output=trace.output,
        expected_output=None,
        metadata={"trace_id": trace.id}
    )

# Define evaluator
def length_evaluator(*, input, output, expected_output, metadata):
    return Evaluation(
        name="output_length",
        value=len(output) if output else 0
    )

# Run batch evaluation
result = client.run_batched_evaluation(
    scope="traces",
    mapper=trace_mapper,
    evaluators=[length_evaluator],
    filter='{"tags": ["production"]}',
    max_items=1000,
    verbose=True
)

print(f"Processed {result.total_items_processed} traces")
print(f"Created {result.total_scores_created} scores")

Evaluation with composite scorer:

def accuracy_evaluator(*, input, output, expected_output, metadata):
    # ... evaluation logic
    return Evaluation(name="accuracy", value=0.85)

def relevance_evaluator(*, input, output, expected_output, metadata):
    # ... evaluation logic
    return Evaluation(name="relevance", value=0.92)

def composite_evaluator(*, item, evaluations):
    # Weighted average of evaluations
    weights = {"accuracy": 0.6, "relevance": 0.4}
    total = sum(
        e.value * weights.get(e.name, 0)
        for e in evaluations
        if isinstance(e.value, (int, float))
    )
    return Evaluation(
        name="composite_score",
        value=total,
        comment=f"Weighted average of {len(evaluations)} metrics"
    )

result = client.run_batched_evaluation(
    scope="traces",
    mapper=trace_mapper,
    evaluators=[accuracy_evaluator, relevance_evaluator],
    composite_evaluator=composite_evaluator,
    filter='{"user_id": "important_user"}',
    verbose=True
)

Handling incomplete runs with resume:

# Initial run that may fail or timeout
result = client.run_batched_evaluation(
    scope="observations",
    mapper=obs_mapper,
    evaluators=[my_evaluator],
    max_items=10000,
    verbose=True
)

# Check if incomplete
if not result.completed and result.resume_token:
    print(f"Processed {result.resume_token.items_processed} items before interruption")

    # Resume from where it left off
    result = client.run_batched_evaluation(
        scope="observations",
        mapper=obs_mapper,
        evaluators=[my_evaluator],
        resume_from=result.resume_token,
        verbose=True
    )

print(f"Total items processed: {result.total_items_processed}")

Monitoring evaluator performance:

result = client.run_batched_evaluation(...)

for stats in result.evaluator_stats:
    success_rate = stats.successful_runs / stats.total_runs
    print(f"{stats.name}:")
    print(f"  Success rate: {success_rate:.1%}")
    print(f"  Scores created: {stats.total_scores_created}")

    if stats.failed_runs > 0:
        print(f"  âš ī¸  Failed {stats.failed_runs} times")
Note:
  • Evaluator failures are logged but don't stop the batch evaluation
  • Individual item failures are tracked but don't stop processing
  • Fetch failures are retried with exponential backoff
  • All scores are automatically flushed to Langfuse at the end
  • The resume mechanism uses timestamp-based filtering to avoid duplicates
def auth_check(self) -> bool:
3127    def auth_check(self) -> bool:
3128        """Check if the provided credentials (public and secret key) are valid.
3129
3130        Raises:
3131            Exception: If no projects were found for the provided credentials.
3132
3133        Note:
3134            This method is blocking. It is discouraged to use it in production code.
3135        """
3136        try:
3137            projects = self.api.projects.get()
3138            langfuse_logger.debug(
3139                f"Auth check successful, found {len(projects.data)} projects"
3140            )
3141            if len(projects.data) == 0:
3142                raise Exception(
3143                    "Auth check failed, no project found for the keys provided."
3144                )
3145            return True
3146
3147        except AttributeError as e:
3148            langfuse_logger.warning(
3149                f"Auth check failed: Client not properly initialized. Error: {e}"
3150            )
3151            return False
3152
3153        except Error as e:
3154            handle_fern_exception(e)
3155            raise e

Check if the provided credentials (public and secret key) are valid.

Raises:
  • Exception: If no projects were found for the provided credentials.
Note:

This method is blocking. It is discouraged to use it in production code.

def create_dataset( self, *, name: str, description: Optional[str] = None, metadata: Optional[Any] = None, input_schema: Optional[Any] = None, expected_output_schema: Optional[Any] = None) -> langfuse.api.Dataset:
3157    def create_dataset(
3158        self,
3159        *,
3160        name: str,
3161        description: Optional[str] = None,
3162        metadata: Optional[Any] = None,
3163        input_schema: Optional[Any] = None,
3164        expected_output_schema: Optional[Any] = None,
3165    ) -> Dataset:
3166        """Create a dataset with the given name on Langfuse.
3167
3168        Args:
3169            name: Name of the dataset to create.
3170            description: Description of the dataset. Defaults to None.
3171            metadata: Additional metadata. Defaults to None.
3172            input_schema: JSON Schema for validating dataset item inputs. When set, all new items will be validated against this schema.
3173            expected_output_schema: JSON Schema for validating dataset item expected outputs. When set, all new items will be validated against this schema.
3174
3175        Returns:
3176            Dataset: The created dataset as returned by the Langfuse API.
3177        """
3178        try:
3179            langfuse_logger.debug(f"Creating datasets {name}")
3180
3181            result = self.api.datasets.create(
3182                name=name,
3183                description=description,
3184                metadata=metadata,
3185                input_schema=input_schema,
3186                expected_output_schema=expected_output_schema,
3187            )
3188
3189            return cast(Dataset, result)
3190
3191        except Error as e:
3192            handle_fern_exception(e)
3193            raise e

Create a dataset with the given name on Langfuse.

Arguments:
  • name: Name of the dataset to create.
  • description: Description of the dataset. Defaults to None.
  • metadata: Additional metadata. Defaults to None.
  • input_schema: JSON Schema for validating dataset item inputs. When set, all new items will be validated against this schema.
  • expected_output_schema: JSON Schema for validating dataset item expected outputs. When set, all new items will be validated against this schema.
Returns:

Dataset: The created dataset as returned by the Langfuse API.

def create_dataset_item( self, *, dataset_name: str, input: Optional[Any] = None, expected_output: Optional[Any] = None, metadata: Optional[Any] = None, source_trace_id: Optional[str] = None, source_observation_id: Optional[str] = None, status: Optional[langfuse.api.DatasetStatus] = None, id: Optional[str] = None) -> langfuse.api.DatasetItem:
3195    def create_dataset_item(
3196        self,
3197        *,
3198        dataset_name: str,
3199        input: Optional[Any] = None,
3200        expected_output: Optional[Any] = None,
3201        metadata: Optional[Any] = None,
3202        source_trace_id: Optional[str] = None,
3203        source_observation_id: Optional[str] = None,
3204        status: Optional[DatasetStatus] = None,
3205        id: Optional[str] = None,
3206    ) -> DatasetItem:
3207        """Create a dataset item.
3208
3209        Upserts if an item with id already exists.
3210
3211        Args:
3212            dataset_name: Name of the dataset in which the dataset item should be created.
3213            input: Input data. Defaults to None. Can contain any dict, list or scalar.
3214            expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar.
3215            metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar.
3216            source_trace_id: Id of the source trace. Defaults to None.
3217            source_observation_id: Id of the source observation. Defaults to None.
3218            status: Status of the dataset item. Defaults to ACTIVE for newly created items.
3219            id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets.
3220
3221        Returns:
3222            DatasetItem: The created dataset item as returned by the Langfuse API.
3223
3224        Example:
3225            ```python
3226            from langfuse import Langfuse
3227
3228            langfuse = Langfuse()
3229
3230            # Uploading items to the Langfuse dataset named "capital_cities"
3231            langfuse.create_dataset_item(
3232                dataset_name="capital_cities",
3233                input={"input": {"country": "Italy"}},
3234                expected_output={"expected_output": "Rome"},
3235                metadata={"foo": "bar"}
3236            )
3237            ```
3238        """
3239        try:
3240            langfuse_logger.debug(f"Creating dataset item for dataset {dataset_name}")
3241
3242            result = self.api.dataset_items.create(
3243                dataset_name=dataset_name,
3244                input=input,
3245                expected_output=expected_output,
3246                metadata=metadata,
3247                source_trace_id=source_trace_id,
3248                source_observation_id=source_observation_id,
3249                status=status,
3250                id=id,
3251            )
3252
3253            return cast(DatasetItem, result)
3254        except Error as e:
3255            handle_fern_exception(e)
3256            raise e

Create a dataset item.

Upserts if an item with id already exists.

Arguments:
  • dataset_name: Name of the dataset in which the dataset item should be created.
  • input: Input data. Defaults to None. Can contain any dict, list or scalar.
  • expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar.
  • metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar.
  • source_trace_id: Id of the source trace. Defaults to None.
  • source_observation_id: Id of the source observation. Defaults to None.
  • status: Status of the dataset item. Defaults to ACTIVE for newly created items.
  • id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets.
Returns:

DatasetItem: The created dataset item as returned by the Langfuse API.

Example:
from langfuse import Langfuse

langfuse = Langfuse()

# Uploading items to the Langfuse dataset named "capital_cities"
langfuse.create_dataset_item(
    dataset_name="capital_cities",
    input={"input": {"country": "Italy"}},
    expected_output={"expected_output": "Rome"},
    metadata={"foo": "bar"}
)
def resolve_media_references( self, *, obj: Any, resolve_with: Literal['base64_data_uri'], max_depth: int = 10, content_fetch_timeout_seconds: int = 5) -> Any:
3258    def resolve_media_references(
3259        self,
3260        *,
3261        obj: Any,
3262        resolve_with: Literal["base64_data_uri"],
3263        max_depth: int = 10,
3264        content_fetch_timeout_seconds: int = 5,
3265    ) -> Any:
3266        """Replace media reference strings in an object with base64 data URIs.
3267
3268        This method recursively traverses an object (up to max_depth) looking for media reference strings
3269        in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using
3270        the provided Langfuse client and replaces the reference string with a base64 data URI.
3271
3272        If fetching media content fails for a reference string, a warning is logged and the reference
3273        string is left unchanged.
3274
3275        Args:
3276            obj: The object to process. Can be a primitive value, array, or nested object.
3277                If the object has a __dict__ attribute, a dict will be returned instead of the original object type.
3278            resolve_with: The representation of the media content to replace the media reference string with.
3279                Currently only "base64_data_uri" is supported.
3280            max_depth: int: The maximum depth to traverse the object. Default is 10.
3281            content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5.
3282
3283        Returns:
3284            A deep copy of the input object with all media references replaced with base64 data URIs where possible.
3285            If the input object has a __dict__ attribute, a dict will be returned instead of the original object type.
3286
3287        Example:
3288            obj = {
3289                "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@",
3290                "nested": {
3291                    "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@"
3292                }
3293            }
3294
3295            result = await LangfuseMedia.resolve_media_references(obj, langfuse_client)
3296
3297            # Result:
3298            # {
3299            #     "image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...",
3300            #     "nested": {
3301            #         "pdf": "data:application/pdf;base64,JVBERi0xLjcK..."
3302            #     }
3303            # }
3304        """
3305        return LangfuseMedia.resolve_media_references(
3306            langfuse_client=self,
3307            obj=obj,
3308            resolve_with=resolve_with,
3309            max_depth=max_depth,
3310            content_fetch_timeout_seconds=content_fetch_timeout_seconds,
3311        )

Replace media reference strings in an object with base64 data URIs.

This method recursively traverses an object (up to max_depth) looking for media reference strings in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using the provided Langfuse client and replaces the reference string with a base64 data URI.

If fetching media content fails for a reference string, a warning is logged and the reference string is left unchanged.

Arguments:
  • obj: The object to process. Can be a primitive value, array, or nested object. If the object has a __dict__ attribute, a dict will be returned instead of the original object type.
  • resolve_with: The representation of the media content to replace the media reference string with. Currently only "base64_data_uri" is supported.
  • max_depth: int: The maximum depth to traverse the object. Default is 10.
  • content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5.
Returns:

A deep copy of the input object with all media references replaced with base64 data URIs where possible. If the input object has a __dict__ attribute, a dict will be returned instead of the original object type.

Example:

obj = { "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@", "nested": { "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@" } }

result = await LangfuseMedia.resolve_media_references(obj, langfuse_client)

Result:

{

"image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...",

"nested": {

"pdf": "data:application/pdf;base64,JVBERi0xLjcK..."

}

}

def get_prompt( self, name: str, *, version: Optional[int] = None, label: Optional[str] = None, type: Literal['chat', 'text'] = 'text', cache_ttl_seconds: Optional[int] = None, fallback: Union[List[langfuse.model.ChatMessageDict], NoneType, str] = None, max_retries: Optional[int] = None, fetch_timeout_seconds: Optional[int] = None) -> Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient]:
3341    def get_prompt(
3342        self,
3343        name: str,
3344        *,
3345        version: Optional[int] = None,
3346        label: Optional[str] = None,
3347        type: Literal["chat", "text"] = "text",
3348        cache_ttl_seconds: Optional[int] = None,
3349        fallback: Union[Optional[List[ChatMessageDict]], Optional[str]] = None,
3350        max_retries: Optional[int] = None,
3351        fetch_timeout_seconds: Optional[int] = None,
3352    ) -> PromptClient:
3353        """Get a prompt.
3354
3355        This method attempts to fetch the requested prompt from the local cache. If the prompt is not found
3356        in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again
3357        and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will
3358        return the expired prompt as a fallback.
3359
3360        Args:
3361            name (str): The name of the prompt to retrieve.
3362
3363        Keyword Args:
3364            version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both.
3365            label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both.
3366            cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a
3367            keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0.
3368            type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text".
3369            fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None.
3370            max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds.
3371            fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default.
3372
3373        Returns:
3374            The prompt object retrieved from the cache or directly fetched if not cached or expired of type
3375            - TextPromptClient, if type argument is 'text'.
3376            - ChatPromptClient, if type argument is 'chat'.
3377
3378        Raises:
3379            Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an
3380            expired prompt in the cache, in which case it logs a warning and returns the expired prompt.
3381        """
3382        if self._resources is None:
3383            raise Error(
3384                "SDK is not correctly initialized. Check the init logs for more details."
3385            )
3386        if version is not None and label is not None:
3387            raise ValueError("Cannot specify both version and label at the same time.")
3388
3389        if not name:
3390            raise ValueError("Prompt name cannot be empty.")
3391
3392        cache_key = PromptCache.generate_cache_key(name, version=version, label=label)
3393        bounded_max_retries = self._get_bounded_max_retries(
3394            max_retries, default_max_retries=2, max_retries_upper_bound=4
3395        )
3396
3397        langfuse_logger.debug(f"Getting prompt '{cache_key}'")
3398        cached_prompt = self._resources.prompt_cache.get(cache_key)
3399
3400        if cached_prompt is None or cache_ttl_seconds == 0:
3401            langfuse_logger.debug(
3402                f"Prompt '{cache_key}' not found in cache or caching disabled."
3403            )
3404            try:
3405                return self._fetch_prompt_and_update_cache(
3406                    name,
3407                    version=version,
3408                    label=label,
3409                    ttl_seconds=cache_ttl_seconds,
3410                    max_retries=bounded_max_retries,
3411                    fetch_timeout_seconds=fetch_timeout_seconds,
3412                )
3413            except Exception as e:
3414                if fallback:
3415                    langfuse_logger.warning(
3416                        f"Returning fallback prompt for '{cache_key}' due to fetch error: {e}"
3417                    )
3418
3419                    fallback_client_args: Dict[str, Any] = {
3420                        "name": name,
3421                        "prompt": fallback,
3422                        "type": type,
3423                        "version": version or 0,
3424                        "config": {},
3425                        "labels": [label] if label else [],
3426                        "tags": [],
3427                    }
3428
3429                    if type == "text":
3430                        return TextPromptClient(
3431                            prompt=Prompt_Text(**fallback_client_args),
3432                            is_fallback=True,
3433                        )
3434
3435                    if type == "chat":
3436                        return ChatPromptClient(
3437                            prompt=Prompt_Chat(**fallback_client_args),
3438                            is_fallback=True,
3439                        )
3440
3441                raise e
3442
3443        if cached_prompt.is_expired():
3444            langfuse_logger.debug(f"Stale prompt '{cache_key}' found in cache.")
3445            try:
3446                # refresh prompt in background thread, refresh_prompt deduplicates tasks
3447                langfuse_logger.debug(f"Refreshing prompt '{cache_key}' in background.")
3448
3449                def refresh_task() -> None:
3450                    self._fetch_prompt_and_update_cache(
3451                        name,
3452                        version=version,
3453                        label=label,
3454                        ttl_seconds=cache_ttl_seconds,
3455                        max_retries=bounded_max_retries,
3456                        fetch_timeout_seconds=fetch_timeout_seconds,
3457                    )
3458
3459                self._resources.prompt_cache.add_refresh_prompt_task(
3460                    cache_key,
3461                    refresh_task,
3462                )
3463                langfuse_logger.debug(
3464                    f"Returning stale prompt '{cache_key}' from cache."
3465                )
3466                # return stale prompt
3467                return cached_prompt.value
3468
3469            except Exception as e:
3470                langfuse_logger.warning(
3471                    f"Error when refreshing cached prompt '{cache_key}', returning cached version. Error: {e}"
3472                )
3473                # creation of refresh prompt task failed, return stale prompt
3474                return cached_prompt.value
3475
3476        return cached_prompt.value

Get a prompt.

This method attempts to fetch the requested prompt from the local cache. If the prompt is not found in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will return the expired prompt as a fallback.

Arguments:
  • name (str): The name of the prompt to retrieve.
Keyword Args:

version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the production label is returned. Specify either version or label, not both. label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the production label is returned. Specify either version or label, not both. cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0. type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text". fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None. max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds. fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default.

Returns:

The prompt object retrieved from the cache or directly fetched if not cached or expired of type

  • TextPromptClient, if type argument is 'text'.
  • ChatPromptClient, if type argument is 'chat'.
Raises:
  • Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an
  • expired prompt in the cache, in which case it logs a warning and returns the expired prompt.
def create_prompt( self, *, name: str, prompt: Union[str, List[Union[langfuse.model.ChatMessageDict, langfuse.model.ChatMessageWithPlaceholdersDict_Message, langfuse.model.ChatMessageWithPlaceholdersDict_Placeholder]]], labels: List[str] = [], tags: Optional[List[str]] = None, type: Optional[Literal['chat', 'text']] = 'text', config: Optional[Any] = None, commit_message: Optional[str] = None) -> Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient]:
3578    def create_prompt(
3579        self,
3580        *,
3581        name: str,
3582        prompt: Union[
3583            str, List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]]
3584        ],
3585        labels: List[str] = [],
3586        tags: Optional[List[str]] = None,
3587        type: Optional[Literal["chat", "text"]] = "text",
3588        config: Optional[Any] = None,
3589        commit_message: Optional[str] = None,
3590    ) -> PromptClient:
3591        """Create a new prompt in Langfuse.
3592
3593        Keyword Args:
3594            name : The name of the prompt to be created.
3595            prompt : The content of the prompt to be created.
3596            is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead.
3597            labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label.
3598            tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt.
3599            config: Additional structured data to be saved with the prompt. Defaults to None.
3600            type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text".
3601            commit_message: Optional string describing the change.
3602
3603        Returns:
3604            TextPromptClient: The prompt if type argument is 'text'.
3605            ChatPromptClient: The prompt if type argument is 'chat'.
3606        """
3607        try:
3608            langfuse_logger.debug(f"Creating prompt {name=}, {labels=}")
3609
3610            if type == "chat":
3611                if not isinstance(prompt, list):
3612                    raise ValueError(
3613                        "For 'chat' type, 'prompt' must be a list of chat messages with role and content attributes."
3614                    )
3615                request: Union[CreateChatPromptRequest, CreateTextPromptRequest] = (
3616                    CreateChatPromptRequest(
3617                        name=name,
3618                        prompt=cast(Any, prompt),
3619                        labels=labels,
3620                        tags=tags,
3621                        config=config or {},
3622                        commit_message=commit_message,
3623                        type=CreateChatPromptType.CHAT,
3624                    )
3625                )
3626                server_prompt = self.api.prompts.create(request=request)
3627
3628                if self._resources is not None:
3629                    self._resources.prompt_cache.invalidate(name)
3630
3631                return ChatPromptClient(prompt=cast(Prompt_Chat, server_prompt))
3632
3633            if not isinstance(prompt, str):
3634                raise ValueError("For 'text' type, 'prompt' must be a string.")
3635
3636            request = CreateTextPromptRequest(
3637                name=name,
3638                prompt=prompt,
3639                labels=labels,
3640                tags=tags,
3641                config=config or {},
3642                commit_message=commit_message,
3643            )
3644
3645            server_prompt = self.api.prompts.create(request=request)
3646
3647            if self._resources is not None:
3648                self._resources.prompt_cache.invalidate(name)
3649
3650            return TextPromptClient(prompt=cast(Prompt_Text, server_prompt))
3651
3652        except Error as e:
3653            handle_fern_exception(e)
3654            raise e

Create a new prompt in Langfuse.

Keyword Args:

name : The name of the prompt to be created. prompt : The content of the prompt to be created. is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead. labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label. tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt. config: Additional structured data to be saved with the prompt. Defaults to None. type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text". commit_message: Optional string describing the change.

Returns:

TextPromptClient: The prompt if type argument is 'text'. ChatPromptClient: The prompt if type argument is 'chat'.

def update_prompt(self, *, name: str, version: int, new_labels: List[str] = []) -> Any:
3656    def update_prompt(
3657        self,
3658        *,
3659        name: str,
3660        version: int,
3661        new_labels: List[str] = [],
3662    ) -> Any:
3663        """Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name.
3664
3665        Args:
3666            name (str): The name of the prompt to update.
3667            version (int): The version number of the prompt to update.
3668            new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to [].
3669
3670        Returns:
3671            Prompt: The updated prompt from the Langfuse API.
3672
3673        """
3674        updated_prompt = self.api.prompt_version.update(
3675            name=self._url_encode(name),
3676            version=version,
3677            new_labels=new_labels,
3678        )
3679
3680        if self._resources is not None:
3681            self._resources.prompt_cache.invalidate(name)
3682
3683        return updated_prompt

Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name.

Arguments:
  • name (str): The name of the prompt to update.
  • version (int): The version number of the prompt to update.
  • new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to [].
Returns:

Prompt: The updated prompt from the Langfuse API.

def clear_prompt_cache(self) -> None:
3698    def clear_prompt_cache(self) -> None:
3699        """Clear the entire prompt cache, removing all cached prompts.
3700
3701        This method is useful when you want to force a complete refresh of all
3702        cached prompts, for example after major updates or when you need to
3703        ensure the latest versions are fetched from the server.
3704        """
3705        if self._resources is not None:
3706            self._resources.prompt_cache.clear()

Clear the entire prompt cache, removing all cached prompts.

This method is useful when you want to force a complete refresh of all cached prompts, for example after major updates or when you need to ensure the latest versions are fetched from the server.

def get_client(*, public_key: Optional[str] = None) -> Langfuse:
 62def get_client(*, public_key: Optional[str] = None) -> Langfuse:
 63    """Get or create a Langfuse client instance.
 64
 65    Returns an existing Langfuse client or creates a new one if none exists. In multi-project setups,
 66    providing a public_key is required. Multi-project support is experimental - see Langfuse docs.
 67
 68    Behavior:
 69    - Single project: Returns existing client or creates new one
 70    - Multi-project: Requires public_key to return specific client
 71    - No public_key in multi-project: Returns disabled client to prevent data leakage
 72
 73    The function uses a singleton pattern per public_key to conserve resources and maintain state.
 74
 75    Args:
 76        public_key (Optional[str]): Project identifier
 77            - With key: Returns client for that project
 78            - Without key: Returns single client or disabled client if multiple exist
 79
 80    Returns:
 81        Langfuse: Client instance in one of three states:
 82            1. Client for specified public_key
 83            2. Default client for single-project setup
 84            3. Disabled client when multiple projects exist without key
 85
 86    Security:
 87        Disables tracing when multiple projects exist without explicit key to prevent
 88        cross-project data leakage. Multi-project setups are experimental.
 89
 90    Example:
 91        ```python
 92        # Single project
 93        client = get_client()  # Default client
 94
 95        # In multi-project usage:
 96        client_a = get_client(public_key="project_a_key")  # Returns project A's client
 97        client_b = get_client(public_key="project_b_key")  # Returns project B's client
 98
 99        # Without specific key in multi-project setup:
100        client = get_client()  # Returns disabled client for safety
101        ```
102    """
103    with LangfuseResourceManager._lock:
104        active_instances = LangfuseResourceManager._instances
105
106        # If no explicit public_key provided, check execution context
107        if not public_key:
108            public_key = _current_public_key.get(None)
109
110        if not public_key:
111            if len(active_instances) == 0:
112                # No clients initialized yet, create default instance
113                return Langfuse()
114
115            if len(active_instances) == 1:
116                # Only one client exists, safe to use without specifying key
117                instance = list(active_instances.values())[0]
118
119                # Initialize with the credentials bound to the instance
120                # This is important if the original instance was instantiated
121                # via constructor arguments
122                return _create_client_from_instance(instance)
123
124            else:
125                # Multiple clients exist but no key specified - disable tracing
126                # to prevent cross-project data leakage
127                langfuse_logger.warning(
128                    "No 'langfuse_public_key' passed to decorated function, but multiple langfuse clients are instantiated in current process. Skipping tracing for this function to avoid cross-project leakage."
129                )
130                return Langfuse(
131                    tracing_enabled=False, public_key="fake", secret_key="fake"
132                )
133
134        else:
135            # Specific key provided, look up existing instance
136            target_instance: Optional[LangfuseResourceManager] = active_instances.get(
137                public_key, None
138            )
139
140            if target_instance is None:
141                # No instance found with this key - client not initialized properly
142                langfuse_logger.warning(
143                    f"No Langfuse client with public key {public_key} has been initialized. Skipping tracing for decorated function."
144                )
145                return Langfuse(
146                    tracing_enabled=False, public_key="fake", secret_key="fake"
147                )
148
149            # target_instance is guaranteed to be not None at this point
150            return _create_client_from_instance(target_instance, public_key)

Get or create a Langfuse client instance.

Returns an existing Langfuse client or creates a new one if none exists. In multi-project setups, providing a public_key is required. Multi-project support is experimental - see Langfuse docs.

Behavior:

  • Single project: Returns existing client or creates new one
  • Multi-project: Requires public_key to return specific client
  • No public_key in multi-project: Returns disabled client to prevent data leakage

The function uses a singleton pattern per public_key to conserve resources and maintain state.

Arguments:
  • public_key (Optional[str]): Project identifier
    • With key: Returns client for that project
    • Without key: Returns single client or disabled client if multiple exist
Returns:

Langfuse: Client instance in one of three states: 1. Client for specified public_key 2. Default client for single-project setup 3. Disabled client when multiple projects exist without key

Security:

Disables tracing when multiple projects exist without explicit key to prevent cross-project data leakage. Multi-project setups are experimental.

Example:
# Single project
client = get_client()  # Default client

# In multi-project usage:
client_a = get_client(public_key="project_a_key")  # Returns project A's client
client_b = get_client(public_key="project_b_key")  # Returns project B's client

# Without specific key in multi-project setup:
client = get_client()  # Returns disabled client for safety
def observe( func: Optional[~F] = None, *, name: Optional[str] = None, as_type: Union[Literal['generation', 'embedding'], Literal['span', 'agent', 'tool', 'chain', 'retriever', 'evaluator', 'guardrail'], NoneType] = None, capture_input: Optional[bool] = None, capture_output: Optional[bool] = None, transform_to_string: Optional[Callable[[Iterable], str]] = None) -> Union[~F, Callable[[~F], ~F]]:
 90    def observe(
 91        self,
 92        func: Optional[F] = None,
 93        *,
 94        name: Optional[str] = None,
 95        as_type: Optional[ObservationTypeLiteralNoEvent] = None,
 96        capture_input: Optional[bool] = None,
 97        capture_output: Optional[bool] = None,
 98        transform_to_string: Optional[Callable[[Iterable], str]] = None,
 99    ) -> Union[F, Callable[[F], F]]:
100        """Wrap a function to create and manage Langfuse tracing around its execution, supporting both synchronous and asynchronous functions.
101
102        This decorator provides seamless integration of Langfuse observability into your codebase. It automatically creates
103        spans or generations around function execution, capturing timing, inputs/outputs, and error states. The decorator
104        intelligently handles both synchronous and asynchronous functions, preserving function signatures and type hints.
105
106        Using OpenTelemetry's distributed tracing system, it maintains proper trace context propagation throughout your application,
107        enabling you to see hierarchical traces of function calls with detailed performance metrics and function-specific details.
108
109        Args:
110            func (Optional[Callable]): The function to decorate. When used with parentheses @observe(), this will be None.
111            name (Optional[str]): Custom name for the created trace or span. If not provided, the function name is used.
112            as_type (Optional[Literal]): Set the observation type. Supported values:
113                    "generation", "span", "agent", "tool", "chain", "retriever", "embedding", "evaluator", "guardrail".
114                    Observation types are highlighted in the Langfuse UI for filtering and visualization.
115                    The types "generation" and "embedding" create a span on which additional attributes such as model metrics
116                    can be set.
117
118        Returns:
119            Callable: A wrapped version of the original function that automatically creates and manages Langfuse spans.
120
121        Example:
122            For general function tracing with automatic naming:
123            ```python
124            @observe()
125            def process_user_request(user_id, query):
126                # Function is automatically traced with name "process_user_request"
127                return get_response(query)
128            ```
129
130            For language model generation tracking:
131            ```python
132            @observe(name="answer-generation", as_type="generation")
133            async def generate_answer(query):
134                # Creates a generation-type span with extended LLM metrics
135                response = await openai.chat.completions.create(
136                    model="gpt-4",
137                    messages=[{"role": "user", "content": query}]
138                )
139                return response.choices[0].message.content
140            ```
141
142            For trace context propagation between functions:
143            ```python
144            @observe()
145            def main_process():
146                # Parent span is created
147                return sub_process()  # Child span automatically connected to parent
148
149            @observe()
150            def sub_process():
151                # Automatically becomes a child span of main_process
152                return "result"
153            ```
154
155        Raises:
156            Exception: Propagates any exceptions from the wrapped function after logging them in the trace.
157
158        Notes:
159            - The decorator preserves the original function's signature, docstring, and return type.
160            - Proper parent-child relationships between spans are automatically maintained.
161            - Special keyword arguments can be passed to control tracing:
162              - langfuse_trace_id: Explicitly set the trace ID for this function call
163              - langfuse_parent_observation_id: Explicitly set the parent span ID
164              - langfuse_public_key: Use a specific Langfuse project (when multiple clients exist)
165            - For async functions, the decorator returns an async function wrapper.
166            - For sync functions, the decorator returns a synchronous wrapper.
167        """
168        valid_types = set(get_observation_types_list(ObservationTypeLiteralNoEvent))
169        if as_type is not None and as_type not in valid_types:
170            self._log.warning(
171                f"Invalid as_type '{as_type}'. Valid types are: {', '.join(sorted(valid_types))}. Defaulting to 'span'."
172            )
173            as_type = "span"
174
175        function_io_capture_enabled = os.environ.get(
176            LANGFUSE_OBSERVE_DECORATOR_IO_CAPTURE_ENABLED, "True"
177        ).lower() not in ("false", "0")
178
179        should_capture_input = (
180            capture_input if capture_input is not None else function_io_capture_enabled
181        )
182
183        should_capture_output = (
184            capture_output
185            if capture_output is not None
186            else function_io_capture_enabled
187        )
188
189        def decorator(func: F) -> F:
190            return (
191                self._async_observe(
192                    func,
193                    name=name,
194                    as_type=as_type,
195                    capture_input=should_capture_input,
196                    capture_output=should_capture_output,
197                    transform_to_string=transform_to_string,
198                )
199                if asyncio.iscoroutinefunction(func)
200                else self._sync_observe(
201                    func,
202                    name=name,
203                    as_type=as_type,
204                    capture_input=should_capture_input,
205                    capture_output=should_capture_output,
206                    transform_to_string=transform_to_string,
207                )
208            )
209
210        """Handle decorator with or without parentheses.
211
212        This logic enables the decorator to work both with and without parentheses:
213        - @observe - Python passes the function directly to the decorator
214        - @observe() - Python calls the decorator first, which must return a function decorator
215
216        When called without arguments (@observe), the func parameter contains the function to decorate,
217        so we directly apply the decorator to it. When called with parentheses (@observe()),
218        func is None, so we return the decorator function itself for Python to apply in the next step.
219        """
220        if func is None:
221            return decorator
222        else:
223            return decorator(func)

Wrap a function to create and manage Langfuse tracing around its execution, supporting both synchronous and asynchronous functions.

This decorator provides seamless integration of Langfuse observability into your codebase. It automatically creates spans or generations around function execution, capturing timing, inputs/outputs, and error states. The decorator intelligently handles both synchronous and asynchronous functions, preserving function signatures and type hints.

Using OpenTelemetry's distributed tracing system, it maintains proper trace context propagation throughout your application, enabling you to see hierarchical traces of function calls with detailed performance metrics and function-specific details.

Arguments:
  • func (Optional[Callable]): The function to decorate. When used with parentheses @observe(), this will be None.
  • name (Optional[str]): Custom name for the created trace or span. If not provided, the function name is used.
  • as_type (Optional[Literal]): Set the observation type. Supported values: "generation", "span", "agent", "tool", "chain", "retriever", "embedding", "evaluator", "guardrail". Observation types are highlighted in the Langfuse UI for filtering and visualization. The types "generation" and "embedding" create a span on which additional attributes such as model metrics can be set.
Returns:

Callable: A wrapped version of the original function that automatically creates and manages Langfuse spans.

Example:

For general function tracing with automatic naming:

@observe()
def process_user_request(user_id, query):
    # Function is automatically traced with name "process_user_request"
    return get_response(query)

For language model generation tracking:

@observe(name="answer-generation", as_type="generation")
async def generate_answer(query):
    # Creates a generation-type span with extended LLM metrics
    response = await openai.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": query}]
    )
    return response.choices[0].message.content

For trace context propagation between functions:

@observe()
def main_process():
    # Parent span is created
    return sub_process()  # Child span automatically connected to parent

@observe()
def sub_process():
    # Automatically becomes a child span of main_process
    return "result"
Raises:
  • Exception: Propagates any exceptions from the wrapped function after logging them in the trace.
Notes:
  • The decorator preserves the original function's signature, docstring, and return type.
  • Proper parent-child relationships between spans are automatically maintained.
  • Special keyword arguments can be passed to control tracing:
    • langfuse_trace_id: Explicitly set the trace ID for this function call
    • langfuse_parent_observation_id: Explicitly set the parent span ID
    • langfuse_public_key: Use a specific Langfuse project (when multiple clients exist)
  • For async functions, the decorator returns an async function wrapper.
  • For sync functions, the decorator returns a synchronous wrapper.
def propagate_attributes( *, user_id: Optional[str] = None, session_id: Optional[str] = None, metadata: Optional[Dict[str, str]] = None, version: Optional[str] = None, tags: Optional[List[str]] = None, trace_name: Optional[str] = None, as_baggage: bool = False) -> opentelemetry.util._decorator._AgnosticContextManager[typing.Any]:
 76def propagate_attributes(
 77    *,
 78    user_id: Optional[str] = None,
 79    session_id: Optional[str] = None,
 80    metadata: Optional[Dict[str, str]] = None,
 81    version: Optional[str] = None,
 82    tags: Optional[List[str]] = None,
 83    trace_name: Optional[str] = None,
 84    as_baggage: bool = False,
 85) -> _AgnosticContextManager[Any]:
 86    """Propagate trace-level attributes to all spans created within this context.
 87
 88    This context manager sets attributes on the currently active span AND automatically
 89    propagates them to all new child spans created within the context. This is the
 90    recommended way to set trace-level attributes like user_id, session_id, and metadata
 91    dimensions that should be consistently applied across all observations in a trace.
 92
 93    **IMPORTANT**: Call this as early as possible within your trace/workflow. Only the
 94    currently active span and spans created after entering this context will have these
 95    attributes. Pre-existing spans will NOT be retroactively updated.
 96
 97    **Why this matters**: Langfuse aggregation queries (e.g., total cost by user_id,
 98    filtering by session_id) only include observations that have the attribute set.
 99    If you call `propagate_attributes` late in your workflow, earlier spans won't be
100    included in aggregations for that attribute.
101
102    Args:
103        user_id: User identifier to associate with all spans in this context.
104            Must be US-ASCII string, ≤200 characters. Use this to track which user
105            generated each trace and enable e.g. per-user cost/performance analysis.
106        session_id: Session identifier to associate with all spans in this context.
107            Must be US-ASCII string, ≤200 characters. Use this to group related traces
108            within a user session (e.g., a conversation thread, multi-turn interaction).
109        metadata: Additional key-value metadata to propagate to all spans.
110            - Keys and values must be US-ASCII strings
111            - All values must be ≤200 characters
112            - Use for dimensions like internal correlating identifiers
113            - AVOID: large payloads, sensitive data, non-string values (will be dropped with warning)
114        version: Version identfier for parts of your application that are independently versioned, e.g. agents
115        tags: List of tags to categorize the group of observations
116        trace_name: Name to assign to the trace. Must be US-ASCII string, ≤200 characters.
117            Use this to set a consistent trace name for all spans created within this context.
118        as_baggage: If True, propagates attributes using OpenTelemetry baggage for
119            cross-process/service propagation. **Security warning**: When enabled,
120            attribute values are added to HTTP headers on ALL outbound requests.
121            Only enable if values are safe to transmit via HTTP headers and you need
122            cross-service tracing. Default: False.
123
124    Returns:
125        Context manager that propagates attributes to all child spans.
126
127    Example:
128        Basic usage with user and session tracking:
129
130        ```python
131        from langfuse import Langfuse
132
133        langfuse = Langfuse()
134
135        # Set attributes early in the trace
136        with langfuse.start_as_current_observation(name="user_workflow") as span:
137            with langfuse.propagate_attributes(
138                user_id="user_123",
139                session_id="session_abc",
140                metadata={"experiment": "variant_a", "environment": "production"}
141            ):
142                # All spans created here will have user_id, session_id, and metadata
143                with langfuse.start_observation(name="llm_call") as llm_span:
144                    # This span inherits: user_id, session_id, experiment, environment
145                    ...
146
147                with langfuse.start_generation(name="completion") as gen:
148                    # This span also inherits all attributes
149                    ...
150        ```
151
152        Late propagation (anti-pattern):
153
154        ```python
155        with langfuse.start_as_current_observation(name="workflow") as span:
156            # These spans WON'T have user_id
157            early_span = langfuse.start_observation(name="early_work")
158            early_span.end()
159
160            # Set attributes in the middle
161            with langfuse.propagate_attributes(user_id="user_123"):
162                # Only spans created AFTER this point will have user_id
163                late_span = langfuse.start_observation(name="late_work")
164                late_span.end()
165
166            # Result: Aggregations by user_id will miss "early_work" span
167        ```
168
169        Cross-service propagation with baggage (advanced):
170
171        ```python
172        # Service A - originating service
173        with langfuse.start_as_current_observation(name="api_request"):
174            with langfuse.propagate_attributes(
175                user_id="user_123",
176                session_id="session_abc",
177                as_baggage=True  # Propagate via HTTP headers
178            ):
179                # Make HTTP request to Service B
180                response = requests.get("https://service-b.example.com/api")
181                # user_id and session_id are now in HTTP headers
182
183        # Service B - downstream service
184        # OpenTelemetry will automatically extract baggage from HTTP headers
185        # and propagate to spans in Service B
186        ```
187
188    Note:
189        - **Validation**: All attribute values (user_id, session_id, metadata values)
190          must be strings ≤200 characters. Invalid values will be dropped with a
191          warning logged. Ensure values meet constraints before calling.
192        - **OpenTelemetry**: This uses OpenTelemetry context propagation under the hood,
193          making it compatible with other OTel-instrumented libraries.
194
195    Raises:
196        No exceptions are raised. Invalid values are logged as warnings and dropped.
197    """
198    return _propagate_attributes(
199        user_id=user_id,
200        session_id=session_id,
201        metadata=metadata,
202        version=version,
203        tags=tags,
204        trace_name=trace_name,
205        as_baggage=as_baggage,
206    )

Propagate trace-level attributes to all spans created within this context.

This context manager sets attributes on the currently active span AND automatically propagates them to all new child spans created within the context. This is the recommended way to set trace-level attributes like user_id, session_id, and metadata dimensions that should be consistently applied across all observations in a trace.

IMPORTANT: Call this as early as possible within your trace/workflow. Only the currently active span and spans created after entering this context will have these attributes. Pre-existing spans will NOT be retroactively updated.

Why this matters: Langfuse aggregation queries (e.g., total cost by user_id, filtering by session_id) only include observations that have the attribute set. If you call propagate_attributes late in your workflow, earlier spans won't be included in aggregations for that attribute.

Arguments:
  • user_id: User identifier to associate with all spans in this context. Must be US-ASCII string, ≤200 characters. Use this to track which user generated each trace and enable e.g. per-user cost/performance analysis.
  • session_id: Session identifier to associate with all spans in this context. Must be US-ASCII string, ≤200 characters. Use this to group related traces within a user session (e.g., a conversation thread, multi-turn interaction).
  • metadata: Additional key-value metadata to propagate to all spans.
    • Keys and values must be US-ASCII strings
    • All values must be ≤200 characters
    • Use for dimensions like internal correlating identifiers
    • AVOID: large payloads, sensitive data, non-string values (will be dropped with warning)
  • version: Version identfier for parts of your application that are independently versioned, e.g. agents
  • tags: List of tags to categorize the group of observations
  • trace_name: Name to assign to the trace. Must be US-ASCII string, ≤200 characters. Use this to set a consistent trace name for all spans created within this context.
  • as_baggage: If True, propagates attributes using OpenTelemetry baggage for cross-process/service propagation. Security warning: When enabled, attribute values are added to HTTP headers on ALL outbound requests. Only enable if values are safe to transmit via HTTP headers and you need cross-service tracing. Default: False.
Returns:

Context manager that propagates attributes to all child spans.

Example:

Basic usage with user and session tracking:

from langfuse import Langfuse

langfuse = Langfuse()

# Set attributes early in the trace
with langfuse.start_as_current_observation(name="user_workflow") as span:
    with langfuse.propagate_attributes(
        user_id="user_123",
        session_id="session_abc",
        metadata={"experiment": "variant_a", "environment": "production"}
    ):
        # All spans created here will have user_id, session_id, and metadata
        with langfuse.start_observation(name="llm_call") as llm_span:
            # This span inherits: user_id, session_id, experiment, environment
            ...

        with langfuse.start_generation(name="completion") as gen:
            # This span also inherits all attributes
            ...

Late propagation (anti-pattern):

with langfuse.start_as_current_observation(name="workflow") as span:
    # These spans WON'T have user_id
    early_span = langfuse.start_observation(name="early_work")
    early_span.end()

    # Set attributes in the middle
    with langfuse.propagate_attributes(user_id="user_123"):
        # Only spans created AFTER this point will have user_id
        late_span = langfuse.start_observation(name="late_work")
        late_span.end()

    # Result: Aggregations by user_id will miss "early_work" span

Cross-service propagation with baggage (advanced):

# Service A - originating service
with langfuse.start_as_current_observation(name="api_request"):
    with langfuse.propagate_attributes(
        user_id="user_123",
        session_id="session_abc",
        as_baggage=True  # Propagate via HTTP headers
    ):
        # Make HTTP request to Service B
        response = requests.get("https://service-b.example.com/api")
        # user_id and session_id are now in HTTP headers

# Service B - downstream service
# OpenTelemetry will automatically extract baggage from HTTP headers
# and propagate to spans in Service B
Note:
  • Validation: All attribute values (user_id, session_id, metadata values) must be strings ≤200 characters. Invalid values will be dropped with a warning logged. Ensure values meet constraints before calling.
  • OpenTelemetry: This uses OpenTelemetry context propagation under the hood, making it compatible with other OTel-instrumented libraries.
Raises:
  • No exceptions are raised. Invalid values are logged as warnings and dropped.
ObservationTypeLiteral = typing.Union[typing.Literal['generation', 'embedding'], typing.Literal['span', 'agent', 'tool', 'chain', 'retriever', 'evaluator', 'guardrail'], typing.Literal['event']]
class LangfuseSpan(langfuse._client.span.LangfuseObservationWrapper):
1236class LangfuseSpan(LangfuseObservationWrapper):
1237    """Standard span implementation for general operations in Langfuse.
1238
1239    This class represents a general-purpose span that can be used to trace
1240    any operation in your application. It extends the base LangfuseObservationWrapper
1241    with specific methods for creating child spans, generations, and updating
1242    span-specific attributes. If possible, use a more specific type for
1243    better observability and insights.
1244    """
1245
1246    def __init__(
1247        self,
1248        *,
1249        otel_span: otel_trace_api.Span,
1250        langfuse_client: "Langfuse",
1251        input: Optional[Any] = None,
1252        output: Optional[Any] = None,
1253        metadata: Optional[Any] = None,
1254        environment: Optional[str] = None,
1255        version: Optional[str] = None,
1256        level: Optional[SpanLevel] = None,
1257        status_message: Optional[str] = None,
1258    ):
1259        """Initialize a new LangfuseSpan.
1260
1261        Args:
1262            otel_span: The OpenTelemetry span to wrap
1263            langfuse_client: Reference to the parent Langfuse client
1264            input: Input data for the span (any JSON-serializable object)
1265            output: Output data from the span (any JSON-serializable object)
1266            metadata: Additional metadata to associate with the span
1267            environment: The tracing environment
1268            version: Version identifier for the code or component
1269            level: Importance level of the span (info, warning, error)
1270            status_message: Optional status message for the span
1271        """
1272        super().__init__(
1273            otel_span=otel_span,
1274            as_type="span",
1275            langfuse_client=langfuse_client,
1276            input=input,
1277            output=output,
1278            metadata=metadata,
1279            environment=environment,
1280            version=version,
1281            level=level,
1282            status_message=status_message,
1283        )

Standard span implementation for general operations in Langfuse.

This class represents a general-purpose span that can be used to trace any operation in your application. It extends the base LangfuseObservationWrapper with specific methods for creating child spans, generations, and updating span-specific attributes. If possible, use a more specific type for better observability and insights.

LangfuseSpan( *, otel_span: opentelemetry.trace.span.Span, langfuse_client: Langfuse, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, environment: Optional[str] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None)
1246    def __init__(
1247        self,
1248        *,
1249        otel_span: otel_trace_api.Span,
1250        langfuse_client: "Langfuse",
1251        input: Optional[Any] = None,
1252        output: Optional[Any] = None,
1253        metadata: Optional[Any] = None,
1254        environment: Optional[str] = None,
1255        version: Optional[str] = None,
1256        level: Optional[SpanLevel] = None,
1257        status_message: Optional[str] = None,
1258    ):
1259        """Initialize a new LangfuseSpan.
1260
1261        Args:
1262            otel_span: The OpenTelemetry span to wrap
1263            langfuse_client: Reference to the parent Langfuse client
1264            input: Input data for the span (any JSON-serializable object)
1265            output: Output data from the span (any JSON-serializable object)
1266            metadata: Additional metadata to associate with the span
1267            environment: The tracing environment
1268            version: Version identifier for the code or component
1269            level: Importance level of the span (info, warning, error)
1270            status_message: Optional status message for the span
1271        """
1272        super().__init__(
1273            otel_span=otel_span,
1274            as_type="span",
1275            langfuse_client=langfuse_client,
1276            input=input,
1277            output=output,
1278            metadata=metadata,
1279            environment=environment,
1280            version=version,
1281            level=level,
1282            status_message=status_message,
1283        )

Initialize a new LangfuseSpan.

Arguments:
  • otel_span: The OpenTelemetry span to wrap
  • langfuse_client: Reference to the parent Langfuse client
  • input: Input data for the span (any JSON-serializable object)
  • output: Output data from the span (any JSON-serializable object)
  • metadata: Additional metadata to associate with the span
  • environment: The tracing environment
  • version: Version identifier for the code or component
  • level: Importance level of the span (info, warning, error)
  • status_message: Optional status message for the span
class LangfuseGeneration(langfuse._client.span.LangfuseObservationWrapper):
1286class LangfuseGeneration(LangfuseObservationWrapper):
1287    """Specialized span implementation for AI model generations in Langfuse.
1288
1289    This class represents a generation span specifically designed for tracking
1290    AI/LLM operations. It extends the base LangfuseObservationWrapper with specialized
1291    attributes for model details, token usage, and costs.
1292    """
1293
1294    def __init__(
1295        self,
1296        *,
1297        otel_span: otel_trace_api.Span,
1298        langfuse_client: "Langfuse",
1299        input: Optional[Any] = None,
1300        output: Optional[Any] = None,
1301        metadata: Optional[Any] = None,
1302        environment: Optional[str] = None,
1303        version: Optional[str] = None,
1304        level: Optional[SpanLevel] = None,
1305        status_message: Optional[str] = None,
1306        completion_start_time: Optional[datetime] = None,
1307        model: Optional[str] = None,
1308        model_parameters: Optional[Dict[str, MapValue]] = None,
1309        usage_details: Optional[Dict[str, int]] = None,
1310        cost_details: Optional[Dict[str, float]] = None,
1311        prompt: Optional[PromptClient] = None,
1312    ):
1313        """Initialize a new LangfuseGeneration span.
1314
1315        Args:
1316            otel_span: The OpenTelemetry span to wrap
1317            langfuse_client: Reference to the parent Langfuse client
1318            input: Input data for the generation (e.g., prompts)
1319            output: Output from the generation (e.g., completions)
1320            metadata: Additional metadata to associate with the generation
1321            environment: The tracing environment
1322            version: Version identifier for the model or component
1323            level: Importance level of the generation (info, warning, error)
1324            status_message: Optional status message for the generation
1325            completion_start_time: When the model started generating the response
1326            model: Name/identifier of the AI model used (e.g., "gpt-4")
1327            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1328            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1329            cost_details: Cost information for the model call
1330            prompt: Associated prompt template from Langfuse prompt management
1331        """
1332        super().__init__(
1333            as_type="generation",
1334            otel_span=otel_span,
1335            langfuse_client=langfuse_client,
1336            input=input,
1337            output=output,
1338            metadata=metadata,
1339            environment=environment,
1340            version=version,
1341            level=level,
1342            status_message=status_message,
1343            completion_start_time=completion_start_time,
1344            model=model,
1345            model_parameters=model_parameters,
1346            usage_details=usage_details,
1347            cost_details=cost_details,
1348            prompt=prompt,
1349        )

Specialized span implementation for AI model generations in Langfuse.

This class represents a generation span specifically designed for tracking AI/LLM operations. It extends the base LangfuseObservationWrapper with specialized attributes for model details, token usage, and costs.

LangfuseGeneration( *, otel_span: opentelemetry.trace.span.Span, langfuse_client: Langfuse, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, environment: Optional[str] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, float, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None)
1294    def __init__(
1295        self,
1296        *,
1297        otel_span: otel_trace_api.Span,
1298        langfuse_client: "Langfuse",
1299        input: Optional[Any] = None,
1300        output: Optional[Any] = None,
1301        metadata: Optional[Any] = None,
1302        environment: Optional[str] = None,
1303        version: Optional[str] = None,
1304        level: Optional[SpanLevel] = None,
1305        status_message: Optional[str] = None,
1306        completion_start_time: Optional[datetime] = None,
1307        model: Optional[str] = None,
1308        model_parameters: Optional[Dict[str, MapValue]] = None,
1309        usage_details: Optional[Dict[str, int]] = None,
1310        cost_details: Optional[Dict[str, float]] = None,
1311        prompt: Optional[PromptClient] = None,
1312    ):
1313        """Initialize a new LangfuseGeneration span.
1314
1315        Args:
1316            otel_span: The OpenTelemetry span to wrap
1317            langfuse_client: Reference to the parent Langfuse client
1318            input: Input data for the generation (e.g., prompts)
1319            output: Output from the generation (e.g., completions)
1320            metadata: Additional metadata to associate with the generation
1321            environment: The tracing environment
1322            version: Version identifier for the model or component
1323            level: Importance level of the generation (info, warning, error)
1324            status_message: Optional status message for the generation
1325            completion_start_time: When the model started generating the response
1326            model: Name/identifier of the AI model used (e.g., "gpt-4")
1327            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1328            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1329            cost_details: Cost information for the model call
1330            prompt: Associated prompt template from Langfuse prompt management
1331        """
1332        super().__init__(
1333            as_type="generation",
1334            otel_span=otel_span,
1335            langfuse_client=langfuse_client,
1336            input=input,
1337            output=output,
1338            metadata=metadata,
1339            environment=environment,
1340            version=version,
1341            level=level,
1342            status_message=status_message,
1343            completion_start_time=completion_start_time,
1344            model=model,
1345            model_parameters=model_parameters,
1346            usage_details=usage_details,
1347            cost_details=cost_details,
1348            prompt=prompt,
1349        )

Initialize a new LangfuseGeneration span.

Arguments:
  • otel_span: The OpenTelemetry span to wrap
  • langfuse_client: Reference to the parent Langfuse client
  • input: Input data for the generation (e.g., prompts)
  • output: Output from the generation (e.g., completions)
  • metadata: Additional metadata to associate with the generation
  • environment: The tracing environment
  • version: Version identifier for the model or component
  • level: Importance level of the generation (info, warning, error)
  • status_message: Optional status message for the generation
  • completion_start_time: When the model started generating the response
  • model: Name/identifier of the AI model used (e.g., "gpt-4")
  • model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
  • usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
  • cost_details: Cost information for the model call
  • prompt: Associated prompt template from Langfuse prompt management
class LangfuseEvent(langfuse._client.span.LangfuseObservationWrapper):
1352class LangfuseEvent(LangfuseObservationWrapper):
1353    """Specialized span implementation for Langfuse Events."""
1354
1355    def __init__(
1356        self,
1357        *,
1358        otel_span: otel_trace_api.Span,
1359        langfuse_client: "Langfuse",
1360        input: Optional[Any] = None,
1361        output: Optional[Any] = None,
1362        metadata: Optional[Any] = None,
1363        environment: Optional[str] = None,
1364        version: Optional[str] = None,
1365        level: Optional[SpanLevel] = None,
1366        status_message: Optional[str] = None,
1367    ):
1368        """Initialize a new LangfuseEvent span.
1369
1370        Args:
1371            otel_span: The OpenTelemetry span to wrap
1372            langfuse_client: Reference to the parent Langfuse client
1373            input: Input data for the event
1374            output: Output from the event
1375            metadata: Additional metadata to associate with the generation
1376            environment: The tracing environment
1377            version: Version identifier for the model or component
1378            level: Importance level of the generation (info, warning, error)
1379            status_message: Optional status message for the generation
1380        """
1381        super().__init__(
1382            otel_span=otel_span,
1383            as_type="event",
1384            langfuse_client=langfuse_client,
1385            input=input,
1386            output=output,
1387            metadata=metadata,
1388            environment=environment,
1389            version=version,
1390            level=level,
1391            status_message=status_message,
1392        )
1393
1394    def update(
1395        self,
1396        *,
1397        name: Optional[str] = None,
1398        input: Optional[Any] = None,
1399        output: Optional[Any] = None,
1400        metadata: Optional[Any] = None,
1401        version: Optional[str] = None,
1402        level: Optional[SpanLevel] = None,
1403        status_message: Optional[str] = None,
1404        completion_start_time: Optional[datetime] = None,
1405        model: Optional[str] = None,
1406        model_parameters: Optional[Dict[str, MapValue]] = None,
1407        usage_details: Optional[Dict[str, int]] = None,
1408        cost_details: Optional[Dict[str, float]] = None,
1409        prompt: Optional[PromptClient] = None,
1410        **kwargs: Any,
1411    ) -> "LangfuseEvent":
1412        """Update is not allowed for LangfuseEvent because events cannot be updated.
1413
1414        This method logs a warning and returns self without making changes.
1415
1416        Returns:
1417            self: Returns the unchanged LangfuseEvent instance
1418        """
1419        langfuse_logger.warning(
1420            "Attempted to update LangfuseEvent observation. Events cannot be updated after creation."
1421        )
1422        return self

Specialized span implementation for Langfuse Events.

LangfuseEvent( *, otel_span: opentelemetry.trace.span.Span, langfuse_client: Langfuse, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, environment: Optional[str] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None)
1355    def __init__(
1356        self,
1357        *,
1358        otel_span: otel_trace_api.Span,
1359        langfuse_client: "Langfuse",
1360        input: Optional[Any] = None,
1361        output: Optional[Any] = None,
1362        metadata: Optional[Any] = None,
1363        environment: Optional[str] = None,
1364        version: Optional[str] = None,
1365        level: Optional[SpanLevel] = None,
1366        status_message: Optional[str] = None,
1367    ):
1368        """Initialize a new LangfuseEvent span.
1369
1370        Args:
1371            otel_span: The OpenTelemetry span to wrap
1372            langfuse_client: Reference to the parent Langfuse client
1373            input: Input data for the event
1374            output: Output from the event
1375            metadata: Additional metadata to associate with the generation
1376            environment: The tracing environment
1377            version: Version identifier for the model or component
1378            level: Importance level of the generation (info, warning, error)
1379            status_message: Optional status message for the generation
1380        """
1381        super().__init__(
1382            otel_span=otel_span,
1383            as_type="event",
1384            langfuse_client=langfuse_client,
1385            input=input,
1386            output=output,
1387            metadata=metadata,
1388            environment=environment,
1389            version=version,
1390            level=level,
1391            status_message=status_message,
1392        )

Initialize a new LangfuseEvent span.

Arguments:
  • otel_span: The OpenTelemetry span to wrap
  • langfuse_client: Reference to the parent Langfuse client
  • input: Input data for the event
  • output: Output from the event
  • metadata: Additional metadata to associate with the generation
  • environment: The tracing environment
  • version: Version identifier for the model or component
  • level: Importance level of the generation (info, warning, error)
  • status_message: Optional status message for the generation
def update( self, *, name: Optional[str] = None, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, float, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None, **kwargs: Any) -> LangfuseEvent:
1394    def update(
1395        self,
1396        *,
1397        name: Optional[str] = None,
1398        input: Optional[Any] = None,
1399        output: Optional[Any] = None,
1400        metadata: Optional[Any] = None,
1401        version: Optional[str] = None,
1402        level: Optional[SpanLevel] = None,
1403        status_message: Optional[str] = None,
1404        completion_start_time: Optional[datetime] = None,
1405        model: Optional[str] = None,
1406        model_parameters: Optional[Dict[str, MapValue]] = None,
1407        usage_details: Optional[Dict[str, int]] = None,
1408        cost_details: Optional[Dict[str, float]] = None,
1409        prompt: Optional[PromptClient] = None,
1410        **kwargs: Any,
1411    ) -> "LangfuseEvent":
1412        """Update is not allowed for LangfuseEvent because events cannot be updated.
1413
1414        This method logs a warning and returns self without making changes.
1415
1416        Returns:
1417            self: Returns the unchanged LangfuseEvent instance
1418        """
1419        langfuse_logger.warning(
1420            "Attempted to update LangfuseEvent observation. Events cannot be updated after creation."
1421        )
1422        return self

Update is not allowed for LangfuseEvent because events cannot be updated.

This method logs a warning and returns self without making changes.

Returns:

self: Returns the unchanged LangfuseEvent instance

class LangfuseOtelSpanAttributes:
28class LangfuseOtelSpanAttributes:
29    # Langfuse-Trace attributes
30    TRACE_NAME = "langfuse.trace.name"
31    TRACE_USER_ID = "user.id"
32    TRACE_SESSION_ID = "session.id"
33    TRACE_TAGS = "langfuse.trace.tags"
34    TRACE_PUBLIC = "langfuse.trace.public"
35    TRACE_METADATA = "langfuse.trace.metadata"
36    TRACE_INPUT = "langfuse.trace.input"
37    TRACE_OUTPUT = "langfuse.trace.output"
38
39    # Langfuse-observation attributes
40    OBSERVATION_TYPE = "langfuse.observation.type"
41    OBSERVATION_METADATA = "langfuse.observation.metadata"
42    OBSERVATION_LEVEL = "langfuse.observation.level"
43    OBSERVATION_STATUS_MESSAGE = "langfuse.observation.status_message"
44    OBSERVATION_INPUT = "langfuse.observation.input"
45    OBSERVATION_OUTPUT = "langfuse.observation.output"
46
47    # Langfuse-observation of type Generation attributes
48    OBSERVATION_COMPLETION_START_TIME = "langfuse.observation.completion_start_time"
49    OBSERVATION_MODEL = "langfuse.observation.model.name"
50    OBSERVATION_MODEL_PARAMETERS = "langfuse.observation.model.parameters"
51    OBSERVATION_USAGE_DETAILS = "langfuse.observation.usage_details"
52    OBSERVATION_COST_DETAILS = "langfuse.observation.cost_details"
53    OBSERVATION_PROMPT_NAME = "langfuse.observation.prompt.name"
54    OBSERVATION_PROMPT_VERSION = "langfuse.observation.prompt.version"
55
56    # General
57    ENVIRONMENT = "langfuse.environment"
58    RELEASE = "langfuse.release"
59    VERSION = "langfuse.version"
60
61    # Internal
62    AS_ROOT = "langfuse.internal.as_root"
63
64    # Experiments
65    EXPERIMENT_ID = "langfuse.experiment.id"
66    EXPERIMENT_NAME = "langfuse.experiment.name"
67    EXPERIMENT_DESCRIPTION = "langfuse.experiment.description"
68    EXPERIMENT_METADATA = "langfuse.experiment.metadata"
69    EXPERIMENT_DATASET_ID = "langfuse.experiment.dataset.id"
70    EXPERIMENT_ITEM_ID = "langfuse.experiment.item.id"
71    EXPERIMENT_ITEM_EXPECTED_OUTPUT = "langfuse.experiment.item.expected_output"
72    EXPERIMENT_ITEM_METADATA = "langfuse.experiment.item.metadata"
73    EXPERIMENT_ITEM_ROOT_OBSERVATION_ID = "langfuse.experiment.item.root_observation_id"
TRACE_NAME = 'langfuse.trace.name'
TRACE_USER_ID = 'user.id'
TRACE_SESSION_ID = 'session.id'
TRACE_TAGS = 'langfuse.trace.tags'
TRACE_PUBLIC = 'langfuse.trace.public'
TRACE_METADATA = 'langfuse.trace.metadata'
TRACE_INPUT = 'langfuse.trace.input'
TRACE_OUTPUT = 'langfuse.trace.output'
OBSERVATION_TYPE = 'langfuse.observation.type'
OBSERVATION_METADATA = 'langfuse.observation.metadata'
OBSERVATION_LEVEL = 'langfuse.observation.level'
OBSERVATION_STATUS_MESSAGE = 'langfuse.observation.status_message'
OBSERVATION_INPUT = 'langfuse.observation.input'
OBSERVATION_OUTPUT = 'langfuse.observation.output'
OBSERVATION_COMPLETION_START_TIME = 'langfuse.observation.completion_start_time'
OBSERVATION_MODEL = 'langfuse.observation.model.name'
OBSERVATION_MODEL_PARAMETERS = 'langfuse.observation.model.parameters'
OBSERVATION_USAGE_DETAILS = 'langfuse.observation.usage_details'
OBSERVATION_COST_DETAILS = 'langfuse.observation.cost_details'
OBSERVATION_PROMPT_NAME = 'langfuse.observation.prompt.name'
OBSERVATION_PROMPT_VERSION = 'langfuse.observation.prompt.version'
ENVIRONMENT = 'langfuse.environment'
RELEASE = 'langfuse.release'
VERSION = 'langfuse.version'
AS_ROOT = 'langfuse.internal.as_root'
EXPERIMENT_ID = 'langfuse.experiment.id'
EXPERIMENT_NAME = 'langfuse.experiment.name'
EXPERIMENT_DESCRIPTION = 'langfuse.experiment.description'
EXPERIMENT_METADATA = 'langfuse.experiment.metadata'
EXPERIMENT_DATASET_ID = 'langfuse.experiment.dataset.id'
EXPERIMENT_ITEM_ID = 'langfuse.experiment.item.id'
EXPERIMENT_ITEM_EXPECTED_OUTPUT = 'langfuse.experiment.item.expected_output'
EXPERIMENT_ITEM_METADATA = 'langfuse.experiment.item.metadata'
EXPERIMENT_ITEM_ROOT_OBSERVATION_ID = 'langfuse.experiment.item.root_observation_id'
class LangfuseAgent(langfuse._client.span.LangfuseObservationWrapper):
1425class LangfuseAgent(LangfuseObservationWrapper):
1426    """Agent observation for reasoning blocks that act on tools using LLM guidance."""
1427
1428    def __init__(self, **kwargs: Any) -> None:
1429        """Initialize a new LangfuseAgent span."""
1430        kwargs["as_type"] = "agent"
1431        super().__init__(**kwargs)

Agent observation for reasoning blocks that act on tools using LLM guidance.

LangfuseAgent(**kwargs: Any)
1428    def __init__(self, **kwargs: Any) -> None:
1429        """Initialize a new LangfuseAgent span."""
1430        kwargs["as_type"] = "agent"
1431        super().__init__(**kwargs)

Initialize a new LangfuseAgent span.

class LangfuseTool(langfuse._client.span.LangfuseObservationWrapper):
1434class LangfuseTool(LangfuseObservationWrapper):
1435    """Tool observation representing external tool calls, e.g., calling a weather API."""
1436
1437    def __init__(self, **kwargs: Any) -> None:
1438        """Initialize a new LangfuseTool span."""
1439        kwargs["as_type"] = "tool"
1440        super().__init__(**kwargs)

Tool observation representing external tool calls, e.g., calling a weather API.

LangfuseTool(**kwargs: Any)
1437    def __init__(self, **kwargs: Any) -> None:
1438        """Initialize a new LangfuseTool span."""
1439        kwargs["as_type"] = "tool"
1440        super().__init__(**kwargs)

Initialize a new LangfuseTool span.

class LangfuseChain(langfuse._client.span.LangfuseObservationWrapper):
1443class LangfuseChain(LangfuseObservationWrapper):
1444    """Chain observation for connecting LLM application steps, e.g. passing context from retriever to LLM."""
1445
1446    def __init__(self, **kwargs: Any) -> None:
1447        """Initialize a new LangfuseChain span."""
1448        kwargs["as_type"] = "chain"
1449        super().__init__(**kwargs)

Chain observation for connecting LLM application steps, e.g. passing context from retriever to LLM.

LangfuseChain(**kwargs: Any)
1446    def __init__(self, **kwargs: Any) -> None:
1447        """Initialize a new LangfuseChain span."""
1448        kwargs["as_type"] = "chain"
1449        super().__init__(**kwargs)

Initialize a new LangfuseChain span.

class LangfuseEmbedding(langfuse._client.span.LangfuseObservationWrapper):
1461class LangfuseEmbedding(LangfuseObservationWrapper):
1462    """Embedding observation for LLM embedding calls, typically used before retrieval."""
1463
1464    def __init__(self, **kwargs: Any) -> None:
1465        """Initialize a new LangfuseEmbedding span."""
1466        kwargs["as_type"] = "embedding"
1467        super().__init__(**kwargs)

Embedding observation for LLM embedding calls, typically used before retrieval.

LangfuseEmbedding(**kwargs: Any)
1464    def __init__(self, **kwargs: Any) -> None:
1465        """Initialize a new LangfuseEmbedding span."""
1466        kwargs["as_type"] = "embedding"
1467        super().__init__(**kwargs)

Initialize a new LangfuseEmbedding span.

class LangfuseEvaluator(langfuse._client.span.LangfuseObservationWrapper):
1470class LangfuseEvaluator(LangfuseObservationWrapper):
1471    """Evaluator observation for assessing relevance, correctness, or helpfulness of LLM outputs."""
1472
1473    def __init__(self, **kwargs: Any) -> None:
1474        """Initialize a new LangfuseEvaluator span."""
1475        kwargs["as_type"] = "evaluator"
1476        super().__init__(**kwargs)

Evaluator observation for assessing relevance, correctness, or helpfulness of LLM outputs.

LangfuseEvaluator(**kwargs: Any)
1473    def __init__(self, **kwargs: Any) -> None:
1474        """Initialize a new LangfuseEvaluator span."""
1475        kwargs["as_type"] = "evaluator"
1476        super().__init__(**kwargs)

Initialize a new LangfuseEvaluator span.

class LangfuseRetriever(langfuse._client.span.LangfuseObservationWrapper):
1452class LangfuseRetriever(LangfuseObservationWrapper):
1453    """Retriever observation for data retrieval steps, e.g. vector store or database queries."""
1454
1455    def __init__(self, **kwargs: Any) -> None:
1456        """Initialize a new LangfuseRetriever span."""
1457        kwargs["as_type"] = "retriever"
1458        super().__init__(**kwargs)

Retriever observation for data retrieval steps, e.g. vector store or database queries.

LangfuseRetriever(**kwargs: Any)
1455    def __init__(self, **kwargs: Any) -> None:
1456        """Initialize a new LangfuseRetriever span."""
1457        kwargs["as_type"] = "retriever"
1458        super().__init__(**kwargs)

Initialize a new LangfuseRetriever span.

class LangfuseGuardrail(langfuse._client.span.LangfuseObservationWrapper):
1479class LangfuseGuardrail(LangfuseObservationWrapper):
1480    """Guardrail observation for protection e.g. against jailbreaks or offensive content."""
1481
1482    def __init__(self, **kwargs: Any) -> None:
1483        """Initialize a new LangfuseGuardrail span."""
1484        kwargs["as_type"] = "guardrail"
1485        super().__init__(**kwargs)

Guardrail observation for protection e.g. against jailbreaks or offensive content.

LangfuseGuardrail(**kwargs: Any)
1482    def __init__(self, **kwargs: Any) -> None:
1483        """Initialize a new LangfuseGuardrail span."""
1484        kwargs["as_type"] = "guardrail"
1485        super().__init__(**kwargs)

Initialize a new LangfuseGuardrail span.

class Evaluation:
 93class Evaluation:
 94    """Represents an evaluation result for an experiment item or an entire experiment run.
 95
 96    This class provides a strongly-typed way to create evaluation results in evaluator functions.
 97    Users must use keyword arguments when instantiating this class.
 98
 99    Attributes:
100        name: Unique identifier for the evaluation metric. Should be descriptive
101            and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity").
102            Used for aggregation and comparison across experiment runs.
103        value: The evaluation score or result. Can be:
104            - Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42)
105            - String: For categorical results like "positive", "negative", "neutral"
106            - Boolean: For binary assessments like "passes_safety_check"
107        comment: Optional human-readable explanation of the evaluation result.
108            Useful for providing context, explaining scoring rationale, or noting
109            special conditions. Displayed in Langfuse UI for interpretability.
110        metadata: Optional structured metadata about the evaluation process.
111            Can include confidence scores, intermediate calculations, model versions,
112            or any other relevant technical details.
113        data_type: Optional score data type. Required if value is not NUMERIC.
114            One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC.
115        config_id: Optional Langfuse score config ID.
116
117    Examples:
118        Basic accuracy evaluation:
119        ```python
120        from langfuse import Evaluation
121
122        def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
123            if not expected_output:
124                return Evaluation(name="accuracy", value=0, comment="No expected output")
125
126            is_correct = output.strip().lower() == expected_output.strip().lower()
127            return Evaluation(
128                name="accuracy",
129                value=1.0 if is_correct else 0.0,
130                comment="Correct answer" if is_correct else "Incorrect answer"
131            )
132        ```
133
134        Multi-metric evaluator:
135        ```python
136        def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs):
137            return [
138                Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"),
139                Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"),
140                Evaluation(
141                    name="quality",
142                    value=0.85,
143                    comment="High quality response",
144                    metadata={"confidence": 0.92, "model": "gpt-4"}
145                )
146            ]
147        ```
148
149        Categorical evaluation:
150        ```python
151        def sentiment_evaluator(*, input, output, **kwargs):
152            sentiment = analyze_sentiment(output)  # Returns "positive", "negative", or "neutral"
153            return Evaluation(
154                name="sentiment",
155                value=sentiment,
156                comment=f"Response expresses {sentiment} sentiment",
157                data_type="CATEGORICAL"
158            )
159        ```
160
161        Failed evaluation with error handling:
162        ```python
163        def external_api_evaluator(*, input, output, **kwargs):
164            try:
165                score = external_api.evaluate(output)
166                return Evaluation(name="external_score", value=score)
167            except Exception as e:
168                return Evaluation(
169                    name="external_score",
170                    value=0,
171                    comment=f"API unavailable: {e}",
172                    metadata={"error": str(e), "retry_count": 3}
173                )
174        ```
175
176    Note:
177        All arguments must be passed as keywords. Positional arguments are not allowed
178        to ensure code clarity and prevent errors from argument reordering.
179    """
180
181    def __init__(
182        self,
183        *,
184        name: str,
185        value: Union[int, float, str, bool],
186        comment: Optional[str] = None,
187        metadata: Optional[Dict[str, Any]] = None,
188        data_type: Optional[ScoreDataType] = None,
189        config_id: Optional[str] = None,
190    ):
191        """Initialize an Evaluation with the provided data.
192
193        Args:
194            name: Unique identifier for the evaluation metric.
195            value: The evaluation score or result.
196            comment: Optional human-readable explanation of the result.
197            metadata: Optional structured metadata about the evaluation process.
198            data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
199            config_id: Optional Langfuse score config ID.
200
201        Note:
202            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
203        """
204        self.name = name
205        self.value = value
206        self.comment = comment
207        self.metadata = metadata
208        self.data_type = data_type
209        self.config_id = config_id

Represents an evaluation result for an experiment item or an entire experiment run.

This class provides a strongly-typed way to create evaluation results in evaluator functions. Users must use keyword arguments when instantiating this class.

Attributes:
  • name: Unique identifier for the evaluation metric. Should be descriptive and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity"). Used for aggregation and comparison across experiment runs.
  • value: The evaluation score or result. Can be:
    • Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42)
    • String: For categorical results like "positive", "negative", "neutral"
    • Boolean: For binary assessments like "passes_safety_check"
  • comment: Optional human-readable explanation of the evaluation result. Useful for providing context, explaining scoring rationale, or noting special conditions. Displayed in Langfuse UI for interpretability.
  • metadata: Optional structured metadata about the evaluation process. Can include confidence scores, intermediate calculations, model versions, or any other relevant technical details.
  • data_type: Optional score data type. Required if value is not NUMERIC. One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC.
  • config_id: Optional Langfuse score config ID.
Examples:

Basic accuracy evaluation:

from langfuse import Evaluation

def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
    if not expected_output:
        return Evaluation(name="accuracy", value=0, comment="No expected output")

    is_correct = output.strip().lower() == expected_output.strip().lower()
    return Evaluation(
        name="accuracy",
        value=1.0 if is_correct else 0.0,
        comment="Correct answer" if is_correct else "Incorrect answer"
    )

Multi-metric evaluator:

def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs):
    return [
        Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"),
        Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"),
        Evaluation(
            name="quality",
            value=0.85,
            comment="High quality response",
            metadata={"confidence": 0.92, "model": "gpt-4"}
        )
    ]

Categorical evaluation:

def sentiment_evaluator(*, input, output, **kwargs):
    sentiment = analyze_sentiment(output)  # Returns "positive", "negative", or "neutral"
    return Evaluation(
        name="sentiment",
        value=sentiment,
        comment=f"Response expresses {sentiment} sentiment",
        data_type="CATEGORICAL"
    )

Failed evaluation with error handling:

def external_api_evaluator(*, input, output, **kwargs):
    try:
        score = external_api.evaluate(output)
        return Evaluation(name="external_score", value=score)
    except Exception as e:
        return Evaluation(
            name="external_score",
            value=0,
            comment=f"API unavailable: {e}",
            metadata={"error": str(e), "retry_count": 3}
        )
Note:

All arguments must be passed as keywords. Positional arguments are not allowed to ensure code clarity and prevent errors from argument reordering.

Evaluation( *, name: str, value: Union[int, float, str, bool], comment: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, data_type: Optional[langfuse.api.ScoreDataType] = None, config_id: Optional[str] = None)
181    def __init__(
182        self,
183        *,
184        name: str,
185        value: Union[int, float, str, bool],
186        comment: Optional[str] = None,
187        metadata: Optional[Dict[str, Any]] = None,
188        data_type: Optional[ScoreDataType] = None,
189        config_id: Optional[str] = None,
190    ):
191        """Initialize an Evaluation with the provided data.
192
193        Args:
194            name: Unique identifier for the evaluation metric.
195            value: The evaluation score or result.
196            comment: Optional human-readable explanation of the result.
197            metadata: Optional structured metadata about the evaluation process.
198            data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
199            config_id: Optional Langfuse score config ID.
200
201        Note:
202            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
203        """
204        self.name = name
205        self.value = value
206        self.comment = comment
207        self.metadata = metadata
208        self.data_type = data_type
209        self.config_id = config_id

Initialize an Evaluation with the provided data.

Arguments:
  • name: Unique identifier for the evaluation metric.
  • value: The evaluation score or result.
  • comment: Optional human-readable explanation of the result.
  • metadata: Optional structured metadata about the evaluation process.
  • data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
  • config_id: Optional Langfuse score config ID.
Note:

All arguments must be provided as keywords. Positional arguments will raise a TypeError.

name
value
comment
metadata
data_type
config_id
class EvaluatorInputs:
 40class EvaluatorInputs:
 41    """Input data structure for evaluators, returned by mapper functions.
 42
 43    This class provides a strongly-typed container for transforming API response
 44    objects (traces, observations) into the standardized format expected
 45    by evaluator functions. It ensures consistent access to input, output, expected
 46    output, and metadata regardless of the source entity type.
 47
 48    Attributes:
 49        input: The input data that was provided to generate the output being evaluated.
 50            For traces, this might be the initial prompt or request. For observations,
 51            this could be the span's input. The exact meaning depends on your use case.
 52        output: The actual output that was produced and needs to be evaluated.
 53            For traces, this is typically the final response. For observations,
 54            this might be the generation output or span result.
 55        expected_output: Optional ground truth or expected result for comparison.
 56            Used by evaluators to assess correctness. May be None if no ground truth
 57            is available for the entity being evaluated.
 58        metadata: Optional structured metadata providing additional context for evaluation.
 59            Can include information about the entity, execution context, user attributes,
 60            or any other relevant data that evaluators might use.
 61
 62    Examples:
 63        Simple mapper for traces:
 64        ```python
 65        from langfuse import EvaluatorInputs
 66
 67        def trace_mapper(trace):
 68            return EvaluatorInputs(
 69                input=trace.input,
 70                output=trace.output,
 71                expected_output=None,  # No ground truth available
 72                metadata={"user_id": trace.user_id, "tags": trace.tags}
 73            )
 74        ```
 75
 76        Mapper for observations extracting specific fields:
 77        ```python
 78        def observation_mapper(observation):
 79            # Extract input/output from observation's data
 80            input_data = observation.input if hasattr(observation, 'input') else None
 81            output_data = observation.output if hasattr(observation, 'output') else None
 82
 83            return EvaluatorInputs(
 84                input=input_data,
 85                output=output_data,
 86                expected_output=None,
 87                metadata={
 88                    "observation_type": observation.type,
 89                    "model": observation.model,
 90                    "latency_ms": observation.end_time - observation.start_time
 91                }
 92            )
 93        ```
 94        ```
 95
 96    Note:
 97        All arguments must be passed as keywords when instantiating this class.
 98    """
 99
100    def __init__(
101        self,
102        *,
103        input: Any,
104        output: Any,
105        expected_output: Any = None,
106        metadata: Optional[Dict[str, Any]] = None,
107    ):
108        """Initialize EvaluatorInputs with the provided data.
109
110        Args:
111            input: The input data for evaluation.
112            output: The output data to be evaluated.
113            expected_output: Optional ground truth for comparison.
114            metadata: Optional additional context for evaluation.
115
116        Note:
117            All arguments must be provided as keywords.
118        """
119        self.input = input
120        self.output = output
121        self.expected_output = expected_output
122        self.metadata = metadata

Input data structure for evaluators, returned by mapper functions.

This class provides a strongly-typed container for transforming API response objects (traces, observations) into the standardized format expected by evaluator functions. It ensures consistent access to input, output, expected output, and metadata regardless of the source entity type.

Attributes:
  • input: The input data that was provided to generate the output being evaluated. For traces, this might be the initial prompt or request. For observations, this could be the span's input. The exact meaning depends on your use case.
  • output: The actual output that was produced and needs to be evaluated. For traces, this is typically the final response. For observations, this might be the generation output or span result.
  • expected_output: Optional ground truth or expected result for comparison. Used by evaluators to assess correctness. May be None if no ground truth is available for the entity being evaluated.
  • metadata: Optional structured metadata providing additional context for evaluation. Can include information about the entity, execution context, user attributes, or any other relevant data that evaluators might use.
Examples:

Simple mapper for traces:

from langfuse import EvaluatorInputs

def trace_mapper(trace):
    return EvaluatorInputs(
        input=trace.input,
        output=trace.output,
        expected_output=None,  # No ground truth available
        metadata={"user_id": trace.user_id, "tags": trace.tags}
    )

Mapper for observations extracting specific fields:

def observation_mapper(observation):
    # Extract input/output from observation's data
    input_data = observation.input if hasattr(observation, 'input') else None
    output_data = observation.output if hasattr(observation, 'output') else None

    return EvaluatorInputs(
        input=input_data,
        output=output_data,
        expected_output=None,
        metadata={
            "observation_type": observation.type,
            "model": observation.model,
            "latency_ms": observation.end_time - observation.start_time
        }
    )

```

Note:

All arguments must be passed as keywords when instantiating this class.

EvaluatorInputs( *, input: Any, output: Any, expected_output: Any = None, metadata: Optional[Dict[str, Any]] = None)
100    def __init__(
101        self,
102        *,
103        input: Any,
104        output: Any,
105        expected_output: Any = None,
106        metadata: Optional[Dict[str, Any]] = None,
107    ):
108        """Initialize EvaluatorInputs with the provided data.
109
110        Args:
111            input: The input data for evaluation.
112            output: The output data to be evaluated.
113            expected_output: Optional ground truth for comparison.
114            metadata: Optional additional context for evaluation.
115
116        Note:
117            All arguments must be provided as keywords.
118        """
119        self.input = input
120        self.output = output
121        self.expected_output = expected_output
122        self.metadata = metadata

Initialize EvaluatorInputs with the provided data.

Arguments:
  • input: The input data for evaluation.
  • output: The output data to be evaluated.
  • expected_output: Optional ground truth for comparison.
  • metadata: Optional additional context for evaluation.
Note:

All arguments must be provided as keywords.

input
output
expected_output
metadata
class MapperFunction(typing.Protocol):
125class MapperFunction(Protocol):
126    """Protocol defining the interface for mapper functions in batch evaluation.
127
128    Mapper functions transform API response objects (traces or observations)
129    into the standardized EvaluatorInputs format that evaluators expect. This abstraction
130    allows you to define how to extract and structure evaluation data from different
131    entity types.
132
133    Mapper functions must:
134    - Accept a single item parameter (trace, observation)
135    - Return an EvaluatorInputs instance with input, output, expected_output, metadata
136    - Can be either synchronous or asynchronous
137    - Should handle missing or malformed data gracefully
138    """
139
140    def __call__(
141        self,
142        *,
143        item: Union["TraceWithFullDetails", "ObservationsView"],
144        **kwargs: Dict[str, Any],
145    ) -> Union[EvaluatorInputs, Awaitable[EvaluatorInputs]]:
146        """Transform an API response object into evaluator inputs.
147
148        This method defines how to extract evaluation-relevant data from the raw
149        API response object. The implementation should map entity-specific fields
150        to the standardized input/output/expected_output/metadata structure.
151
152        Args:
153            item: The API response object to transform. The type depends on the scope:
154                - TraceWithFullDetails: When evaluating traces
155                - ObservationsView: When evaluating observations
156
157        Returns:
158            EvaluatorInputs: A structured container with:
159                - input: The input data that generated the output
160                - output: The output to be evaluated
161                - expected_output: Optional ground truth for comparison
162                - metadata: Optional additional context
163
164            Can return either a direct EvaluatorInputs instance or an awaitable
165            (for async mappers that need to fetch additional data).
166
167        Examples:
168            Basic trace mapper:
169            ```python
170            def map_trace(trace):
171                return EvaluatorInputs(
172                    input=trace.input,
173                    output=trace.output,
174                    expected_output=None,
175                    metadata={"trace_id": trace.id, "user": trace.user_id}
176                )
177            ```
178
179            Observation mapper with conditional logic:
180            ```python
181            def map_observation(observation):
182                # Extract fields based on observation type
183                if observation.type == "GENERATION":
184                    input_data = observation.input
185                    output_data = observation.output
186                else:
187                    # For other types, use different fields
188                    input_data = observation.metadata.get("input")
189                    output_data = observation.metadata.get("output")
190
191                return EvaluatorInputs(
192                    input=input_data,
193                    output=output_data,
194                    expected_output=None,
195                    metadata={"obs_id": observation.id, "type": observation.type}
196                )
197            ```
198
199            Async mapper (if additional processing needed):
200            ```python
201            async def map_trace_async(trace):
202                # Could do async processing here if needed
203                processed_output = await some_async_transformation(trace.output)
204
205                return EvaluatorInputs(
206                    input=trace.input,
207                    output=processed_output,
208                    expected_output=None,
209                    metadata={"trace_id": trace.id}
210                )
211            ```
212        """
213        ...

Protocol defining the interface for mapper functions in batch evaluation.

Mapper functions transform API response objects (traces or observations) into the standardized EvaluatorInputs format that evaluators expect. This abstraction allows you to define how to extract and structure evaluation data from different entity types.

Mapper functions must:

  • Accept a single item parameter (trace, observation)
  • Return an EvaluatorInputs instance with input, output, expected_output, metadata
  • Can be either synchronous or asynchronous
  • Should handle missing or malformed data gracefully
MapperFunction(*args, **kwargs)
1927def _no_init_or_replace_init(self, *args, **kwargs):
1928    cls = type(self)
1929
1930    if cls._is_protocol:
1931        raise TypeError('Protocols cannot be instantiated')
1932
1933    # Already using a custom `__init__`. No need to calculate correct
1934    # `__init__` to call. This can lead to RecursionError. See bpo-45121.
1935    if cls.__init__ is not _no_init_or_replace_init:
1936        return
1937
1938    # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`.
1939    # The first instantiation of the subclass will call `_no_init_or_replace_init` which
1940    # searches for a proper new `__init__` in the MRO. The new `__init__`
1941    # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent
1942    # instantiation of the protocol subclass will thus use the new
1943    # `__init__` and no longer call `_no_init_or_replace_init`.
1944    for base in cls.__mro__:
1945        init = base.__dict__.get('__init__', _no_init_or_replace_init)
1946        if init is not _no_init_or_replace_init:
1947            cls.__init__ = init
1948            break
1949    else:
1950        # should not happen
1951        cls.__init__ = object.__init__
1952
1953    cls.__init__(self, *args, **kwargs)
class CompositeEvaluatorFunction(typing.Protocol):
216class CompositeEvaluatorFunction(Protocol):
217    """Protocol defining the interface for composite evaluator functions.
218
219    Composite evaluators create aggregate scores from multiple item-level evaluations.
220    This is commonly used to compute weighted averages, combined metrics, or other
221    composite assessments based on individual evaluation results.
222
223    Composite evaluators:
224    - Accept the same inputs as item-level evaluators (input, output, expected_output, metadata)
225      plus the list of evaluations
226    - Return either a single Evaluation, a list of Evaluations, or a dict
227    - Can be either synchronous or asynchronous
228    - Have access to both raw item data and evaluation results
229    """
230
231    def __call__(
232        self,
233        *,
234        input: Optional[Any] = None,
235        output: Optional[Any] = None,
236        expected_output: Optional[Any] = None,
237        metadata: Optional[Dict[str, Any]] = None,
238        evaluations: List[Evaluation],
239        **kwargs: Dict[str, Any],
240    ) -> Union[
241        Evaluation,
242        List[Evaluation],
243        Dict[str, Any],
244        Awaitable[Evaluation],
245        Awaitable[List[Evaluation]],
246        Awaitable[Dict[str, Any]],
247    ]:
248        r"""Create a composite evaluation from item-level evaluation results.
249
250        This method combines multiple evaluation scores into a single composite metric.
251        Common use cases include weighted averages, pass/fail decisions based on multiple
252        criteria, or custom scoring logic that considers multiple dimensions.
253
254        Args:
255            input: The input data that was provided to the system being evaluated.
256            output: The output generated by the system being evaluated.
257            expected_output: The expected/reference output for comparison (if available).
258            metadata: Additional metadata about the evaluation context.
259            evaluations: List of evaluation results from item-level evaluators.
260                Each evaluation contains name, value, comment, and metadata.
261
262        Returns:
263            Can return any of:
264            - Evaluation: A single composite evaluation result
265            - List[Evaluation]: Multiple composite evaluations
266            - Dict: A dict that will be converted to an Evaluation
267                - name: Identifier for the composite metric (e.g., "composite_score")
268                - value: The computed composite value
269                - comment: Optional explanation of how the score was computed
270                - metadata: Optional details about the composition logic
271
272            Can return either a direct Evaluation instance or an awaitable
273            (for async composite evaluators).
274
275        Examples:
276            Simple weighted average:
277            ```python
278            def weighted_composite(*, input, output, expected_output, metadata, evaluations):
279                weights = {
280                    "accuracy": 0.5,
281                    "relevance": 0.3,
282                    "safety": 0.2
283                }
284
285                total_score = 0.0
286                total_weight = 0.0
287
288                for eval in evaluations:
289                    if eval.name in weights and isinstance(eval.value, (int, float)):
290                        total_score += eval.value * weights[eval.name]
291                        total_weight += weights[eval.name]
292
293                final_score = total_score / total_weight if total_weight > 0 else 0.0
294
295                return Evaluation(
296                    name="composite_score",
297                    value=final_score,
298                    comment=f"Weighted average of {len(evaluations)} metrics"
299                )
300            ```
301
302            Pass/fail composite based on thresholds:
303            ```python
304            def pass_fail_composite(*, input, output, expected_output, metadata, evaluations):
305                # Must pass all criteria
306                thresholds = {
307                    "accuracy": 0.7,
308                    "safety": 0.9,
309                    "relevance": 0.6
310                }
311
312                passes = True
313                failing_metrics = []
314
315                for metric, threshold in thresholds.items():
316                    eval_result = next((e for e in evaluations if e.name == metric), None)
317                    if eval_result and isinstance(eval_result.value, (int, float)):
318                        if eval_result.value < threshold:
319                            passes = False
320                            failing_metrics.append(metric)
321
322                return Evaluation(
323                    name="passes_all_checks",
324                    value=passes,
325                    comment=f"Failed: {', '.join(failing_metrics)}" if failing_metrics else "All checks passed",
326                    data_type="BOOLEAN"
327                )
328            ```
329
330            Async composite with external scoring:
331            ```python
332            async def llm_composite(*, input, output, expected_output, metadata, evaluations):
333                # Use LLM to synthesize multiple evaluation results
334                eval_summary = "\n".join(
335                    f"- {e.name}: {e.value}" for e in evaluations
336                )
337
338                prompt = f"Given these evaluation scores:\n{eval_summary}\n"
339                prompt += f"For the output: {output}\n"
340                prompt += "Provide an overall quality score from 0-1."
341
342                response = await openai.chat.completions.create(
343                    model="gpt-4",
344                    messages=[{"role": "user", "content": prompt}]
345                )
346
347                score = float(response.choices[0].message.content.strip())
348
349                return Evaluation(
350                    name="llm_composite_score",
351                    value=score,
352                    comment="LLM-synthesized composite score"
353                )
354            ```
355
356            Context-aware composite:
357            ```python
358            def context_composite(*, input, output, expected_output, metadata, evaluations):
359                # Adjust weighting based on metadata
360                base_weights = {"accuracy": 0.5, "speed": 0.3, "cost": 0.2}
361
362                # If metadata indicates high importance, prioritize accuracy
363                if metadata and metadata.get('importance') == 'high':
364                    weights = {"accuracy": 0.7, "speed": 0.2, "cost": 0.1}
365                else:
366                    weights = base_weights
367
368                total = sum(
369                    e.value * weights.get(e.name, 0)
370                    for e in evaluations
371                    if isinstance(e.value, (int, float))
372                )
373
374                return Evaluation(
375                    name="weighted_composite",
376                    value=total,
377                    comment="Context-aware weighted composite"
378                )
379            ```
380        """
381        ...

Protocol defining the interface for composite evaluator functions.

Composite evaluators create aggregate scores from multiple item-level evaluations. This is commonly used to compute weighted averages, combined metrics, or other composite assessments based on individual evaluation results.

Composite evaluators:

  • Accept the same inputs as item-level evaluators (input, output, expected_output, metadata) plus the list of evaluations
  • Return either a single Evaluation, a list of Evaluations, or a dict
  • Can be either synchronous or asynchronous
  • Have access to both raw item data and evaluation results
CompositeEvaluatorFunction(*args, **kwargs)
1927def _no_init_or_replace_init(self, *args, **kwargs):
1928    cls = type(self)
1929
1930    if cls._is_protocol:
1931        raise TypeError('Protocols cannot be instantiated')
1932
1933    # Already using a custom `__init__`. No need to calculate correct
1934    # `__init__` to call. This can lead to RecursionError. See bpo-45121.
1935    if cls.__init__ is not _no_init_or_replace_init:
1936        return
1937
1938    # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`.
1939    # The first instantiation of the subclass will call `_no_init_or_replace_init` which
1940    # searches for a proper new `__init__` in the MRO. The new `__init__`
1941    # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent
1942    # instantiation of the protocol subclass will thus use the new
1943    # `__init__` and no longer call `_no_init_or_replace_init`.
1944    for base in cls.__mro__:
1945        init = base.__dict__.get('__init__', _no_init_or_replace_init)
1946        if init is not _no_init_or_replace_init:
1947            cls.__init__ = init
1948            break
1949    else:
1950        # should not happen
1951        cls.__init__ = object.__init__
1952
1953    cls.__init__(self, *args, **kwargs)
class EvaluatorStats:
384class EvaluatorStats:
385    """Statistics for a single evaluator's performance during batch evaluation.
386
387    This class tracks detailed metrics about how a specific evaluator performed
388    across all items in a batch evaluation run. It helps identify evaluator issues,
389    understand reliability, and optimize evaluation pipelines.
390
391    Attributes:
392        name: The name of the evaluator function (extracted from __name__).
393        total_runs: Total number of times the evaluator was invoked.
394        successful_runs: Number of times the evaluator completed successfully.
395        failed_runs: Number of times the evaluator raised an exception or failed.
396        total_scores_created: Total number of evaluation scores created by this evaluator.
397            Can be higher than successful_runs if the evaluator returns multiple scores.
398
399    Examples:
400        Accessing evaluator stats from batch evaluation result:
401        ```python
402        result = client.run_batched_evaluation(...)
403
404        for stats in result.evaluator_stats:
405            print(f"Evaluator: {stats.name}")
406            print(f"  Success rate: {stats.successful_runs / stats.total_runs:.1%}")
407            print(f"  Scores created: {stats.total_scores_created}")
408
409            if stats.failed_runs > 0:
410                print(f"  âš ī¸  Failed {stats.failed_runs} times")
411        ```
412
413        Identifying problematic evaluators:
414        ```python
415        result = client.run_batched_evaluation(...)
416
417        # Find evaluators with high failure rates
418        for stats in result.evaluator_stats:
419            failure_rate = stats.failed_runs / stats.total_runs
420            if failure_rate > 0.1:  # More than 10% failures
421                print(f"âš ī¸  {stats.name} has {failure_rate:.1%} failure rate")
422                print(f"    Consider debugging or removing this evaluator")
423        ```
424
425    Note:
426        All arguments must be passed as keywords when instantiating this class.
427    """
428
429    def __init__(
430        self,
431        *,
432        name: str,
433        total_runs: int = 0,
434        successful_runs: int = 0,
435        failed_runs: int = 0,
436        total_scores_created: int = 0,
437    ):
438        """Initialize EvaluatorStats with the provided metrics.
439
440        Args:
441            name: The evaluator function name.
442            total_runs: Total number of evaluator invocations.
443            successful_runs: Number of successful completions.
444            failed_runs: Number of failures.
445            total_scores_created: Total scores created by this evaluator.
446
447        Note:
448            All arguments must be provided as keywords.
449        """
450        self.name = name
451        self.total_runs = total_runs
452        self.successful_runs = successful_runs
453        self.failed_runs = failed_runs
454        self.total_scores_created = total_scores_created

Statistics for a single evaluator's performance during batch evaluation.

This class tracks detailed metrics about how a specific evaluator performed across all items in a batch evaluation run. It helps identify evaluator issues, understand reliability, and optimize evaluation pipelines.

Attributes:
  • name: The name of the evaluator function (extracted from __name__).
  • total_runs: Total number of times the evaluator was invoked.
  • successful_runs: Number of times the evaluator completed successfully.
  • failed_runs: Number of times the evaluator raised an exception or failed.
  • total_scores_created: Total number of evaluation scores created by this evaluator. Can be higher than successful_runs if the evaluator returns multiple scores.
Examples:

Accessing evaluator stats from batch evaluation result:

result = client.run_batched_evaluation(...)

for stats in result.evaluator_stats:
    print(f"Evaluator: {stats.name}")
    print(f"  Success rate: {stats.successful_runs / stats.total_runs:.1%}")
    print(f"  Scores created: {stats.total_scores_created}")

    if stats.failed_runs > 0:
        print(f"  âš ī¸  Failed {stats.failed_runs} times")

Identifying problematic evaluators:

result = client.run_batched_evaluation(...)

# Find evaluators with high failure rates
for stats in result.evaluator_stats:
    failure_rate = stats.failed_runs / stats.total_runs
    if failure_rate > 0.1:  # More than 10% failures
        print(f"âš ī¸  {stats.name} has {failure_rate:.1%} failure rate")
        print(f"    Consider debugging or removing this evaluator")
Note:

All arguments must be passed as keywords when instantiating this class.

EvaluatorStats( *, name: str, total_runs: int = 0, successful_runs: int = 0, failed_runs: int = 0, total_scores_created: int = 0)
429    def __init__(
430        self,
431        *,
432        name: str,
433        total_runs: int = 0,
434        successful_runs: int = 0,
435        failed_runs: int = 0,
436        total_scores_created: int = 0,
437    ):
438        """Initialize EvaluatorStats with the provided metrics.
439
440        Args:
441            name: The evaluator function name.
442            total_runs: Total number of evaluator invocations.
443            successful_runs: Number of successful completions.
444            failed_runs: Number of failures.
445            total_scores_created: Total scores created by this evaluator.
446
447        Note:
448            All arguments must be provided as keywords.
449        """
450        self.name = name
451        self.total_runs = total_runs
452        self.successful_runs = successful_runs
453        self.failed_runs = failed_runs
454        self.total_scores_created = total_scores_created

Initialize EvaluatorStats with the provided metrics.

Arguments:
  • name: The evaluator function name.
  • total_runs: Total number of evaluator invocations.
  • successful_runs: Number of successful completions.
  • failed_runs: Number of failures.
  • total_scores_created: Total scores created by this evaluator.
Note:

All arguments must be provided as keywords.

name
total_runs
successful_runs
failed_runs
total_scores_created
class BatchEvaluationResumeToken:
457class BatchEvaluationResumeToken:
458    """Token for resuming a failed batch evaluation run.
459
460    This class encapsulates all the information needed to resume a batch evaluation
461    that was interrupted or failed partway through. It uses timestamp-based filtering
462    to avoid re-processing items that were already evaluated, even if the underlying
463    dataset changed between runs.
464
465    Attributes:
466        scope: The type of items being evaluated ("traces", "observations").
467        filter: The original JSON filter string used to query items.
468        last_processed_timestamp: ISO 8601 timestamp of the last successfully processed item.
469            Used to construct a filter that only fetches items after this timestamp.
470        last_processed_id: The ID of the last successfully processed item, for reference.
471        items_processed: Count of items successfully processed before interruption.
472
473    Examples:
474        Resuming a failed batch evaluation:
475        ```python
476        # Initial run that fails partway through
477        try:
478            result = client.run_batched_evaluation(
479                scope="traces",
480                mapper=my_mapper,
481                evaluators=[evaluator1, evaluator2],
482                filter='{"tags": ["production"]}',
483                max_items=10000
484            )
485        except Exception as e:
486            print(f"Evaluation failed: {e}")
487
488            # Save the resume token
489            if result.resume_token:
490                # Store resume token for later (e.g., in a file or database)
491                import json
492                with open("resume_token.json", "w") as f:
493                    json.dump({
494                        "scope": result.resume_token.scope,
495                        "filter": result.resume_token.filter,
496                        "last_timestamp": result.resume_token.last_processed_timestamp,
497                        "last_id": result.resume_token.last_processed_id,
498                        "items_done": result.resume_token.items_processed
499                    }, f)
500
501        # Later, resume from where it left off
502        with open("resume_token.json") as f:
503            token_data = json.load(f)
504
505        resume_token = BatchEvaluationResumeToken(
506            scope=token_data["scope"],
507            filter=token_data["filter"],
508            last_processed_timestamp=token_data["last_timestamp"],
509            last_processed_id=token_data["last_id"],
510            items_processed=token_data["items_done"]
511        )
512
513        # Resume the evaluation
514        result = client.run_batched_evaluation(
515            scope="traces",
516            mapper=my_mapper,
517            evaluators=[evaluator1, evaluator2],
518            resume_from=resume_token
519        )
520
521        print(f"Processed {result.total_items_processed} additional items")
522        ```
523
524        Handling partial completion:
525        ```python
526        result = client.run_batched_evaluation(...)
527
528        if not result.completed:
529            print(f"Evaluation incomplete. Processed {result.resume_token.items_processed} items")
530            print(f"Last item: {result.resume_token.last_processed_id}")
531            print(f"Resume from: {result.resume_token.last_processed_timestamp}")
532
533            # Optionally retry automatically
534            if result.resume_token:
535                print("Retrying...")
536                result = client.run_batched_evaluation(
537                    scope=result.resume_token.scope,
538                    mapper=my_mapper,
539                    evaluators=my_evaluators,
540                    resume_from=result.resume_token
541                )
542        ```
543
544    Note:
545        All arguments must be passed as keywords when instantiating this class.
546        The timestamp-based approach means that items created after the initial run
547        but before the timestamp will be skipped. This is intentional to avoid
548        duplicates and ensure consistent evaluation.
549    """
550
551    def __init__(
552        self,
553        *,
554        scope: str,
555        filter: Optional[str],
556        last_processed_timestamp: str,
557        last_processed_id: str,
558        items_processed: int,
559    ):
560        """Initialize BatchEvaluationResumeToken with the provided state.
561
562        Args:
563            scope: The scope type ("traces", "observations").
564            filter: The original JSON filter string.
565            last_processed_timestamp: ISO 8601 timestamp of last processed item.
566            last_processed_id: ID of last processed item.
567            items_processed: Count of items processed before interruption.
568
569        Note:
570            All arguments must be provided as keywords.
571        """
572        self.scope = scope
573        self.filter = filter
574        self.last_processed_timestamp = last_processed_timestamp
575        self.last_processed_id = last_processed_id
576        self.items_processed = items_processed

Token for resuming a failed batch evaluation run.

This class encapsulates all the information needed to resume a batch evaluation that was interrupted or failed partway through. It uses timestamp-based filtering to avoid re-processing items that were already evaluated, even if the underlying dataset changed between runs.

Attributes:
  • scope: The type of items being evaluated ("traces", "observations").
  • filter: The original JSON filter string used to query items.
  • last_processed_timestamp: ISO 8601 timestamp of the last successfully processed item. Used to construct a filter that only fetches items after this timestamp.
  • last_processed_id: The ID of the last successfully processed item, for reference.
  • items_processed: Count of items successfully processed before interruption.
Examples:

Resuming a failed batch evaluation:

# Initial run that fails partway through
try:
    result = client.run_batched_evaluation(
        scope="traces",
        mapper=my_mapper,
        evaluators=[evaluator1, evaluator2],
        filter='{"tags": ["production"]}',
        max_items=10000
    )
except Exception as e:
    print(f"Evaluation failed: {e}")

    # Save the resume token
    if result.resume_token:
        # Store resume token for later (e.g., in a file or database)
        import json
        with open("resume_token.json", "w") as f:
            json.dump({
                "scope": result.resume_token.scope,
                "filter": result.resume_token.filter,
                "last_timestamp": result.resume_token.last_processed_timestamp,
                "last_id": result.resume_token.last_processed_id,
                "items_done": result.resume_token.items_processed
            }, f)

# Later, resume from where it left off
with open("resume_token.json") as f:
    token_data = json.load(f)

resume_token = BatchEvaluationResumeToken(
    scope=token_data["scope"],
    filter=token_data["filter"],
    last_processed_timestamp=token_data["last_timestamp"],
    last_processed_id=token_data["last_id"],
    items_processed=token_data["items_done"]
)

# Resume the evaluation
result = client.run_batched_evaluation(
    scope="traces",
    mapper=my_mapper,
    evaluators=[evaluator1, evaluator2],
    resume_from=resume_token
)

print(f"Processed {result.total_items_processed} additional items")

Handling partial completion:

result = client.run_batched_evaluation(...)

if not result.completed:
    print(f"Evaluation incomplete. Processed {result.resume_token.items_processed} items")
    print(f"Last item: {result.resume_token.last_processed_id}")
    print(f"Resume from: {result.resume_token.last_processed_timestamp}")

    # Optionally retry automatically
    if result.resume_token:
        print("Retrying...")
        result = client.run_batched_evaluation(
            scope=result.resume_token.scope,
            mapper=my_mapper,
            evaluators=my_evaluators,
            resume_from=result.resume_token
        )
Note:

All arguments must be passed as keywords when instantiating this class. The timestamp-based approach means that items created after the initial run but before the timestamp will be skipped. This is intentional to avoid duplicates and ensure consistent evaluation.

BatchEvaluationResumeToken( *, scope: str, filter: Optional[str], last_processed_timestamp: str, last_processed_id: str, items_processed: int)
551    def __init__(
552        self,
553        *,
554        scope: str,
555        filter: Optional[str],
556        last_processed_timestamp: str,
557        last_processed_id: str,
558        items_processed: int,
559    ):
560        """Initialize BatchEvaluationResumeToken with the provided state.
561
562        Args:
563            scope: The scope type ("traces", "observations").
564            filter: The original JSON filter string.
565            last_processed_timestamp: ISO 8601 timestamp of last processed item.
566            last_processed_id: ID of last processed item.
567            items_processed: Count of items processed before interruption.
568
569        Note:
570            All arguments must be provided as keywords.
571        """
572        self.scope = scope
573        self.filter = filter
574        self.last_processed_timestamp = last_processed_timestamp
575        self.last_processed_id = last_processed_id
576        self.items_processed = items_processed

Initialize BatchEvaluationResumeToken with the provided state.

Arguments:
  • scope: The scope type ("traces", "observations").
  • filter: The original JSON filter string.
  • last_processed_timestamp: ISO 8601 timestamp of last processed item.
  • last_processed_id: ID of last processed item.
  • items_processed: Count of items processed before interruption.
Note:

All arguments must be provided as keywords.

scope
filter
last_processed_timestamp
last_processed_id
items_processed
class BatchEvaluationResult:
579class BatchEvaluationResult:
580    r"""Complete result structure for batch evaluation execution.
581
582    This class encapsulates comprehensive statistics and metadata about a batch
583    evaluation run, including counts, evaluator-specific metrics, timing information,
584    error details, and resume capability.
585
586    Attributes:
587        total_items_fetched: Total number of items fetched from the API.
588        total_items_processed: Number of items successfully evaluated.
589        total_items_failed: Number of items that failed during evaluation.
590        total_scores_created: Total scores created by all item-level evaluators.
591        total_composite_scores_created: Scores created by the composite evaluator.
592        total_evaluations_failed: Number of individual evaluator failures across all items.
593        evaluator_stats: List of per-evaluator statistics (success/failure rates, scores created).
594        resume_token: Token for resuming if evaluation was interrupted (None if completed).
595        completed: True if all items were processed, False if stopped early or failed.
596        duration_seconds: Total time taken to execute the batch evaluation.
597        failed_item_ids: List of IDs for items that failed evaluation.
598        error_summary: Dictionary mapping error types to occurrence counts.
599        has_more_items: True if max_items limit was reached but more items exist.
600        item_evaluations: Dictionary mapping item IDs to their evaluation results (both regular and composite).
601
602    Examples:
603        Basic result inspection:
604        ```python
605        result = client.run_batched_evaluation(...)
606
607        print(f"Processed: {result.total_items_processed}/{result.total_items_fetched}")
608        print(f"Scores created: {result.total_scores_created}")
609        print(f"Duration: {result.duration_seconds:.2f}s")
610        print(f"Success rate: {result.total_items_processed / result.total_items_fetched:.1%}")
611        ```
612
613        Detailed analysis with evaluator stats:
614        ```python
615        result = client.run_batched_evaluation(...)
616
617        print(f"\n📊 Batch Evaluation Results")
618        print(f"{'='*50}")
619        print(f"Items processed: {result.total_items_processed}")
620        print(f"Items failed: {result.total_items_failed}")
621        print(f"Scores created: {result.total_scores_created}")
622
623        if result.total_composite_scores_created > 0:
624            print(f"Composite scores: {result.total_composite_scores_created}")
625
626        print(f"\n📈 Evaluator Performance:")
627        for stats in result.evaluator_stats:
628            success_rate = stats.successful_runs / stats.total_runs if stats.total_runs > 0 else 0
629            print(f"\n  {stats.name}:")
630            print(f"    Success rate: {success_rate:.1%}")
631            print(f"    Scores created: {stats.total_scores_created}")
632            if stats.failed_runs > 0:
633                print(f"    âš ī¸  Failures: {stats.failed_runs}")
634
635        if result.error_summary:
636            print(f"\nâš ī¸  Errors encountered:")
637            for error_type, count in result.error_summary.items():
638                print(f"    {error_type}: {count}")
639        ```
640
641        Handling incomplete runs:
642        ```python
643        result = client.run_batched_evaluation(...)
644
645        if not result.completed:
646            print("âš ī¸  Evaluation incomplete!")
647
648            if result.resume_token:
649                print(f"Processed {result.resume_token.items_processed} items before failure")
650                print(f"Use resume_from parameter to continue from:")
651                print(f"  Timestamp: {result.resume_token.last_processed_timestamp}")
652                print(f"  Last ID: {result.resume_token.last_processed_id}")
653
654        if result.has_more_items:
655            print(f"â„šī¸  More items available beyond max_items limit")
656        ```
657
658        Performance monitoring:
659        ```python
660        result = client.run_batched_evaluation(...)
661
662        items_per_second = result.total_items_processed / result.duration_seconds
663        avg_scores_per_item = result.total_scores_created / result.total_items_processed
664
665        print(f"Performance metrics:")
666        print(f"  Throughput: {items_per_second:.2f} items/second")
667        print(f"  Avg scores/item: {avg_scores_per_item:.2f}")
668        print(f"  Total duration: {result.duration_seconds:.2f}s")
669
670        if result.total_evaluations_failed > 0:
671            failure_rate = result.total_evaluations_failed / (
672                result.total_items_processed * len(result.evaluator_stats)
673            )
674            print(f"  Evaluation failure rate: {failure_rate:.1%}")
675        ```
676
677    Note:
678        All arguments must be passed as keywords when instantiating this class.
679    """
680
681    def __init__(
682        self,
683        *,
684        total_items_fetched: int,
685        total_items_processed: int,
686        total_items_failed: int,
687        total_scores_created: int,
688        total_composite_scores_created: int,
689        total_evaluations_failed: int,
690        evaluator_stats: List[EvaluatorStats],
691        resume_token: Optional[BatchEvaluationResumeToken],
692        completed: bool,
693        duration_seconds: float,
694        failed_item_ids: List[str],
695        error_summary: Dict[str, int],
696        has_more_items: bool,
697        item_evaluations: Dict[str, List["Evaluation"]],
698    ):
699        """Initialize BatchEvaluationResult with comprehensive statistics.
700
701        Args:
702            total_items_fetched: Total items fetched from API.
703            total_items_processed: Items successfully evaluated.
704            total_items_failed: Items that failed evaluation.
705            total_scores_created: Scores from item-level evaluators.
706            total_composite_scores_created: Scores from composite evaluator.
707            total_evaluations_failed: Individual evaluator failures.
708            evaluator_stats: Per-evaluator statistics.
709            resume_token: Token for resuming (None if completed).
710            completed: Whether all items were processed.
711            duration_seconds: Total execution time.
712            failed_item_ids: IDs of failed items.
713            error_summary: Error types and counts.
714            has_more_items: Whether more items exist beyond max_items.
715            item_evaluations: Dictionary mapping item IDs to their evaluation results.
716
717        Note:
718            All arguments must be provided as keywords.
719        """
720        self.total_items_fetched = total_items_fetched
721        self.total_items_processed = total_items_processed
722        self.total_items_failed = total_items_failed
723        self.total_scores_created = total_scores_created
724        self.total_composite_scores_created = total_composite_scores_created
725        self.total_evaluations_failed = total_evaluations_failed
726        self.evaluator_stats = evaluator_stats
727        self.resume_token = resume_token
728        self.completed = completed
729        self.duration_seconds = duration_seconds
730        self.failed_item_ids = failed_item_ids
731        self.error_summary = error_summary
732        self.has_more_items = has_more_items
733        self.item_evaluations = item_evaluations
734
735    def __str__(self) -> str:
736        """Return a formatted string representation of the batch evaluation results.
737
738        Returns:
739            A multi-line string with a summary of the evaluation results.
740        """
741        lines = []
742        lines.append("=" * 60)
743        lines.append("Batch Evaluation Results")
744        lines.append("=" * 60)
745
746        # Summary statistics
747        lines.append(f"\nStatus: {'Completed' if self.completed else 'Incomplete'}")
748        lines.append(f"Duration: {self.duration_seconds:.2f}s")
749        lines.append(f"\nItems fetched: {self.total_items_fetched}")
750        lines.append(f"Items processed: {self.total_items_processed}")
751
752        if self.total_items_failed > 0:
753            lines.append(f"Items failed: {self.total_items_failed}")
754
755        # Success rate
756        if self.total_items_fetched > 0:
757            success_rate = self.total_items_processed / self.total_items_fetched * 100
758            lines.append(f"Success rate: {success_rate:.1f}%")
759
760        # Scores created
761        lines.append(f"\nScores created: {self.total_scores_created}")
762        if self.total_composite_scores_created > 0:
763            lines.append(f"Composite scores: {self.total_composite_scores_created}")
764
765        total_scores = self.total_scores_created + self.total_composite_scores_created
766        lines.append(f"Total scores: {total_scores}")
767
768        # Evaluator statistics
769        if self.evaluator_stats:
770            lines.append("\nEvaluator Performance:")
771            for stats in self.evaluator_stats:
772                lines.append(f"  {stats.name}:")
773                if stats.total_runs > 0:
774                    success_rate = (
775                        stats.successful_runs / stats.total_runs * 100
776                        if stats.total_runs > 0
777                        else 0
778                    )
779                    lines.append(
780                        f"    Runs: {stats.successful_runs}/{stats.total_runs} "
781                        f"({success_rate:.1f}% success)"
782                    )
783                    lines.append(f"    Scores created: {stats.total_scores_created}")
784                    if stats.failed_runs > 0:
785                        lines.append(f"    Failed runs: {stats.failed_runs}")
786
787        # Performance metrics
788        if self.total_items_processed > 0 and self.duration_seconds > 0:
789            items_per_sec = self.total_items_processed / self.duration_seconds
790            lines.append("\nPerformance:")
791            lines.append(f"  Throughput: {items_per_sec:.2f} items/second")
792            if self.total_scores_created > 0:
793                avg_scores = self.total_scores_created / self.total_items_processed
794                lines.append(f"  Avg scores per item: {avg_scores:.2f}")
795
796        # Errors and warnings
797        if self.error_summary:
798            lines.append("\nErrors encountered:")
799            for error_type, count in self.error_summary.items():
800                lines.append(f"  {error_type}: {count}")
801
802        # Incomplete run information
803        if not self.completed:
804            lines.append("\nWarning: Evaluation incomplete")
805            if self.resume_token:
806                lines.append(
807                    f"  Last processed: {self.resume_token.last_processed_timestamp}"
808                )
809                lines.append(f"  Items processed: {self.resume_token.items_processed}")
810                lines.append("  Use resume_from parameter to continue")
811
812        if self.has_more_items:
813            lines.append("\nNote: More items available beyond max_items limit")
814
815        lines.append("=" * 60)
816        return "\n".join(lines)

Complete result structure for batch evaluation execution.

This class encapsulates comprehensive statistics and metadata about a batch evaluation run, including counts, evaluator-specific metrics, timing information, error details, and resume capability.

Attributes:
  • total_items_fetched: Total number of items fetched from the API.
  • total_items_processed: Number of items successfully evaluated.
  • total_items_failed: Number of items that failed during evaluation.
  • total_scores_created: Total scores created by all item-level evaluators.
  • total_composite_scores_created: Scores created by the composite evaluator.
  • total_evaluations_failed: Number of individual evaluator failures across all items.
  • evaluator_stats: List of per-evaluator statistics (success/failure rates, scores created).
  • resume_token: Token for resuming if evaluation was interrupted (None if completed).
  • completed: True if all items were processed, False if stopped early or failed.
  • duration_seconds: Total time taken to execute the batch evaluation.
  • failed_item_ids: List of IDs for items that failed evaluation.
  • error_summary: Dictionary mapping error types to occurrence counts.
  • has_more_items: True if max_items limit was reached but more items exist.
  • item_evaluations: Dictionary mapping item IDs to their evaluation results (both regular and composite).
Examples:

Basic result inspection:

result = client.run_batched_evaluation(...)

print(f"Processed: {result.total_items_processed}/{result.total_items_fetched}")
print(f"Scores created: {result.total_scores_created}")
print(f"Duration: {result.duration_seconds:.2f}s")
print(f"Success rate: {result.total_items_processed / result.total_items_fetched:.1%}")

Detailed analysis with evaluator stats:

result = client.run_batched_evaluation(...)

print(f"\n📊 Batch Evaluation Results")
print(f"{'='*50}")
print(f"Items processed: {result.total_items_processed}")
print(f"Items failed: {result.total_items_failed}")
print(f"Scores created: {result.total_scores_created}")

if result.total_composite_scores_created > 0:
    print(f"Composite scores: {result.total_composite_scores_created}")

print(f"\n📈 Evaluator Performance:")
for stats in result.evaluator_stats:
    success_rate = stats.successful_runs / stats.total_runs if stats.total_runs > 0 else 0
    print(f"\n  {stats.name}:")
    print(f"    Success rate: {success_rate:.1%}")
    print(f"    Scores created: {stats.total_scores_created}")
    if stats.failed_runs > 0:
        print(f"    âš ī¸  Failures: {stats.failed_runs}")

if result.error_summary:
    print(f"\nâš ī¸  Errors encountered:")
    for error_type, count in result.error_summary.items():
        print(f"    {error_type}: {count}")

Handling incomplete runs:

result = client.run_batched_evaluation(...)

if not result.completed:
    print("âš ī¸  Evaluation incomplete!")

    if result.resume_token:
        print(f"Processed {result.resume_token.items_processed} items before failure")
        print(f"Use resume_from parameter to continue from:")
        print(f"  Timestamp: {result.resume_token.last_processed_timestamp}")
        print(f"  Last ID: {result.resume_token.last_processed_id}")

if result.has_more_items:
    print(f"â„šī¸  More items available beyond max_items limit")

Performance monitoring:

result = client.run_batched_evaluation(...)

items_per_second = result.total_items_processed / result.duration_seconds
avg_scores_per_item = result.total_scores_created / result.total_items_processed

print(f"Performance metrics:")
print(f"  Throughput: {items_per_second:.2f} items/second")
print(f"  Avg scores/item: {avg_scores_per_item:.2f}")
print(f"  Total duration: {result.duration_seconds:.2f}s")

if result.total_evaluations_failed > 0:
    failure_rate = result.total_evaluations_failed / (
        result.total_items_processed * len(result.evaluator_stats)
    )
    print(f"  Evaluation failure rate: {failure_rate:.1%}")
Note:

All arguments must be passed as keywords when instantiating this class.

BatchEvaluationResult( *, total_items_fetched: int, total_items_processed: int, total_items_failed: int, total_scores_created: int, total_composite_scores_created: int, total_evaluations_failed: int, evaluator_stats: List[EvaluatorStats], resume_token: Optional[BatchEvaluationResumeToken], completed: bool, duration_seconds: float, failed_item_ids: List[str], error_summary: Dict[str, int], has_more_items: bool, item_evaluations: Dict[str, List[Evaluation]])
681    def __init__(
682        self,
683        *,
684        total_items_fetched: int,
685        total_items_processed: int,
686        total_items_failed: int,
687        total_scores_created: int,
688        total_composite_scores_created: int,
689        total_evaluations_failed: int,
690        evaluator_stats: List[EvaluatorStats],
691        resume_token: Optional[BatchEvaluationResumeToken],
692        completed: bool,
693        duration_seconds: float,
694        failed_item_ids: List[str],
695        error_summary: Dict[str, int],
696        has_more_items: bool,
697        item_evaluations: Dict[str, List["Evaluation"]],
698    ):
699        """Initialize BatchEvaluationResult with comprehensive statistics.
700
701        Args:
702            total_items_fetched: Total items fetched from API.
703            total_items_processed: Items successfully evaluated.
704            total_items_failed: Items that failed evaluation.
705            total_scores_created: Scores from item-level evaluators.
706            total_composite_scores_created: Scores from composite evaluator.
707            total_evaluations_failed: Individual evaluator failures.
708            evaluator_stats: Per-evaluator statistics.
709            resume_token: Token for resuming (None if completed).
710            completed: Whether all items were processed.
711            duration_seconds: Total execution time.
712            failed_item_ids: IDs of failed items.
713            error_summary: Error types and counts.
714            has_more_items: Whether more items exist beyond max_items.
715            item_evaluations: Dictionary mapping item IDs to their evaluation results.
716
717        Note:
718            All arguments must be provided as keywords.
719        """
720        self.total_items_fetched = total_items_fetched
721        self.total_items_processed = total_items_processed
722        self.total_items_failed = total_items_failed
723        self.total_scores_created = total_scores_created
724        self.total_composite_scores_created = total_composite_scores_created
725        self.total_evaluations_failed = total_evaluations_failed
726        self.evaluator_stats = evaluator_stats
727        self.resume_token = resume_token
728        self.completed = completed
729        self.duration_seconds = duration_seconds
730        self.failed_item_ids = failed_item_ids
731        self.error_summary = error_summary
732        self.has_more_items = has_more_items
733        self.item_evaluations = item_evaluations

Initialize BatchEvaluationResult with comprehensive statistics.

Arguments:
  • total_items_fetched: Total items fetched from API.
  • total_items_processed: Items successfully evaluated.
  • total_items_failed: Items that failed evaluation.
  • total_scores_created: Scores from item-level evaluators.
  • total_composite_scores_created: Scores from composite evaluator.
  • total_evaluations_failed: Individual evaluator failures.
  • evaluator_stats: Per-evaluator statistics.
  • resume_token: Token for resuming (None if completed).
  • completed: Whether all items were processed.
  • duration_seconds: Total execution time.
  • failed_item_ids: IDs of failed items.
  • error_summary: Error types and counts.
  • has_more_items: Whether more items exist beyond max_items.
  • item_evaluations: Dictionary mapping item IDs to their evaluation results.
Note:

All arguments must be provided as keywords.

total_items_fetched
total_items_processed
total_items_failed
total_scores_created
total_composite_scores_created
total_evaluations_failed
evaluator_stats
resume_token
completed
duration_seconds
failed_item_ids
error_summary
has_more_items
item_evaluations
def is_default_export_span(span: opentelemetry.sdk.trace.ReadableSpan) -> bool:
73def is_default_export_span(span: ReadableSpan) -> bool:
74    """Return whether a span should be exported by default."""
75    return (
76        is_langfuse_span(span) or is_genai_span(span) or is_known_llm_instrumentor(span)
77    )

Return whether a span should be exported by default.

def is_langfuse_span(span: opentelemetry.sdk.trace.ReadableSpan) -> bool:
36def is_langfuse_span(span: ReadableSpan) -> bool:
37    """Return whether the span was created by the Langfuse SDK tracer."""
38    return (
39        span.instrumentation_scope is not None
40        and span.instrumentation_scope.name == LANGFUSE_TRACER_NAME
41    )

Return whether the span was created by the Langfuse SDK tracer.

def is_genai_span(span: opentelemetry.sdk.trace.ReadableSpan) -> bool:
44def is_genai_span(span: ReadableSpan) -> bool:
45    """Return whether the span has any ``gen_ai.*`` semantic convention attribute."""
46    if span.attributes is None:
47        return False
48
49    return any(
50        isinstance(key, str) and key.startswith("gen_ai")
51        for key in span.attributes.keys()
52    )

Return whether the span has any gen_ai.* semantic convention attribute.

def is_known_llm_instrumentor(span: opentelemetry.sdk.trace.ReadableSpan) -> bool:
60def is_known_llm_instrumentor(span: ReadableSpan) -> bool:
61    """Return whether the span comes from a known LLM instrumentation scope."""
62    if span.instrumentation_scope is None:
63        return False
64
65    scope_name = span.instrumentation_scope.name
66
67    return any(
68        _matches_scope_prefix(scope_name, prefix)
69        for prefix in KNOWN_LLM_INSTRUMENTATION_SCOPE_PREFIXES
70    )

Return whether the span comes from a known LLM instrumentation scope.

KNOWN_LLM_INSTRUMENTATION_SCOPE_PREFIXES = frozenset({'haystack', 'ai', 'openinference', 'langsmith', 'langfuse-sdk', 'strands-agents', 'opentelemetry.instrumentation.anthropic', 'agent_framework', 'vllm', 'litellm'})