langfuse

 1""".. include:: ../README.md"""
 2
 3from langfuse.batch_evaluation import (
 4    BatchEvaluationResult,
 5    BatchEvaluationResumeToken,
 6    CompositeEvaluatorFunction,
 7    EvaluatorInputs,
 8    EvaluatorStats,
 9    MapperFunction,
10)
11from langfuse.experiment import Evaluation, RegressionError, RunnerContext
12
13from ._client import client as _client_module
14from ._client.attributes import LangfuseOtelSpanAttributes
15from ._client.constants import ObservationTypeLiteral
16from ._client.get_client import get_client
17from ._client.observe import observe
18from ._client.propagation import propagate_attributes
19from ._client.span import (
20    LangfuseAgent,
21    LangfuseChain,
22    LangfuseEmbedding,
23    LangfuseEvaluator,
24    LangfuseEvent,
25    LangfuseGeneration,
26    LangfuseGuardrail,
27    LangfuseRetriever,
28    LangfuseSpan,
29    LangfuseTool,
30)
31from ._version import __version__
32from .span_filter import (
33    KNOWN_LLM_INSTRUMENTATION_SCOPE_PREFIXES,
34    is_default_export_span,
35    is_genai_span,
36    is_known_llm_instrumentor,
37    is_langfuse_span,
38)
39
40Langfuse = _client_module.Langfuse
41
42__all__ = [
43    "Langfuse",
44    "get_client",
45    "observe",
46    "propagate_attributes",
47    "ObservationTypeLiteral",
48    "LangfuseSpan",
49    "LangfuseGeneration",
50    "LangfuseEvent",
51    "LangfuseOtelSpanAttributes",
52    "LangfuseAgent",
53    "LangfuseTool",
54    "LangfuseChain",
55    "LangfuseEmbedding",
56    "LangfuseEvaluator",
57    "LangfuseRetriever",
58    "LangfuseGuardrail",
59    "Evaluation",
60    "EvaluatorInputs",
61    "MapperFunction",
62    "CompositeEvaluatorFunction",
63    "EvaluatorStats",
64    "BatchEvaluationResumeToken",
65    "BatchEvaluationResult",
66    "RunnerContext",
67    "RegressionError",
68    "__version__",
69    "is_default_export_span",
70    "is_langfuse_span",
71    "is_genai_span",
72    "is_known_llm_instrumentor",
73    "KNOWN_LLM_INSTRUMENTATION_SCOPE_PREFIXES",
74    "experiment",
75    "api",
76]
class Langfuse:
 142class Langfuse:
 143    """Main client for Langfuse tracing and platform features.
 144
 145    This class provides an interface for creating and managing traces, spans,
 146    and generations in Langfuse as well as interacting with the Langfuse API.
 147
 148    The client features a thread-safe singleton pattern for each unique public API key,
 149    ensuring consistent trace context propagation across your application. It implements
 150    efficient batching of spans with configurable flush settings and includes background
 151    thread management for media uploads and score ingestion.
 152
 153    Configuration is flexible through either direct parameters or environment variables,
 154    with graceful fallbacks and runtime configuration updates.
 155
 156    Attributes:
 157        api: Synchronous API client for Langfuse backend communication
 158        async_api: Asynchronous API client for Langfuse backend communication
 159        _otel_tracer: Internal LangfuseTracer instance managing OpenTelemetry components
 160
 161    Parameters:
 162        public_key (Optional[str]): Your Langfuse public API key. Can also be set via LANGFUSE_PUBLIC_KEY environment variable.
 163        secret_key (Optional[str]): Your Langfuse secret API key. Can also be set via LANGFUSE_SECRET_KEY environment variable.
 164        base_url (Optional[str]): The Langfuse API base URL. Defaults to "https://cloud.langfuse.com". Can also be set via LANGFUSE_BASE_URL environment variable.
 165        host (Optional[str]): Deprecated. Use base_url instead. The Langfuse API host URL. Defaults to "https://cloud.langfuse.com".
 166        timeout (Optional[int]): Timeout in seconds for API requests. Defaults to 5 seconds.
 167        httpx_client (Optional[httpx.Client]): Custom httpx client for making non-tracing HTTP requests. If not provided, a default client will be created.
 168        debug (bool): Enable debug logging. Defaults to False. Can also be set via LANGFUSE_DEBUG environment variable.
 169        tracing_enabled (Optional[bool]): Enable or disable tracing. Defaults to True. Can also be set via LANGFUSE_TRACING_ENABLED environment variable.
 170        flush_at (Optional[int]): Number of spans to batch before sending to the API. Defaults to 512. Can also be set via LANGFUSE_FLUSH_AT environment variable.
 171        flush_interval (Optional[float]): Time in seconds between batch flushes. Defaults to 5 seconds. Can also be set via LANGFUSE_FLUSH_INTERVAL environment variable.
 172        environment (Optional[str]): Environment name for tracing. Default is 'default'. Can also be set via LANGFUSE_TRACING_ENVIRONMENT environment variable. Can be any lowercase alphanumeric string with hyphens and underscores that does not start with 'langfuse'.
 173        release (Optional[str]): Release version/hash of your application. Used for grouping analytics by release.
 174        media_upload_thread_count (Optional[int]): Number of background threads for handling media uploads. Defaults to 1. Can also be set via LANGFUSE_MEDIA_UPLOAD_THREAD_COUNT environment variable.
 175        sample_rate (Optional[float]): Sampling rate for traces (0.0 to 1.0). Defaults to 1.0 (100% of traces are sampled). Can also be set via LANGFUSE_SAMPLE_RATE environment variable.
 176        mask (Optional[MaskFunction]): Function to mask sensitive data in traces before sending to the API.
 177        blocked_instrumentation_scopes (Optional[List[str]]): Deprecated. Use `should_export_span` instead. Equivalent behavior:
 178            ```python
 179            from langfuse.span_filter import is_default_export_span
 180            blocked = {"sqlite", "requests"}
 181
 182            should_export_span = lambda span: (
 183                is_default_export_span(span)
 184                and (
 185                    span.instrumentation_scope is None
 186                    or span.instrumentation_scope.name not in blocked
 187                )
 188            )
 189            ```
 190        should_export_span (Optional[Callable[[ReadableSpan], bool]]): Callback to decide whether to export a span. If omitted, Langfuse uses the default filter (Langfuse SDK spans, spans with `gen_ai.*` attributes, and known LLM instrumentation scopes).
 191        additional_headers (Optional[Dict[str, str]]): Additional headers to include in all API requests and in the default OTLPSpanExporter requests. These headers will be merged with default headers. Note: If httpx_client is provided, additional_headers must be set directly on your custom httpx_client as well. If `span_exporter` is provided, these headers are not wired into that exporter and must be configured on the exporter instance directly.
 192        tracer_provider(Optional[TracerProvider]): OpenTelemetry TracerProvider to use for Langfuse. This can be useful to set to have disconnected tracing between Langfuse and other OpenTelemetry-span emitting libraries. Note: To track active spans, the context is still shared between TracerProviders. This may lead to broken trace trees.
 193        span_exporter (Optional[SpanExporter]): Custom OpenTelemetry span exporter for the Langfuse span processor. If omitted, Langfuse creates an OTLPSpanExporter pointed at the Langfuse OTLP endpoint. If provided, Langfuse does not wire `base_url`, exporter headers, exporter auth, or exporter timeout into it. Configure endpoint, headers, and timeout on the exporter instance directly. If you are sending spans to Langfuse v4 or using Langfuse Cloud Fast Preview, include `x-langfuse-ingestion-version=4` on the exporter to enable real time processing of exported spans.
 194
 195    Example:
 196        ```python
 197        from langfuse.otel import Langfuse
 198
 199        # Initialize the client (reads from env vars if not provided)
 200        langfuse = Langfuse(
 201            public_key="your-public-key",
 202            secret_key="your-secret-key",
 203            host="https://cloud.langfuse.com",  # Optional, default shown
 204        )
 205
 206        # Create a trace span
 207        with langfuse.start_as_current_observation(name="process-query") as span:
 208            # Your application code here
 209
 210            # Create a nested generation span for an LLM call
 211            with span.start_as_current_generation(
 212                name="generate-response",
 213                model="gpt-4",
 214                input={"query": "Tell me about AI"},
 215                model_parameters={"temperature": 0.7, "max_tokens": 500}
 216            ) as generation:
 217                # Generate response here
 218                response = "AI is a field of computer science..."
 219
 220                generation.update(
 221                    output=response,
 222                    usage_details={"prompt_tokens": 10, "completion_tokens": 50},
 223                    cost_details={"total_cost": 0.0023}
 224                )
 225
 226                # Score the generation (supports NUMERIC, BOOLEAN, CATEGORICAL)
 227                generation.score(name="relevance", value=0.95, data_type="NUMERIC")
 228        ```
 229    """
 230
 231    _resources: Optional[LangfuseResourceManager] = None
 232    _mask: Optional[MaskFunction] = None
 233    _otel_tracer: otel_trace_api.Tracer
 234
 235    def __init__(
 236        self,
 237        *,
 238        public_key: Optional[str] = None,
 239        secret_key: Optional[str] = None,
 240        base_url: Optional[str] = None,
 241        host: Optional[str] = None,
 242        timeout: Optional[int] = None,
 243        httpx_client: Optional[httpx.Client] = None,
 244        debug: bool = False,
 245        tracing_enabled: Optional[bool] = True,
 246        flush_at: Optional[int] = None,
 247        flush_interval: Optional[float] = None,
 248        environment: Optional[str] = None,
 249        release: Optional[str] = None,
 250        media_upload_thread_count: Optional[int] = None,
 251        sample_rate: Optional[float] = None,
 252        mask: Optional[MaskFunction] = None,
 253        blocked_instrumentation_scopes: Optional[List[str]] = None,
 254        should_export_span: Optional[Callable[[ReadableSpan], bool]] = None,
 255        additional_headers: Optional[Dict[str, str]] = None,
 256        tracer_provider: Optional[TracerProvider] = None,
 257        span_exporter: Optional[SpanExporter] = None,
 258    ):
 259        self._base_url = (
 260            base_url
 261            or os.environ.get(LANGFUSE_BASE_URL)
 262            or host
 263            or os.environ.get(LANGFUSE_HOST, "https://cloud.langfuse.com")
 264        )
 265        self._environment = environment or cast(
 266            str, os.environ.get(LANGFUSE_TRACING_ENVIRONMENT)
 267        )
 268        self._release = (
 269            release
 270            or os.environ.get(LANGFUSE_RELEASE, None)
 271            or get_common_release_envs()
 272        )
 273        self._project_id: Optional[str] = None
 274        sample_rate = sample_rate or float(os.environ.get(LANGFUSE_SAMPLE_RATE, 1.0))
 275        if not 0.0 <= sample_rate <= 1.0:
 276            raise ValueError(
 277                f"Sample rate must be between 0.0 and 1.0, got {sample_rate}"
 278            )
 279
 280        timeout = timeout or int(os.environ.get(LANGFUSE_TIMEOUT, 5))
 281
 282        self._tracing_enabled = (
 283            tracing_enabled
 284            and os.environ.get(LANGFUSE_TRACING_ENABLED, "true").lower() != "false"
 285        )
 286        if not self._tracing_enabled:
 287            langfuse_logger.info(
 288                "Configuration: Langfuse tracing is explicitly disabled. No data will be sent to the Langfuse API."
 289            )
 290
 291        debug = (
 292            debug if debug else (os.getenv(LANGFUSE_DEBUG, "false").lower() == "true")
 293        )
 294        if debug:
 295            logging.basicConfig(
 296                format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 297            )
 298            langfuse_logger.setLevel(logging.DEBUG)
 299
 300        public_key = public_key or os.environ.get(LANGFUSE_PUBLIC_KEY)
 301        if public_key is None:
 302            langfuse_logger.warning(
 303                "Authentication error: Langfuse client initialized without public_key. Client will be disabled. "
 304                "Provide a public_key parameter or set LANGFUSE_PUBLIC_KEY environment variable. "
 305            )
 306            self._otel_tracer = otel_trace_api.NoOpTracer()
 307            return
 308
 309        secret_key = secret_key or os.environ.get(LANGFUSE_SECRET_KEY)
 310        if secret_key is None:
 311            langfuse_logger.warning(
 312                "Authentication error: Langfuse client initialized without secret_key. Client will be disabled. "
 313                "Provide a secret_key parameter or set LANGFUSE_SECRET_KEY environment variable. "
 314            )
 315            self._otel_tracer = otel_trace_api.NoOpTracer()
 316            return
 317
 318        if os.environ.get("OTEL_SDK_DISABLED", "false").lower() == "true":
 319            langfuse_logger.warning(
 320                "OTEL_SDK_DISABLED is set. Langfuse tracing will be disabled and no traces will appear in the UI."
 321            )
 322
 323        if blocked_instrumentation_scopes is not None:
 324            warnings.warn(
 325                "`blocked_instrumentation_scopes` is deprecated and will be removed in a future release. "
 326                "Use `should_export_span` instead. Example: "
 327                "from langfuse.span_filter import is_default_export_span; "
 328                'blocked={"scope"}; should_export_span=lambda span: '
 329                "is_default_export_span(span) and (span.instrumentation_scope is None or "
 330                "span.instrumentation_scope.name not in blocked).",
 331                DeprecationWarning,
 332                stacklevel=2,
 333            )
 334
 335        # Initialize api and tracer if requirements are met
 336        self._resources = LangfuseResourceManager(
 337            public_key=public_key,
 338            secret_key=secret_key,
 339            base_url=self._base_url,
 340            timeout=timeout,
 341            environment=self._environment,
 342            release=release,
 343            flush_at=flush_at,
 344            flush_interval=flush_interval,
 345            httpx_client=httpx_client,
 346            media_upload_thread_count=media_upload_thread_count,
 347            sample_rate=sample_rate,
 348            mask=mask,
 349            tracing_enabled=self._tracing_enabled,
 350            blocked_instrumentation_scopes=blocked_instrumentation_scopes,
 351            should_export_span=should_export_span,
 352            additional_headers=additional_headers,
 353            tracer_provider=tracer_provider,
 354            span_exporter=span_exporter,
 355        )
 356        self._mask = self._resources.mask
 357
 358        self._otel_tracer = (
 359            self._resources.tracer
 360            if self._tracing_enabled and self._resources.tracer is not None
 361            else otel_trace_api.NoOpTracer()
 362        )
 363        self.api = self._resources.api
 364        self.async_api = self._resources.async_api
 365
 366    @overload
 367    def start_observation(
 368        self,
 369        *,
 370        trace_context: Optional[TraceContext] = None,
 371        name: str,
 372        as_type: Literal["generation"],
 373        input: Optional[Any] = None,
 374        output: Optional[Any] = None,
 375        metadata: Optional[Any] = None,
 376        version: Optional[str] = None,
 377        level: Optional[SpanLevel] = None,
 378        status_message: Optional[str] = None,
 379        completion_start_time: Optional[datetime] = None,
 380        model: Optional[str] = None,
 381        model_parameters: Optional[Dict[str, MapValue]] = None,
 382        usage_details: Optional[Dict[str, int]] = None,
 383        cost_details: Optional[Dict[str, float]] = None,
 384        prompt: Optional[PromptClient] = None,
 385    ) -> LangfuseGeneration: ...
 386
 387    @overload
 388    def start_observation(
 389        self,
 390        *,
 391        trace_context: Optional[TraceContext] = None,
 392        name: str,
 393        as_type: Literal["span"] = "span",
 394        input: Optional[Any] = None,
 395        output: Optional[Any] = None,
 396        metadata: Optional[Any] = None,
 397        version: Optional[str] = None,
 398        level: Optional[SpanLevel] = None,
 399        status_message: Optional[str] = None,
 400    ) -> LangfuseSpan: ...
 401
 402    @overload
 403    def start_observation(
 404        self,
 405        *,
 406        trace_context: Optional[TraceContext] = None,
 407        name: str,
 408        as_type: Literal["agent"],
 409        input: Optional[Any] = None,
 410        output: Optional[Any] = None,
 411        metadata: Optional[Any] = None,
 412        version: Optional[str] = None,
 413        level: Optional[SpanLevel] = None,
 414        status_message: Optional[str] = None,
 415    ) -> LangfuseAgent: ...
 416
 417    @overload
 418    def start_observation(
 419        self,
 420        *,
 421        trace_context: Optional[TraceContext] = None,
 422        name: str,
 423        as_type: Literal["tool"],
 424        input: Optional[Any] = None,
 425        output: Optional[Any] = None,
 426        metadata: Optional[Any] = None,
 427        version: Optional[str] = None,
 428        level: Optional[SpanLevel] = None,
 429        status_message: Optional[str] = None,
 430    ) -> LangfuseTool: ...
 431
 432    @overload
 433    def start_observation(
 434        self,
 435        *,
 436        trace_context: Optional[TraceContext] = None,
 437        name: str,
 438        as_type: Literal["chain"],
 439        input: Optional[Any] = None,
 440        output: Optional[Any] = None,
 441        metadata: Optional[Any] = None,
 442        version: Optional[str] = None,
 443        level: Optional[SpanLevel] = None,
 444        status_message: Optional[str] = None,
 445    ) -> LangfuseChain: ...
 446
 447    @overload
 448    def start_observation(
 449        self,
 450        *,
 451        trace_context: Optional[TraceContext] = None,
 452        name: str,
 453        as_type: Literal["retriever"],
 454        input: Optional[Any] = None,
 455        output: Optional[Any] = None,
 456        metadata: Optional[Any] = None,
 457        version: Optional[str] = None,
 458        level: Optional[SpanLevel] = None,
 459        status_message: Optional[str] = None,
 460    ) -> LangfuseRetriever: ...
 461
 462    @overload
 463    def start_observation(
 464        self,
 465        *,
 466        trace_context: Optional[TraceContext] = None,
 467        name: str,
 468        as_type: Literal["evaluator"],
 469        input: Optional[Any] = None,
 470        output: Optional[Any] = None,
 471        metadata: Optional[Any] = None,
 472        version: Optional[str] = None,
 473        level: Optional[SpanLevel] = None,
 474        status_message: Optional[str] = None,
 475    ) -> LangfuseEvaluator: ...
 476
 477    @overload
 478    def start_observation(
 479        self,
 480        *,
 481        trace_context: Optional[TraceContext] = None,
 482        name: str,
 483        as_type: Literal["embedding"],
 484        input: Optional[Any] = None,
 485        output: Optional[Any] = None,
 486        metadata: Optional[Any] = None,
 487        version: Optional[str] = None,
 488        level: Optional[SpanLevel] = None,
 489        status_message: Optional[str] = None,
 490        completion_start_time: Optional[datetime] = None,
 491        model: Optional[str] = None,
 492        model_parameters: Optional[Dict[str, MapValue]] = None,
 493        usage_details: Optional[Dict[str, int]] = None,
 494        cost_details: Optional[Dict[str, float]] = None,
 495        prompt: Optional[PromptClient] = None,
 496    ) -> LangfuseEmbedding: ...
 497
 498    @overload
 499    def start_observation(
 500        self,
 501        *,
 502        trace_context: Optional[TraceContext] = None,
 503        name: str,
 504        as_type: Literal["guardrail"],
 505        input: Optional[Any] = None,
 506        output: Optional[Any] = None,
 507        metadata: Optional[Any] = None,
 508        version: Optional[str] = None,
 509        level: Optional[SpanLevel] = None,
 510        status_message: Optional[str] = None,
 511    ) -> LangfuseGuardrail: ...
 512
 513    def start_observation(
 514        self,
 515        *,
 516        trace_context: Optional[TraceContext] = None,
 517        name: str,
 518        as_type: ObservationTypeLiteralNoEvent = "span",
 519        input: Optional[Any] = None,
 520        output: Optional[Any] = None,
 521        metadata: Optional[Any] = None,
 522        version: Optional[str] = None,
 523        level: Optional[SpanLevel] = None,
 524        status_message: Optional[str] = None,
 525        completion_start_time: Optional[datetime] = None,
 526        model: Optional[str] = None,
 527        model_parameters: Optional[Dict[str, MapValue]] = None,
 528        usage_details: Optional[Dict[str, int]] = None,
 529        cost_details: Optional[Dict[str, float]] = None,
 530        prompt: Optional[PromptClient] = None,
 531    ) -> Union[
 532        LangfuseSpan,
 533        LangfuseGeneration,
 534        LangfuseAgent,
 535        LangfuseTool,
 536        LangfuseChain,
 537        LangfuseRetriever,
 538        LangfuseEvaluator,
 539        LangfuseEmbedding,
 540        LangfuseGuardrail,
 541    ]:
 542        """Create a new observation of the specified type.
 543
 544        This method creates a new observation but does not set it as the current span in the
 545        context. To create and use an observation within a context, use start_as_current_observation().
 546
 547        Args:
 548            trace_context: Optional context for connecting to an existing trace
 549            name: Name of the observation
 550            as_type: Type of observation to create (defaults to "span")
 551            input: Input data for the operation
 552            output: Output data from the operation
 553            metadata: Additional metadata to associate with the observation
 554            version: Version identifier for the code or component
 555            level: Importance level of the observation
 556            status_message: Optional status message for the observation
 557            completion_start_time: When the model started generating (for generation types)
 558            model: Name/identifier of the AI model used (for generation types)
 559            model_parameters: Parameters used for the model (for generation types)
 560            usage_details: Token usage information (for generation types)
 561            cost_details: Cost information (for generation types)
 562            prompt: Associated prompt template (for generation types)
 563
 564        Returns:
 565            An observation object of the appropriate type that must be ended with .end()
 566        """
 567        if trace_context:
 568            trace_id = trace_context.get("trace_id", None)
 569            parent_span_id = trace_context.get("parent_span_id", None)
 570
 571            if trace_id:
 572                remote_parent_span = self._create_remote_parent_span(
 573                    trace_id=trace_id, parent_span_id=parent_span_id
 574                )
 575
 576                with otel_trace_api.use_span(
 577                    cast(otel_trace_api.Span, remote_parent_span)
 578                ):
 579                    otel_span = self._otel_tracer.start_span(name=name)
 580                    otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True)
 581
 582                    return self._create_observation_from_otel_span(
 583                        otel_span=otel_span,
 584                        as_type=as_type,
 585                        input=input,
 586                        output=output,
 587                        metadata=metadata,
 588                        version=version,
 589                        level=level,
 590                        status_message=status_message,
 591                        completion_start_time=completion_start_time,
 592                        model=model,
 593                        model_parameters=model_parameters,
 594                        usage_details=usage_details,
 595                        cost_details=cost_details,
 596                        prompt=prompt,
 597                    )
 598
 599        otel_span = self._otel_tracer.start_span(name=name)
 600
 601        return self._create_observation_from_otel_span(
 602            otel_span=otel_span,
 603            as_type=as_type,
 604            input=input,
 605            output=output,
 606            metadata=metadata,
 607            version=version,
 608            level=level,
 609            status_message=status_message,
 610            completion_start_time=completion_start_time,
 611            model=model,
 612            model_parameters=model_parameters,
 613            usage_details=usage_details,
 614            cost_details=cost_details,
 615            prompt=prompt,
 616        )
 617
 618    def _create_observation_from_otel_span(
 619        self,
 620        *,
 621        otel_span: otel_trace_api.Span,
 622        as_type: ObservationTypeLiteralNoEvent,
 623        input: Optional[Any] = None,
 624        output: Optional[Any] = None,
 625        metadata: Optional[Any] = None,
 626        version: Optional[str] = None,
 627        level: Optional[SpanLevel] = None,
 628        status_message: Optional[str] = None,
 629        completion_start_time: Optional[datetime] = None,
 630        model: Optional[str] = None,
 631        model_parameters: Optional[Dict[str, MapValue]] = None,
 632        usage_details: Optional[Dict[str, int]] = None,
 633        cost_details: Optional[Dict[str, float]] = None,
 634        prompt: Optional[PromptClient] = None,
 635    ) -> Union[
 636        LangfuseSpan,
 637        LangfuseGeneration,
 638        LangfuseAgent,
 639        LangfuseTool,
 640        LangfuseChain,
 641        LangfuseRetriever,
 642        LangfuseEvaluator,
 643        LangfuseEmbedding,
 644        LangfuseGuardrail,
 645    ]:
 646        """Create the appropriate observation type from an OTEL span."""
 647        if as_type in get_observation_types_list(ObservationTypeGenerationLike):
 648            observation_class = self._get_span_class(as_type)
 649            # Type ignore to prevent overloads of internal _get_span_class function,
 650            # issue is that LangfuseEvent could be returned and that classes have diff. args
 651            return observation_class(  # type: ignore[return-value,call-arg]
 652                otel_span=otel_span,
 653                langfuse_client=self,
 654                environment=self._environment,
 655                release=self._release,
 656                input=input,
 657                output=output,
 658                metadata=metadata,
 659                version=version,
 660                level=level,
 661                status_message=status_message,
 662                completion_start_time=completion_start_time,
 663                model=model,
 664                model_parameters=model_parameters,
 665                usage_details=usage_details,
 666                cost_details=cost_details,
 667                prompt=prompt,
 668            )
 669        else:
 670            # For other types (e.g. span, guardrail), create appropriate class without generation properties
 671            observation_class = self._get_span_class(as_type)
 672            # Type ignore to prevent overloads of internal _get_span_class function,
 673            # issue is that LangfuseEvent could be returned and that classes have diff. args
 674            return observation_class(  # type: ignore[return-value,call-arg]
 675                otel_span=otel_span,
 676                langfuse_client=self,
 677                environment=self._environment,
 678                release=self._release,
 679                input=input,
 680                output=output,
 681                metadata=metadata,
 682                version=version,
 683                level=level,
 684                status_message=status_message,
 685            )
 686            # span._observation_type = as_type
 687            # span._otel_span.set_attribute("langfuse.observation.type", as_type)
 688            # return span
 689
 690    @overload
 691    def start_as_current_observation(
 692        self,
 693        *,
 694        trace_context: Optional[TraceContext] = None,
 695        name: str,
 696        as_type: Literal["generation"],
 697        input: Optional[Any] = None,
 698        output: Optional[Any] = None,
 699        metadata: Optional[Any] = None,
 700        version: Optional[str] = None,
 701        level: Optional[SpanLevel] = None,
 702        status_message: Optional[str] = None,
 703        completion_start_time: Optional[datetime] = None,
 704        model: Optional[str] = None,
 705        model_parameters: Optional[Dict[str, MapValue]] = None,
 706        usage_details: Optional[Dict[str, int]] = None,
 707        cost_details: Optional[Dict[str, float]] = None,
 708        prompt: Optional[PromptClient] = None,
 709        end_on_exit: Optional[bool] = None,
 710    ) -> _AgnosticContextManager[LangfuseGeneration]: ...
 711
 712    @overload
 713    def start_as_current_observation(
 714        self,
 715        *,
 716        trace_context: Optional[TraceContext] = None,
 717        name: str,
 718        as_type: Literal["span"] = "span",
 719        input: Optional[Any] = None,
 720        output: Optional[Any] = None,
 721        metadata: Optional[Any] = None,
 722        version: Optional[str] = None,
 723        level: Optional[SpanLevel] = None,
 724        status_message: Optional[str] = None,
 725        end_on_exit: Optional[bool] = None,
 726    ) -> _AgnosticContextManager[LangfuseSpan]: ...
 727
 728    @overload
 729    def start_as_current_observation(
 730        self,
 731        *,
 732        trace_context: Optional[TraceContext] = None,
 733        name: str,
 734        as_type: Literal["agent"],
 735        input: Optional[Any] = None,
 736        output: Optional[Any] = None,
 737        metadata: Optional[Any] = None,
 738        version: Optional[str] = None,
 739        level: Optional[SpanLevel] = None,
 740        status_message: Optional[str] = None,
 741        end_on_exit: Optional[bool] = None,
 742    ) -> _AgnosticContextManager[LangfuseAgent]: ...
 743
 744    @overload
 745    def start_as_current_observation(
 746        self,
 747        *,
 748        trace_context: Optional[TraceContext] = None,
 749        name: str,
 750        as_type: Literal["tool"],
 751        input: Optional[Any] = None,
 752        output: Optional[Any] = None,
 753        metadata: Optional[Any] = None,
 754        version: Optional[str] = None,
 755        level: Optional[SpanLevel] = None,
 756        status_message: Optional[str] = None,
 757        end_on_exit: Optional[bool] = None,
 758    ) -> _AgnosticContextManager[LangfuseTool]: ...
 759
 760    @overload
 761    def start_as_current_observation(
 762        self,
 763        *,
 764        trace_context: Optional[TraceContext] = None,
 765        name: str,
 766        as_type: Literal["chain"],
 767        input: Optional[Any] = None,
 768        output: Optional[Any] = None,
 769        metadata: Optional[Any] = None,
 770        version: Optional[str] = None,
 771        level: Optional[SpanLevel] = None,
 772        status_message: Optional[str] = None,
 773        end_on_exit: Optional[bool] = None,
 774    ) -> _AgnosticContextManager[LangfuseChain]: ...
 775
 776    @overload
 777    def start_as_current_observation(
 778        self,
 779        *,
 780        trace_context: Optional[TraceContext] = None,
 781        name: str,
 782        as_type: Literal["retriever"],
 783        input: Optional[Any] = None,
 784        output: Optional[Any] = None,
 785        metadata: Optional[Any] = None,
 786        version: Optional[str] = None,
 787        level: Optional[SpanLevel] = None,
 788        status_message: Optional[str] = None,
 789        end_on_exit: Optional[bool] = None,
 790    ) -> _AgnosticContextManager[LangfuseRetriever]: ...
 791
 792    @overload
 793    def start_as_current_observation(
 794        self,
 795        *,
 796        trace_context: Optional[TraceContext] = None,
 797        name: str,
 798        as_type: Literal["evaluator"],
 799        input: Optional[Any] = None,
 800        output: Optional[Any] = None,
 801        metadata: Optional[Any] = None,
 802        version: Optional[str] = None,
 803        level: Optional[SpanLevel] = None,
 804        status_message: Optional[str] = None,
 805        end_on_exit: Optional[bool] = None,
 806    ) -> _AgnosticContextManager[LangfuseEvaluator]: ...
 807
 808    @overload
 809    def start_as_current_observation(
 810        self,
 811        *,
 812        trace_context: Optional[TraceContext] = None,
 813        name: str,
 814        as_type: Literal["embedding"],
 815        input: Optional[Any] = None,
 816        output: Optional[Any] = None,
 817        metadata: Optional[Any] = None,
 818        version: Optional[str] = None,
 819        level: Optional[SpanLevel] = None,
 820        status_message: Optional[str] = None,
 821        completion_start_time: Optional[datetime] = None,
 822        model: Optional[str] = None,
 823        model_parameters: Optional[Dict[str, MapValue]] = None,
 824        usage_details: Optional[Dict[str, int]] = None,
 825        cost_details: Optional[Dict[str, float]] = None,
 826        prompt: Optional[PromptClient] = None,
 827        end_on_exit: Optional[bool] = None,
 828    ) -> _AgnosticContextManager[LangfuseEmbedding]: ...
 829
 830    @overload
 831    def start_as_current_observation(
 832        self,
 833        *,
 834        trace_context: Optional[TraceContext] = None,
 835        name: str,
 836        as_type: Literal["guardrail"],
 837        input: Optional[Any] = None,
 838        output: Optional[Any] = None,
 839        metadata: Optional[Any] = None,
 840        version: Optional[str] = None,
 841        level: Optional[SpanLevel] = None,
 842        status_message: Optional[str] = None,
 843        end_on_exit: Optional[bool] = None,
 844    ) -> _AgnosticContextManager[LangfuseGuardrail]: ...
 845
 846    def start_as_current_observation(
 847        self,
 848        *,
 849        trace_context: Optional[TraceContext] = None,
 850        name: str,
 851        as_type: ObservationTypeLiteralNoEvent = "span",
 852        input: Optional[Any] = None,
 853        output: Optional[Any] = None,
 854        metadata: Optional[Any] = None,
 855        version: Optional[str] = None,
 856        level: Optional[SpanLevel] = None,
 857        status_message: Optional[str] = None,
 858        completion_start_time: Optional[datetime] = None,
 859        model: Optional[str] = None,
 860        model_parameters: Optional[Dict[str, MapValue]] = None,
 861        usage_details: Optional[Dict[str, int]] = None,
 862        cost_details: Optional[Dict[str, float]] = None,
 863        prompt: Optional[PromptClient] = None,
 864        end_on_exit: Optional[bool] = None,
 865    ) -> Union[
 866        _AgnosticContextManager[LangfuseGeneration],
 867        _AgnosticContextManager[LangfuseSpan],
 868        _AgnosticContextManager[LangfuseAgent],
 869        _AgnosticContextManager[LangfuseTool],
 870        _AgnosticContextManager[LangfuseChain],
 871        _AgnosticContextManager[LangfuseRetriever],
 872        _AgnosticContextManager[LangfuseEvaluator],
 873        _AgnosticContextManager[LangfuseEmbedding],
 874        _AgnosticContextManager[LangfuseGuardrail],
 875    ]:
 876        """Create a new observation and set it as the current span in a context manager.
 877
 878        This method creates a new observation of the specified type and sets it as the
 879        current span within a context manager. Use this method with a 'with' statement to
 880        automatically handle the observation lifecycle within a code block.
 881
 882        The created observation will be the child of the current span in the context.
 883
 884        Args:
 885            trace_context: Optional context for connecting to an existing trace
 886            name: Name of the observation (e.g., function or operation name)
 887            as_type: Type of observation to create (defaults to "span")
 888            input: Input data for the operation (can be any JSON-serializable object)
 889            output: Output data from the operation (can be any JSON-serializable object)
 890            metadata: Additional metadata to associate with the observation
 891            version: Version identifier for the code or component
 892            level: Importance level of the observation (info, warning, error)
 893            status_message: Optional status message for the observation
 894            end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
 895
 896            The following parameters are available when as_type is: "generation" or "embedding".
 897            completion_start_time: When the model started generating the response
 898            model: Name/identifier of the AI model used (e.g., "gpt-4")
 899            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
 900            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
 901            cost_details: Cost information for the model call
 902            prompt: Associated prompt template from Langfuse prompt management
 903
 904        Returns:
 905            A context manager that yields the appropriate observation type based on as_type
 906
 907        Example:
 908            ```python
 909            # Create a span
 910            with langfuse.start_as_current_observation(name="process-query", as_type="span") as span:
 911                # Do work
 912                result = process_data()
 913                span.update(output=result)
 914
 915                # Create a child span automatically
 916                with span.start_as_current_observation(name="sub-operation") as child_span:
 917                    # Do sub-operation work
 918                    child_span.update(output="sub-result")
 919
 920            # Create a tool observation
 921            with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool:
 922                # Do tool work
 923                results = search_web(query)
 924                tool.update(output=results)
 925
 926            # Create a generation observation
 927            with langfuse.start_as_current_observation(
 928                name="answer-generation",
 929                as_type="generation",
 930                model="gpt-4"
 931            ) as generation:
 932                # Generate answer
 933                response = llm.generate(...)
 934                generation.update(output=response)
 935            ```
 936        """
 937        if as_type in get_observation_types_list(ObservationTypeGenerationLike):
 938            if trace_context:
 939                trace_id = trace_context.get("trace_id", None)
 940                parent_span_id = trace_context.get("parent_span_id", None)
 941
 942                if trace_id:
 943                    remote_parent_span = self._create_remote_parent_span(
 944                        trace_id=trace_id, parent_span_id=parent_span_id
 945                    )
 946
 947                    return cast(
 948                        Union[
 949                            _AgnosticContextManager[LangfuseGeneration],
 950                            _AgnosticContextManager[LangfuseEmbedding],
 951                        ],
 952                        self._create_span_with_parent_context(
 953                            as_type=as_type,
 954                            name=name,
 955                            remote_parent_span=remote_parent_span,
 956                            parent=None,
 957                            end_on_exit=end_on_exit,
 958                            input=input,
 959                            output=output,
 960                            metadata=metadata,
 961                            version=version,
 962                            level=level,
 963                            status_message=status_message,
 964                            completion_start_time=completion_start_time,
 965                            model=model,
 966                            model_parameters=model_parameters,
 967                            usage_details=usage_details,
 968                            cost_details=cost_details,
 969                            prompt=prompt,
 970                        ),
 971                    )
 972
 973            return cast(
 974                Union[
 975                    _AgnosticContextManager[LangfuseGeneration],
 976                    _AgnosticContextManager[LangfuseEmbedding],
 977                ],
 978                self._start_as_current_otel_span_with_processed_media(
 979                    as_type=as_type,
 980                    name=name,
 981                    end_on_exit=end_on_exit,
 982                    input=input,
 983                    output=output,
 984                    metadata=metadata,
 985                    version=version,
 986                    level=level,
 987                    status_message=status_message,
 988                    completion_start_time=completion_start_time,
 989                    model=model,
 990                    model_parameters=model_parameters,
 991                    usage_details=usage_details,
 992                    cost_details=cost_details,
 993                    prompt=prompt,
 994                ),
 995            )
 996
 997        if as_type in get_observation_types_list(ObservationTypeSpanLike):
 998            if trace_context:
 999                trace_id = trace_context.get("trace_id", None)
1000                parent_span_id = trace_context.get("parent_span_id", None)
1001
1002                if trace_id:
1003                    remote_parent_span = self._create_remote_parent_span(
1004                        trace_id=trace_id, parent_span_id=parent_span_id
1005                    )
1006
1007                    return cast(
1008                        Union[
1009                            _AgnosticContextManager[LangfuseSpan],
1010                            _AgnosticContextManager[LangfuseAgent],
1011                            _AgnosticContextManager[LangfuseTool],
1012                            _AgnosticContextManager[LangfuseChain],
1013                            _AgnosticContextManager[LangfuseRetriever],
1014                            _AgnosticContextManager[LangfuseEvaluator],
1015                            _AgnosticContextManager[LangfuseGuardrail],
1016                        ],
1017                        self._create_span_with_parent_context(
1018                            as_type=as_type,
1019                            name=name,
1020                            remote_parent_span=remote_parent_span,
1021                            parent=None,
1022                            end_on_exit=end_on_exit,
1023                            input=input,
1024                            output=output,
1025                            metadata=metadata,
1026                            version=version,
1027                            level=level,
1028                            status_message=status_message,
1029                        ),
1030                    )
1031
1032            return cast(
1033                Union[
1034                    _AgnosticContextManager[LangfuseSpan],
1035                    _AgnosticContextManager[LangfuseAgent],
1036                    _AgnosticContextManager[LangfuseTool],
1037                    _AgnosticContextManager[LangfuseChain],
1038                    _AgnosticContextManager[LangfuseRetriever],
1039                    _AgnosticContextManager[LangfuseEvaluator],
1040                    _AgnosticContextManager[LangfuseGuardrail],
1041                ],
1042                self._start_as_current_otel_span_with_processed_media(
1043                    as_type=as_type,
1044                    name=name,
1045                    end_on_exit=end_on_exit,
1046                    input=input,
1047                    output=output,
1048                    metadata=metadata,
1049                    version=version,
1050                    level=level,
1051                    status_message=status_message,
1052                ),
1053            )
1054
1055        # This should never be reached since all valid types are handled above
1056        langfuse_logger.warning(
1057            f"Unknown observation type: {as_type}, falling back to span"
1058        )
1059        return self._start_as_current_otel_span_with_processed_media(
1060            as_type="span",
1061            name=name,
1062            end_on_exit=end_on_exit,
1063            input=input,
1064            output=output,
1065            metadata=metadata,
1066            version=version,
1067            level=level,
1068            status_message=status_message,
1069        )
1070
1071    def _get_span_class(
1072        self,
1073        as_type: ObservationTypeLiteral,
1074    ) -> Union[
1075        Type[LangfuseAgent],
1076        Type[LangfuseTool],
1077        Type[LangfuseChain],
1078        Type[LangfuseRetriever],
1079        Type[LangfuseEvaluator],
1080        Type[LangfuseEmbedding],
1081        Type[LangfuseGuardrail],
1082        Type[LangfuseGeneration],
1083        Type[LangfuseEvent],
1084        Type[LangfuseSpan],
1085    ]:
1086        """Get the appropriate span class based on as_type."""
1087        normalized_type = as_type.lower()
1088
1089        if normalized_type == "agent":
1090            return LangfuseAgent
1091        elif normalized_type == "tool":
1092            return LangfuseTool
1093        elif normalized_type == "chain":
1094            return LangfuseChain
1095        elif normalized_type == "retriever":
1096            return LangfuseRetriever
1097        elif normalized_type == "evaluator":
1098            return LangfuseEvaluator
1099        elif normalized_type == "embedding":
1100            return LangfuseEmbedding
1101        elif normalized_type == "guardrail":
1102            return LangfuseGuardrail
1103        elif normalized_type == "generation":
1104            return LangfuseGeneration
1105        elif normalized_type == "event":
1106            return LangfuseEvent
1107        elif normalized_type == "span":
1108            return LangfuseSpan
1109        else:
1110            return LangfuseSpan
1111
1112    @_agnosticcontextmanager
1113    def _create_span_with_parent_context(
1114        self,
1115        *,
1116        name: str,
1117        parent: Optional[otel_trace_api.Span] = None,
1118        remote_parent_span: Optional[otel_trace_api.Span] = None,
1119        as_type: ObservationTypeLiteralNoEvent,
1120        end_on_exit: Optional[bool] = None,
1121        input: Optional[Any] = None,
1122        output: Optional[Any] = None,
1123        metadata: Optional[Any] = None,
1124        version: Optional[str] = None,
1125        level: Optional[SpanLevel] = None,
1126        status_message: Optional[str] = None,
1127        completion_start_time: Optional[datetime] = None,
1128        model: Optional[str] = None,
1129        model_parameters: Optional[Dict[str, MapValue]] = None,
1130        usage_details: Optional[Dict[str, int]] = None,
1131        cost_details: Optional[Dict[str, float]] = None,
1132        prompt: Optional[PromptClient] = None,
1133    ) -> Any:
1134        parent_span = parent or cast(otel_trace_api.Span, remote_parent_span)
1135
1136        with otel_trace_api.use_span(parent_span):
1137            with self._start_as_current_otel_span_with_processed_media(
1138                name=name,
1139                as_type=as_type,
1140                end_on_exit=end_on_exit,
1141                input=input,
1142                output=output,
1143                metadata=metadata,
1144                version=version,
1145                level=level,
1146                status_message=status_message,
1147                completion_start_time=completion_start_time,
1148                model=model,
1149                model_parameters=model_parameters,
1150                usage_details=usage_details,
1151                cost_details=cost_details,
1152                prompt=prompt,
1153            ) as langfuse_span:
1154                if remote_parent_span is not None:
1155                    langfuse_span._otel_span.set_attribute(
1156                        LangfuseOtelSpanAttributes.AS_ROOT, True
1157                    )
1158
1159                yield langfuse_span
1160
1161    @_agnosticcontextmanager
1162    def _start_as_current_otel_span_with_processed_media(
1163        self,
1164        *,
1165        name: str,
1166        as_type: Optional[ObservationTypeLiteralNoEvent] = None,
1167        end_on_exit: Optional[bool] = None,
1168        input: Optional[Any] = None,
1169        output: Optional[Any] = None,
1170        metadata: Optional[Any] = None,
1171        version: Optional[str] = None,
1172        level: Optional[SpanLevel] = None,
1173        status_message: Optional[str] = None,
1174        completion_start_time: Optional[datetime] = None,
1175        model: Optional[str] = None,
1176        model_parameters: Optional[Dict[str, MapValue]] = None,
1177        usage_details: Optional[Dict[str, int]] = None,
1178        cost_details: Optional[Dict[str, float]] = None,
1179        prompt: Optional[PromptClient] = None,
1180    ) -> Any:
1181        with self._otel_tracer.start_as_current_span(
1182            name=name,
1183            end_on_exit=end_on_exit if end_on_exit is not None else True,
1184        ) as otel_span:
1185            baggage_token = None
1186
1187            if otel_span.is_recording():
1188                context_with_app_root_claim = _set_langfuse_trace_id_in_baggage(
1189                    trace_id=self._get_otel_trace_id(otel_span),
1190                    context=otel_context_api.get_current(),
1191                )
1192                baggage_token = otel_context_api.attach(context_with_app_root_claim)
1193
1194            span_class = self._get_span_class(
1195                as_type or "generation"
1196            )  # default was "generation"
1197
1198            try:
1199                common_args = {
1200                    "otel_span": otel_span,
1201                    "langfuse_client": self,
1202                    "environment": self._environment,
1203                    "release": self._release,
1204                    "input": input,
1205                    "output": output,
1206                    "metadata": metadata,
1207                    "version": version,
1208                    "level": level,
1209                    "status_message": status_message,
1210                }
1211
1212                if span_class in [
1213                    LangfuseGeneration,
1214                    LangfuseEmbedding,
1215                ]:
1216                    common_args.update(
1217                        {
1218                            "completion_start_time": completion_start_time,
1219                            "model": model,
1220                            "model_parameters": model_parameters,
1221                            "usage_details": usage_details,
1222                            "cost_details": cost_details,
1223                            "prompt": prompt,
1224                        }
1225                    )
1226                # For span-like types (span, agent, tool, chain, retriever, evaluator, guardrail), no generation properties needed
1227
1228                yield span_class(**common_args)  # type: ignore[arg-type]
1229
1230            finally:
1231                if baggage_token is not None:
1232                    _detach_context_token_safely(baggage_token)
1233
1234    def _get_current_otel_span(self) -> Optional[otel_trace_api.Span]:
1235        current_span = otel_trace_api.get_current_span()
1236
1237        if current_span is otel_trace_api.INVALID_SPAN:
1238            langfuse_logger.warning(
1239                "Context error: No active span in current context. Operations that depend on an active span will be skipped. "
1240                "Ensure spans are created with start_as_current_observation() or that you're operating within an active span context."
1241            )
1242            return None
1243
1244        return current_span
1245
1246    def update_current_generation(
1247        self,
1248        *,
1249        name: Optional[str] = None,
1250        input: Optional[Any] = None,
1251        output: Optional[Any] = None,
1252        metadata: Optional[Any] = None,
1253        version: Optional[str] = None,
1254        level: Optional[SpanLevel] = None,
1255        status_message: Optional[str] = None,
1256        completion_start_time: Optional[datetime] = None,
1257        model: Optional[str] = None,
1258        model_parameters: Optional[Dict[str, MapValue]] = None,
1259        usage_details: Optional[Dict[str, int]] = None,
1260        cost_details: Optional[Dict[str, float]] = None,
1261        prompt: Optional[PromptClient] = None,
1262    ) -> None:
1263        """Update the current active generation span with new information.
1264
1265        This method updates the current generation span in the active context with
1266        additional information. It's useful for adding output, usage stats, or other
1267        details that become available during or after model generation.
1268
1269        Args:
1270            name: The generation name
1271            input: Updated input data for the model
1272            output: Output from the model (e.g., completions)
1273            metadata: Additional metadata to associate with the generation
1274            version: Version identifier for the model or component
1275            level: Importance level of the generation (info, warning, error)
1276            status_message: Optional status message for the generation
1277            completion_start_time: When the model started generating the response
1278            model: Name/identifier of the AI model used (e.g., "gpt-4")
1279            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1280            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1281            cost_details: Cost information for the model call
1282            prompt: Associated prompt template from Langfuse prompt management
1283
1284        Example:
1285            ```python
1286            with langfuse.start_as_current_generation(name="answer-query") as generation:
1287                # Initial setup and API call
1288                response = llm.generate(...)
1289
1290                # Update with results that weren't available at creation time
1291                langfuse.update_current_generation(
1292                    output=response.text,
1293                    usage_details={
1294                        "prompt_tokens": response.usage.prompt_tokens,
1295                        "completion_tokens": response.usage.completion_tokens
1296                    }
1297                )
1298            ```
1299        """
1300        if not self._tracing_enabled:
1301            langfuse_logger.debug(
1302                "Operation skipped: update_current_generation - Tracing is disabled or client is in no-op mode."
1303            )
1304            return
1305
1306        current_otel_span = self._get_current_otel_span()
1307
1308        if current_otel_span is not None:
1309            generation = LangfuseGeneration(
1310                otel_span=current_otel_span, langfuse_client=self
1311            )
1312
1313            if name:
1314                current_otel_span.update_name(name)
1315
1316            generation.update(
1317                input=input,
1318                output=output,
1319                metadata=metadata,
1320                version=version,
1321                level=level,
1322                status_message=status_message,
1323                completion_start_time=completion_start_time,
1324                model=model,
1325                model_parameters=model_parameters,
1326                usage_details=usage_details,
1327                cost_details=cost_details,
1328                prompt=prompt,
1329            )
1330
1331    def update_current_span(
1332        self,
1333        *,
1334        name: Optional[str] = None,
1335        input: Optional[Any] = None,
1336        output: Optional[Any] = None,
1337        metadata: Optional[Any] = None,
1338        version: Optional[str] = None,
1339        level: Optional[SpanLevel] = None,
1340        status_message: Optional[str] = None,
1341    ) -> None:
1342        """Update the current active span with new information.
1343
1344        This method updates the current span in the active context with
1345        additional information. It's useful for adding outputs or metadata
1346        that become available during execution.
1347
1348        Args:
1349            name: The span name
1350            input: Updated input data for the operation
1351            output: Output data from the operation
1352            metadata: Additional metadata to associate with the span
1353            version: Version identifier for the code or component
1354            level: Importance level of the span (info, warning, error)
1355            status_message: Optional status message for the span
1356
1357        Example:
1358            ```python
1359            with langfuse.start_as_current_observation(name="process-data") as span:
1360                # Initial processing
1361                result = process_first_part()
1362
1363                # Update with intermediate results
1364                langfuse.update_current_span(metadata={"intermediate_result": result})
1365
1366                # Continue processing
1367                final_result = process_second_part(result)
1368
1369                # Final update
1370                langfuse.update_current_span(output=final_result)
1371            ```
1372        """
1373        if not self._tracing_enabled:
1374            langfuse_logger.debug(
1375                "Operation skipped: update_current_span - Tracing is disabled or client is in no-op mode."
1376            )
1377            return
1378
1379        current_otel_span = self._get_current_otel_span()
1380
1381        if current_otel_span is not None:
1382            span = LangfuseSpan(
1383                otel_span=current_otel_span,
1384                langfuse_client=self,
1385                environment=self._environment,
1386                release=self._release,
1387            )
1388
1389            if name:
1390                current_otel_span.update_name(name)
1391
1392            span.update(
1393                input=input,
1394                output=output,
1395                metadata=metadata,
1396                version=version,
1397                level=level,
1398                status_message=status_message,
1399            )
1400
1401    @deprecated(
1402        "Trace-level input/output is deprecated. "
1403        "For trace attributes (user_id, session_id, tags, etc.), use propagate_attributes() instead. "
1404        "This method will be removed in a future major version."
1405    )
1406    def set_current_trace_io(
1407        self,
1408        *,
1409        input: Optional[Any] = None,
1410        output: Optional[Any] = None,
1411    ) -> None:
1412        """Set trace-level input and output for the current span's trace.
1413
1414        .. deprecated::
1415            This is a legacy method for backward compatibility with Langfuse platform
1416            features that still rely on trace-level input/output (e.g., legacy LLM-as-a-judge
1417            evaluators). It will be removed in a future major version.
1418
1419            For setting other trace attributes (user_id, session_id, metadata, tags, version),
1420            use :meth:`propagate_attributes` instead.
1421
1422        Args:
1423            input: Input data to associate with the trace.
1424            output: Output data to associate with the trace.
1425        """
1426        if not self._tracing_enabled:
1427            langfuse_logger.debug(
1428                "Operation skipped: set_current_trace_io - Tracing is disabled or client is in no-op mode."
1429            )
1430            return
1431
1432        current_otel_span = self._get_current_otel_span()
1433
1434        if current_otel_span is not None and current_otel_span.is_recording():
1435            existing_observation_type = current_otel_span.attributes.get(  # type: ignore[attr-defined]
1436                LangfuseOtelSpanAttributes.OBSERVATION_TYPE, "span"
1437            )
1438            # We need to preserve the class to keep the correct observation type
1439            span_class = self._get_span_class(existing_observation_type)
1440            span = span_class(
1441                otel_span=current_otel_span,
1442                langfuse_client=self,
1443                environment=self._environment,
1444                release=self._release,
1445            )
1446
1447            span.set_trace_io(
1448                input=input,
1449                output=output,
1450            )
1451
1452    def set_current_trace_as_public(self) -> None:
1453        """Make the current trace publicly accessible via its URL.
1454
1455        When a trace is published, anyone with the trace link can view the full trace
1456        without needing to be logged in to Langfuse. This action cannot be undone
1457        programmatically - once published, the entire trace becomes public.
1458
1459        This is a convenience method that publishes the trace from the currently
1460        active span context. Use this when you want to make a trace public from
1461        within a traced function without needing direct access to the span object.
1462        """
1463        if not self._tracing_enabled:
1464            langfuse_logger.debug(
1465                "Operation skipped: set_current_trace_as_public - Tracing is disabled or client is in no-op mode."
1466            )
1467            return
1468
1469        current_otel_span = self._get_current_otel_span()
1470
1471        if current_otel_span is not None and current_otel_span.is_recording():
1472            existing_observation_type = current_otel_span.attributes.get(  # type: ignore[attr-defined]
1473                LangfuseOtelSpanAttributes.OBSERVATION_TYPE, "span"
1474            )
1475            # We need to preserve the class to keep the correct observation type
1476            span_class = self._get_span_class(existing_observation_type)
1477            span = span_class(
1478                otel_span=current_otel_span,
1479                langfuse_client=self,
1480                environment=self._environment,
1481            )
1482
1483            span.set_trace_as_public()
1484
1485    def create_event(
1486        self,
1487        *,
1488        trace_context: Optional[TraceContext] = None,
1489        name: str,
1490        input: Optional[Any] = None,
1491        output: Optional[Any] = None,
1492        metadata: Optional[Any] = None,
1493        version: Optional[str] = None,
1494        level: Optional[SpanLevel] = None,
1495        status_message: Optional[str] = None,
1496    ) -> LangfuseEvent:
1497        """Create a new Langfuse observation of type 'EVENT'.
1498
1499        The created Langfuse Event observation will be the child of the current span in the context.
1500
1501        Args:
1502            trace_context: Optional context for connecting to an existing trace
1503            name: Name of the span (e.g., function or operation name)
1504            input: Input data for the operation (can be any JSON-serializable object)
1505            output: Output data from the operation (can be any JSON-serializable object)
1506            metadata: Additional metadata to associate with the span
1507            version: Version identifier for the code or component
1508            level: Importance level of the span (info, warning, error)
1509            status_message: Optional status message for the span
1510
1511        Returns:
1512            The Langfuse Event object
1513
1514        Example:
1515            ```python
1516            event = langfuse.create_event(name="process-event")
1517            ```
1518        """
1519        timestamp = time_ns()
1520
1521        if trace_context:
1522            trace_id = trace_context.get("trace_id", None)
1523            parent_span_id = trace_context.get("parent_span_id", None)
1524
1525            if trace_id:
1526                remote_parent_span = self._create_remote_parent_span(
1527                    trace_id=trace_id, parent_span_id=parent_span_id
1528                )
1529
1530                with otel_trace_api.use_span(
1531                    cast(otel_trace_api.Span, remote_parent_span)
1532                ):
1533                    otel_span = self._otel_tracer.start_span(
1534                        name=name, start_time=timestamp
1535                    )
1536                    otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True)
1537
1538                    return cast(
1539                        LangfuseEvent,
1540                        LangfuseEvent(
1541                            otel_span=otel_span,
1542                            langfuse_client=self,
1543                            environment=self._environment,
1544                            release=self._release,
1545                            input=input,
1546                            output=output,
1547                            metadata=metadata,
1548                            version=version,
1549                            level=level,
1550                            status_message=status_message,
1551                        ).end(end_time=timestamp),
1552                    )
1553
1554        otel_span = self._otel_tracer.start_span(name=name, start_time=timestamp)
1555
1556        return cast(
1557            LangfuseEvent,
1558            LangfuseEvent(
1559                otel_span=otel_span,
1560                langfuse_client=self,
1561                environment=self._environment,
1562                release=self._release,
1563                input=input,
1564                output=output,
1565                metadata=metadata,
1566                version=version,
1567                level=level,
1568                status_message=status_message,
1569            ).end(end_time=timestamp),
1570        )
1571
1572    def _create_remote_parent_span(
1573        self, *, trace_id: str, parent_span_id: Optional[str]
1574    ) -> Any:
1575        if not self._is_valid_trace_id(trace_id):
1576            langfuse_logger.warning(
1577                f"Passed trace ID '{trace_id}' is not a valid 32 lowercase hex char Langfuse trace id. Ignoring trace ID."
1578            )
1579
1580        if parent_span_id and not self._is_valid_span_id(parent_span_id):
1581            langfuse_logger.warning(
1582                f"Passed span ID '{parent_span_id}' is not a valid 16 lowercase hex char Langfuse span id. Ignoring parent span ID."
1583            )
1584
1585        int_trace_id = int(trace_id, 16)
1586        int_parent_span_id = (
1587            int(parent_span_id, 16)
1588            if parent_span_id
1589            else RandomIdGenerator().generate_span_id()
1590        )
1591
1592        span_context = otel_trace_api.SpanContext(
1593            trace_id=int_trace_id,
1594            span_id=int_parent_span_id,
1595            trace_flags=otel_trace_api.TraceFlags(0x01),  # mark span as sampled
1596            is_remote=False,
1597        )
1598
1599        return otel_trace_api.NonRecordingSpan(span_context)
1600
1601    def _is_valid_trace_id(self, trace_id: str) -> bool:
1602        pattern = r"^[0-9a-f]{32}$"
1603
1604        return bool(re.match(pattern, trace_id))
1605
1606    def _is_valid_span_id(self, span_id: str) -> bool:
1607        pattern = r"^[0-9a-f]{16}$"
1608
1609        return bool(re.match(pattern, span_id))
1610
1611    def _create_observation_id(self, *, seed: Optional[str] = None) -> str:
1612        """Create a unique observation ID for use with Langfuse.
1613
1614        This method generates a unique observation ID (span ID in OpenTelemetry terms)
1615        for use with various Langfuse APIs. It can either generate a random ID or
1616        create a deterministic ID based on a seed string.
1617
1618        Observation IDs must be 16 lowercase hexadecimal characters, representing 8 bytes.
1619        This method ensures the generated ID meets this requirement. If you need to
1620        correlate an external ID with a Langfuse observation ID, use the external ID as
1621        the seed to get a valid, deterministic observation ID.
1622
1623        Args:
1624            seed: Optional string to use as a seed for deterministic ID generation.
1625                 If provided, the same seed will always produce the same ID.
1626                 If not provided, a random ID will be generated.
1627
1628        Returns:
1629            A 16-character lowercase hexadecimal string representing the observation ID.
1630
1631        Example:
1632            ```python
1633            # Generate a random observation ID
1634            obs_id = langfuse.create_observation_id()
1635
1636            # Generate a deterministic ID based on a seed
1637            user_obs_id = langfuse.create_observation_id(seed="user-123-feedback")
1638
1639            # Correlate an external item ID with a Langfuse observation ID
1640            item_id = "item-789012"
1641            correlated_obs_id = langfuse.create_observation_id(seed=item_id)
1642
1643            # Use the ID with Langfuse APIs
1644            langfuse.create_score(
1645                name="relevance",
1646                value=0.95,
1647                trace_id=trace_id,
1648                observation_id=obs_id
1649            )
1650            ```
1651        """
1652        if not seed:
1653            span_id_int = RandomIdGenerator().generate_span_id()
1654
1655            return self._format_otel_span_id(span_id_int)
1656
1657        return sha256(seed.encode("utf-8")).digest()[:8].hex()
1658
1659    @staticmethod
1660    def create_trace_id(*, seed: Optional[str] = None) -> str:
1661        """Create a unique trace ID for use with Langfuse.
1662
1663        This method generates a unique trace ID for use with various Langfuse APIs.
1664        It can either generate a random ID or create a deterministic ID based on
1665        a seed string.
1666
1667        Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes.
1668        This method ensures the generated ID meets this requirement. If you need to
1669        correlate an external ID with a Langfuse trace ID, use the external ID as the
1670        seed to get a valid, deterministic Langfuse trace ID.
1671
1672        Args:
1673            seed: Optional string to use as a seed for deterministic ID generation.
1674                 If provided, the same seed will always produce the same ID.
1675                 If not provided, a random ID will be generated.
1676
1677        Returns:
1678            A 32-character lowercase hexadecimal string representing the Langfuse trace ID.
1679
1680        Example:
1681            ```python
1682            # Generate a random trace ID
1683            trace_id = langfuse.create_trace_id()
1684
1685            # Generate a deterministic ID based on a seed
1686            session_trace_id = langfuse.create_trace_id(seed="session-456")
1687
1688            # Correlate an external ID with a Langfuse trace ID
1689            external_id = "external-system-123456"
1690            correlated_trace_id = langfuse.create_trace_id(seed=external_id)
1691
1692            # Use the ID with trace context
1693            with langfuse.start_as_current_observation(
1694                name="process-request",
1695                trace_context={"trace_id": trace_id}
1696            ) as span:
1697                # Operation will be part of the specific trace
1698                pass
1699            ```
1700        """
1701        if not seed:
1702            trace_id_int = RandomIdGenerator().generate_trace_id()
1703
1704            return Langfuse._format_otel_trace_id(trace_id_int)
1705
1706        return sha256(seed.encode("utf-8")).digest()[:16].hex()
1707
1708    def _get_otel_trace_id(self, otel_span: otel_trace_api.Span) -> str:
1709        span_context = otel_span.get_span_context()
1710
1711        return self._format_otel_trace_id(span_context.trace_id)
1712
1713    def _get_otel_span_id(self, otel_span: otel_trace_api.Span) -> str:
1714        span_context = otel_span.get_span_context()
1715
1716        return self._format_otel_span_id(span_context.span_id)
1717
1718    @staticmethod
1719    def _format_otel_span_id(span_id_int: int) -> str:
1720        """Format an integer span ID to a 16-character lowercase hex string.
1721
1722        Internal method to convert an OpenTelemetry integer span ID to the standard
1723        W3C Trace Context format (16-character lowercase hex string).
1724
1725        Args:
1726            span_id_int: 64-bit integer representing a span ID
1727
1728        Returns:
1729            A 16-character lowercase hexadecimal string
1730        """
1731        return format(span_id_int, "016x")
1732
1733    @staticmethod
1734    def _format_otel_trace_id(trace_id_int: int) -> str:
1735        """Format an integer trace ID to a 32-character lowercase hex string.
1736
1737        Internal method to convert an OpenTelemetry integer trace ID to the standard
1738        W3C Trace Context format (32-character lowercase hex string).
1739
1740        Args:
1741            trace_id_int: 128-bit integer representing a trace ID
1742
1743        Returns:
1744            A 32-character lowercase hexadecimal string
1745        """
1746        return format(trace_id_int, "032x")
1747
1748    @overload
1749    def create_score(
1750        self,
1751        *,
1752        name: str,
1753        value: float,
1754        session_id: Optional[str] = None,
1755        dataset_run_id: Optional[str] = None,
1756        trace_id: Optional[str] = None,
1757        observation_id: Optional[str] = None,
1758        score_id: Optional[str] = None,
1759        data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None,
1760        comment: Optional[str] = None,
1761        config_id: Optional[str] = None,
1762        metadata: Optional[Any] = None,
1763        timestamp: Optional[datetime] = None,
1764    ) -> None: ...
1765
1766    @overload
1767    def create_score(
1768        self,
1769        *,
1770        name: str,
1771        value: str,
1772        session_id: Optional[str] = None,
1773        dataset_run_id: Optional[str] = None,
1774        trace_id: Optional[str] = None,
1775        score_id: Optional[str] = None,
1776        observation_id: Optional[str] = None,
1777        data_type: Optional[Literal["CATEGORICAL", "TEXT"]] = "CATEGORICAL",
1778        comment: Optional[str] = None,
1779        config_id: Optional[str] = None,
1780        metadata: Optional[Any] = None,
1781        timestamp: Optional[datetime] = None,
1782    ) -> None: ...
1783
1784    def create_score(
1785        self,
1786        *,
1787        name: str,
1788        value: Union[float, str],
1789        session_id: Optional[str] = None,
1790        dataset_run_id: Optional[str] = None,
1791        trace_id: Optional[str] = None,
1792        observation_id: Optional[str] = None,
1793        score_id: Optional[str] = None,
1794        data_type: Optional[ScoreDataType] = None,
1795        comment: Optional[str] = None,
1796        config_id: Optional[str] = None,
1797        metadata: Optional[Any] = None,
1798        timestamp: Optional[datetime] = None,
1799    ) -> None:
1800        """Create a score for a specific trace or observation.
1801
1802        This method creates a score for evaluating a Langfuse trace or observation. Scores can be
1803        used to track quality metrics, user feedback, or automated evaluations.
1804
1805        Args:
1806            name: Name of the score (e.g., "relevance", "accuracy")
1807            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT)
1808            session_id: ID of the Langfuse session to associate the score with
1809            dataset_run_id: ID of the Langfuse dataset run to associate the score with
1810            trace_id: ID of the Langfuse trace to associate the score with
1811            observation_id: Optional ID of the specific observation to score. Trace ID must be provided too.
1812            score_id: Optional custom ID for the score (auto-generated if not provided)
1813            data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT)
1814            comment: Optional comment or explanation for the score
1815            config_id: Optional ID of a score config defined in Langfuse
1816            metadata: Optional metadata to be attached to the score
1817            timestamp: Optional timestamp for the score (defaults to current UTC time)
1818
1819        Example:
1820            ```python
1821            # Create a numeric score for accuracy
1822            langfuse.create_score(
1823                name="accuracy",
1824                value=0.92,
1825                trace_id="abcdef1234567890abcdef1234567890",
1826                data_type="NUMERIC",
1827                comment="High accuracy with minor irrelevant details"
1828            )
1829
1830            # Create a categorical score for sentiment
1831            langfuse.create_score(
1832                name="sentiment",
1833                value="positive",
1834                trace_id="abcdef1234567890abcdef1234567890",
1835                observation_id="abcdef1234567890",
1836                data_type="CATEGORICAL"
1837            )
1838            ```
1839        """
1840        if not self._tracing_enabled:
1841            return
1842
1843        score_id = score_id or self._create_observation_id()
1844
1845        try:
1846            new_body = ScoreBody(
1847                id=score_id,
1848                sessionId=session_id,
1849                datasetRunId=dataset_run_id,
1850                traceId=trace_id,
1851                observationId=observation_id,
1852                name=name,
1853                value=value,
1854                dataType=data_type,  # type: ignore
1855                comment=comment,
1856                configId=config_id,
1857                environment=self._environment,
1858                metadata=metadata,
1859            )
1860
1861            event = {
1862                "id": self.create_trace_id(),
1863                "type": "score-create",
1864                "timestamp": timestamp or _get_timestamp(),
1865                "body": new_body,
1866            }
1867
1868            if self._resources is not None:
1869                # Force the score to be in sample if it was for a legacy trace ID, i.e. non-32 hexchar
1870                force_sample = (
1871                    not self._is_valid_trace_id(trace_id) if trace_id else True
1872                )
1873
1874                self._resources.add_score_task(
1875                    event,
1876                    force_sample=force_sample,
1877                )
1878
1879        except Exception as e:
1880            langfuse_logger.exception(
1881                f"Error creating score: Failed to process score event for trace_id={trace_id}, name={name}. Error: {e}"
1882            )
1883
1884    def _create_trace_tags_via_ingestion(
1885        self,
1886        *,
1887        trace_id: str,
1888        tags: List[str],
1889    ) -> None:
1890        """Private helper to enqueue trace tag updates via ingestion API events."""
1891        if not self._tracing_enabled:
1892            return
1893
1894        if len(tags) == 0:
1895            return
1896
1897        try:
1898            new_body = TraceBody(
1899                id=trace_id,
1900                tags=tags,
1901            )
1902
1903            event = {
1904                "id": self.create_trace_id(),
1905                "type": "trace-create",
1906                "timestamp": _get_timestamp(),
1907                "body": new_body,
1908            }
1909
1910            if self._resources is not None:
1911                self._resources.add_trace_task(event)
1912        except Exception as e:
1913            langfuse_logger.exception(
1914                f"Error updating trace tags: Failed to process trace update event for trace_id={trace_id}. Error: {e}"
1915            )
1916
1917    @overload
1918    def score_current_span(
1919        self,
1920        *,
1921        name: str,
1922        value: float,
1923        score_id: Optional[str] = None,
1924        data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None,
1925        comment: Optional[str] = None,
1926        config_id: Optional[str] = None,
1927        metadata: Optional[Any] = None,
1928    ) -> None: ...
1929
1930    @overload
1931    def score_current_span(
1932        self,
1933        *,
1934        name: str,
1935        value: str,
1936        score_id: Optional[str] = None,
1937        data_type: Optional[Literal["CATEGORICAL", "TEXT"]] = "CATEGORICAL",
1938        comment: Optional[str] = None,
1939        config_id: Optional[str] = None,
1940        metadata: Optional[Any] = None,
1941    ) -> None: ...
1942
1943    def score_current_span(
1944        self,
1945        *,
1946        name: str,
1947        value: Union[float, str],
1948        score_id: Optional[str] = None,
1949        data_type: Optional[ScoreDataType] = None,
1950        comment: Optional[str] = None,
1951        config_id: Optional[str] = None,
1952        metadata: Optional[Any] = None,
1953    ) -> None:
1954        """Create a score for the current active span.
1955
1956        This method scores the currently active span in the context. It's a convenient
1957        way to score the current operation without needing to know its trace and span IDs.
1958
1959        Args:
1960            name: Name of the score (e.g., "relevance", "accuracy")
1961            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT)
1962            score_id: Optional custom ID for the score (auto-generated if not provided)
1963            data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT)
1964            comment: Optional comment or explanation for the score
1965            config_id: Optional ID of a score config defined in Langfuse
1966            metadata: Optional metadata to be attached to the score
1967
1968        Example:
1969            ```python
1970            with langfuse.start_as_current_generation(name="answer-query") as generation:
1971                # Generate answer
1972                response = generate_answer(...)
1973                generation.update(output=response)
1974
1975                # Score the generation
1976                langfuse.score_current_span(
1977                    name="relevance",
1978                    value=0.85,
1979                    data_type="NUMERIC",
1980                    comment="Mostly relevant but contains some tangential information",
1981                    metadata={"model": "gpt-4", "prompt_version": "v2"}
1982                )
1983            ```
1984        """
1985        current_span = self._get_current_otel_span()
1986
1987        if current_span is not None:
1988            trace_id = self._get_otel_trace_id(current_span)
1989            observation_id = self._get_otel_span_id(current_span)
1990
1991            langfuse_logger.info(
1992                f"Score: Creating score name='{name}' value={value} for current span ({observation_id}) in trace {trace_id}"
1993            )
1994
1995            self.create_score(
1996                trace_id=trace_id,
1997                observation_id=observation_id,
1998                name=name,
1999                value=cast(str, value),
2000                score_id=score_id,
2001                data_type=cast(Literal["CATEGORICAL", "TEXT"], data_type),
2002                comment=comment,
2003                config_id=config_id,
2004                metadata=metadata,
2005            )
2006
2007    @overload
2008    def score_current_trace(
2009        self,
2010        *,
2011        name: str,
2012        value: float,
2013        score_id: Optional[str] = None,
2014        data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None,
2015        comment: Optional[str] = None,
2016        config_id: Optional[str] = None,
2017        metadata: Optional[Any] = None,
2018    ) -> None: ...
2019
2020    @overload
2021    def score_current_trace(
2022        self,
2023        *,
2024        name: str,
2025        value: str,
2026        score_id: Optional[str] = None,
2027        data_type: Optional[Literal["CATEGORICAL", "TEXT"]] = "CATEGORICAL",
2028        comment: Optional[str] = None,
2029        config_id: Optional[str] = None,
2030        metadata: Optional[Any] = None,
2031    ) -> None: ...
2032
2033    def score_current_trace(
2034        self,
2035        *,
2036        name: str,
2037        value: Union[float, str],
2038        score_id: Optional[str] = None,
2039        data_type: Optional[ScoreDataType] = None,
2040        comment: Optional[str] = None,
2041        config_id: Optional[str] = None,
2042        metadata: Optional[Any] = None,
2043    ) -> None:
2044        """Create a score for the current trace.
2045
2046        This method scores the trace of the currently active span. Unlike score_current_span,
2047        this method associates the score with the entire trace rather than a specific span.
2048        It's useful for scoring overall performance or quality of the entire operation.
2049
2050        Args:
2051            name: Name of the score (e.g., "user_satisfaction", "overall_quality")
2052            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT)
2053            score_id: Optional custom ID for the score (auto-generated if not provided)
2054            data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT)
2055            comment: Optional comment or explanation for the score
2056            config_id: Optional ID of a score config defined in Langfuse
2057            metadata: Optional metadata to be attached to the score
2058
2059        Example:
2060            ```python
2061            with langfuse.start_as_current_observation(name="process-user-request") as span:
2062                # Process request
2063                result = process_complete_request()
2064                span.update(output=result)
2065
2066                # Score the overall trace
2067                langfuse.score_current_trace(
2068                    name="overall_quality",
2069                    value=0.95,
2070                    data_type="NUMERIC",
2071                    comment="High quality end-to-end response",
2072                    metadata={"evaluator": "gpt-4", "criteria": "comprehensive"}
2073                )
2074            ```
2075        """
2076        current_span = self._get_current_otel_span()
2077
2078        if current_span is not None:
2079            trace_id = self._get_otel_trace_id(current_span)
2080
2081            langfuse_logger.info(
2082                f"Score: Creating score name='{name}' value={value} for entire trace {trace_id}"
2083            )
2084
2085            self.create_score(
2086                trace_id=trace_id,
2087                name=name,
2088                value=cast(str, value),
2089                score_id=score_id,
2090                data_type=cast(Literal["CATEGORICAL", "TEXT"], data_type),
2091                comment=comment,
2092                config_id=config_id,
2093                metadata=metadata,
2094            )
2095
2096    def flush(self) -> None:
2097        """Force flush all pending spans and events to the Langfuse API.
2098
2099        This method manually flushes any pending spans, scores, and other events to the
2100        Langfuse API. It's useful in scenarios where you want to ensure all data is sent
2101        before proceeding, without waiting for the automatic flush interval.
2102
2103        Example:
2104            ```python
2105            # Record some spans and scores
2106            with langfuse.start_as_current_observation(name="operation") as span:
2107                # Do work...
2108                pass
2109
2110            # Ensure all data is sent to Langfuse before proceeding
2111            langfuse.flush()
2112
2113            # Continue with other work
2114            ```
2115        """
2116        if self._resources is not None:
2117            self._resources.flush()
2118
2119    def shutdown(self) -> None:
2120        """Shut down the Langfuse client and flush all pending data.
2121
2122        This method cleanly shuts down the Langfuse client, ensuring all pending data
2123        is flushed to the API and all background threads are properly terminated.
2124
2125        It's important to call this method when your application is shutting down to
2126        prevent data loss and resource leaks. For most applications, using the client
2127        as a context manager or relying on the automatic shutdown via atexit is sufficient.
2128
2129        Example:
2130            ```python
2131            # Initialize Langfuse
2132            langfuse = Langfuse(public_key="...", secret_key="...")
2133
2134            # Use Langfuse throughout your application
2135            # ...
2136
2137            # When application is shutting down
2138            langfuse.shutdown()
2139            ```
2140        """
2141        if self._resources is not None:
2142            self._resources.shutdown()
2143
2144    def get_current_trace_id(self) -> Optional[str]:
2145        """Get the trace ID of the current active span.
2146
2147        This method retrieves the trace ID from the currently active span in the context.
2148        It can be used to get the trace ID for referencing in logs, external systems,
2149        or for creating related operations.
2150
2151        Returns:
2152            The current trace ID as a 32-character lowercase hexadecimal string,
2153            or None if there is no active span.
2154
2155        Example:
2156            ```python
2157            with langfuse.start_as_current_observation(name="process-request") as span:
2158                # Get the current trace ID for reference
2159                trace_id = langfuse.get_current_trace_id()
2160
2161                # Use it for external correlation
2162                log.info(f"Processing request with trace_id: {trace_id}")
2163
2164                # Or pass to another system
2165                external_system.process(data, trace_id=trace_id)
2166            ```
2167        """
2168        if not self._tracing_enabled:
2169            langfuse_logger.debug(
2170                "Operation skipped: get_current_trace_id - Tracing is disabled or client is in no-op mode."
2171            )
2172            return None
2173
2174        current_otel_span = self._get_current_otel_span()
2175
2176        return self._get_otel_trace_id(current_otel_span) if current_otel_span else None
2177
2178    def get_current_observation_id(self) -> Optional[str]:
2179        """Get the observation ID (span ID) of the current active span.
2180
2181        This method retrieves the observation ID from the currently active span in the context.
2182        It can be used to get the observation ID for referencing in logs, external systems,
2183        or for creating scores or other related operations.
2184
2185        Returns:
2186            The current observation ID as a 16-character lowercase hexadecimal string,
2187            or None if there is no active span.
2188
2189        Example:
2190            ```python
2191            with langfuse.start_as_current_observation(name="process-user-query") as span:
2192                # Get the current observation ID
2193                observation_id = langfuse.get_current_observation_id()
2194
2195                # Store it for later reference
2196                cache.set(f"query_{query_id}_observation", observation_id)
2197
2198                # Process the query...
2199            ```
2200        """
2201        if not self._tracing_enabled:
2202            langfuse_logger.debug(
2203                "Operation skipped: get_current_observation_id - Tracing is disabled or client is in no-op mode."
2204            )
2205            return None
2206
2207        current_otel_span = self._get_current_otel_span()
2208
2209        return self._get_otel_span_id(current_otel_span) if current_otel_span else None
2210
2211    def _get_project_id(self) -> Optional[str]:
2212        """Fetch and return the current project id. Persisted across requests. Returns None if no project id is found for api keys."""
2213        if not self._project_id:
2214            proj = self.api.projects.get()
2215            if not proj.data or not proj.data[0].id:
2216                return None
2217
2218            self._project_id = proj.data[0].id
2219
2220        return self._project_id
2221
2222    def get_trace_url(self, *, trace_id: Optional[str] = None) -> Optional[str]:
2223        """Get the URL to view a trace in the Langfuse UI.
2224
2225        This method generates a URL that links directly to a trace in the Langfuse UI.
2226        It's useful for providing links in logs, notifications, or debugging tools.
2227
2228        Args:
2229            trace_id: Optional trace ID to generate a URL for. If not provided,
2230                     the trace ID of the current active span will be used.
2231
2232        Returns:
2233            A URL string pointing to the trace in the Langfuse UI,
2234            or None if the project ID couldn't be retrieved or no trace ID is available.
2235
2236        Example:
2237            ```python
2238            # Get URL for the current trace
2239            with langfuse.start_as_current_observation(name="process-request") as span:
2240                trace_url = langfuse.get_trace_url()
2241                log.info(f"Processing trace: {trace_url}")
2242
2243            # Get URL for a specific trace
2244            specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef")
2245            send_notification(f"Review needed for trace: {specific_trace_url}")
2246            ```
2247        """
2248        final_trace_id = trace_id or self.get_current_trace_id()
2249        if not final_trace_id:
2250            return None
2251
2252        project_id = self._get_project_id()
2253
2254        return (
2255            f"{self._base_url}/project/{project_id}/traces/{final_trace_id}"
2256            if project_id and final_trace_id
2257            else None
2258        )
2259
2260    def get_dataset(
2261        self,
2262        name: str,
2263        *,
2264        fetch_items_page_size: Optional[int] = 50,
2265        version: Optional[datetime] = None,
2266    ) -> "DatasetClient":
2267        """Fetch a dataset by its name.
2268
2269        Args:
2270            name (str): The name of the dataset to fetch.
2271            fetch_items_page_size (Optional[int]): All items of the dataset will be fetched in chunks of this size. Defaults to 50.
2272            version (Optional[datetime]): Retrieve dataset items as they existed at this specific point in time (UTC).
2273                If provided, returns the state of items at the specified UTC timestamp.
2274                If not provided, returns the latest version. Must be a timezone-aware datetime object in UTC.
2275
2276        Returns:
2277            DatasetClient: The dataset with the given name.
2278        """
2279        try:
2280            langfuse_logger.debug(f"Getting datasets {name}")
2281            dataset = self.api.datasets.get(dataset_name=self._url_encode(name))
2282
2283            dataset_items = []
2284            page = 1
2285
2286            while True:
2287                new_items = self.api.dataset_items.list(
2288                    dataset_name=self._url_encode(name, is_url_param=True),
2289                    page=page,
2290                    limit=fetch_items_page_size,
2291                    version=version,
2292                )
2293                dataset_items.extend(new_items.data)
2294
2295                if new_items.meta.total_pages <= page:
2296                    break
2297
2298                page += 1
2299
2300            return DatasetClient(
2301                dataset=dataset,
2302                items=dataset_items,
2303                version=version,
2304                langfuse_client=self,
2305            )
2306
2307        except Error as e:
2308            handle_fern_exception(e)
2309            raise e
2310
2311    def get_dataset_run(
2312        self, *, dataset_name: str, run_name: str
2313    ) -> DatasetRunWithItems:
2314        """Fetch a dataset run by dataset name and run name.
2315
2316        Args:
2317            dataset_name (str): The name of the dataset.
2318            run_name (str): The name of the run.
2319
2320        Returns:
2321            DatasetRunWithItems: The dataset run with its items.
2322        """
2323        try:
2324            return cast(
2325                DatasetRunWithItems,
2326                self.api.datasets.get_run(
2327                    dataset_name=self._url_encode(dataset_name),
2328                    run_name=self._url_encode(run_name),
2329                    request_options=None,
2330                ),
2331            )
2332        except Error as e:
2333            handle_fern_exception(e)
2334            raise e
2335
2336    def get_dataset_runs(
2337        self,
2338        *,
2339        dataset_name: str,
2340        page: Optional[int] = None,
2341        limit: Optional[int] = None,
2342    ) -> PaginatedDatasetRuns:
2343        """Fetch all runs for a dataset.
2344
2345        Args:
2346            dataset_name (str): The name of the dataset.
2347            page (Optional[int]): Page number, starts at 1.
2348            limit (Optional[int]): Limit of items per page.
2349
2350        Returns:
2351            PaginatedDatasetRuns: Paginated list of dataset runs.
2352        """
2353        try:
2354            return cast(
2355                PaginatedDatasetRuns,
2356                self.api.datasets.get_runs(
2357                    dataset_name=self._url_encode(dataset_name),
2358                    page=page,
2359                    limit=limit,
2360                    request_options=None,
2361                ),
2362            )
2363        except Error as e:
2364            handle_fern_exception(e)
2365            raise e
2366
2367    def delete_dataset_run(
2368        self, *, dataset_name: str, run_name: str
2369    ) -> DeleteDatasetRunResponse:
2370        """Delete a dataset run and all its run items. This action is irreversible.
2371
2372        Args:
2373            dataset_name (str): The name of the dataset.
2374            run_name (str): The name of the run.
2375
2376        Returns:
2377            DeleteDatasetRunResponse: Confirmation of deletion.
2378        """
2379        try:
2380            return cast(
2381                DeleteDatasetRunResponse,
2382                self.api.datasets.delete_run(
2383                    dataset_name=self._url_encode(dataset_name),
2384                    run_name=self._url_encode(run_name),
2385                    request_options=None,
2386                ),
2387            )
2388        except Error as e:
2389            handle_fern_exception(e)
2390            raise e
2391
2392    def run_experiment(
2393        self,
2394        *,
2395        name: str,
2396        run_name: Optional[str] = None,
2397        description: Optional[str] = None,
2398        data: ExperimentData,
2399        task: TaskFunction,
2400        evaluators: List[EvaluatorFunction] = [],
2401        composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
2402        run_evaluators: List[RunEvaluatorFunction] = [],
2403        max_concurrency: int = 50,
2404        metadata: Optional[Dict[str, str]] = None,
2405        _dataset_version: Optional[datetime] = None,
2406    ) -> ExperimentResult:
2407        """Run an experiment on a dataset with automatic tracing and evaluation.
2408
2409        This method executes a task function on each item in the provided dataset,
2410        automatically traces all executions with Langfuse for observability, runs
2411        item-level and run-level evaluators on the outputs, and returns comprehensive
2412        results with evaluation metrics.
2413
2414        The experiment system provides:
2415        - Automatic tracing of all task executions
2416        - Concurrent processing with configurable limits
2417        - Comprehensive error handling that isolates failures
2418        - Integration with Langfuse datasets for experiment tracking
2419        - Flexible evaluation framework supporting both sync and async evaluators
2420
2421        Args:
2422            name: Human-readable name for the experiment. Used for identification
2423                in the Langfuse UI.
2424            run_name: Optional exact name for the experiment run. If provided, this will be
2425                used as the exact dataset run name if the `data` contains Langfuse dataset items.
2426                If not provided, this will default to the experiment name appended with an ISO timestamp.
2427            description: Optional description explaining the experiment's purpose,
2428                methodology, or expected outcomes.
2429            data: Array of data items to process. Can be either:
2430                - List of dict-like items with 'input', 'expected_output', 'metadata' keys
2431                - List of Langfuse DatasetItem objects from dataset.items
2432            task: Function that processes each data item and returns output.
2433                Must accept 'item' as keyword argument and can return sync or async results.
2434                The task function signature should be: task(*, item, **kwargs) -> Any
2435            evaluators: List of functions to evaluate each item's output individually.
2436                Each evaluator receives input, output, expected_output, and metadata.
2437                Can return single Evaluation dict or list of Evaluation dicts.
2438            composite_evaluator: Optional function that creates composite scores from item-level evaluations.
2439                Receives the same inputs as item-level evaluators (input, output, expected_output, metadata)
2440                plus the list of evaluations from item-level evaluators. Useful for weighted averages,
2441                pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics.
2442            run_evaluators: List of functions to evaluate the entire experiment run.
2443                Each run evaluator receives all item_results and can compute aggregate metrics.
2444                Useful for calculating averages, distributions, or cross-item comparisons.
2445            max_concurrency: Maximum number of concurrent task executions (default: 50).
2446                Controls the number of items processed simultaneously. Adjust based on
2447                API rate limits and system resources.
2448            metadata: Optional metadata dictionary to attach to all experiment traces.
2449                This metadata will be included in every trace created during the experiment.
2450                If `data` are Langfuse dataset items, the metadata will be attached to the dataset run, too.
2451
2452        Returns:
2453            ExperimentResult containing:
2454            - run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset.
2455            - item_results: List of results for each processed item with outputs and evaluations
2456            - run_evaluations: List of aggregate evaluation results for the entire run
2457            - experiment_id: Stable identifier for the experiment run across all items
2458            - dataset_run_id: ID of the dataset run (if using Langfuse datasets)
2459            - dataset_run_url: Direct URL to view results in Langfuse UI (if applicable)
2460
2461        Raises:
2462            ValueError: If required parameters are missing or invalid
2463            Exception: If experiment setup fails (individual item failures are handled gracefully)
2464
2465        Examples:
2466            Basic experiment with local data:
2467            ```python
2468            def summarize_text(*, item, **kwargs):
2469                return f"Summary: {item['input'][:50]}..."
2470
2471            def length_evaluator(*, input, output, expected_output=None, **kwargs):
2472                return {
2473                    "name": "output_length",
2474                    "value": len(output),
2475                    "comment": f"Output contains {len(output)} characters"
2476                }
2477
2478            result = langfuse.run_experiment(
2479                name="Text Summarization Test",
2480                description="Evaluate summarization quality and length",
2481                data=[
2482                    {"input": "Long article text...", "expected_output": "Expected summary"},
2483                    {"input": "Another article...", "expected_output": "Another summary"}
2484                ],
2485                task=summarize_text,
2486                evaluators=[length_evaluator]
2487            )
2488
2489            print(f"Processed {len(result.item_results)} items")
2490            for item_result in result.item_results:
2491                print(f"Input: {item_result.item['input']}")
2492                print(f"Output: {item_result.output}")
2493                print(f"Evaluations: {item_result.evaluations}")
2494            ```
2495
2496            Advanced experiment with async task and multiple evaluators:
2497            ```python
2498            async def llm_task(*, item, **kwargs):
2499                # Simulate async LLM call
2500                response = await openai_client.chat.completions.create(
2501                    model="gpt-4",
2502                    messages=[{"role": "user", "content": item["input"]}]
2503                )
2504                return response.choices[0].message.content
2505
2506            def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
2507                if expected_output and expected_output.lower() in output.lower():
2508                    return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"}
2509                return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"}
2510
2511            def toxicity_evaluator(*, input, output, expected_output=None, **kwargs):
2512                # Simulate toxicity check
2513                toxicity_score = check_toxicity(output)  # Your toxicity checker
2514                return {
2515                    "name": "toxicity",
2516                    "value": toxicity_score,
2517                    "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}"
2518                }
2519
2520            def average_accuracy(*, item_results, **kwargs):
2521                accuracies = [
2522                    eval.value for result in item_results
2523                    for eval in result.evaluations
2524                    if eval.name == "accuracy"
2525                ]
2526                return {
2527                    "name": "average_accuracy",
2528                    "value": sum(accuracies) / len(accuracies) if accuracies else 0,
2529                    "comment": f"Average accuracy across {len(accuracies)} items"
2530                }
2531
2532            result = langfuse.run_experiment(
2533                name="LLM Safety and Accuracy Test",
2534                description="Evaluate model accuracy and safety across diverse prompts",
2535                data=test_dataset,  # Your dataset items
2536                task=llm_task,
2537                evaluators=[accuracy_evaluator, toxicity_evaluator],
2538                run_evaluators=[average_accuracy],
2539                max_concurrency=5,  # Limit concurrent API calls
2540                metadata={"model": "gpt-4", "temperature": 0.7}
2541            )
2542            ```
2543
2544            Using with Langfuse datasets:
2545            ```python
2546            # Get dataset from Langfuse
2547            dataset = langfuse.get_dataset("my-eval-dataset")
2548
2549            result = dataset.run_experiment(
2550                name="Production Model Evaluation",
2551                description="Monthly evaluation of production model performance",
2552                task=my_production_task,
2553                evaluators=[accuracy_evaluator, latency_evaluator]
2554            )
2555
2556            # Results automatically linked to dataset in Langfuse UI
2557            print(f"View results: {result['dataset_run_url']}")
2558            ```
2559
2560        Note:
2561            - Task and evaluator functions can be either synchronous or asynchronous
2562            - Individual item failures are logged but don't stop the experiment
2563            - All executions are automatically traced and visible in Langfuse UI
2564            - When using Langfuse datasets, results are automatically linked for easy comparison
2565            - This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.)
2566            - Async execution is handled automatically with smart event loop detection
2567        """
2568        return cast(
2569            ExperimentResult,
2570            run_async_safely(
2571                self._run_experiment_async(
2572                    name=name,
2573                    run_name=self._create_experiment_run_name(
2574                        name=name, run_name=run_name
2575                    ),
2576                    description=description,
2577                    data=data,
2578                    task=task,
2579                    evaluators=evaluators or [],
2580                    composite_evaluator=composite_evaluator,
2581                    run_evaluators=run_evaluators or [],
2582                    max_concurrency=max_concurrency,
2583                    metadata=metadata,
2584                    dataset_version=_dataset_version,
2585                ),
2586            ),
2587        )
2588
2589    async def _run_experiment_async(
2590        self,
2591        *,
2592        name: str,
2593        run_name: str,
2594        description: Optional[str],
2595        data: ExperimentData,
2596        task: TaskFunction,
2597        evaluators: List[EvaluatorFunction],
2598        composite_evaluator: Optional[CompositeEvaluatorFunction],
2599        run_evaluators: List[RunEvaluatorFunction],
2600        max_concurrency: int,
2601        metadata: Optional[Dict[str, Any]] = None,
2602        dataset_version: Optional[datetime] = None,
2603    ) -> ExperimentResult:
2604        langfuse_logger.debug(
2605            f"Starting experiment '{name}' run '{run_name}' with {len(data)} items"
2606        )
2607
2608        shared_fallback_experiment_id = self._create_observation_id()
2609
2610        # Set up concurrency control
2611        semaphore = asyncio.Semaphore(max_concurrency)
2612
2613        # Process all items
2614        async def process_item(item: ExperimentItem) -> ExperimentItemResult:
2615            async with semaphore:
2616                return await self._process_experiment_item(
2617                    item,
2618                    task,
2619                    evaluators,
2620                    composite_evaluator,
2621                    shared_fallback_experiment_id,
2622                    name,
2623                    run_name,
2624                    description,
2625                    metadata,
2626                    dataset_version,
2627                )
2628
2629        # Run all items concurrently
2630        tasks = [process_item(item) for item in data]
2631        item_results = await asyncio.gather(*tasks, return_exceptions=True)
2632
2633        # Filter out any exceptions and log errors
2634        valid_results: List[ExperimentItemResult] = []
2635        for i, result in enumerate(item_results):
2636            if isinstance(result, Exception):
2637                langfuse_logger.error(f"Item {i} failed: {result}")
2638            elif isinstance(result, ExperimentItemResult):
2639                valid_results.append(result)  # type: ignore
2640
2641        # Run experiment-level evaluators
2642        run_evaluations: List[Evaluation] = []
2643        for run_evaluator in run_evaluators:
2644            try:
2645                evaluations = await _run_evaluator(
2646                    run_evaluator, item_results=valid_results
2647                )
2648                run_evaluations.extend(evaluations)
2649            except Exception as e:
2650                langfuse_logger.error(f"Run evaluator failed: {e}")
2651
2652        # Generate dataset run URL if applicable
2653        dataset_run_id = next(
2654            (
2655                result.dataset_run_id
2656                for result in valid_results
2657                if result.dataset_run_id
2658            ),
2659            None,
2660        )
2661        dataset_run_url = None
2662        if dataset_run_id and data:
2663            try:
2664                # Check if the first item has dataset_id (for DatasetItem objects)
2665                first_item = data[0]
2666                dataset_id = None
2667
2668                if hasattr(first_item, "dataset_id"):
2669                    dataset_id = getattr(first_item, "dataset_id", None)
2670
2671                if dataset_id:
2672                    project_id = self._get_project_id()
2673
2674                    if project_id:
2675                        dataset_run_url = f"{self._base_url}/project/{project_id}/datasets/{dataset_id}/runs/{dataset_run_id}"
2676
2677            except Exception:
2678                pass  # URL generation is optional
2679
2680        # Store run-level evaluations as scores
2681        for evaluation in run_evaluations:
2682            try:
2683                if dataset_run_id:
2684                    self.create_score(
2685                        dataset_run_id=dataset_run_id,
2686                        name=evaluation.name or "<unknown>",
2687                        value=evaluation.value,  # type: ignore
2688                        comment=evaluation.comment,
2689                        metadata=evaluation.metadata,
2690                        data_type=evaluation.data_type,  # type: ignore
2691                        config_id=evaluation.config_id,
2692                    )
2693
2694            except Exception as e:
2695                langfuse_logger.error(f"Failed to store run evaluation: {e}")
2696
2697        # Flush scores and traces
2698        self.flush()
2699
2700        return ExperimentResult(
2701            name=name,
2702            run_name=run_name,
2703            description=description,
2704            item_results=valid_results,
2705            run_evaluations=run_evaluations,
2706            experiment_id=dataset_run_id or shared_fallback_experiment_id,
2707            dataset_run_id=dataset_run_id,
2708            dataset_run_url=dataset_run_url,
2709        )
2710
2711    async def _process_experiment_item(
2712        self,
2713        item: ExperimentItem,
2714        task: Callable,
2715        evaluators: List[Callable],
2716        composite_evaluator: Optional[CompositeEvaluatorFunction],
2717        fallback_experiment_id: str,
2718        experiment_name: str,
2719        experiment_run_name: str,
2720        experiment_description: Optional[str],
2721        experiment_metadata: Optional[Dict[str, Any]] = None,
2722        dataset_version: Optional[datetime] = None,
2723    ) -> ExperimentItemResult:
2724        span_name = "experiment-item-run"
2725
2726        with self.start_as_current_observation(name=span_name) as span:
2727            try:
2728                input_data = (
2729                    item.get("input")
2730                    if isinstance(item, dict)
2731                    else getattr(item, "input", None)
2732                )
2733
2734                if input_data is None:
2735                    raise ValueError("Experiment Item is missing input. Skipping item.")
2736
2737                expected_output = (
2738                    item.get("expected_output")
2739                    if isinstance(item, dict)
2740                    else getattr(item, "expected_output", None)
2741                )
2742
2743                item_metadata = (
2744                    item.get("metadata")
2745                    if isinstance(item, dict)
2746                    else getattr(item, "metadata", None)
2747                )
2748
2749                final_observation_metadata = {
2750                    "experiment_name": experiment_name,
2751                    "experiment_run_name": experiment_run_name,
2752                    **(experiment_metadata or {}),
2753                }
2754
2755                trace_id = span.trace_id
2756                dataset_id = None
2757                dataset_item_id = None
2758                dataset_run_id = None
2759
2760                # Link to dataset run if this is a dataset item
2761                if hasattr(item, "id") and hasattr(item, "dataset_id"):
2762                    try:
2763                        # Use sync API to avoid event loop issues when run_async_safely
2764                        # creates multiple event loops across different threads
2765                        dataset_run_item = await asyncio.to_thread(
2766                            self.api.dataset_run_items.create,
2767                            run_name=experiment_run_name,
2768                            run_description=experiment_description,
2769                            metadata=experiment_metadata,
2770                            dataset_item_id=item.id,  # type: ignore
2771                            trace_id=trace_id,
2772                            observation_id=span.id,
2773                            dataset_version=dataset_version,
2774                        )
2775
2776                        dataset_run_id = dataset_run_item.dataset_run_id
2777
2778                    except Exception as e:
2779                        langfuse_logger.error(f"Failed to create dataset run item: {e}")
2780
2781                if (
2782                    not isinstance(item, dict)
2783                    and hasattr(item, "dataset_id")
2784                    and hasattr(item, "id")
2785                ):
2786                    dataset_id = item.dataset_id
2787                    dataset_item_id = item.id
2788
2789                    final_observation_metadata.update(
2790                        {"dataset_id": dataset_id, "dataset_item_id": dataset_item_id}
2791                    )
2792
2793                if isinstance(item_metadata, dict):
2794                    final_observation_metadata.update(item_metadata)
2795
2796                experiment_id = dataset_run_id or fallback_experiment_id
2797                experiment_item_id = (
2798                    dataset_item_id or get_sha256_hash_hex(_serialize(input_data))[:16]
2799                )
2800                span._otel_span.set_attributes(
2801                    {
2802                        k: v
2803                        for k, v in {
2804                            LangfuseOtelSpanAttributes.ENVIRONMENT: LANGFUSE_SDK_EXPERIMENT_ENVIRONMENT,
2805                            LangfuseOtelSpanAttributes.EXPERIMENT_DESCRIPTION: experiment_description,
2806                            LangfuseOtelSpanAttributes.EXPERIMENT_ITEM_EXPECTED_OUTPUT: _serialize(
2807                                expected_output
2808                            ),
2809                        }.items()
2810                        if v is not None
2811                    }
2812                )
2813
2814                propagated_experiment_attributes = PropagatedExperimentAttributes(
2815                    experiment_id=experiment_id,
2816                    experiment_name=experiment_run_name,
2817                    experiment_metadata=_flatten_and_serialize_metadata_values(
2818                        experiment_metadata
2819                    ),
2820                    experiment_dataset_id=dataset_id,
2821                    experiment_item_id=experiment_item_id,
2822                    experiment_item_metadata=_flatten_and_serialize_metadata_values(
2823                        item_metadata if isinstance(item_metadata, dict) else None
2824                    ),
2825                    experiment_item_root_observation_id=span.id,
2826                )
2827
2828                with _propagate_attributes(experiment=propagated_experiment_attributes):
2829                    output = await _run_task(task, item)
2830
2831                span.update(
2832                    input=input_data,
2833                    output=output,
2834                    metadata=final_observation_metadata,
2835                )
2836
2837            except Exception as e:
2838                span.update(
2839                    output=f"Error: {str(e)}", level="ERROR", status_message=str(e)
2840                )
2841                raise e
2842
2843            # Run evaluators
2844            evaluations = []
2845
2846            for evaluator in evaluators:
2847                try:
2848                    eval_metadata: Optional[Dict[str, Any]] = None
2849
2850                    if isinstance(item, dict):
2851                        eval_metadata = item.get("metadata")
2852                    elif hasattr(item, "metadata"):
2853                        eval_metadata = item.metadata
2854
2855                    with _propagate_attributes(
2856                        experiment=propagated_experiment_attributes
2857                    ):
2858                        eval_results = await _run_evaluator(
2859                            evaluator,
2860                            input=input_data,
2861                            output=output,
2862                            expected_output=expected_output,
2863                            metadata=eval_metadata,
2864                        )
2865                        evaluations.extend(eval_results)
2866
2867                        # Store evaluations as scores
2868                        for evaluation in eval_results:
2869                            self.create_score(
2870                                trace_id=trace_id,
2871                                observation_id=span.id,
2872                                name=evaluation.name,
2873                                value=evaluation.value,  # type: ignore
2874                                comment=evaluation.comment,
2875                                metadata=evaluation.metadata,
2876                                config_id=evaluation.config_id,
2877                                data_type=evaluation.data_type,  # type: ignore
2878                            )
2879
2880                except Exception as e:
2881                    langfuse_logger.error(f"Evaluator failed: {e}")
2882
2883            # Run composite evaluator if provided and we have evaluations
2884            if composite_evaluator and evaluations:
2885                try:
2886                    composite_eval_metadata: Optional[Dict[str, Any]] = None
2887                    if isinstance(item, dict):
2888                        composite_eval_metadata = item.get("metadata")
2889                    elif hasattr(item, "metadata"):
2890                        composite_eval_metadata = item.metadata
2891
2892                    with _propagate_attributes(
2893                        experiment=propagated_experiment_attributes
2894                    ):
2895                        result = composite_evaluator(
2896                            input=input_data,
2897                            output=output,
2898                            expected_output=expected_output,
2899                            metadata=composite_eval_metadata,
2900                            evaluations=evaluations,
2901                        )
2902
2903                        # Handle async composite evaluators
2904                        if asyncio.iscoroutine(result):
2905                            result = await result
2906
2907                        # Normalize to list
2908                        composite_evals: List[Evaluation] = []
2909                        if isinstance(result, (dict, Evaluation)):
2910                            composite_evals = [result]  # type: ignore
2911                        elif isinstance(result, list):
2912                            composite_evals = result  # type: ignore
2913
2914                        # Store composite evaluations as scores and add to evaluations list
2915                        for composite_evaluation in composite_evals:
2916                            self.create_score(
2917                                trace_id=trace_id,
2918                                observation_id=span.id,
2919                                name=composite_evaluation.name,
2920                                value=composite_evaluation.value,  # type: ignore
2921                                comment=composite_evaluation.comment,
2922                                metadata=composite_evaluation.metadata,
2923                                config_id=composite_evaluation.config_id,
2924                                data_type=composite_evaluation.data_type,  # type: ignore
2925                            )
2926                            evaluations.append(composite_evaluation)
2927
2928                except Exception as e:
2929                    langfuse_logger.error(f"Composite evaluator failed: {e}")
2930
2931            return ExperimentItemResult(
2932                item=item,
2933                output=output,
2934                evaluations=evaluations,
2935                trace_id=trace_id,
2936                dataset_run_id=dataset_run_id,
2937            )
2938
2939    def _create_experiment_run_name(
2940        self, *, name: Optional[str] = None, run_name: Optional[str] = None
2941    ) -> str:
2942        if run_name:
2943            return run_name
2944
2945        iso_timestamp = _get_timestamp().isoformat().replace("+00:00", "Z")
2946
2947        return f"{name} - {iso_timestamp}"
2948
2949    def run_batched_evaluation(
2950        self,
2951        *,
2952        scope: Literal["traces", "observations"],
2953        mapper: MapperFunction,
2954        filter: Optional[str] = None,
2955        fetch_batch_size: int = 50,
2956        fetch_trace_fields: Optional[str] = None,
2957        max_items: Optional[int] = None,
2958        max_retries: int = 3,
2959        evaluators: List[EvaluatorFunction],
2960        composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
2961        max_concurrency: int = 5,
2962        metadata: Optional[Dict[str, Any]] = None,
2963        _add_observation_scores_to_trace: bool = False,
2964        _additional_trace_tags: Optional[List[str]] = None,
2965        resume_from: Optional[BatchEvaluationResumeToken] = None,
2966        verbose: bool = False,
2967    ) -> BatchEvaluationResult:
2968        """Fetch traces or observations and run evaluations on each item.
2969
2970        This method provides a powerful way to evaluate existing data in Langfuse at scale.
2971        It fetches items based on filters, transforms them using a mapper function, runs
2972        evaluators on each item, and creates scores that are linked back to the original
2973        entities. This is ideal for:
2974
2975        - Running evaluations on production traces after deployment
2976        - Backtesting new evaluation metrics on historical data
2977        - Batch scoring of observations for quality monitoring
2978        - Periodic evaluation runs on recent data
2979
2980        The method uses a streaming/pipeline approach to process items in batches, making
2981        it memory-efficient for large datasets. It includes comprehensive error handling,
2982        retry logic, and resume capability for long-running evaluations.
2983
2984        Args:
2985            scope: The type of items to evaluate. Must be one of:
2986                - "traces": Evaluate complete traces with all their observations
2987                - "observations": Evaluate individual observations (spans, generations, events)
2988            mapper: Function that transforms API response objects into evaluator inputs.
2989                Receives a trace/observation object and returns an EvaluatorInputs
2990                instance with input, output, expected_output, and metadata fields.
2991                Can be sync or async.
2992            evaluators: List of evaluation functions to run on each item. Each evaluator
2993                receives the mapped inputs and returns Evaluation object(s). Evaluator
2994                failures are logged but don't stop the batch evaluation.
2995            filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples:
2996                - '{"tags": ["production"]}'
2997                - '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}'
2998                Default: None (fetches all items).
2999            fetch_batch_size: Number of items to fetch per API call and hold in memory.
3000                Larger values may be faster but use more memory. Default: 50.
3001            fetch_trace_fields: Comma-separated list of fields to include when fetching traces. Available field groups: 'core' (always included), 'io' (input, output, metadata), 'scores', 'observations', 'metrics'. If not specified, all fields are returned. Example: 'core,scores,metrics'. Note: Excluded 'observations' or 'scores' fields return empty arrays; excluded 'metrics' returns -1 for 'totalCost' and 'latency'. Only relevant if scope is 'traces'.
3002            max_items: Maximum total number of items to process. If None, processes all
3003                items matching the filter. Useful for testing or limiting evaluation runs.
3004                Default: None (process all).
3005            max_concurrency: Maximum number of items to evaluate concurrently. Controls
3006                parallelism and resource usage. Default: 5.
3007            composite_evaluator: Optional function that creates a composite score from
3008                item-level evaluations. Receives the original item and its evaluations,
3009                returns a single Evaluation. Useful for weighted averages or combined metrics.
3010                Default: None.
3011            metadata: Optional metadata dict to add to all created scores. Useful for
3012                tracking evaluation runs, versions, or other context. Default: None.
3013            max_retries: Maximum number of retry attempts for failed batch fetches.
3014                Uses exponential backoff (1s, 2s, 4s). Default: 3.
3015            verbose: If True, logs progress information to console. Useful for monitoring
3016                long-running evaluations. Default: False.
3017            resume_from: Optional resume token from a previous incomplete run. Allows
3018                continuing evaluation after interruption or failure. Default: None.
3019
3020
3021        Returns:
3022            BatchEvaluationResult containing:
3023                - total_items_fetched: Number of items fetched from API
3024                - total_items_processed: Number of items successfully evaluated
3025                - total_items_failed: Number of items that failed evaluation
3026                - total_scores_created: Scores created by item-level evaluators
3027                - total_composite_scores_created: Scores created by composite evaluator
3028                - total_evaluations_failed: Individual evaluator failures
3029                - evaluator_stats: Per-evaluator statistics (success rate, scores created)
3030                - resume_token: Token for resuming if incomplete (None if completed)
3031                - completed: True if all items processed
3032                - duration_seconds: Total execution time
3033                - failed_item_ids: IDs of items that failed
3034                - error_summary: Error types and counts
3035                - has_more_items: True if max_items reached but more exist
3036
3037        Raises:
3038            ValueError: If invalid scope is provided.
3039
3040        Examples:
3041            Basic trace evaluation:
3042            ```python
3043            from langfuse import Langfuse, EvaluatorInputs, Evaluation
3044
3045            client = Langfuse()
3046
3047            # Define mapper to extract fields from traces
3048            def trace_mapper(trace):
3049                return EvaluatorInputs(
3050                    input=trace.input,
3051                    output=trace.output,
3052                    expected_output=None,
3053                    metadata={"trace_id": trace.id}
3054                )
3055
3056            # Define evaluator
3057            def length_evaluator(*, input, output, expected_output, metadata):
3058                return Evaluation(
3059                    name="output_length",
3060                    value=len(output) if output else 0
3061                )
3062
3063            # Run batch evaluation
3064            result = client.run_batched_evaluation(
3065                scope="traces",
3066                mapper=trace_mapper,
3067                evaluators=[length_evaluator],
3068                filter='{"tags": ["production"]}',
3069                max_items=1000,
3070                verbose=True
3071            )
3072
3073            print(f"Processed {result.total_items_processed} traces")
3074            print(f"Created {result.total_scores_created} scores")
3075            ```
3076
3077            Evaluation with composite scorer:
3078            ```python
3079            def accuracy_evaluator(*, input, output, expected_output, metadata):
3080                # ... evaluation logic
3081                return Evaluation(name="accuracy", value=0.85)
3082
3083            def relevance_evaluator(*, input, output, expected_output, metadata):
3084                # ... evaluation logic
3085                return Evaluation(name="relevance", value=0.92)
3086
3087            def composite_evaluator(*, item, evaluations):
3088                # Weighted average of evaluations
3089                weights = {"accuracy": 0.6, "relevance": 0.4}
3090                total = sum(
3091                    e.value * weights.get(e.name, 0)
3092                    for e in evaluations
3093                    if isinstance(e.value, (int, float))
3094                )
3095                return Evaluation(
3096                    name="composite_score",
3097                    value=total,
3098                    comment=f"Weighted average of {len(evaluations)} metrics"
3099                )
3100
3101            result = client.run_batched_evaluation(
3102                scope="traces",
3103                mapper=trace_mapper,
3104                evaluators=[accuracy_evaluator, relevance_evaluator],
3105                composite_evaluator=composite_evaluator,
3106                filter='{"user_id": "important_user"}',
3107                verbose=True
3108            )
3109            ```
3110
3111            Handling incomplete runs with resume:
3112            ```python
3113            # Initial run that may fail or timeout
3114            result = client.run_batched_evaluation(
3115                scope="observations",
3116                mapper=obs_mapper,
3117                evaluators=[my_evaluator],
3118                max_items=10000,
3119                verbose=True
3120            )
3121
3122            # Check if incomplete
3123            if not result.completed and result.resume_token:
3124                print(f"Processed {result.resume_token.items_processed} items before interruption")
3125
3126                # Resume from where it left off
3127                result = client.run_batched_evaluation(
3128                    scope="observations",
3129                    mapper=obs_mapper,
3130                    evaluators=[my_evaluator],
3131                    resume_from=result.resume_token,
3132                    verbose=True
3133                )
3134
3135            print(f"Total items processed: {result.total_items_processed}")
3136            ```
3137
3138            Monitoring evaluator performance:
3139            ```python
3140            result = client.run_batched_evaluation(...)
3141
3142            for stats in result.evaluator_stats:
3143                success_rate = stats.successful_runs / stats.total_runs
3144                print(f"{stats.name}:")
3145                print(f"  Success rate: {success_rate:.1%}")
3146                print(f"  Scores created: {stats.total_scores_created}")
3147
3148                if stats.failed_runs > 0:
3149                    print(f"  ⚠️  Failed {stats.failed_runs} times")
3150            ```
3151
3152        Note:
3153            - Evaluator failures are logged but don't stop the batch evaluation
3154            - Individual item failures are tracked but don't stop processing
3155            - Fetch failures are retried with exponential backoff
3156            - All scores are automatically flushed to Langfuse at the end
3157            - The resume mechanism uses timestamp-based filtering to avoid duplicates
3158        """
3159        runner = BatchEvaluationRunner(self)
3160
3161        return cast(
3162            BatchEvaluationResult,
3163            run_async_safely(
3164                runner.run_async(
3165                    scope=scope,
3166                    mapper=mapper,
3167                    evaluators=evaluators,
3168                    filter=filter,
3169                    fetch_batch_size=fetch_batch_size,
3170                    fetch_trace_fields=fetch_trace_fields,
3171                    max_items=max_items,
3172                    max_concurrency=max_concurrency,
3173                    composite_evaluator=composite_evaluator,
3174                    metadata=metadata,
3175                    _add_observation_scores_to_trace=_add_observation_scores_to_trace,
3176                    _additional_trace_tags=_additional_trace_tags,
3177                    max_retries=max_retries,
3178                    verbose=verbose,
3179                    resume_from=resume_from,
3180                )
3181            ),
3182        )
3183
3184    def auth_check(self) -> bool:
3185        """Check if the provided credentials (public and secret key) are valid.
3186
3187        Raises:
3188            Exception: If no projects were found for the provided credentials.
3189
3190        Note:
3191            This method is blocking. It is discouraged to use it in production code.
3192        """
3193        try:
3194            projects = self.api.projects.get()
3195            langfuse_logger.debug(
3196                f"Auth check successful, found {len(projects.data)} projects"
3197            )
3198            if len(projects.data) == 0:
3199                raise Exception(
3200                    "Auth check failed, no project found for the keys provided."
3201                )
3202            return True
3203
3204        except AttributeError as e:
3205            langfuse_logger.warning(
3206                f"Auth check failed: Client not properly initialized. Error: {e}"
3207            )
3208            return False
3209
3210        except Error as e:
3211            handle_fern_exception(e)
3212            raise e
3213
3214    def create_dataset(
3215        self,
3216        *,
3217        name: str,
3218        description: Optional[str] = None,
3219        metadata: Optional[Any] = None,
3220        input_schema: Optional[Any] = None,
3221        expected_output_schema: Optional[Any] = None,
3222    ) -> Dataset:
3223        """Create a dataset with the given name on Langfuse.
3224
3225        Args:
3226            name: Name of the dataset to create.
3227            description: Description of the dataset. Defaults to None.
3228            metadata: Additional metadata. Defaults to None.
3229            input_schema: JSON Schema for validating dataset item inputs. When set, all new items will be validated against this schema.
3230            expected_output_schema: JSON Schema for validating dataset item expected outputs. When set, all new items will be validated against this schema.
3231
3232        Returns:
3233            Dataset: The created dataset as returned by the Langfuse API.
3234        """
3235        try:
3236            langfuse_logger.debug(f"Creating datasets {name}")
3237
3238            result = self.api.datasets.create(
3239                name=name,
3240                description=description,
3241                metadata=metadata,
3242                input_schema=input_schema,
3243                expected_output_schema=expected_output_schema,
3244            )
3245
3246            return cast(Dataset, result)
3247
3248        except Error as e:
3249            handle_fern_exception(e)
3250            raise e
3251
3252    def create_dataset_item(
3253        self,
3254        *,
3255        dataset_name: str,
3256        input: Optional[Any] = None,
3257        expected_output: Optional[Any] = None,
3258        metadata: Optional[Any] = None,
3259        source_trace_id: Optional[str] = None,
3260        source_observation_id: Optional[str] = None,
3261        status: Optional[DatasetStatus] = None,
3262        id: Optional[str] = None,
3263    ) -> DatasetItem:
3264        """Create a dataset item.
3265
3266        Upserts if an item with id already exists.
3267
3268        Args:
3269            dataset_name: Name of the dataset in which the dataset item should be created.
3270            input: Input data. Defaults to None. Can contain any dict, list or scalar.
3271            expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar.
3272            metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar.
3273            source_trace_id: Id of the source trace. Defaults to None.
3274            source_observation_id: Id of the source observation. Defaults to None.
3275            status: Status of the dataset item. Defaults to ACTIVE for newly created items.
3276            id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets.
3277
3278        Returns:
3279            DatasetItem: The created dataset item as returned by the Langfuse API.
3280
3281        Example:
3282            ```python
3283            from langfuse import Langfuse
3284
3285            langfuse = Langfuse()
3286
3287            # Uploading items to the Langfuse dataset named "capital_cities"
3288            langfuse.create_dataset_item(
3289                dataset_name="capital_cities",
3290                input={"input": {"country": "Italy"}},
3291                expected_output={"expected_output": "Rome"},
3292                metadata={"foo": "bar"}
3293            )
3294            ```
3295        """
3296        try:
3297            langfuse_logger.debug(f"Creating dataset item for dataset {dataset_name}")
3298
3299            result = self.api.dataset_items.create(
3300                dataset_name=dataset_name,
3301                input=input,
3302                expected_output=expected_output,
3303                metadata=metadata,
3304                source_trace_id=source_trace_id,
3305                source_observation_id=source_observation_id,
3306                status=status,
3307                id=id,
3308            )
3309
3310            return cast(DatasetItem, result)
3311        except Error as e:
3312            handle_fern_exception(e)
3313            raise e
3314
3315    def resolve_media_references(
3316        self,
3317        *,
3318        obj: Any,
3319        resolve_with: Literal["base64_data_uri"],
3320        max_depth: int = 10,
3321        content_fetch_timeout_seconds: int = 5,
3322    ) -> Any:
3323        """Replace media reference strings in an object with base64 data URIs.
3324
3325        This method recursively traverses an object (up to max_depth) looking for media reference strings
3326        in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using
3327        the provided Langfuse client and replaces the reference string with a base64 data URI.
3328
3329        If fetching media content fails for a reference string, a warning is logged and the reference
3330        string is left unchanged.
3331
3332        Args:
3333            obj: The object to process. Can be a primitive value, array, or nested object.
3334                If the object has a __dict__ attribute, a dict will be returned instead of the original object type.
3335            resolve_with: The representation of the media content to replace the media reference string with.
3336                Currently only "base64_data_uri" is supported.
3337            max_depth: int: The maximum depth to traverse the object. Default is 10.
3338            content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5.
3339
3340        Returns:
3341            A deep copy of the input object with all media references replaced with base64 data URIs where possible.
3342            If the input object has a __dict__ attribute, a dict will be returned instead of the original object type.
3343
3344        Example:
3345            obj = {
3346                "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@",
3347                "nested": {
3348                    "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@"
3349                }
3350            }
3351
3352            result = await LangfuseMedia.resolve_media_references(obj, langfuse_client)
3353
3354            # Result:
3355            # {
3356            #     "image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...",
3357            #     "nested": {
3358            #         "pdf": "data:application/pdf;base64,JVBERi0xLjcK..."
3359            #     }
3360            # }
3361        """
3362        return LangfuseMedia.resolve_media_references(
3363            langfuse_client=self,
3364            obj=obj,
3365            resolve_with=resolve_with,
3366            max_depth=max_depth,
3367            content_fetch_timeout_seconds=content_fetch_timeout_seconds,
3368        )
3369
3370    @overload
3371    def get_prompt(
3372        self,
3373        name: str,
3374        *,
3375        version: Optional[int] = None,
3376        label: Optional[str] = None,
3377        type: Literal["chat"],
3378        cache_ttl_seconds: Optional[int] = None,
3379        fallback: Optional[List[ChatMessageDict]] = None,
3380        max_retries: Optional[int] = None,
3381        fetch_timeout_seconds: Optional[int] = None,
3382    ) -> ChatPromptClient: ...
3383
3384    @overload
3385    def get_prompt(
3386        self,
3387        name: str,
3388        *,
3389        version: Optional[int] = None,
3390        label: Optional[str] = None,
3391        type: Literal["text"] = "text",
3392        cache_ttl_seconds: Optional[int] = None,
3393        fallback: Optional[str] = None,
3394        max_retries: Optional[int] = None,
3395        fetch_timeout_seconds: Optional[int] = None,
3396    ) -> TextPromptClient: ...
3397
3398    def get_prompt(
3399        self,
3400        name: str,
3401        *,
3402        version: Optional[int] = None,
3403        label: Optional[str] = None,
3404        type: Literal["chat", "text"] = "text",
3405        cache_ttl_seconds: Optional[int] = None,
3406        fallback: Union[Optional[List[ChatMessageDict]], Optional[str]] = None,
3407        max_retries: Optional[int] = None,
3408        fetch_timeout_seconds: Optional[int] = None,
3409    ) -> PromptClient:
3410        """Get a prompt.
3411
3412        This method attempts to fetch the requested prompt from the local cache. If the prompt is not found
3413        in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again
3414        and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will
3415        return the expired prompt as a fallback.
3416
3417        Args:
3418            name (str): The name of the prompt to retrieve.
3419
3420        Keyword Args:
3421            version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both.
3422            label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both.
3423            cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a
3424            keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0.
3425            type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text".
3426            fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None.
3427            max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds.
3428            fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default.
3429
3430        Returns:
3431            The prompt object retrieved from the cache or directly fetched if not cached or expired of type
3432            - TextPromptClient, if type argument is 'text'.
3433            - ChatPromptClient, if type argument is 'chat'.
3434
3435        Raises:
3436            Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an
3437            expired prompt in the cache, in which case it logs a warning and returns the expired prompt.
3438        """
3439        if self._resources is None:
3440            raise Error(
3441                "SDK is not correctly initialized. Check the init logs for more details."
3442            )
3443        if version is not None and label is not None:
3444            raise ValueError("Cannot specify both version and label at the same time.")
3445
3446        if not name:
3447            raise ValueError("Prompt name cannot be empty.")
3448
3449        cache_key = PromptCache.generate_cache_key(name, version=version, label=label)
3450        bounded_max_retries = self._get_bounded_max_retries(
3451            max_retries, default_max_retries=2, max_retries_upper_bound=4
3452        )
3453
3454        langfuse_logger.debug(f"Getting prompt '{cache_key}'")
3455        cached_prompt = self._resources.prompt_cache.get(cache_key)
3456
3457        if cached_prompt is None or cache_ttl_seconds == 0:
3458            langfuse_logger.debug(
3459                f"Prompt '{cache_key}' not found in cache or caching disabled."
3460            )
3461            try:
3462                return self._fetch_prompt_and_update_cache(
3463                    name,
3464                    version=version,
3465                    label=label,
3466                    ttl_seconds=cache_ttl_seconds,
3467                    max_retries=bounded_max_retries,
3468                    fetch_timeout_seconds=fetch_timeout_seconds,
3469                )
3470            except Exception as e:
3471                if fallback:
3472                    langfuse_logger.warning(
3473                        f"Returning fallback prompt for '{cache_key}' due to fetch error: {e}"
3474                    )
3475
3476                    fallback_client_args: Dict[str, Any] = {
3477                        "name": name,
3478                        "prompt": fallback,
3479                        "type": type,
3480                        "version": version or 0,
3481                        "config": {},
3482                        "labels": [label] if label else [],
3483                        "tags": [],
3484                    }
3485
3486                    if type == "text":
3487                        return TextPromptClient(
3488                            prompt=Prompt_Text(**fallback_client_args),
3489                            is_fallback=True,
3490                        )
3491
3492                    if type == "chat":
3493                        return ChatPromptClient(
3494                            prompt=Prompt_Chat(**fallback_client_args),
3495                            is_fallback=True,
3496                        )
3497
3498                raise e
3499
3500        if cached_prompt.is_expired():
3501            langfuse_logger.debug(f"Stale prompt '{cache_key}' found in cache.")
3502            try:
3503                # refresh prompt in background thread, refresh_prompt deduplicates tasks
3504                langfuse_logger.debug(f"Refreshing prompt '{cache_key}' in background.")
3505
3506                def refresh_task() -> None:
3507                    self._fetch_prompt_and_update_cache(
3508                        name,
3509                        version=version,
3510                        label=label,
3511                        ttl_seconds=cache_ttl_seconds,
3512                        max_retries=bounded_max_retries,
3513                        fetch_timeout_seconds=fetch_timeout_seconds,
3514                    )
3515
3516                self._resources.prompt_cache.add_refresh_prompt_task_if_current(
3517                    cache_key,
3518                    cached_prompt,
3519                    refresh_task,
3520                )
3521                langfuse_logger.debug(
3522                    f"Returning stale prompt '{cache_key}' from cache."
3523                )
3524                # return stale prompt
3525                return cached_prompt.value
3526
3527            except Exception as e:
3528                langfuse_logger.warning(
3529                    f"Error when refreshing cached prompt '{cache_key}', returning cached version. Error: {e}"
3530                )
3531                # creation of refresh prompt task failed, return stale prompt
3532                return cached_prompt.value
3533
3534        return cached_prompt.value
3535
3536    def _fetch_prompt_and_update_cache(
3537        self,
3538        name: str,
3539        *,
3540        version: Optional[int] = None,
3541        label: Optional[str] = None,
3542        ttl_seconds: Optional[int] = None,
3543        max_retries: int,
3544        fetch_timeout_seconds: Optional[int],
3545    ) -> PromptClient:
3546        cache_key = PromptCache.generate_cache_key(name, version=version, label=label)
3547        langfuse_logger.debug(f"Fetching prompt '{cache_key}' from server...")
3548
3549        try:
3550
3551            @backoff.on_exception(
3552                backoff.constant, Exception, max_tries=max_retries + 1, logger=None
3553            )
3554            def fetch_prompts() -> Any:
3555                return self.api.prompts.get(
3556                    self._url_encode(name),
3557                    version=version,
3558                    label=label,
3559                    request_options={
3560                        "timeout_in_seconds": fetch_timeout_seconds,
3561                    }
3562                    if fetch_timeout_seconds is not None
3563                    else None,
3564                )
3565
3566            prompt_response = fetch_prompts()
3567
3568            prompt: PromptClient
3569            if prompt_response.type == "chat":
3570                prompt = ChatPromptClient(prompt_response)
3571            else:
3572                prompt = TextPromptClient(prompt_response)
3573
3574            if self._resources is not None:
3575                self._resources.prompt_cache.set(cache_key, prompt, ttl_seconds)
3576
3577            return prompt
3578
3579        except NotFoundError as not_found_error:
3580            langfuse_logger.warning(
3581                f"Prompt '{cache_key}' not found during refresh, evicting from cache."
3582            )
3583            if self._resources is not None:
3584                self._resources.prompt_cache.delete(cache_key)
3585            raise not_found_error
3586
3587        except Exception as e:
3588            langfuse_logger.error(
3589                f"Error while fetching prompt '{cache_key}': {str(e)}"
3590            )
3591            raise e
3592
3593    def _get_bounded_max_retries(
3594        self,
3595        max_retries: Optional[int],
3596        *,
3597        default_max_retries: int = 2,
3598        max_retries_upper_bound: int = 4,
3599    ) -> int:
3600        if max_retries is None:
3601            return default_max_retries
3602
3603        bounded_max_retries = min(
3604            max(max_retries, 0),
3605            max_retries_upper_bound,
3606        )
3607
3608        return bounded_max_retries
3609
3610    @overload
3611    def create_prompt(
3612        self,
3613        *,
3614        name: str,
3615        prompt: List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]],
3616        labels: List[str] = [],
3617        tags: Optional[List[str]] = None,
3618        type: Optional[Literal["chat"]],
3619        config: Optional[Any] = None,
3620        commit_message: Optional[str] = None,
3621    ) -> ChatPromptClient: ...
3622
3623    @overload
3624    def create_prompt(
3625        self,
3626        *,
3627        name: str,
3628        prompt: str,
3629        labels: List[str] = [],
3630        tags: Optional[List[str]] = None,
3631        type: Optional[Literal["text"]] = "text",
3632        config: Optional[Any] = None,
3633        commit_message: Optional[str] = None,
3634    ) -> TextPromptClient: ...
3635
3636    def create_prompt(
3637        self,
3638        *,
3639        name: str,
3640        prompt: Union[
3641            str, List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]]
3642        ],
3643        labels: List[str] = [],
3644        tags: Optional[List[str]] = None,
3645        type: Optional[Literal["chat", "text"]] = "text",
3646        config: Optional[Any] = None,
3647        commit_message: Optional[str] = None,
3648    ) -> PromptClient:
3649        """Create a new prompt in Langfuse.
3650
3651        Keyword Args:
3652            name : The name of the prompt to be created.
3653            prompt : The content of the prompt to be created.
3654            is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead.
3655            labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label.
3656            tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt.
3657            config: Additional structured data to be saved with the prompt. Defaults to None.
3658            type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text".
3659            commit_message: Optional string describing the change.
3660
3661        Returns:
3662            TextPromptClient: The prompt if type argument is 'text'.
3663            ChatPromptClient: The prompt if type argument is 'chat'.
3664        """
3665        try:
3666            langfuse_logger.debug(f"Creating prompt {name=}, {labels=}")
3667
3668            if type == "chat":
3669                if not isinstance(prompt, list):
3670                    raise ValueError(
3671                        "For 'chat' type, 'prompt' must be a list of chat messages with role and content attributes."
3672                    )
3673                request: Union[CreateChatPromptRequest, CreateTextPromptRequest] = (
3674                    CreateChatPromptRequest(
3675                        name=name,
3676                        prompt=cast(Any, prompt),
3677                        labels=labels,
3678                        tags=tags,
3679                        config=config or {},
3680                        commit_message=commit_message,
3681                        type=CreateChatPromptType.CHAT,
3682                    )
3683                )
3684                server_prompt = self.api.prompts.create(request=request)
3685
3686                if self._resources is not None:
3687                    self._resources.prompt_cache.invalidate(name)
3688
3689                return ChatPromptClient(prompt=cast(Prompt_Chat, server_prompt))
3690
3691            if not isinstance(prompt, str):
3692                raise ValueError("For 'text' type, 'prompt' must be a string.")
3693
3694            request = CreateTextPromptRequest(
3695                name=name,
3696                prompt=prompt,
3697                labels=labels,
3698                tags=tags,
3699                config=config or {},
3700                commit_message=commit_message,
3701            )
3702
3703            server_prompt = self.api.prompts.create(request=request)
3704
3705            if self._resources is not None:
3706                self._resources.prompt_cache.invalidate(name)
3707
3708            return TextPromptClient(prompt=cast(Prompt_Text, server_prompt))
3709
3710        except Error as e:
3711            handle_fern_exception(e)
3712            raise e
3713
3714    def update_prompt(
3715        self,
3716        *,
3717        name: str,
3718        version: int,
3719        new_labels: List[str] = [],
3720    ) -> Any:
3721        """Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name.
3722
3723        Args:
3724            name (str): The name of the prompt to update.
3725            version (int): The version number of the prompt to update.
3726            new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to [].
3727
3728        Returns:
3729            Prompt: The updated prompt from the Langfuse API.
3730
3731        """
3732        updated_prompt = self.api.prompt_version.update(
3733            name=self._url_encode(name),
3734            version=version,
3735            new_labels=new_labels,
3736        )
3737
3738        if self._resources is not None:
3739            self._resources.prompt_cache.invalidate(name)
3740
3741        return updated_prompt
3742
3743    def _url_encode(self, url: str, *, is_url_param: Optional[bool] = False) -> str:
3744        # httpx ≥ 0.28 does its own WHATWG-compliant quoting (eg. encodes bare
3745        # “%”, “?”, “#”, “|”, … in query/path parts).  Re-quoting here would
3746        # double-encode, so we skip when the value is about to be sent straight
3747        # to httpx (`is_url_param=True`) and the installed version is ≥ 0.28.
3748        if is_url_param and Version(httpx.__version__) >= Version("0.28.0"):
3749            return url
3750
3751        # urllib.parse.quote does not escape slashes "/" by default; we need to add safe="" to force escaping
3752        # we need add safe="" to force escaping of slashes
3753        # This is necessary for prompts in prompt folders
3754        return urllib.parse.quote(url, safe="")
3755
3756    def clear_prompt_cache(self) -> None:
3757        """Clear the entire prompt cache, removing all cached prompts.
3758
3759        This method is useful when you want to force a complete refresh of all
3760        cached prompts, for example after major updates or when you need to
3761        ensure the latest versions are fetched from the server.
3762        """
3763        if self._resources is not None:
3764            self._resources.prompt_cache.clear()

Main client for Langfuse tracing and platform features.

This class provides an interface for creating and managing traces, spans, and generations in Langfuse as well as interacting with the Langfuse API.

The client features a thread-safe singleton pattern for each unique public API key, ensuring consistent trace context propagation across your application. It implements efficient batching of spans with configurable flush settings and includes background thread management for media uploads and score ingestion.

Configuration is flexible through either direct parameters or environment variables, with graceful fallbacks and runtime configuration updates.

Attributes:
  • api: Synchronous API client for Langfuse backend communication
  • async_api: Asynchronous API client for Langfuse backend communication
  • _otel_tracer: Internal LangfuseTracer instance managing OpenTelemetry components
Arguments:
  • public_key (Optional[str]): Your Langfuse public API key. Can also be set via LANGFUSE_PUBLIC_KEY environment variable.
  • secret_key (Optional[str]): Your Langfuse secret API key. Can also be set via LANGFUSE_SECRET_KEY environment variable.
  • base_url (Optional[str]): The Langfuse API base URL. Defaults to "https://cloud.langfuse.com". Can also be set via LANGFUSE_BASE_URL environment variable.
  • host (Optional[str]): Deprecated. Use base_url instead. The Langfuse API host URL. Defaults to "https://cloud.langfuse.com".
  • timeout (Optional[int]): Timeout in seconds for API requests. Defaults to 5 seconds.
  • httpx_client (Optional[httpx.Client]): Custom httpx client for making non-tracing HTTP requests. If not provided, a default client will be created.
  • debug (bool): Enable debug logging. Defaults to False. Can also be set via LANGFUSE_DEBUG environment variable.
  • tracing_enabled (Optional[bool]): Enable or disable tracing. Defaults to True. Can also be set via LANGFUSE_TRACING_ENABLED environment variable.
  • flush_at (Optional[int]): Number of spans to batch before sending to the API. Defaults to 512. Can also be set via LANGFUSE_FLUSH_AT environment variable.
  • flush_interval (Optional[float]): Time in seconds between batch flushes. Defaults to 5 seconds. Can also be set via LANGFUSE_FLUSH_INTERVAL environment variable.
  • environment (Optional[str]): Environment name for tracing. Default is 'default'. Can also be set via LANGFUSE_TRACING_ENVIRONMENT environment variable. Can be any lowercase alphanumeric string with hyphens and underscores that does not start with 'langfuse'.
  • release (Optional[str]): Release version/hash of your application. Used for grouping analytics by release.
  • media_upload_thread_count (Optional[int]): Number of background threads for handling media uploads. Defaults to 1. Can also be set via LANGFUSE_MEDIA_UPLOAD_THREAD_COUNT environment variable.
  • sample_rate (Optional[float]): Sampling rate for traces (0.0 to 1.0). Defaults to 1.0 (100% of traces are sampled). Can also be set via LANGFUSE_SAMPLE_RATE environment variable.
  • mask (Optional[MaskFunction]): Function to mask sensitive data in traces before sending to the API.
  • blocked_instrumentation_scopes (Optional[List[str]]): Deprecated. Use should_export_span instead. Equivalent behavior:

    from langfuse.span_filter import is_default_export_span
    blocked = {"sqlite", "requests"}
    
    should_export_span = lambda span: (
        is_default_export_span(span)
        and (
            span.instrumentation_scope is None
            or span.instrumentation_scope.name not in blocked
        )
    )
    
  • should_export_span (Optional[Callable[[ReadableSpan], bool]]): Callback to decide whether to export a span. If omitted, Langfuse uses the default filter (Langfuse SDK spans, spans with gen_ai.* attributes, and known LLM instrumentation scopes).

  • additional_headers (Optional[Dict[str, str]]): Additional headers to include in all API requests and in the default OTLPSpanExporter requests. These headers will be merged with default headers. Note: If httpx_client is provided, additional_headers must be set directly on your custom httpx_client as well. If span_exporter is provided, these headers are not wired into that exporter and must be configured on the exporter instance directly.
  • tracer_provider(Optional[TracerProvider]): OpenTelemetry TracerProvider to use for Langfuse. This can be useful to set to have disconnected tracing between Langfuse and other OpenTelemetry-span emitting libraries. Note: To track active spans, the context is still shared between TracerProviders. This may lead to broken trace trees.
  • span_exporter (Optional[SpanExporter]): Custom OpenTelemetry span exporter for the Langfuse span processor. If omitted, Langfuse creates an OTLPSpanExporter pointed at the Langfuse OTLP endpoint. If provided, Langfuse does not wire base_url, exporter headers, exporter auth, or exporter timeout into it. Configure endpoint, headers, and timeout on the exporter instance directly. If you are sending spans to Langfuse v4 or using Langfuse Cloud Fast Preview, include x-langfuse-ingestion-version=4 on the exporter to enable real time processing of exported spans.
Example:
from langfuse.otel import Langfuse

# Initialize the client (reads from env vars if not provided)
langfuse = Langfuse(
    public_key="your-public-key",
    secret_key="your-secret-key",
    host="https://cloud.langfuse.com",  # Optional, default shown
)

# Create a trace span
with langfuse.start_as_current_observation(name="process-query") as span:
    # Your application code here

    # Create a nested generation span for an LLM call
    with span.start_as_current_generation(
        name="generate-response",
        model="gpt-4",
        input={"query": "Tell me about AI"},
        model_parameters={"temperature": 0.7, "max_tokens": 500}
    ) as generation:
        # Generate response here
        response = "AI is a field of computer science..."

        generation.update(
            output=response,
            usage_details={"prompt_tokens": 10, "completion_tokens": 50},
            cost_details={"total_cost": 0.0023}
        )

        # Score the generation (supports NUMERIC, BOOLEAN, CATEGORICAL)
        generation.score(name="relevance", value=0.95, data_type="NUMERIC")
Langfuse( *, public_key: Optional[str] = None, secret_key: Optional[str] = None, base_url: Optional[str] = None, host: Optional[str] = None, timeout: Optional[int] = None, httpx_client: Optional[httpx.Client] = None, debug: bool = False, tracing_enabled: Optional[bool] = True, flush_at: Optional[int] = None, flush_interval: Optional[float] = None, environment: Optional[str] = None, release: Optional[str] = None, media_upload_thread_count: Optional[int] = None, sample_rate: Optional[float] = None, mask: Optional[langfuse.types.MaskFunction] = None, blocked_instrumentation_scopes: Optional[List[str]] = None, should_export_span: Optional[Callable[[opentelemetry.sdk.trace.ReadableSpan], bool]] = None, additional_headers: Optional[Dict[str, str]] = None, tracer_provider: Optional[opentelemetry.sdk.trace.TracerProvider] = None, span_exporter: Optional[opentelemetry.sdk.trace.export.SpanExporter] = None)
235    def __init__(
236        self,
237        *,
238        public_key: Optional[str] = None,
239        secret_key: Optional[str] = None,
240        base_url: Optional[str] = None,
241        host: Optional[str] = None,
242        timeout: Optional[int] = None,
243        httpx_client: Optional[httpx.Client] = None,
244        debug: bool = False,
245        tracing_enabled: Optional[bool] = True,
246        flush_at: Optional[int] = None,
247        flush_interval: Optional[float] = None,
248        environment: Optional[str] = None,
249        release: Optional[str] = None,
250        media_upload_thread_count: Optional[int] = None,
251        sample_rate: Optional[float] = None,
252        mask: Optional[MaskFunction] = None,
253        blocked_instrumentation_scopes: Optional[List[str]] = None,
254        should_export_span: Optional[Callable[[ReadableSpan], bool]] = None,
255        additional_headers: Optional[Dict[str, str]] = None,
256        tracer_provider: Optional[TracerProvider] = None,
257        span_exporter: Optional[SpanExporter] = None,
258    ):
259        self._base_url = (
260            base_url
261            or os.environ.get(LANGFUSE_BASE_URL)
262            or host
263            or os.environ.get(LANGFUSE_HOST, "https://cloud.langfuse.com")
264        )
265        self._environment = environment or cast(
266            str, os.environ.get(LANGFUSE_TRACING_ENVIRONMENT)
267        )
268        self._release = (
269            release
270            or os.environ.get(LANGFUSE_RELEASE, None)
271            or get_common_release_envs()
272        )
273        self._project_id: Optional[str] = None
274        sample_rate = sample_rate or float(os.environ.get(LANGFUSE_SAMPLE_RATE, 1.0))
275        if not 0.0 <= sample_rate <= 1.0:
276            raise ValueError(
277                f"Sample rate must be between 0.0 and 1.0, got {sample_rate}"
278            )
279
280        timeout = timeout or int(os.environ.get(LANGFUSE_TIMEOUT, 5))
281
282        self._tracing_enabled = (
283            tracing_enabled
284            and os.environ.get(LANGFUSE_TRACING_ENABLED, "true").lower() != "false"
285        )
286        if not self._tracing_enabled:
287            langfuse_logger.info(
288                "Configuration: Langfuse tracing is explicitly disabled. No data will be sent to the Langfuse API."
289            )
290
291        debug = (
292            debug if debug else (os.getenv(LANGFUSE_DEBUG, "false").lower() == "true")
293        )
294        if debug:
295            logging.basicConfig(
296                format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
297            )
298            langfuse_logger.setLevel(logging.DEBUG)
299
300        public_key = public_key or os.environ.get(LANGFUSE_PUBLIC_KEY)
301        if public_key is None:
302            langfuse_logger.warning(
303                "Authentication error: Langfuse client initialized without public_key. Client will be disabled. "
304                "Provide a public_key parameter or set LANGFUSE_PUBLIC_KEY environment variable. "
305            )
306            self._otel_tracer = otel_trace_api.NoOpTracer()
307            return
308
309        secret_key = secret_key or os.environ.get(LANGFUSE_SECRET_KEY)
310        if secret_key is None:
311            langfuse_logger.warning(
312                "Authentication error: Langfuse client initialized without secret_key. Client will be disabled. "
313                "Provide a secret_key parameter or set LANGFUSE_SECRET_KEY environment variable. "
314            )
315            self._otel_tracer = otel_trace_api.NoOpTracer()
316            return
317
318        if os.environ.get("OTEL_SDK_DISABLED", "false").lower() == "true":
319            langfuse_logger.warning(
320                "OTEL_SDK_DISABLED is set. Langfuse tracing will be disabled and no traces will appear in the UI."
321            )
322
323        if blocked_instrumentation_scopes is not None:
324            warnings.warn(
325                "`blocked_instrumentation_scopes` is deprecated and will be removed in a future release. "
326                "Use `should_export_span` instead. Example: "
327                "from langfuse.span_filter import is_default_export_span; "
328                'blocked={"scope"}; should_export_span=lambda span: '
329                "is_default_export_span(span) and (span.instrumentation_scope is None or "
330                "span.instrumentation_scope.name not in blocked).",
331                DeprecationWarning,
332                stacklevel=2,
333            )
334
335        # Initialize api and tracer if requirements are met
336        self._resources = LangfuseResourceManager(
337            public_key=public_key,
338            secret_key=secret_key,
339            base_url=self._base_url,
340            timeout=timeout,
341            environment=self._environment,
342            release=release,
343            flush_at=flush_at,
344            flush_interval=flush_interval,
345            httpx_client=httpx_client,
346            media_upload_thread_count=media_upload_thread_count,
347            sample_rate=sample_rate,
348            mask=mask,
349            tracing_enabled=self._tracing_enabled,
350            blocked_instrumentation_scopes=blocked_instrumentation_scopes,
351            should_export_span=should_export_span,
352            additional_headers=additional_headers,
353            tracer_provider=tracer_provider,
354            span_exporter=span_exporter,
355        )
356        self._mask = self._resources.mask
357
358        self._otel_tracer = (
359            self._resources.tracer
360            if self._tracing_enabled and self._resources.tracer is not None
361            else otel_trace_api.NoOpTracer()
362        )
363        self.api = self._resources.api
364        self.async_api = self._resources.async_api
api
async_api
def start_observation( self, *, trace_context: Optional[langfuse.types.TraceContext] = None, name: str, as_type: Union[Literal['generation', 'embedding'], Literal['span', 'agent', 'tool', 'chain', 'retriever', 'evaluator', 'guardrail']] = 'span', input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, float, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None) -> Union[LangfuseSpan, LangfuseGeneration, LangfuseAgent, LangfuseTool, LangfuseChain, LangfuseRetriever, LangfuseEvaluator, LangfuseEmbedding, LangfuseGuardrail]:
513    def start_observation(
514        self,
515        *,
516        trace_context: Optional[TraceContext] = None,
517        name: str,
518        as_type: ObservationTypeLiteralNoEvent = "span",
519        input: Optional[Any] = None,
520        output: Optional[Any] = None,
521        metadata: Optional[Any] = None,
522        version: Optional[str] = None,
523        level: Optional[SpanLevel] = None,
524        status_message: Optional[str] = None,
525        completion_start_time: Optional[datetime] = None,
526        model: Optional[str] = None,
527        model_parameters: Optional[Dict[str, MapValue]] = None,
528        usage_details: Optional[Dict[str, int]] = None,
529        cost_details: Optional[Dict[str, float]] = None,
530        prompt: Optional[PromptClient] = None,
531    ) -> Union[
532        LangfuseSpan,
533        LangfuseGeneration,
534        LangfuseAgent,
535        LangfuseTool,
536        LangfuseChain,
537        LangfuseRetriever,
538        LangfuseEvaluator,
539        LangfuseEmbedding,
540        LangfuseGuardrail,
541    ]:
542        """Create a new observation of the specified type.
543
544        This method creates a new observation but does not set it as the current span in the
545        context. To create and use an observation within a context, use start_as_current_observation().
546
547        Args:
548            trace_context: Optional context for connecting to an existing trace
549            name: Name of the observation
550            as_type: Type of observation to create (defaults to "span")
551            input: Input data for the operation
552            output: Output data from the operation
553            metadata: Additional metadata to associate with the observation
554            version: Version identifier for the code or component
555            level: Importance level of the observation
556            status_message: Optional status message for the observation
557            completion_start_time: When the model started generating (for generation types)
558            model: Name/identifier of the AI model used (for generation types)
559            model_parameters: Parameters used for the model (for generation types)
560            usage_details: Token usage information (for generation types)
561            cost_details: Cost information (for generation types)
562            prompt: Associated prompt template (for generation types)
563
564        Returns:
565            An observation object of the appropriate type that must be ended with .end()
566        """
567        if trace_context:
568            trace_id = trace_context.get("trace_id", None)
569            parent_span_id = trace_context.get("parent_span_id", None)
570
571            if trace_id:
572                remote_parent_span = self._create_remote_parent_span(
573                    trace_id=trace_id, parent_span_id=parent_span_id
574                )
575
576                with otel_trace_api.use_span(
577                    cast(otel_trace_api.Span, remote_parent_span)
578                ):
579                    otel_span = self._otel_tracer.start_span(name=name)
580                    otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True)
581
582                    return self._create_observation_from_otel_span(
583                        otel_span=otel_span,
584                        as_type=as_type,
585                        input=input,
586                        output=output,
587                        metadata=metadata,
588                        version=version,
589                        level=level,
590                        status_message=status_message,
591                        completion_start_time=completion_start_time,
592                        model=model,
593                        model_parameters=model_parameters,
594                        usage_details=usage_details,
595                        cost_details=cost_details,
596                        prompt=prompt,
597                    )
598
599        otel_span = self._otel_tracer.start_span(name=name)
600
601        return self._create_observation_from_otel_span(
602            otel_span=otel_span,
603            as_type=as_type,
604            input=input,
605            output=output,
606            metadata=metadata,
607            version=version,
608            level=level,
609            status_message=status_message,
610            completion_start_time=completion_start_time,
611            model=model,
612            model_parameters=model_parameters,
613            usage_details=usage_details,
614            cost_details=cost_details,
615            prompt=prompt,
616        )

Create a new observation of the specified type.

This method creates a new observation but does not set it as the current span in the context. To create and use an observation within a context, use start_as_current_observation().

Arguments:
  • trace_context: Optional context for connecting to an existing trace
  • name: Name of the observation
  • as_type: Type of observation to create (defaults to "span")
  • input: Input data for the operation
  • output: Output data from the operation
  • metadata: Additional metadata to associate with the observation
  • version: Version identifier for the code or component
  • level: Importance level of the observation
  • status_message: Optional status message for the observation
  • completion_start_time: When the model started generating (for generation types)
  • model: Name/identifier of the AI model used (for generation types)
  • model_parameters: Parameters used for the model (for generation types)
  • usage_details: Token usage information (for generation types)
  • cost_details: Cost information (for generation types)
  • prompt: Associated prompt template (for generation types)
Returns:

An observation object of the appropriate type that must be ended with .end()

def start_as_current_observation( self, *, trace_context: Optional[langfuse.types.TraceContext] = None, name: str, as_type: Union[Literal['generation', 'embedding'], Literal['span', 'agent', 'tool', 'chain', 'retriever', 'evaluator', 'guardrail']] = 'span', input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, float, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None, end_on_exit: Optional[bool] = None) -> Union[opentelemetry.util._decorator._AgnosticContextManager[LangfuseGeneration], opentelemetry.util._decorator._AgnosticContextManager[LangfuseSpan], opentelemetry.util._decorator._AgnosticContextManager[LangfuseAgent], opentelemetry.util._decorator._AgnosticContextManager[LangfuseTool], opentelemetry.util._decorator._AgnosticContextManager[LangfuseChain], opentelemetry.util._decorator._AgnosticContextManager[LangfuseRetriever], opentelemetry.util._decorator._AgnosticContextManager[LangfuseEvaluator], opentelemetry.util._decorator._AgnosticContextManager[LangfuseEmbedding], opentelemetry.util._decorator._AgnosticContextManager[LangfuseGuardrail]]:
 846    def start_as_current_observation(
 847        self,
 848        *,
 849        trace_context: Optional[TraceContext] = None,
 850        name: str,
 851        as_type: ObservationTypeLiteralNoEvent = "span",
 852        input: Optional[Any] = None,
 853        output: Optional[Any] = None,
 854        metadata: Optional[Any] = None,
 855        version: Optional[str] = None,
 856        level: Optional[SpanLevel] = None,
 857        status_message: Optional[str] = None,
 858        completion_start_time: Optional[datetime] = None,
 859        model: Optional[str] = None,
 860        model_parameters: Optional[Dict[str, MapValue]] = None,
 861        usage_details: Optional[Dict[str, int]] = None,
 862        cost_details: Optional[Dict[str, float]] = None,
 863        prompt: Optional[PromptClient] = None,
 864        end_on_exit: Optional[bool] = None,
 865    ) -> Union[
 866        _AgnosticContextManager[LangfuseGeneration],
 867        _AgnosticContextManager[LangfuseSpan],
 868        _AgnosticContextManager[LangfuseAgent],
 869        _AgnosticContextManager[LangfuseTool],
 870        _AgnosticContextManager[LangfuseChain],
 871        _AgnosticContextManager[LangfuseRetriever],
 872        _AgnosticContextManager[LangfuseEvaluator],
 873        _AgnosticContextManager[LangfuseEmbedding],
 874        _AgnosticContextManager[LangfuseGuardrail],
 875    ]:
 876        """Create a new observation and set it as the current span in a context manager.
 877
 878        This method creates a new observation of the specified type and sets it as the
 879        current span within a context manager. Use this method with a 'with' statement to
 880        automatically handle the observation lifecycle within a code block.
 881
 882        The created observation will be the child of the current span in the context.
 883
 884        Args:
 885            trace_context: Optional context for connecting to an existing trace
 886            name: Name of the observation (e.g., function or operation name)
 887            as_type: Type of observation to create (defaults to "span")
 888            input: Input data for the operation (can be any JSON-serializable object)
 889            output: Output data from the operation (can be any JSON-serializable object)
 890            metadata: Additional metadata to associate with the observation
 891            version: Version identifier for the code or component
 892            level: Importance level of the observation (info, warning, error)
 893            status_message: Optional status message for the observation
 894            end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
 895
 896            The following parameters are available when as_type is: "generation" or "embedding".
 897            completion_start_time: When the model started generating the response
 898            model: Name/identifier of the AI model used (e.g., "gpt-4")
 899            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
 900            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
 901            cost_details: Cost information for the model call
 902            prompt: Associated prompt template from Langfuse prompt management
 903
 904        Returns:
 905            A context manager that yields the appropriate observation type based on as_type
 906
 907        Example:
 908            ```python
 909            # Create a span
 910            with langfuse.start_as_current_observation(name="process-query", as_type="span") as span:
 911                # Do work
 912                result = process_data()
 913                span.update(output=result)
 914
 915                # Create a child span automatically
 916                with span.start_as_current_observation(name="sub-operation") as child_span:
 917                    # Do sub-operation work
 918                    child_span.update(output="sub-result")
 919
 920            # Create a tool observation
 921            with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool:
 922                # Do tool work
 923                results = search_web(query)
 924                tool.update(output=results)
 925
 926            # Create a generation observation
 927            with langfuse.start_as_current_observation(
 928                name="answer-generation",
 929                as_type="generation",
 930                model="gpt-4"
 931            ) as generation:
 932                # Generate answer
 933                response = llm.generate(...)
 934                generation.update(output=response)
 935            ```
 936        """
 937        if as_type in get_observation_types_list(ObservationTypeGenerationLike):
 938            if trace_context:
 939                trace_id = trace_context.get("trace_id", None)
 940                parent_span_id = trace_context.get("parent_span_id", None)
 941
 942                if trace_id:
 943                    remote_parent_span = self._create_remote_parent_span(
 944                        trace_id=trace_id, parent_span_id=parent_span_id
 945                    )
 946
 947                    return cast(
 948                        Union[
 949                            _AgnosticContextManager[LangfuseGeneration],
 950                            _AgnosticContextManager[LangfuseEmbedding],
 951                        ],
 952                        self._create_span_with_parent_context(
 953                            as_type=as_type,
 954                            name=name,
 955                            remote_parent_span=remote_parent_span,
 956                            parent=None,
 957                            end_on_exit=end_on_exit,
 958                            input=input,
 959                            output=output,
 960                            metadata=metadata,
 961                            version=version,
 962                            level=level,
 963                            status_message=status_message,
 964                            completion_start_time=completion_start_time,
 965                            model=model,
 966                            model_parameters=model_parameters,
 967                            usage_details=usage_details,
 968                            cost_details=cost_details,
 969                            prompt=prompt,
 970                        ),
 971                    )
 972
 973            return cast(
 974                Union[
 975                    _AgnosticContextManager[LangfuseGeneration],
 976                    _AgnosticContextManager[LangfuseEmbedding],
 977                ],
 978                self._start_as_current_otel_span_with_processed_media(
 979                    as_type=as_type,
 980                    name=name,
 981                    end_on_exit=end_on_exit,
 982                    input=input,
 983                    output=output,
 984                    metadata=metadata,
 985                    version=version,
 986                    level=level,
 987                    status_message=status_message,
 988                    completion_start_time=completion_start_time,
 989                    model=model,
 990                    model_parameters=model_parameters,
 991                    usage_details=usage_details,
 992                    cost_details=cost_details,
 993                    prompt=prompt,
 994                ),
 995            )
 996
 997        if as_type in get_observation_types_list(ObservationTypeSpanLike):
 998            if trace_context:
 999                trace_id = trace_context.get("trace_id", None)
1000                parent_span_id = trace_context.get("parent_span_id", None)
1001
1002                if trace_id:
1003                    remote_parent_span = self._create_remote_parent_span(
1004                        trace_id=trace_id, parent_span_id=parent_span_id
1005                    )
1006
1007                    return cast(
1008                        Union[
1009                            _AgnosticContextManager[LangfuseSpan],
1010                            _AgnosticContextManager[LangfuseAgent],
1011                            _AgnosticContextManager[LangfuseTool],
1012                            _AgnosticContextManager[LangfuseChain],
1013                            _AgnosticContextManager[LangfuseRetriever],
1014                            _AgnosticContextManager[LangfuseEvaluator],
1015                            _AgnosticContextManager[LangfuseGuardrail],
1016                        ],
1017                        self._create_span_with_parent_context(
1018                            as_type=as_type,
1019                            name=name,
1020                            remote_parent_span=remote_parent_span,
1021                            parent=None,
1022                            end_on_exit=end_on_exit,
1023                            input=input,
1024                            output=output,
1025                            metadata=metadata,
1026                            version=version,
1027                            level=level,
1028                            status_message=status_message,
1029                        ),
1030                    )
1031
1032            return cast(
1033                Union[
1034                    _AgnosticContextManager[LangfuseSpan],
1035                    _AgnosticContextManager[LangfuseAgent],
1036                    _AgnosticContextManager[LangfuseTool],
1037                    _AgnosticContextManager[LangfuseChain],
1038                    _AgnosticContextManager[LangfuseRetriever],
1039                    _AgnosticContextManager[LangfuseEvaluator],
1040                    _AgnosticContextManager[LangfuseGuardrail],
1041                ],
1042                self._start_as_current_otel_span_with_processed_media(
1043                    as_type=as_type,
1044                    name=name,
1045                    end_on_exit=end_on_exit,
1046                    input=input,
1047                    output=output,
1048                    metadata=metadata,
1049                    version=version,
1050                    level=level,
1051                    status_message=status_message,
1052                ),
1053            )
1054
1055        # This should never be reached since all valid types are handled above
1056        langfuse_logger.warning(
1057            f"Unknown observation type: {as_type}, falling back to span"
1058        )
1059        return self._start_as_current_otel_span_with_processed_media(
1060            as_type="span",
1061            name=name,
1062            end_on_exit=end_on_exit,
1063            input=input,
1064            output=output,
1065            metadata=metadata,
1066            version=version,
1067            level=level,
1068            status_message=status_message,
1069        )

Create a new observation and set it as the current span in a context manager.

This method creates a new observation of the specified type and sets it as the current span within a context manager. Use this method with a 'with' statement to automatically handle the observation lifecycle within a code block.

The created observation will be the child of the current span in the context.

Arguments:
  • trace_context: Optional context for connecting to an existing trace
  • name: Name of the observation (e.g., function or operation name)
  • as_type: Type of observation to create (defaults to "span")
  • input: Input data for the operation (can be any JSON-serializable object)
  • output: Output data from the operation (can be any JSON-serializable object)
  • metadata: Additional metadata to associate with the observation
  • version: Version identifier for the code or component
  • level: Importance level of the observation (info, warning, error)
  • status_message: Optional status message for the observation
  • end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
  • The following parameters are available when as_type is: "generation" or "embedding".
  • completion_start_time: When the model started generating the response
  • model: Name/identifier of the AI model used (e.g., "gpt-4")
  • model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
  • usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
  • cost_details: Cost information for the model call
  • prompt: Associated prompt template from Langfuse prompt management
Returns:

A context manager that yields the appropriate observation type based on as_type

Example:
# Create a span
with langfuse.start_as_current_observation(name="process-query", as_type="span") as span:
    # Do work
    result = process_data()
    span.update(output=result)

    # Create a child span automatically
    with span.start_as_current_observation(name="sub-operation") as child_span:
        # Do sub-operation work
        child_span.update(output="sub-result")

# Create a tool observation
with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool:
    # Do tool work
    results = search_web(query)
    tool.update(output=results)

# Create a generation observation
with langfuse.start_as_current_observation(
    name="answer-generation",
    as_type="generation",
    model="gpt-4"
) as generation:
    # Generate answer
    response = llm.generate(...)
    generation.update(output=response)
def update_current_generation( self, *, name: Optional[str] = None, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, float, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None) -> None:
1246    def update_current_generation(
1247        self,
1248        *,
1249        name: Optional[str] = None,
1250        input: Optional[Any] = None,
1251        output: Optional[Any] = None,
1252        metadata: Optional[Any] = None,
1253        version: Optional[str] = None,
1254        level: Optional[SpanLevel] = None,
1255        status_message: Optional[str] = None,
1256        completion_start_time: Optional[datetime] = None,
1257        model: Optional[str] = None,
1258        model_parameters: Optional[Dict[str, MapValue]] = None,
1259        usage_details: Optional[Dict[str, int]] = None,
1260        cost_details: Optional[Dict[str, float]] = None,
1261        prompt: Optional[PromptClient] = None,
1262    ) -> None:
1263        """Update the current active generation span with new information.
1264
1265        This method updates the current generation span in the active context with
1266        additional information. It's useful for adding output, usage stats, or other
1267        details that become available during or after model generation.
1268
1269        Args:
1270            name: The generation name
1271            input: Updated input data for the model
1272            output: Output from the model (e.g., completions)
1273            metadata: Additional metadata to associate with the generation
1274            version: Version identifier for the model or component
1275            level: Importance level of the generation (info, warning, error)
1276            status_message: Optional status message for the generation
1277            completion_start_time: When the model started generating the response
1278            model: Name/identifier of the AI model used (e.g., "gpt-4")
1279            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1280            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1281            cost_details: Cost information for the model call
1282            prompt: Associated prompt template from Langfuse prompt management
1283
1284        Example:
1285            ```python
1286            with langfuse.start_as_current_generation(name="answer-query") as generation:
1287                # Initial setup and API call
1288                response = llm.generate(...)
1289
1290                # Update with results that weren't available at creation time
1291                langfuse.update_current_generation(
1292                    output=response.text,
1293                    usage_details={
1294                        "prompt_tokens": response.usage.prompt_tokens,
1295                        "completion_tokens": response.usage.completion_tokens
1296                    }
1297                )
1298            ```
1299        """
1300        if not self._tracing_enabled:
1301            langfuse_logger.debug(
1302                "Operation skipped: update_current_generation - Tracing is disabled or client is in no-op mode."
1303            )
1304            return
1305
1306        current_otel_span = self._get_current_otel_span()
1307
1308        if current_otel_span is not None:
1309            generation = LangfuseGeneration(
1310                otel_span=current_otel_span, langfuse_client=self
1311            )
1312
1313            if name:
1314                current_otel_span.update_name(name)
1315
1316            generation.update(
1317                input=input,
1318                output=output,
1319                metadata=metadata,
1320                version=version,
1321                level=level,
1322                status_message=status_message,
1323                completion_start_time=completion_start_time,
1324                model=model,
1325                model_parameters=model_parameters,
1326                usage_details=usage_details,
1327                cost_details=cost_details,
1328                prompt=prompt,
1329            )

Update the current active generation span with new information.

This method updates the current generation span in the active context with additional information. It's useful for adding output, usage stats, or other details that become available during or after model generation.

Arguments:
  • name: The generation name
  • input: Updated input data for the model
  • output: Output from the model (e.g., completions)
  • metadata: Additional metadata to associate with the generation
  • version: Version identifier for the model or component
  • level: Importance level of the generation (info, warning, error)
  • status_message: Optional status message for the generation
  • completion_start_time: When the model started generating the response
  • model: Name/identifier of the AI model used (e.g., "gpt-4")
  • model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
  • usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
  • cost_details: Cost information for the model call
  • prompt: Associated prompt template from Langfuse prompt management
Example:
with langfuse.start_as_current_generation(name="answer-query") as generation:
    # Initial setup and API call
    response = llm.generate(...)

    # Update with results that weren't available at creation time
    langfuse.update_current_generation(
        output=response.text,
        usage_details={
            "prompt_tokens": response.usage.prompt_tokens,
            "completion_tokens": response.usage.completion_tokens
        }
    )
def update_current_span( self, *, name: Optional[str] = None, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None) -> None:
1331    def update_current_span(
1332        self,
1333        *,
1334        name: Optional[str] = None,
1335        input: Optional[Any] = None,
1336        output: Optional[Any] = None,
1337        metadata: Optional[Any] = None,
1338        version: Optional[str] = None,
1339        level: Optional[SpanLevel] = None,
1340        status_message: Optional[str] = None,
1341    ) -> None:
1342        """Update the current active span with new information.
1343
1344        This method updates the current span in the active context with
1345        additional information. It's useful for adding outputs or metadata
1346        that become available during execution.
1347
1348        Args:
1349            name: The span name
1350            input: Updated input data for the operation
1351            output: Output data from the operation
1352            metadata: Additional metadata to associate with the span
1353            version: Version identifier for the code or component
1354            level: Importance level of the span (info, warning, error)
1355            status_message: Optional status message for the span
1356
1357        Example:
1358            ```python
1359            with langfuse.start_as_current_observation(name="process-data") as span:
1360                # Initial processing
1361                result = process_first_part()
1362
1363                # Update with intermediate results
1364                langfuse.update_current_span(metadata={"intermediate_result": result})
1365
1366                # Continue processing
1367                final_result = process_second_part(result)
1368
1369                # Final update
1370                langfuse.update_current_span(output=final_result)
1371            ```
1372        """
1373        if not self._tracing_enabled:
1374            langfuse_logger.debug(
1375                "Operation skipped: update_current_span - Tracing is disabled or client is in no-op mode."
1376            )
1377            return
1378
1379        current_otel_span = self._get_current_otel_span()
1380
1381        if current_otel_span is not None:
1382            span = LangfuseSpan(
1383                otel_span=current_otel_span,
1384                langfuse_client=self,
1385                environment=self._environment,
1386                release=self._release,
1387            )
1388
1389            if name:
1390                current_otel_span.update_name(name)
1391
1392            span.update(
1393                input=input,
1394                output=output,
1395                metadata=metadata,
1396                version=version,
1397                level=level,
1398                status_message=status_message,
1399            )

Update the current active span with new information.

This method updates the current span in the active context with additional information. It's useful for adding outputs or metadata that become available during execution.

Arguments:
  • name: The span name
  • input: Updated input data for the operation
  • output: Output data from the operation
  • metadata: Additional metadata to associate with the span
  • version: Version identifier for the code or component
  • level: Importance level of the span (info, warning, error)
  • status_message: Optional status message for the span
Example:
with langfuse.start_as_current_observation(name="process-data") as span:
    # Initial processing
    result = process_first_part()

    # Update with intermediate results
    langfuse.update_current_span(metadata={"intermediate_result": result})

    # Continue processing
    final_result = process_second_part(result)

    # Final update
    langfuse.update_current_span(output=final_result)
@deprecated('Trace-level input/output is deprecated. For trace attributes (user_id, session_id, tags, etc.), use propagate_attributes() instead. This method will be removed in a future major version.')
def set_current_trace_io( self, *, input: Optional[Any] = None, output: Optional[Any] = None) -> None:
1401    @deprecated(
1402        "Trace-level input/output is deprecated. "
1403        "For trace attributes (user_id, session_id, tags, etc.), use propagate_attributes() instead. "
1404        "This method will be removed in a future major version."
1405    )
1406    def set_current_trace_io(
1407        self,
1408        *,
1409        input: Optional[Any] = None,
1410        output: Optional[Any] = None,
1411    ) -> None:
1412        """Set trace-level input and output for the current span's trace.
1413
1414        .. deprecated::
1415            This is a legacy method for backward compatibility with Langfuse platform
1416            features that still rely on trace-level input/output (e.g., legacy LLM-as-a-judge
1417            evaluators). It will be removed in a future major version.
1418
1419            For setting other trace attributes (user_id, session_id, metadata, tags, version),
1420            use :meth:`propagate_attributes` instead.
1421
1422        Args:
1423            input: Input data to associate with the trace.
1424            output: Output data to associate with the trace.
1425        """
1426        if not self._tracing_enabled:
1427            langfuse_logger.debug(
1428                "Operation skipped: set_current_trace_io - Tracing is disabled or client is in no-op mode."
1429            )
1430            return
1431
1432        current_otel_span = self._get_current_otel_span()
1433
1434        if current_otel_span is not None and current_otel_span.is_recording():
1435            existing_observation_type = current_otel_span.attributes.get(  # type: ignore[attr-defined]
1436                LangfuseOtelSpanAttributes.OBSERVATION_TYPE, "span"
1437            )
1438            # We need to preserve the class to keep the correct observation type
1439            span_class = self._get_span_class(existing_observation_type)
1440            span = span_class(
1441                otel_span=current_otel_span,
1442                langfuse_client=self,
1443                environment=self._environment,
1444                release=self._release,
1445            )
1446
1447            span.set_trace_io(
1448                input=input,
1449                output=output,
1450            )

Set trace-level input and output for the current span's trace.

Deprecated since version : This is a legacy method for backward compatibility with Langfuse platform features that still rely on trace-level input/output (e.g., legacy LLM-as-a-judge evaluators). It will be removed in a future major version.

For setting other trace attributes (user_id, session_id, metadata, tags, version), use propagate_attributes() instead.

Arguments:
  • input: Input data to associate with the trace.
  • output: Output data to associate with the trace.
def set_current_trace_as_public(self) -> None:
1452    def set_current_trace_as_public(self) -> None:
1453        """Make the current trace publicly accessible via its URL.
1454
1455        When a trace is published, anyone with the trace link can view the full trace
1456        without needing to be logged in to Langfuse. This action cannot be undone
1457        programmatically - once published, the entire trace becomes public.
1458
1459        This is a convenience method that publishes the trace from the currently
1460        active span context. Use this when you want to make a trace public from
1461        within a traced function without needing direct access to the span object.
1462        """
1463        if not self._tracing_enabled:
1464            langfuse_logger.debug(
1465                "Operation skipped: set_current_trace_as_public - Tracing is disabled or client is in no-op mode."
1466            )
1467            return
1468
1469        current_otel_span = self._get_current_otel_span()
1470
1471        if current_otel_span is not None and current_otel_span.is_recording():
1472            existing_observation_type = current_otel_span.attributes.get(  # type: ignore[attr-defined]
1473                LangfuseOtelSpanAttributes.OBSERVATION_TYPE, "span"
1474            )
1475            # We need to preserve the class to keep the correct observation type
1476            span_class = self._get_span_class(existing_observation_type)
1477            span = span_class(
1478                otel_span=current_otel_span,
1479                langfuse_client=self,
1480                environment=self._environment,
1481            )
1482
1483            span.set_trace_as_public()

Make the current trace publicly accessible via its URL.

When a trace is published, anyone with the trace link can view the full trace without needing to be logged in to Langfuse. This action cannot be undone programmatically - once published, the entire trace becomes public.

This is a convenience method that publishes the trace from the currently active span context. Use this when you want to make a trace public from within a traced function without needing direct access to the span object.

def create_event( self, *, trace_context: Optional[langfuse.types.TraceContext] = None, name: str, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None) -> LangfuseEvent:
1485    def create_event(
1486        self,
1487        *,
1488        trace_context: Optional[TraceContext] = None,
1489        name: str,
1490        input: Optional[Any] = None,
1491        output: Optional[Any] = None,
1492        metadata: Optional[Any] = None,
1493        version: Optional[str] = None,
1494        level: Optional[SpanLevel] = None,
1495        status_message: Optional[str] = None,
1496    ) -> LangfuseEvent:
1497        """Create a new Langfuse observation of type 'EVENT'.
1498
1499        The created Langfuse Event observation will be the child of the current span in the context.
1500
1501        Args:
1502            trace_context: Optional context for connecting to an existing trace
1503            name: Name of the span (e.g., function or operation name)
1504            input: Input data for the operation (can be any JSON-serializable object)
1505            output: Output data from the operation (can be any JSON-serializable object)
1506            metadata: Additional metadata to associate with the span
1507            version: Version identifier for the code or component
1508            level: Importance level of the span (info, warning, error)
1509            status_message: Optional status message for the span
1510
1511        Returns:
1512            The Langfuse Event object
1513
1514        Example:
1515            ```python
1516            event = langfuse.create_event(name="process-event")
1517            ```
1518        """
1519        timestamp = time_ns()
1520
1521        if trace_context:
1522            trace_id = trace_context.get("trace_id", None)
1523            parent_span_id = trace_context.get("parent_span_id", None)
1524
1525            if trace_id:
1526                remote_parent_span = self._create_remote_parent_span(
1527                    trace_id=trace_id, parent_span_id=parent_span_id
1528                )
1529
1530                with otel_trace_api.use_span(
1531                    cast(otel_trace_api.Span, remote_parent_span)
1532                ):
1533                    otel_span = self._otel_tracer.start_span(
1534                        name=name, start_time=timestamp
1535                    )
1536                    otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True)
1537
1538                    return cast(
1539                        LangfuseEvent,
1540                        LangfuseEvent(
1541                            otel_span=otel_span,
1542                            langfuse_client=self,
1543                            environment=self._environment,
1544                            release=self._release,
1545                            input=input,
1546                            output=output,
1547                            metadata=metadata,
1548                            version=version,
1549                            level=level,
1550                            status_message=status_message,
1551                        ).end(end_time=timestamp),
1552                    )
1553
1554        otel_span = self._otel_tracer.start_span(name=name, start_time=timestamp)
1555
1556        return cast(
1557            LangfuseEvent,
1558            LangfuseEvent(
1559                otel_span=otel_span,
1560                langfuse_client=self,
1561                environment=self._environment,
1562                release=self._release,
1563                input=input,
1564                output=output,
1565                metadata=metadata,
1566                version=version,
1567                level=level,
1568                status_message=status_message,
1569            ).end(end_time=timestamp),
1570        )

Create a new Langfuse observation of type 'EVENT'.

The created Langfuse Event observation will be the child of the current span in the context.

Arguments:
  • trace_context: Optional context for connecting to an existing trace
  • name: Name of the span (e.g., function or operation name)
  • input: Input data for the operation (can be any JSON-serializable object)
  • output: Output data from the operation (can be any JSON-serializable object)
  • metadata: Additional metadata to associate with the span
  • version: Version identifier for the code or component
  • level: Importance level of the span (info, warning, error)
  • status_message: Optional status message for the span
Returns:

The Langfuse Event object

Example:
event = langfuse.create_event(name="process-event")
@staticmethod
def create_trace_id(*, seed: Optional[str] = None) -> str:
1659    @staticmethod
1660    def create_trace_id(*, seed: Optional[str] = None) -> str:
1661        """Create a unique trace ID for use with Langfuse.
1662
1663        This method generates a unique trace ID for use with various Langfuse APIs.
1664        It can either generate a random ID or create a deterministic ID based on
1665        a seed string.
1666
1667        Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes.
1668        This method ensures the generated ID meets this requirement. If you need to
1669        correlate an external ID with a Langfuse trace ID, use the external ID as the
1670        seed to get a valid, deterministic Langfuse trace ID.
1671
1672        Args:
1673            seed: Optional string to use as a seed for deterministic ID generation.
1674                 If provided, the same seed will always produce the same ID.
1675                 If not provided, a random ID will be generated.
1676
1677        Returns:
1678            A 32-character lowercase hexadecimal string representing the Langfuse trace ID.
1679
1680        Example:
1681            ```python
1682            # Generate a random trace ID
1683            trace_id = langfuse.create_trace_id()
1684
1685            # Generate a deterministic ID based on a seed
1686            session_trace_id = langfuse.create_trace_id(seed="session-456")
1687
1688            # Correlate an external ID with a Langfuse trace ID
1689            external_id = "external-system-123456"
1690            correlated_trace_id = langfuse.create_trace_id(seed=external_id)
1691
1692            # Use the ID with trace context
1693            with langfuse.start_as_current_observation(
1694                name="process-request",
1695                trace_context={"trace_id": trace_id}
1696            ) as span:
1697                # Operation will be part of the specific trace
1698                pass
1699            ```
1700        """
1701        if not seed:
1702            trace_id_int = RandomIdGenerator().generate_trace_id()
1703
1704            return Langfuse._format_otel_trace_id(trace_id_int)
1705
1706        return sha256(seed.encode("utf-8")).digest()[:16].hex()

Create a unique trace ID for use with Langfuse.

This method generates a unique trace ID for use with various Langfuse APIs. It can either generate a random ID or create a deterministic ID based on a seed string.

Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes. This method ensures the generated ID meets this requirement. If you need to correlate an external ID with a Langfuse trace ID, use the external ID as the seed to get a valid, deterministic Langfuse trace ID.

Arguments:
  • seed: Optional string to use as a seed for deterministic ID generation. If provided, the same seed will always produce the same ID. If not provided, a random ID will be generated.
Returns:

A 32-character lowercase hexadecimal string representing the Langfuse trace ID.

Example:
# Generate a random trace ID
trace_id = langfuse.create_trace_id()

# Generate a deterministic ID based on a seed
session_trace_id = langfuse.create_trace_id(seed="session-456")

# Correlate an external ID with a Langfuse trace ID
external_id = "external-system-123456"
correlated_trace_id = langfuse.create_trace_id(seed=external_id)

# Use the ID with trace context
with langfuse.start_as_current_observation(
    name="process-request",
    trace_context={"trace_id": trace_id}
) as span:
    # Operation will be part of the specific trace
    pass
def create_score( self, *, name: str, value: Union[float, str], session_id: Optional[str] = None, dataset_run_id: Optional[str] = None, trace_id: Optional[str] = None, observation_id: Optional[str] = None, score_id: Optional[str] = None, data_type: Optional[Literal['NUMERIC', 'CATEGORICAL', 'BOOLEAN', 'TEXT']] = None, comment: Optional[str] = None, config_id: Optional[str] = None, metadata: Optional[Any] = None, timestamp: Optional[datetime.datetime] = None) -> None:
1784    def create_score(
1785        self,
1786        *,
1787        name: str,
1788        value: Union[float, str],
1789        session_id: Optional[str] = None,
1790        dataset_run_id: Optional[str] = None,
1791        trace_id: Optional[str] = None,
1792        observation_id: Optional[str] = None,
1793        score_id: Optional[str] = None,
1794        data_type: Optional[ScoreDataType] = None,
1795        comment: Optional[str] = None,
1796        config_id: Optional[str] = None,
1797        metadata: Optional[Any] = None,
1798        timestamp: Optional[datetime] = None,
1799    ) -> None:
1800        """Create a score for a specific trace or observation.
1801
1802        This method creates a score for evaluating a Langfuse trace or observation. Scores can be
1803        used to track quality metrics, user feedback, or automated evaluations.
1804
1805        Args:
1806            name: Name of the score (e.g., "relevance", "accuracy")
1807            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT)
1808            session_id: ID of the Langfuse session to associate the score with
1809            dataset_run_id: ID of the Langfuse dataset run to associate the score with
1810            trace_id: ID of the Langfuse trace to associate the score with
1811            observation_id: Optional ID of the specific observation to score. Trace ID must be provided too.
1812            score_id: Optional custom ID for the score (auto-generated if not provided)
1813            data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT)
1814            comment: Optional comment or explanation for the score
1815            config_id: Optional ID of a score config defined in Langfuse
1816            metadata: Optional metadata to be attached to the score
1817            timestamp: Optional timestamp for the score (defaults to current UTC time)
1818
1819        Example:
1820            ```python
1821            # Create a numeric score for accuracy
1822            langfuse.create_score(
1823                name="accuracy",
1824                value=0.92,
1825                trace_id="abcdef1234567890abcdef1234567890",
1826                data_type="NUMERIC",
1827                comment="High accuracy with minor irrelevant details"
1828            )
1829
1830            # Create a categorical score for sentiment
1831            langfuse.create_score(
1832                name="sentiment",
1833                value="positive",
1834                trace_id="abcdef1234567890abcdef1234567890",
1835                observation_id="abcdef1234567890",
1836                data_type="CATEGORICAL"
1837            )
1838            ```
1839        """
1840        if not self._tracing_enabled:
1841            return
1842
1843        score_id = score_id or self._create_observation_id()
1844
1845        try:
1846            new_body = ScoreBody(
1847                id=score_id,
1848                sessionId=session_id,
1849                datasetRunId=dataset_run_id,
1850                traceId=trace_id,
1851                observationId=observation_id,
1852                name=name,
1853                value=value,
1854                dataType=data_type,  # type: ignore
1855                comment=comment,
1856                configId=config_id,
1857                environment=self._environment,
1858                metadata=metadata,
1859            )
1860
1861            event = {
1862                "id": self.create_trace_id(),
1863                "type": "score-create",
1864                "timestamp": timestamp or _get_timestamp(),
1865                "body": new_body,
1866            }
1867
1868            if self._resources is not None:
1869                # Force the score to be in sample if it was for a legacy trace ID, i.e. non-32 hexchar
1870                force_sample = (
1871                    not self._is_valid_trace_id(trace_id) if trace_id else True
1872                )
1873
1874                self._resources.add_score_task(
1875                    event,
1876                    force_sample=force_sample,
1877                )
1878
1879        except Exception as e:
1880            langfuse_logger.exception(
1881                f"Error creating score: Failed to process score event for trace_id={trace_id}, name={name}. Error: {e}"
1882            )

Create a score for a specific trace or observation.

This method creates a score for evaluating a Langfuse trace or observation. Scores can be used to track quality metrics, user feedback, or automated evaluations.

Arguments:
  • name: Name of the score (e.g., "relevance", "accuracy")
  • value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT)
  • session_id: ID of the Langfuse session to associate the score with
  • dataset_run_id: ID of the Langfuse dataset run to associate the score with
  • trace_id: ID of the Langfuse trace to associate the score with
  • observation_id: Optional ID of the specific observation to score. Trace ID must be provided too.
  • score_id: Optional custom ID for the score (auto-generated if not provided)
  • data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT)
  • comment: Optional comment or explanation for the score
  • config_id: Optional ID of a score config defined in Langfuse
  • metadata: Optional metadata to be attached to the score
  • timestamp: Optional timestamp for the score (defaults to current UTC time)
Example:
# Create a numeric score for accuracy
langfuse.create_score(
    name="accuracy",
    value=0.92,
    trace_id="abcdef1234567890abcdef1234567890",
    data_type="NUMERIC",
    comment="High accuracy with minor irrelevant details"
)

# Create a categorical score for sentiment
langfuse.create_score(
    name="sentiment",
    value="positive",
    trace_id="abcdef1234567890abcdef1234567890",
    observation_id="abcdef1234567890",
    data_type="CATEGORICAL"
)
def score_current_span( self, *, name: str, value: Union[float, str], score_id: Optional[str] = None, data_type: Optional[Literal['NUMERIC', 'CATEGORICAL', 'BOOLEAN', 'TEXT']] = None, comment: Optional[str] = None, config_id: Optional[str] = None, metadata: Optional[Any] = None) -> None:
1943    def score_current_span(
1944        self,
1945        *,
1946        name: str,
1947        value: Union[float, str],
1948        score_id: Optional[str] = None,
1949        data_type: Optional[ScoreDataType] = None,
1950        comment: Optional[str] = None,
1951        config_id: Optional[str] = None,
1952        metadata: Optional[Any] = None,
1953    ) -> None:
1954        """Create a score for the current active span.
1955
1956        This method scores the currently active span in the context. It's a convenient
1957        way to score the current operation without needing to know its trace and span IDs.
1958
1959        Args:
1960            name: Name of the score (e.g., "relevance", "accuracy")
1961            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT)
1962            score_id: Optional custom ID for the score (auto-generated if not provided)
1963            data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT)
1964            comment: Optional comment or explanation for the score
1965            config_id: Optional ID of a score config defined in Langfuse
1966            metadata: Optional metadata to be attached to the score
1967
1968        Example:
1969            ```python
1970            with langfuse.start_as_current_generation(name="answer-query") as generation:
1971                # Generate answer
1972                response = generate_answer(...)
1973                generation.update(output=response)
1974
1975                # Score the generation
1976                langfuse.score_current_span(
1977                    name="relevance",
1978                    value=0.85,
1979                    data_type="NUMERIC",
1980                    comment="Mostly relevant but contains some tangential information",
1981                    metadata={"model": "gpt-4", "prompt_version": "v2"}
1982                )
1983            ```
1984        """
1985        current_span = self._get_current_otel_span()
1986
1987        if current_span is not None:
1988            trace_id = self._get_otel_trace_id(current_span)
1989            observation_id = self._get_otel_span_id(current_span)
1990
1991            langfuse_logger.info(
1992                f"Score: Creating score name='{name}' value={value} for current span ({observation_id}) in trace {trace_id}"
1993            )
1994
1995            self.create_score(
1996                trace_id=trace_id,
1997                observation_id=observation_id,
1998                name=name,
1999                value=cast(str, value),
2000                score_id=score_id,
2001                data_type=cast(Literal["CATEGORICAL", "TEXT"], data_type),
2002                comment=comment,
2003                config_id=config_id,
2004                metadata=metadata,
2005            )

Create a score for the current active span.

This method scores the currently active span in the context. It's a convenient way to score the current operation without needing to know its trace and span IDs.

Arguments:
  • name: Name of the score (e.g., "relevance", "accuracy")
  • value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT)
  • score_id: Optional custom ID for the score (auto-generated if not provided)
  • data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT)
  • comment: Optional comment or explanation for the score
  • config_id: Optional ID of a score config defined in Langfuse
  • metadata: Optional metadata to be attached to the score
Example:
with langfuse.start_as_current_generation(name="answer-query") as generation:
    # Generate answer
    response = generate_answer(...)
    generation.update(output=response)

    # Score the generation
    langfuse.score_current_span(
        name="relevance",
        value=0.85,
        data_type="NUMERIC",
        comment="Mostly relevant but contains some tangential information",
        metadata={"model": "gpt-4", "prompt_version": "v2"}
    )
def score_current_trace( self, *, name: str, value: Union[float, str], score_id: Optional[str] = None, data_type: Optional[Literal['NUMERIC', 'CATEGORICAL', 'BOOLEAN', 'TEXT']] = None, comment: Optional[str] = None, config_id: Optional[str] = None, metadata: Optional[Any] = None) -> None:
2033    def score_current_trace(
2034        self,
2035        *,
2036        name: str,
2037        value: Union[float, str],
2038        score_id: Optional[str] = None,
2039        data_type: Optional[ScoreDataType] = None,
2040        comment: Optional[str] = None,
2041        config_id: Optional[str] = None,
2042        metadata: Optional[Any] = None,
2043    ) -> None:
2044        """Create a score for the current trace.
2045
2046        This method scores the trace of the currently active span. Unlike score_current_span,
2047        this method associates the score with the entire trace rather than a specific span.
2048        It's useful for scoring overall performance or quality of the entire operation.
2049
2050        Args:
2051            name: Name of the score (e.g., "user_satisfaction", "overall_quality")
2052            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT)
2053            score_id: Optional custom ID for the score (auto-generated if not provided)
2054            data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT)
2055            comment: Optional comment or explanation for the score
2056            config_id: Optional ID of a score config defined in Langfuse
2057            metadata: Optional metadata to be attached to the score
2058
2059        Example:
2060            ```python
2061            with langfuse.start_as_current_observation(name="process-user-request") as span:
2062                # Process request
2063                result = process_complete_request()
2064                span.update(output=result)
2065
2066                # Score the overall trace
2067                langfuse.score_current_trace(
2068                    name="overall_quality",
2069                    value=0.95,
2070                    data_type="NUMERIC",
2071                    comment="High quality end-to-end response",
2072                    metadata={"evaluator": "gpt-4", "criteria": "comprehensive"}
2073                )
2074            ```
2075        """
2076        current_span = self._get_current_otel_span()
2077
2078        if current_span is not None:
2079            trace_id = self._get_otel_trace_id(current_span)
2080
2081            langfuse_logger.info(
2082                f"Score: Creating score name='{name}' value={value} for entire trace {trace_id}"
2083            )
2084
2085            self.create_score(
2086                trace_id=trace_id,
2087                name=name,
2088                value=cast(str, value),
2089                score_id=score_id,
2090                data_type=cast(Literal["CATEGORICAL", "TEXT"], data_type),
2091                comment=comment,
2092                config_id=config_id,
2093                metadata=metadata,
2094            )

Create a score for the current trace.

This method scores the trace of the currently active span. Unlike score_current_span, this method associates the score with the entire trace rather than a specific span. It's useful for scoring overall performance or quality of the entire operation.

Arguments:
  • name: Name of the score (e.g., "user_satisfaction", "overall_quality")
  • value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT)
  • score_id: Optional custom ID for the score (auto-generated if not provided)
  • data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT)
  • comment: Optional comment or explanation for the score
  • config_id: Optional ID of a score config defined in Langfuse
  • metadata: Optional metadata to be attached to the score
Example:
with langfuse.start_as_current_observation(name="process-user-request") as span:
    # Process request
    result = process_complete_request()
    span.update(output=result)

    # Score the overall trace
    langfuse.score_current_trace(
        name="overall_quality",
        value=0.95,
        data_type="NUMERIC",
        comment="High quality end-to-end response",
        metadata={"evaluator": "gpt-4", "criteria": "comprehensive"}
    )
def flush(self) -> None:
2096    def flush(self) -> None:
2097        """Force flush all pending spans and events to the Langfuse API.
2098
2099        This method manually flushes any pending spans, scores, and other events to the
2100        Langfuse API. It's useful in scenarios where you want to ensure all data is sent
2101        before proceeding, without waiting for the automatic flush interval.
2102
2103        Example:
2104            ```python
2105            # Record some spans and scores
2106            with langfuse.start_as_current_observation(name="operation") as span:
2107                # Do work...
2108                pass
2109
2110            # Ensure all data is sent to Langfuse before proceeding
2111            langfuse.flush()
2112
2113            # Continue with other work
2114            ```
2115        """
2116        if self._resources is not None:
2117            self._resources.flush()

Force flush all pending spans and events to the Langfuse API.

This method manually flushes any pending spans, scores, and other events to the Langfuse API. It's useful in scenarios where you want to ensure all data is sent before proceeding, without waiting for the automatic flush interval.

Example:
# Record some spans and scores
with langfuse.start_as_current_observation(name="operation") as span:
    # Do work...
    pass

# Ensure all data is sent to Langfuse before proceeding
langfuse.flush()

# Continue with other work
def shutdown(self) -> None:
2119    def shutdown(self) -> None:
2120        """Shut down the Langfuse client and flush all pending data.
2121
2122        This method cleanly shuts down the Langfuse client, ensuring all pending data
2123        is flushed to the API and all background threads are properly terminated.
2124
2125        It's important to call this method when your application is shutting down to
2126        prevent data loss and resource leaks. For most applications, using the client
2127        as a context manager or relying on the automatic shutdown via atexit is sufficient.
2128
2129        Example:
2130            ```python
2131            # Initialize Langfuse
2132            langfuse = Langfuse(public_key="...", secret_key="...")
2133
2134            # Use Langfuse throughout your application
2135            # ...
2136
2137            # When application is shutting down
2138            langfuse.shutdown()
2139            ```
2140        """
2141        if self._resources is not None:
2142            self._resources.shutdown()

Shut down the Langfuse client and flush all pending data.

This method cleanly shuts down the Langfuse client, ensuring all pending data is flushed to the API and all background threads are properly terminated.

It's important to call this method when your application is shutting down to prevent data loss and resource leaks. For most applications, using the client as a context manager or relying on the automatic shutdown via atexit is sufficient.

Example:
# Initialize Langfuse
langfuse = Langfuse(public_key="...", secret_key="...")

# Use Langfuse throughout your application
# ...

# When application is shutting down
langfuse.shutdown()
def get_current_trace_id(self) -> Optional[str]:
2144    def get_current_trace_id(self) -> Optional[str]:
2145        """Get the trace ID of the current active span.
2146
2147        This method retrieves the trace ID from the currently active span in the context.
2148        It can be used to get the trace ID for referencing in logs, external systems,
2149        or for creating related operations.
2150
2151        Returns:
2152            The current trace ID as a 32-character lowercase hexadecimal string,
2153            or None if there is no active span.
2154
2155        Example:
2156            ```python
2157            with langfuse.start_as_current_observation(name="process-request") as span:
2158                # Get the current trace ID for reference
2159                trace_id = langfuse.get_current_trace_id()
2160
2161                # Use it for external correlation
2162                log.info(f"Processing request with trace_id: {trace_id}")
2163
2164                # Or pass to another system
2165                external_system.process(data, trace_id=trace_id)
2166            ```
2167        """
2168        if not self._tracing_enabled:
2169            langfuse_logger.debug(
2170                "Operation skipped: get_current_trace_id - Tracing is disabled or client is in no-op mode."
2171            )
2172            return None
2173
2174        current_otel_span = self._get_current_otel_span()
2175
2176        return self._get_otel_trace_id(current_otel_span) if current_otel_span else None

Get the trace ID of the current active span.

This method retrieves the trace ID from the currently active span in the context. It can be used to get the trace ID for referencing in logs, external systems, or for creating related operations.

Returns:

The current trace ID as a 32-character lowercase hexadecimal string, or None if there is no active span.

Example:
with langfuse.start_as_current_observation(name="process-request") as span:
    # Get the current trace ID for reference
    trace_id = langfuse.get_current_trace_id()

    # Use it for external correlation
    log.info(f"Processing request with trace_id: {trace_id}")

    # Or pass to another system
    external_system.process(data, trace_id=trace_id)
def get_current_observation_id(self) -> Optional[str]:
2178    def get_current_observation_id(self) -> Optional[str]:
2179        """Get the observation ID (span ID) of the current active span.
2180
2181        This method retrieves the observation ID from the currently active span in the context.
2182        It can be used to get the observation ID for referencing in logs, external systems,
2183        or for creating scores or other related operations.
2184
2185        Returns:
2186            The current observation ID as a 16-character lowercase hexadecimal string,
2187            or None if there is no active span.
2188
2189        Example:
2190            ```python
2191            with langfuse.start_as_current_observation(name="process-user-query") as span:
2192                # Get the current observation ID
2193                observation_id = langfuse.get_current_observation_id()
2194
2195                # Store it for later reference
2196                cache.set(f"query_{query_id}_observation", observation_id)
2197
2198                # Process the query...
2199            ```
2200        """
2201        if not self._tracing_enabled:
2202            langfuse_logger.debug(
2203                "Operation skipped: get_current_observation_id - Tracing is disabled or client is in no-op mode."
2204            )
2205            return None
2206
2207        current_otel_span = self._get_current_otel_span()
2208
2209        return self._get_otel_span_id(current_otel_span) if current_otel_span else None

Get the observation ID (span ID) of the current active span.

This method retrieves the observation ID from the currently active span in the context. It can be used to get the observation ID for referencing in logs, external systems, or for creating scores or other related operations.

Returns:

The current observation ID as a 16-character lowercase hexadecimal string, or None if there is no active span.

Example:
with langfuse.start_as_current_observation(name="process-user-query") as span:
    # Get the current observation ID
    observation_id = langfuse.get_current_observation_id()

    # Store it for later reference
    cache.set(f"query_{query_id}_observation", observation_id)

    # Process the query...
def get_trace_url(self, *, trace_id: Optional[str] = None) -> Optional[str]:
2222    def get_trace_url(self, *, trace_id: Optional[str] = None) -> Optional[str]:
2223        """Get the URL to view a trace in the Langfuse UI.
2224
2225        This method generates a URL that links directly to a trace in the Langfuse UI.
2226        It's useful for providing links in logs, notifications, or debugging tools.
2227
2228        Args:
2229            trace_id: Optional trace ID to generate a URL for. If not provided,
2230                     the trace ID of the current active span will be used.
2231
2232        Returns:
2233            A URL string pointing to the trace in the Langfuse UI,
2234            or None if the project ID couldn't be retrieved or no trace ID is available.
2235
2236        Example:
2237            ```python
2238            # Get URL for the current trace
2239            with langfuse.start_as_current_observation(name="process-request") as span:
2240                trace_url = langfuse.get_trace_url()
2241                log.info(f"Processing trace: {trace_url}")
2242
2243            # Get URL for a specific trace
2244            specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef")
2245            send_notification(f"Review needed for trace: {specific_trace_url}")
2246            ```
2247        """
2248        final_trace_id = trace_id or self.get_current_trace_id()
2249        if not final_trace_id:
2250            return None
2251
2252        project_id = self._get_project_id()
2253
2254        return (
2255            f"{self._base_url}/project/{project_id}/traces/{final_trace_id}"
2256            if project_id and final_trace_id
2257            else None
2258        )

Get the URL to view a trace in the Langfuse UI.

This method generates a URL that links directly to a trace in the Langfuse UI. It's useful for providing links in logs, notifications, or debugging tools.

Arguments:
  • trace_id: Optional trace ID to generate a URL for. If not provided, the trace ID of the current active span will be used.
Returns:

A URL string pointing to the trace in the Langfuse UI, or None if the project ID couldn't be retrieved or no trace ID is available.

Example:
# Get URL for the current trace
with langfuse.start_as_current_observation(name="process-request") as span:
    trace_url = langfuse.get_trace_url()
    log.info(f"Processing trace: {trace_url}")

# Get URL for a specific trace
specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef")
send_notification(f"Review needed for trace: {specific_trace_url}")
def get_dataset( self, name: str, *, fetch_items_page_size: Optional[int] = 50, version: Optional[datetime.datetime] = None) -> langfuse._client.datasets.DatasetClient:
2260    def get_dataset(
2261        self,
2262        name: str,
2263        *,
2264        fetch_items_page_size: Optional[int] = 50,
2265        version: Optional[datetime] = None,
2266    ) -> "DatasetClient":
2267        """Fetch a dataset by its name.
2268
2269        Args:
2270            name (str): The name of the dataset to fetch.
2271            fetch_items_page_size (Optional[int]): All items of the dataset will be fetched in chunks of this size. Defaults to 50.
2272            version (Optional[datetime]): Retrieve dataset items as they existed at this specific point in time (UTC).
2273                If provided, returns the state of items at the specified UTC timestamp.
2274                If not provided, returns the latest version. Must be a timezone-aware datetime object in UTC.
2275
2276        Returns:
2277            DatasetClient: The dataset with the given name.
2278        """
2279        try:
2280            langfuse_logger.debug(f"Getting datasets {name}")
2281            dataset = self.api.datasets.get(dataset_name=self._url_encode(name))
2282
2283            dataset_items = []
2284            page = 1
2285
2286            while True:
2287                new_items = self.api.dataset_items.list(
2288                    dataset_name=self._url_encode(name, is_url_param=True),
2289                    page=page,
2290                    limit=fetch_items_page_size,
2291                    version=version,
2292                )
2293                dataset_items.extend(new_items.data)
2294
2295                if new_items.meta.total_pages <= page:
2296                    break
2297
2298                page += 1
2299
2300            return DatasetClient(
2301                dataset=dataset,
2302                items=dataset_items,
2303                version=version,
2304                langfuse_client=self,
2305            )
2306
2307        except Error as e:
2308            handle_fern_exception(e)
2309            raise e

Fetch a dataset by its name.

Arguments:
  • name (str): The name of the dataset to fetch.
  • fetch_items_page_size (Optional[int]): All items of the dataset will be fetched in chunks of this size. Defaults to 50.
  • version (Optional[datetime]): Retrieve dataset items as they existed at this specific point in time (UTC). If provided, returns the state of items at the specified UTC timestamp. If not provided, returns the latest version. Must be a timezone-aware datetime object in UTC.
Returns:

DatasetClient: The dataset with the given name.

def get_dataset_run( self, *, dataset_name: str, run_name: str) -> langfuse.api.DatasetRunWithItems:
2311    def get_dataset_run(
2312        self, *, dataset_name: str, run_name: str
2313    ) -> DatasetRunWithItems:
2314        """Fetch a dataset run by dataset name and run name.
2315
2316        Args:
2317            dataset_name (str): The name of the dataset.
2318            run_name (str): The name of the run.
2319
2320        Returns:
2321            DatasetRunWithItems: The dataset run with its items.
2322        """
2323        try:
2324            return cast(
2325                DatasetRunWithItems,
2326                self.api.datasets.get_run(
2327                    dataset_name=self._url_encode(dataset_name),
2328                    run_name=self._url_encode(run_name),
2329                    request_options=None,
2330                ),
2331            )
2332        except Error as e:
2333            handle_fern_exception(e)
2334            raise e

Fetch a dataset run by dataset name and run name.

Arguments:
  • dataset_name (str): The name of the dataset.
  • run_name (str): The name of the run.
Returns:

DatasetRunWithItems: The dataset run with its items.

def get_dataset_runs( self, *, dataset_name: str, page: Optional[int] = None, limit: Optional[int] = None) -> langfuse.api.PaginatedDatasetRuns:
2336    def get_dataset_runs(
2337        self,
2338        *,
2339        dataset_name: str,
2340        page: Optional[int] = None,
2341        limit: Optional[int] = None,
2342    ) -> PaginatedDatasetRuns:
2343        """Fetch all runs for a dataset.
2344
2345        Args:
2346            dataset_name (str): The name of the dataset.
2347            page (Optional[int]): Page number, starts at 1.
2348            limit (Optional[int]): Limit of items per page.
2349
2350        Returns:
2351            PaginatedDatasetRuns: Paginated list of dataset runs.
2352        """
2353        try:
2354            return cast(
2355                PaginatedDatasetRuns,
2356                self.api.datasets.get_runs(
2357                    dataset_name=self._url_encode(dataset_name),
2358                    page=page,
2359                    limit=limit,
2360                    request_options=None,
2361                ),
2362            )
2363        except Error as e:
2364            handle_fern_exception(e)
2365            raise e

Fetch all runs for a dataset.

Arguments:
  • dataset_name (str): The name of the dataset.
  • page (Optional[int]): Page number, starts at 1.
  • limit (Optional[int]): Limit of items per page.
Returns:

PaginatedDatasetRuns: Paginated list of dataset runs.

def delete_dataset_run( self, *, dataset_name: str, run_name: str) -> langfuse.api.DeleteDatasetRunResponse:
2367    def delete_dataset_run(
2368        self, *, dataset_name: str, run_name: str
2369    ) -> DeleteDatasetRunResponse:
2370        """Delete a dataset run and all its run items. This action is irreversible.
2371
2372        Args:
2373            dataset_name (str): The name of the dataset.
2374            run_name (str): The name of the run.
2375
2376        Returns:
2377            DeleteDatasetRunResponse: Confirmation of deletion.
2378        """
2379        try:
2380            return cast(
2381                DeleteDatasetRunResponse,
2382                self.api.datasets.delete_run(
2383                    dataset_name=self._url_encode(dataset_name),
2384                    run_name=self._url_encode(run_name),
2385                    request_options=None,
2386                ),
2387            )
2388        except Error as e:
2389            handle_fern_exception(e)
2390            raise e

Delete a dataset run and all its run items. This action is irreversible.

Arguments:
  • dataset_name (str): The name of the dataset.
  • run_name (str): The name of the run.
Returns:

DeleteDatasetRunResponse: Confirmation of deletion.

def run_experiment( self, *, name: str, run_name: Optional[str] = None, description: Optional[str] = None, data: Union[List[langfuse.experiment.LocalExperimentItem], List[langfuse.api.DatasetItem]], task: langfuse.experiment.TaskFunction, evaluators: List[langfuse.experiment.EvaluatorFunction] = [], composite_evaluator: Optional[CompositeEvaluatorFunction] = None, run_evaluators: List[langfuse.experiment.RunEvaluatorFunction] = [], max_concurrency: int = 50, metadata: Optional[Dict[str, str]] = None, _dataset_version: Optional[datetime.datetime] = None) -> langfuse.experiment.ExperimentResult:
2392    def run_experiment(
2393        self,
2394        *,
2395        name: str,
2396        run_name: Optional[str] = None,
2397        description: Optional[str] = None,
2398        data: ExperimentData,
2399        task: TaskFunction,
2400        evaluators: List[EvaluatorFunction] = [],
2401        composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
2402        run_evaluators: List[RunEvaluatorFunction] = [],
2403        max_concurrency: int = 50,
2404        metadata: Optional[Dict[str, str]] = None,
2405        _dataset_version: Optional[datetime] = None,
2406    ) -> ExperimentResult:
2407        """Run an experiment on a dataset with automatic tracing and evaluation.
2408
2409        This method executes a task function on each item in the provided dataset,
2410        automatically traces all executions with Langfuse for observability, runs
2411        item-level and run-level evaluators on the outputs, and returns comprehensive
2412        results with evaluation metrics.
2413
2414        The experiment system provides:
2415        - Automatic tracing of all task executions
2416        - Concurrent processing with configurable limits
2417        - Comprehensive error handling that isolates failures
2418        - Integration with Langfuse datasets for experiment tracking
2419        - Flexible evaluation framework supporting both sync and async evaluators
2420
2421        Args:
2422            name: Human-readable name for the experiment. Used for identification
2423                in the Langfuse UI.
2424            run_name: Optional exact name for the experiment run. If provided, this will be
2425                used as the exact dataset run name if the `data` contains Langfuse dataset items.
2426                If not provided, this will default to the experiment name appended with an ISO timestamp.
2427            description: Optional description explaining the experiment's purpose,
2428                methodology, or expected outcomes.
2429            data: Array of data items to process. Can be either:
2430                - List of dict-like items with 'input', 'expected_output', 'metadata' keys
2431                - List of Langfuse DatasetItem objects from dataset.items
2432            task: Function that processes each data item and returns output.
2433                Must accept 'item' as keyword argument and can return sync or async results.
2434                The task function signature should be: task(*, item, **kwargs) -> Any
2435            evaluators: List of functions to evaluate each item's output individually.
2436                Each evaluator receives input, output, expected_output, and metadata.
2437                Can return single Evaluation dict or list of Evaluation dicts.
2438            composite_evaluator: Optional function that creates composite scores from item-level evaluations.
2439                Receives the same inputs as item-level evaluators (input, output, expected_output, metadata)
2440                plus the list of evaluations from item-level evaluators. Useful for weighted averages,
2441                pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics.
2442            run_evaluators: List of functions to evaluate the entire experiment run.
2443                Each run evaluator receives all item_results and can compute aggregate metrics.
2444                Useful for calculating averages, distributions, or cross-item comparisons.
2445            max_concurrency: Maximum number of concurrent task executions (default: 50).
2446                Controls the number of items processed simultaneously. Adjust based on
2447                API rate limits and system resources.
2448            metadata: Optional metadata dictionary to attach to all experiment traces.
2449                This metadata will be included in every trace created during the experiment.
2450                If `data` are Langfuse dataset items, the metadata will be attached to the dataset run, too.
2451
2452        Returns:
2453            ExperimentResult containing:
2454            - run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset.
2455            - item_results: List of results for each processed item with outputs and evaluations
2456            - run_evaluations: List of aggregate evaluation results for the entire run
2457            - experiment_id: Stable identifier for the experiment run across all items
2458            - dataset_run_id: ID of the dataset run (if using Langfuse datasets)
2459            - dataset_run_url: Direct URL to view results in Langfuse UI (if applicable)
2460
2461        Raises:
2462            ValueError: If required parameters are missing or invalid
2463            Exception: If experiment setup fails (individual item failures are handled gracefully)
2464
2465        Examples:
2466            Basic experiment with local data:
2467            ```python
2468            def summarize_text(*, item, **kwargs):
2469                return f"Summary: {item['input'][:50]}..."
2470
2471            def length_evaluator(*, input, output, expected_output=None, **kwargs):
2472                return {
2473                    "name": "output_length",
2474                    "value": len(output),
2475                    "comment": f"Output contains {len(output)} characters"
2476                }
2477
2478            result = langfuse.run_experiment(
2479                name="Text Summarization Test",
2480                description="Evaluate summarization quality and length",
2481                data=[
2482                    {"input": "Long article text...", "expected_output": "Expected summary"},
2483                    {"input": "Another article...", "expected_output": "Another summary"}
2484                ],
2485                task=summarize_text,
2486                evaluators=[length_evaluator]
2487            )
2488
2489            print(f"Processed {len(result.item_results)} items")
2490            for item_result in result.item_results:
2491                print(f"Input: {item_result.item['input']}")
2492                print(f"Output: {item_result.output}")
2493                print(f"Evaluations: {item_result.evaluations}")
2494            ```
2495
2496            Advanced experiment with async task and multiple evaluators:
2497            ```python
2498            async def llm_task(*, item, **kwargs):
2499                # Simulate async LLM call
2500                response = await openai_client.chat.completions.create(
2501                    model="gpt-4",
2502                    messages=[{"role": "user", "content": item["input"]}]
2503                )
2504                return response.choices[0].message.content
2505
2506            def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
2507                if expected_output and expected_output.lower() in output.lower():
2508                    return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"}
2509                return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"}
2510
2511            def toxicity_evaluator(*, input, output, expected_output=None, **kwargs):
2512                # Simulate toxicity check
2513                toxicity_score = check_toxicity(output)  # Your toxicity checker
2514                return {
2515                    "name": "toxicity",
2516                    "value": toxicity_score,
2517                    "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}"
2518                }
2519
2520            def average_accuracy(*, item_results, **kwargs):
2521                accuracies = [
2522                    eval.value for result in item_results
2523                    for eval in result.evaluations
2524                    if eval.name == "accuracy"
2525                ]
2526                return {
2527                    "name": "average_accuracy",
2528                    "value": sum(accuracies) / len(accuracies) if accuracies else 0,
2529                    "comment": f"Average accuracy across {len(accuracies)} items"
2530                }
2531
2532            result = langfuse.run_experiment(
2533                name="LLM Safety and Accuracy Test",
2534                description="Evaluate model accuracy and safety across diverse prompts",
2535                data=test_dataset,  # Your dataset items
2536                task=llm_task,
2537                evaluators=[accuracy_evaluator, toxicity_evaluator],
2538                run_evaluators=[average_accuracy],
2539                max_concurrency=5,  # Limit concurrent API calls
2540                metadata={"model": "gpt-4", "temperature": 0.7}
2541            )
2542            ```
2543
2544            Using with Langfuse datasets:
2545            ```python
2546            # Get dataset from Langfuse
2547            dataset = langfuse.get_dataset("my-eval-dataset")
2548
2549            result = dataset.run_experiment(
2550                name="Production Model Evaluation",
2551                description="Monthly evaluation of production model performance",
2552                task=my_production_task,
2553                evaluators=[accuracy_evaluator, latency_evaluator]
2554            )
2555
2556            # Results automatically linked to dataset in Langfuse UI
2557            print(f"View results: {result['dataset_run_url']}")
2558            ```
2559
2560        Note:
2561            - Task and evaluator functions can be either synchronous or asynchronous
2562            - Individual item failures are logged but don't stop the experiment
2563            - All executions are automatically traced and visible in Langfuse UI
2564            - When using Langfuse datasets, results are automatically linked for easy comparison
2565            - This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.)
2566            - Async execution is handled automatically with smart event loop detection
2567        """
2568        return cast(
2569            ExperimentResult,
2570            run_async_safely(
2571                self._run_experiment_async(
2572                    name=name,
2573                    run_name=self._create_experiment_run_name(
2574                        name=name, run_name=run_name
2575                    ),
2576                    description=description,
2577                    data=data,
2578                    task=task,
2579                    evaluators=evaluators or [],
2580                    composite_evaluator=composite_evaluator,
2581                    run_evaluators=run_evaluators or [],
2582                    max_concurrency=max_concurrency,
2583                    metadata=metadata,
2584                    dataset_version=_dataset_version,
2585                ),
2586            ),
2587        )

Run an experiment on a dataset with automatic tracing and evaluation.

This method executes a task function on each item in the provided dataset, automatically traces all executions with Langfuse for observability, runs item-level and run-level evaluators on the outputs, and returns comprehensive results with evaluation metrics.

The experiment system provides:

  • Automatic tracing of all task executions
  • Concurrent processing with configurable limits
  • Comprehensive error handling that isolates failures
  • Integration with Langfuse datasets for experiment tracking
  • Flexible evaluation framework supporting both sync and async evaluators
Arguments:
  • name: Human-readable name for the experiment. Used for identification in the Langfuse UI.
  • run_name: Optional exact name for the experiment run. If provided, this will be used as the exact dataset run name if the data contains Langfuse dataset items. If not provided, this will default to the experiment name appended with an ISO timestamp.
  • description: Optional description explaining the experiment's purpose, methodology, or expected outcomes.
  • data: Array of data items to process. Can be either:
    • List of dict-like items with 'input', 'expected_output', 'metadata' keys
    • List of Langfuse DatasetItem objects from dataset.items
  • task: Function that processes each data item and returns output. Must accept 'item' as keyword argument and can return sync or async results. The task function signature should be: task(*, item, **kwargs) -> Any
  • evaluators: List of functions to evaluate each item's output individually. Each evaluator receives input, output, expected_output, and metadata. Can return single Evaluation dict or list of Evaluation dicts.
  • composite_evaluator: Optional function that creates composite scores from item-level evaluations. Receives the same inputs as item-level evaluators (input, output, expected_output, metadata) plus the list of evaluations from item-level evaluators. Useful for weighted averages, pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics.
  • run_evaluators: List of functions to evaluate the entire experiment run. Each run evaluator receives all item_results and can compute aggregate metrics. Useful for calculating averages, distributions, or cross-item comparisons.
  • max_concurrency: Maximum number of concurrent task executions (default: 50). Controls the number of items processed simultaneously. Adjust based on API rate limits and system resources.
  • metadata: Optional metadata dictionary to attach to all experiment traces. This metadata will be included in every trace created during the experiment. If data are Langfuse dataset items, the metadata will be attached to the dataset run, too.
Returns:

ExperimentResult containing:

  • run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset.
  • item_results: List of results for each processed item with outputs and evaluations
  • run_evaluations: List of aggregate evaluation results for the entire run
  • experiment_id: Stable identifier for the experiment run across all items
  • dataset_run_id: ID of the dataset run (if using Langfuse datasets)
  • dataset_run_url: Direct URL to view results in Langfuse UI (if applicable)
Raises:
  • ValueError: If required parameters are missing or invalid
  • Exception: If experiment setup fails (individual item failures are handled gracefully)
Examples:

Basic experiment with local data:

def summarize_text(*, item, **kwargs):
    return f"Summary: {item['input'][:50]}..."

def length_evaluator(*, input, output, expected_output=None, **kwargs):
    return {
        "name": "output_length",
        "value": len(output),
        "comment": f"Output contains {len(output)} characters"
    }

result = langfuse.run_experiment(
    name="Text Summarization Test",
    description="Evaluate summarization quality and length",
    data=[
        {"input": "Long article text...", "expected_output": "Expected summary"},
        {"input": "Another article...", "expected_output": "Another summary"}
    ],
    task=summarize_text,
    evaluators=[length_evaluator]
)

print(f"Processed {len(result.item_results)} items")
for item_result in result.item_results:
    print(f"Input: {item_result.item['input']}")
    print(f"Output: {item_result.output}")
    print(f"Evaluations: {item_result.evaluations}")

Advanced experiment with async task and multiple evaluators:

async def llm_task(*, item, **kwargs):
    # Simulate async LLM call
    response = await openai_client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": item["input"]}]
    )
    return response.choices[0].message.content

def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
    if expected_output and expected_output.lower() in output.lower():
        return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"}
    return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"}

def toxicity_evaluator(*, input, output, expected_output=None, **kwargs):
    # Simulate toxicity check
    toxicity_score = check_toxicity(output)  # Your toxicity checker
    return {
        "name": "toxicity",
        "value": toxicity_score,
        "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}"
    }

def average_accuracy(*, item_results, **kwargs):
    accuracies = [
        eval.value for result in item_results
        for eval in result.evaluations
        if eval.name == "accuracy"
    ]
    return {
        "name": "average_accuracy",
        "value": sum(accuracies) / len(accuracies) if accuracies else 0,
        "comment": f"Average accuracy across {len(accuracies)} items"
    }

result = langfuse.run_experiment(
    name="LLM Safety and Accuracy Test",
    description="Evaluate model accuracy and safety across diverse prompts",
    data=test_dataset,  # Your dataset items
    task=llm_task,
    evaluators=[accuracy_evaluator, toxicity_evaluator],
    run_evaluators=[average_accuracy],
    max_concurrency=5,  # Limit concurrent API calls
    metadata={"model": "gpt-4", "temperature": 0.7}
)

Using with Langfuse datasets:

# Get dataset from Langfuse
dataset = langfuse.get_dataset("my-eval-dataset")

result = dataset.run_experiment(
    name="Production Model Evaluation",
    description="Monthly evaluation of production model performance",
    task=my_production_task,
    evaluators=[accuracy_evaluator, latency_evaluator]
)

# Results automatically linked to dataset in Langfuse UI
print(f"View results: {result['dataset_run_url']}")
Note:
  • Task and evaluator functions can be either synchronous or asynchronous
  • Individual item failures are logged but don't stop the experiment
  • All executions are automatically traced and visible in Langfuse UI
  • When using Langfuse datasets, results are automatically linked for easy comparison
  • This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.)
  • Async execution is handled automatically with smart event loop detection
def run_batched_evaluation( self, *, scope: Literal['traces', 'observations'], mapper: MapperFunction, filter: Optional[str] = None, fetch_batch_size: int = 50, fetch_trace_fields: Optional[str] = None, max_items: Optional[int] = None, max_retries: int = 3, evaluators: List[langfuse.experiment.EvaluatorFunction], composite_evaluator: Optional[CompositeEvaluatorFunction] = None, max_concurrency: int = 5, metadata: Optional[Dict[str, Any]] = None, _add_observation_scores_to_trace: bool = False, _additional_trace_tags: Optional[List[str]] = None, resume_from: Optional[BatchEvaluationResumeToken] = None, verbose: bool = False) -> BatchEvaluationResult:
2949    def run_batched_evaluation(
2950        self,
2951        *,
2952        scope: Literal["traces", "observations"],
2953        mapper: MapperFunction,
2954        filter: Optional[str] = None,
2955        fetch_batch_size: int = 50,
2956        fetch_trace_fields: Optional[str] = None,
2957        max_items: Optional[int] = None,
2958        max_retries: int = 3,
2959        evaluators: List[EvaluatorFunction],
2960        composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
2961        max_concurrency: int = 5,
2962        metadata: Optional[Dict[str, Any]] = None,
2963        _add_observation_scores_to_trace: bool = False,
2964        _additional_trace_tags: Optional[List[str]] = None,
2965        resume_from: Optional[BatchEvaluationResumeToken] = None,
2966        verbose: bool = False,
2967    ) -> BatchEvaluationResult:
2968        """Fetch traces or observations and run evaluations on each item.
2969
2970        This method provides a powerful way to evaluate existing data in Langfuse at scale.
2971        It fetches items based on filters, transforms them using a mapper function, runs
2972        evaluators on each item, and creates scores that are linked back to the original
2973        entities. This is ideal for:
2974
2975        - Running evaluations on production traces after deployment
2976        - Backtesting new evaluation metrics on historical data
2977        - Batch scoring of observations for quality monitoring
2978        - Periodic evaluation runs on recent data
2979
2980        The method uses a streaming/pipeline approach to process items in batches, making
2981        it memory-efficient for large datasets. It includes comprehensive error handling,
2982        retry logic, and resume capability for long-running evaluations.
2983
2984        Args:
2985            scope: The type of items to evaluate. Must be one of:
2986                - "traces": Evaluate complete traces with all their observations
2987                - "observations": Evaluate individual observations (spans, generations, events)
2988            mapper: Function that transforms API response objects into evaluator inputs.
2989                Receives a trace/observation object and returns an EvaluatorInputs
2990                instance with input, output, expected_output, and metadata fields.
2991                Can be sync or async.
2992            evaluators: List of evaluation functions to run on each item. Each evaluator
2993                receives the mapped inputs and returns Evaluation object(s). Evaluator
2994                failures are logged but don't stop the batch evaluation.
2995            filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples:
2996                - '{"tags": ["production"]}'
2997                - '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}'
2998                Default: None (fetches all items).
2999            fetch_batch_size: Number of items to fetch per API call and hold in memory.
3000                Larger values may be faster but use more memory. Default: 50.
3001            fetch_trace_fields: Comma-separated list of fields to include when fetching traces. Available field groups: 'core' (always included), 'io' (input, output, metadata), 'scores', 'observations', 'metrics'. If not specified, all fields are returned. Example: 'core,scores,metrics'. Note: Excluded 'observations' or 'scores' fields return empty arrays; excluded 'metrics' returns -1 for 'totalCost' and 'latency'. Only relevant if scope is 'traces'.
3002            max_items: Maximum total number of items to process. If None, processes all
3003                items matching the filter. Useful for testing or limiting evaluation runs.
3004                Default: None (process all).
3005            max_concurrency: Maximum number of items to evaluate concurrently. Controls
3006                parallelism and resource usage. Default: 5.
3007            composite_evaluator: Optional function that creates a composite score from
3008                item-level evaluations. Receives the original item and its evaluations,
3009                returns a single Evaluation. Useful for weighted averages or combined metrics.
3010                Default: None.
3011            metadata: Optional metadata dict to add to all created scores. Useful for
3012                tracking evaluation runs, versions, or other context. Default: None.
3013            max_retries: Maximum number of retry attempts for failed batch fetches.
3014                Uses exponential backoff (1s, 2s, 4s). Default: 3.
3015            verbose: If True, logs progress information to console. Useful for monitoring
3016                long-running evaluations. Default: False.
3017            resume_from: Optional resume token from a previous incomplete run. Allows
3018                continuing evaluation after interruption or failure. Default: None.
3019
3020
3021        Returns:
3022            BatchEvaluationResult containing:
3023                - total_items_fetched: Number of items fetched from API
3024                - total_items_processed: Number of items successfully evaluated
3025                - total_items_failed: Number of items that failed evaluation
3026                - total_scores_created: Scores created by item-level evaluators
3027                - total_composite_scores_created: Scores created by composite evaluator
3028                - total_evaluations_failed: Individual evaluator failures
3029                - evaluator_stats: Per-evaluator statistics (success rate, scores created)
3030                - resume_token: Token for resuming if incomplete (None if completed)
3031                - completed: True if all items processed
3032                - duration_seconds: Total execution time
3033                - failed_item_ids: IDs of items that failed
3034                - error_summary: Error types and counts
3035                - has_more_items: True if max_items reached but more exist
3036
3037        Raises:
3038            ValueError: If invalid scope is provided.
3039
3040        Examples:
3041            Basic trace evaluation:
3042            ```python
3043            from langfuse import Langfuse, EvaluatorInputs, Evaluation
3044
3045            client = Langfuse()
3046
3047            # Define mapper to extract fields from traces
3048            def trace_mapper(trace):
3049                return EvaluatorInputs(
3050                    input=trace.input,
3051                    output=trace.output,
3052                    expected_output=None,
3053                    metadata={"trace_id": trace.id}
3054                )
3055
3056            # Define evaluator
3057            def length_evaluator(*, input, output, expected_output, metadata):
3058                return Evaluation(
3059                    name="output_length",
3060                    value=len(output) if output else 0
3061                )
3062
3063            # Run batch evaluation
3064            result = client.run_batched_evaluation(
3065                scope="traces",
3066                mapper=trace_mapper,
3067                evaluators=[length_evaluator],
3068                filter='{"tags": ["production"]}',
3069                max_items=1000,
3070                verbose=True
3071            )
3072
3073            print(f"Processed {result.total_items_processed} traces")
3074            print(f"Created {result.total_scores_created} scores")
3075            ```
3076
3077            Evaluation with composite scorer:
3078            ```python
3079            def accuracy_evaluator(*, input, output, expected_output, metadata):
3080                # ... evaluation logic
3081                return Evaluation(name="accuracy", value=0.85)
3082
3083            def relevance_evaluator(*, input, output, expected_output, metadata):
3084                # ... evaluation logic
3085                return Evaluation(name="relevance", value=0.92)
3086
3087            def composite_evaluator(*, item, evaluations):
3088                # Weighted average of evaluations
3089                weights = {"accuracy": 0.6, "relevance": 0.4}
3090                total = sum(
3091                    e.value * weights.get(e.name, 0)
3092                    for e in evaluations
3093                    if isinstance(e.value, (int, float))
3094                )
3095                return Evaluation(
3096                    name="composite_score",
3097                    value=total,
3098                    comment=f"Weighted average of {len(evaluations)} metrics"
3099                )
3100
3101            result = client.run_batched_evaluation(
3102                scope="traces",
3103                mapper=trace_mapper,
3104                evaluators=[accuracy_evaluator, relevance_evaluator],
3105                composite_evaluator=composite_evaluator,
3106                filter='{"user_id": "important_user"}',
3107                verbose=True
3108            )
3109            ```
3110
3111            Handling incomplete runs with resume:
3112            ```python
3113            # Initial run that may fail or timeout
3114            result = client.run_batched_evaluation(
3115                scope="observations",
3116                mapper=obs_mapper,
3117                evaluators=[my_evaluator],
3118                max_items=10000,
3119                verbose=True
3120            )
3121
3122            # Check if incomplete
3123            if not result.completed and result.resume_token:
3124                print(f"Processed {result.resume_token.items_processed} items before interruption")
3125
3126                # Resume from where it left off
3127                result = client.run_batched_evaluation(
3128                    scope="observations",
3129                    mapper=obs_mapper,
3130                    evaluators=[my_evaluator],
3131                    resume_from=result.resume_token,
3132                    verbose=True
3133                )
3134
3135            print(f"Total items processed: {result.total_items_processed}")
3136            ```
3137
3138            Monitoring evaluator performance:
3139            ```python
3140            result = client.run_batched_evaluation(...)
3141
3142            for stats in result.evaluator_stats:
3143                success_rate = stats.successful_runs / stats.total_runs
3144                print(f"{stats.name}:")
3145                print(f"  Success rate: {success_rate:.1%}")
3146                print(f"  Scores created: {stats.total_scores_created}")
3147
3148                if stats.failed_runs > 0:
3149                    print(f"  ⚠️  Failed {stats.failed_runs} times")
3150            ```
3151
3152        Note:
3153            - Evaluator failures are logged but don't stop the batch evaluation
3154            - Individual item failures are tracked but don't stop processing
3155            - Fetch failures are retried with exponential backoff
3156            - All scores are automatically flushed to Langfuse at the end
3157            - The resume mechanism uses timestamp-based filtering to avoid duplicates
3158        """
3159        runner = BatchEvaluationRunner(self)
3160
3161        return cast(
3162            BatchEvaluationResult,
3163            run_async_safely(
3164                runner.run_async(
3165                    scope=scope,
3166                    mapper=mapper,
3167                    evaluators=evaluators,
3168                    filter=filter,
3169                    fetch_batch_size=fetch_batch_size,
3170                    fetch_trace_fields=fetch_trace_fields,
3171                    max_items=max_items,
3172                    max_concurrency=max_concurrency,
3173                    composite_evaluator=composite_evaluator,
3174                    metadata=metadata,
3175                    _add_observation_scores_to_trace=_add_observation_scores_to_trace,
3176                    _additional_trace_tags=_additional_trace_tags,
3177                    max_retries=max_retries,
3178                    verbose=verbose,
3179                    resume_from=resume_from,
3180                )
3181            ),
3182        )

Fetch traces or observations and run evaluations on each item.

This method provides a powerful way to evaluate existing data in Langfuse at scale. It fetches items based on filters, transforms them using a mapper function, runs evaluators on each item, and creates scores that are linked back to the original entities. This is ideal for:

  • Running evaluations on production traces after deployment
  • Backtesting new evaluation metrics on historical data
  • Batch scoring of observations for quality monitoring
  • Periodic evaluation runs on recent data

The method uses a streaming/pipeline approach to process items in batches, making it memory-efficient for large datasets. It includes comprehensive error handling, retry logic, and resume capability for long-running evaluations.

Arguments:
  • scope: The type of items to evaluate. Must be one of:
    • "traces": Evaluate complete traces with all their observations
    • "observations": Evaluate individual observations (spans, generations, events)
  • mapper: Function that transforms API response objects into evaluator inputs. Receives a trace/observation object and returns an EvaluatorInputs instance with input, output, expected_output, and metadata fields. Can be sync or async.
  • evaluators: List of evaluation functions to run on each item. Each evaluator receives the mapped inputs and returns Evaluation object(s). Evaluator failures are logged but don't stop the batch evaluation.
  • filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples:
    • '{"tags": ["production"]}'
    • '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}' Default: None (fetches all items).
  • fetch_batch_size: Number of items to fetch per API call and hold in memory. Larger values may be faster but use more memory. Default: 50.
  • fetch_trace_fields: Comma-separated list of fields to include when fetching traces. Available field groups: 'core' (always included), 'io' (input, output, metadata), 'scores', 'observations', 'metrics'. If not specified, all fields are returned. Example: 'core,scores,metrics'. Note: Excluded 'observations' or 'scores' fields return empty arrays; excluded 'metrics' returns -1 for 'totalCost' and 'latency'. Only relevant if scope is 'traces'.
  • max_items: Maximum total number of items to process. If None, processes all items matching the filter. Useful for testing or limiting evaluation runs. Default: None (process all).
  • max_concurrency: Maximum number of items to evaluate concurrently. Controls parallelism and resource usage. Default: 5.
  • composite_evaluator: Optional function that creates a composite score from item-level evaluations. Receives the original item and its evaluations, returns a single Evaluation. Useful for weighted averages or combined metrics. Default: None.
  • metadata: Optional metadata dict to add to all created scores. Useful for tracking evaluation runs, versions, or other context. Default: None.
  • max_retries: Maximum number of retry attempts for failed batch fetches. Uses exponential backoff (1s, 2s, 4s). Default: 3.
  • verbose: If True, logs progress information to console. Useful for monitoring long-running evaluations. Default: False.
  • resume_from: Optional resume token from a previous incomplete run. Allows continuing evaluation after interruption or failure. Default: None.
Returns:

BatchEvaluationResult containing: - total_items_fetched: Number of items fetched from API - total_items_processed: Number of items successfully evaluated - total_items_failed: Number of items that failed evaluation - total_scores_created: Scores created by item-level evaluators - total_composite_scores_created: Scores created by composite evaluator - total_evaluations_failed: Individual evaluator failures - evaluator_stats: Per-evaluator statistics (success rate, scores created) - resume_token: Token for resuming if incomplete (None if completed) - completed: True if all items processed - duration_seconds: Total execution time - failed_item_ids: IDs of items that failed - error_summary: Error types and counts - has_more_items: True if max_items reached but more exist

Raises:
  • ValueError: If invalid scope is provided.
Examples:

Basic trace evaluation:

from langfuse import Langfuse, EvaluatorInputs, Evaluation

client = Langfuse()

# Define mapper to extract fields from traces
def trace_mapper(trace):
    return EvaluatorInputs(
        input=trace.input,
        output=trace.output,
        expected_output=None,
        metadata={"trace_id": trace.id}
    )

# Define evaluator
def length_evaluator(*, input, output, expected_output, metadata):
    return Evaluation(
        name="output_length",
        value=len(output) if output else 0
    )

# Run batch evaluation
result = client.run_batched_evaluation(
    scope="traces",
    mapper=trace_mapper,
    evaluators=[length_evaluator],
    filter='{"tags": ["production"]}',
    max_items=1000,
    verbose=True
)

print(f"Processed {result.total_items_processed} traces")
print(f"Created {result.total_scores_created} scores")

Evaluation with composite scorer:

def accuracy_evaluator(*, input, output, expected_output, metadata):
    # ... evaluation logic
    return Evaluation(name="accuracy", value=0.85)

def relevance_evaluator(*, input, output, expected_output, metadata):
    # ... evaluation logic
    return Evaluation(name="relevance", value=0.92)

def composite_evaluator(*, item, evaluations):
    # Weighted average of evaluations
    weights = {"accuracy": 0.6, "relevance": 0.4}
    total = sum(
        e.value * weights.get(e.name, 0)
        for e in evaluations
        if isinstance(e.value, (int, float))
    )
    return Evaluation(
        name="composite_score",
        value=total,
        comment=f"Weighted average of {len(evaluations)} metrics"
    )

result = client.run_batched_evaluation(
    scope="traces",
    mapper=trace_mapper,
    evaluators=[accuracy_evaluator, relevance_evaluator],
    composite_evaluator=composite_evaluator,
    filter='{"user_id": "important_user"}',
    verbose=True
)

Handling incomplete runs with resume:

# Initial run that may fail or timeout
result = client.run_batched_evaluation(
    scope="observations",
    mapper=obs_mapper,
    evaluators=[my_evaluator],
    max_items=10000,
    verbose=True
)

# Check if incomplete
if not result.completed and result.resume_token:
    print(f"Processed {result.resume_token.items_processed} items before interruption")

    # Resume from where it left off
    result = client.run_batched_evaluation(
        scope="observations",
        mapper=obs_mapper,
        evaluators=[my_evaluator],
        resume_from=result.resume_token,
        verbose=True
    )

print(f"Total items processed: {result.total_items_processed}")

Monitoring evaluator performance:

result = client.run_batched_evaluation(...)

for stats in result.evaluator_stats:
    success_rate = stats.successful_runs / stats.total_runs
    print(f"{stats.name}:")
    print(f"  Success rate: {success_rate:.1%}")
    print(f"  Scores created: {stats.total_scores_created}")

    if stats.failed_runs > 0:
        print(f"  ⚠️  Failed {stats.failed_runs} times")
Note:
  • Evaluator failures are logged but don't stop the batch evaluation
  • Individual item failures are tracked but don't stop processing
  • Fetch failures are retried with exponential backoff
  • All scores are automatically flushed to Langfuse at the end
  • The resume mechanism uses timestamp-based filtering to avoid duplicates
def auth_check(self) -> bool:
3184    def auth_check(self) -> bool:
3185        """Check if the provided credentials (public and secret key) are valid.
3186
3187        Raises:
3188            Exception: If no projects were found for the provided credentials.
3189
3190        Note:
3191            This method is blocking. It is discouraged to use it in production code.
3192        """
3193        try:
3194            projects = self.api.projects.get()
3195            langfuse_logger.debug(
3196                f"Auth check successful, found {len(projects.data)} projects"
3197            )
3198            if len(projects.data) == 0:
3199                raise Exception(
3200                    "Auth check failed, no project found for the keys provided."
3201                )
3202            return True
3203
3204        except AttributeError as e:
3205            langfuse_logger.warning(
3206                f"Auth check failed: Client not properly initialized. Error: {e}"
3207            )
3208            return False
3209
3210        except Error as e:
3211            handle_fern_exception(e)
3212            raise e

Check if the provided credentials (public and secret key) are valid.

Raises:
  • Exception: If no projects were found for the provided credentials.
Note:

This method is blocking. It is discouraged to use it in production code.

def create_dataset( self, *, name: str, description: Optional[str] = None, metadata: Optional[Any] = None, input_schema: Optional[Any] = None, expected_output_schema: Optional[Any] = None) -> langfuse.api.Dataset:
3214    def create_dataset(
3215        self,
3216        *,
3217        name: str,
3218        description: Optional[str] = None,
3219        metadata: Optional[Any] = None,
3220        input_schema: Optional[Any] = None,
3221        expected_output_schema: Optional[Any] = None,
3222    ) -> Dataset:
3223        """Create a dataset with the given name on Langfuse.
3224
3225        Args:
3226            name: Name of the dataset to create.
3227            description: Description of the dataset. Defaults to None.
3228            metadata: Additional metadata. Defaults to None.
3229            input_schema: JSON Schema for validating dataset item inputs. When set, all new items will be validated against this schema.
3230            expected_output_schema: JSON Schema for validating dataset item expected outputs. When set, all new items will be validated against this schema.
3231
3232        Returns:
3233            Dataset: The created dataset as returned by the Langfuse API.
3234        """
3235        try:
3236            langfuse_logger.debug(f"Creating datasets {name}")
3237
3238            result = self.api.datasets.create(
3239                name=name,
3240                description=description,
3241                metadata=metadata,
3242                input_schema=input_schema,
3243                expected_output_schema=expected_output_schema,
3244            )
3245
3246            return cast(Dataset, result)
3247
3248        except Error as e:
3249            handle_fern_exception(e)
3250            raise e

Create a dataset with the given name on Langfuse.

Arguments:
  • name: Name of the dataset to create.
  • description: Description of the dataset. Defaults to None.
  • metadata: Additional metadata. Defaults to None.
  • input_schema: JSON Schema for validating dataset item inputs. When set, all new items will be validated against this schema.
  • expected_output_schema: JSON Schema for validating dataset item expected outputs. When set, all new items will be validated against this schema.
Returns:

Dataset: The created dataset as returned by the Langfuse API.

def create_dataset_item( self, *, dataset_name: str, input: Optional[Any] = None, expected_output: Optional[Any] = None, metadata: Optional[Any] = None, source_trace_id: Optional[str] = None, source_observation_id: Optional[str] = None, status: Optional[langfuse.api.DatasetStatus] = None, id: Optional[str] = None) -> langfuse.api.DatasetItem:
3252    def create_dataset_item(
3253        self,
3254        *,
3255        dataset_name: str,
3256        input: Optional[Any] = None,
3257        expected_output: Optional[Any] = None,
3258        metadata: Optional[Any] = None,
3259        source_trace_id: Optional[str] = None,
3260        source_observation_id: Optional[str] = None,
3261        status: Optional[DatasetStatus] = None,
3262        id: Optional[str] = None,
3263    ) -> DatasetItem:
3264        """Create a dataset item.
3265
3266        Upserts if an item with id already exists.
3267
3268        Args:
3269            dataset_name: Name of the dataset in which the dataset item should be created.
3270            input: Input data. Defaults to None. Can contain any dict, list or scalar.
3271            expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar.
3272            metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar.
3273            source_trace_id: Id of the source trace. Defaults to None.
3274            source_observation_id: Id of the source observation. Defaults to None.
3275            status: Status of the dataset item. Defaults to ACTIVE for newly created items.
3276            id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets.
3277
3278        Returns:
3279            DatasetItem: The created dataset item as returned by the Langfuse API.
3280
3281        Example:
3282            ```python
3283            from langfuse import Langfuse
3284
3285            langfuse = Langfuse()
3286
3287            # Uploading items to the Langfuse dataset named "capital_cities"
3288            langfuse.create_dataset_item(
3289                dataset_name="capital_cities",
3290                input={"input": {"country": "Italy"}},
3291                expected_output={"expected_output": "Rome"},
3292                metadata={"foo": "bar"}
3293            )
3294            ```
3295        """
3296        try:
3297            langfuse_logger.debug(f"Creating dataset item for dataset {dataset_name}")
3298
3299            result = self.api.dataset_items.create(
3300                dataset_name=dataset_name,
3301                input=input,
3302                expected_output=expected_output,
3303                metadata=metadata,
3304                source_trace_id=source_trace_id,
3305                source_observation_id=source_observation_id,
3306                status=status,
3307                id=id,
3308            )
3309
3310            return cast(DatasetItem, result)
3311        except Error as e:
3312            handle_fern_exception(e)
3313            raise e

Create a dataset item.

Upserts if an item with id already exists.

Arguments:
  • dataset_name: Name of the dataset in which the dataset item should be created.
  • input: Input data. Defaults to None. Can contain any dict, list or scalar.
  • expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar.
  • metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar.
  • source_trace_id: Id of the source trace. Defaults to None.
  • source_observation_id: Id of the source observation. Defaults to None.
  • status: Status of the dataset item. Defaults to ACTIVE for newly created items.
  • id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets.
Returns:

DatasetItem: The created dataset item as returned by the Langfuse API.

Example:
from langfuse import Langfuse

langfuse = Langfuse()

# Uploading items to the Langfuse dataset named "capital_cities"
langfuse.create_dataset_item(
    dataset_name="capital_cities",
    input={"input": {"country": "Italy"}},
    expected_output={"expected_output": "Rome"},
    metadata={"foo": "bar"}
)
def resolve_media_references( self, *, obj: Any, resolve_with: Literal['base64_data_uri'], max_depth: int = 10, content_fetch_timeout_seconds: int = 5) -> Any:
3315    def resolve_media_references(
3316        self,
3317        *,
3318        obj: Any,
3319        resolve_with: Literal["base64_data_uri"],
3320        max_depth: int = 10,
3321        content_fetch_timeout_seconds: int = 5,
3322    ) -> Any:
3323        """Replace media reference strings in an object with base64 data URIs.
3324
3325        This method recursively traverses an object (up to max_depth) looking for media reference strings
3326        in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using
3327        the provided Langfuse client and replaces the reference string with a base64 data URI.
3328
3329        If fetching media content fails for a reference string, a warning is logged and the reference
3330        string is left unchanged.
3331
3332        Args:
3333            obj: The object to process. Can be a primitive value, array, or nested object.
3334                If the object has a __dict__ attribute, a dict will be returned instead of the original object type.
3335            resolve_with: The representation of the media content to replace the media reference string with.
3336                Currently only "base64_data_uri" is supported.
3337            max_depth: int: The maximum depth to traverse the object. Default is 10.
3338            content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5.
3339
3340        Returns:
3341            A deep copy of the input object with all media references replaced with base64 data URIs where possible.
3342            If the input object has a __dict__ attribute, a dict will be returned instead of the original object type.
3343
3344        Example:
3345            obj = {
3346                "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@",
3347                "nested": {
3348                    "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@"
3349                }
3350            }
3351
3352            result = await LangfuseMedia.resolve_media_references(obj, langfuse_client)
3353
3354            # Result:
3355            # {
3356            #     "image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...",
3357            #     "nested": {
3358            #         "pdf": "data:application/pdf;base64,JVBERi0xLjcK..."
3359            #     }
3360            # }
3361        """
3362        return LangfuseMedia.resolve_media_references(
3363            langfuse_client=self,
3364            obj=obj,
3365            resolve_with=resolve_with,
3366            max_depth=max_depth,
3367            content_fetch_timeout_seconds=content_fetch_timeout_seconds,
3368        )

Replace media reference strings in an object with base64 data URIs.

This method recursively traverses an object (up to max_depth) looking for media reference strings in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using the provided Langfuse client and replaces the reference string with a base64 data URI.

If fetching media content fails for a reference string, a warning is logged and the reference string is left unchanged.

Arguments:
  • obj: The object to process. Can be a primitive value, array, or nested object. If the object has a __dict__ attribute, a dict will be returned instead of the original object type.
  • resolve_with: The representation of the media content to replace the media reference string with. Currently only "base64_data_uri" is supported.
  • max_depth: int: The maximum depth to traverse the object. Default is 10.
  • content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5.
Returns:

A deep copy of the input object with all media references replaced with base64 data URIs where possible. If the input object has a __dict__ attribute, a dict will be returned instead of the original object type.

Example:

obj = { "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@", "nested": { "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@" } }

result = await LangfuseMedia.resolve_media_references(obj, langfuse_client)

Result:

{

"image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...",

"nested": {

"pdf": "data:application/pdf;base64,JVBERi0xLjcK..."

}

}

def get_prompt( self, name: str, *, version: Optional[int] = None, label: Optional[str] = None, type: Literal['chat', 'text'] = 'text', cache_ttl_seconds: Optional[int] = None, fallback: Union[List[langfuse.model.ChatMessageDict], NoneType, str] = None, max_retries: Optional[int] = None, fetch_timeout_seconds: Optional[int] = None) -> Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient]:
3398    def get_prompt(
3399        self,
3400        name: str,
3401        *,
3402        version: Optional[int] = None,
3403        label: Optional[str] = None,
3404        type: Literal["chat", "text"] = "text",
3405        cache_ttl_seconds: Optional[int] = None,
3406        fallback: Union[Optional[List[ChatMessageDict]], Optional[str]] = None,
3407        max_retries: Optional[int] = None,
3408        fetch_timeout_seconds: Optional[int] = None,
3409    ) -> PromptClient:
3410        """Get a prompt.
3411
3412        This method attempts to fetch the requested prompt from the local cache. If the prompt is not found
3413        in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again
3414        and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will
3415        return the expired prompt as a fallback.
3416
3417        Args:
3418            name (str): The name of the prompt to retrieve.
3419
3420        Keyword Args:
3421            version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both.
3422            label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both.
3423            cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a
3424            keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0.
3425            type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text".
3426            fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None.
3427            max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds.
3428            fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default.
3429
3430        Returns:
3431            The prompt object retrieved from the cache or directly fetched if not cached or expired of type
3432            - TextPromptClient, if type argument is 'text'.
3433            - ChatPromptClient, if type argument is 'chat'.
3434
3435        Raises:
3436            Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an
3437            expired prompt in the cache, in which case it logs a warning and returns the expired prompt.
3438        """
3439        if self._resources is None:
3440            raise Error(
3441                "SDK is not correctly initialized. Check the init logs for more details."
3442            )
3443        if version is not None and label is not None:
3444            raise ValueError("Cannot specify both version and label at the same time.")
3445
3446        if not name:
3447            raise ValueError("Prompt name cannot be empty.")
3448
3449        cache_key = PromptCache.generate_cache_key(name, version=version, label=label)
3450        bounded_max_retries = self._get_bounded_max_retries(
3451            max_retries, default_max_retries=2, max_retries_upper_bound=4
3452        )
3453
3454        langfuse_logger.debug(f"Getting prompt '{cache_key}'")
3455        cached_prompt = self._resources.prompt_cache.get(cache_key)
3456
3457        if cached_prompt is None or cache_ttl_seconds == 0:
3458            langfuse_logger.debug(
3459                f"Prompt '{cache_key}' not found in cache or caching disabled."
3460            )
3461            try:
3462                return self._fetch_prompt_and_update_cache(
3463                    name,
3464                    version=version,
3465                    label=label,
3466                    ttl_seconds=cache_ttl_seconds,
3467                    max_retries=bounded_max_retries,
3468                    fetch_timeout_seconds=fetch_timeout_seconds,
3469                )
3470            except Exception as e:
3471                if fallback:
3472                    langfuse_logger.warning(
3473                        f"Returning fallback prompt for '{cache_key}' due to fetch error: {e}"
3474                    )
3475
3476                    fallback_client_args: Dict[str, Any] = {
3477                        "name": name,
3478                        "prompt": fallback,
3479                        "type": type,
3480                        "version": version or 0,
3481                        "config": {},
3482                        "labels": [label] if label else [],
3483                        "tags": [],
3484                    }
3485
3486                    if type == "text":
3487                        return TextPromptClient(
3488                            prompt=Prompt_Text(**fallback_client_args),
3489                            is_fallback=True,
3490                        )
3491
3492                    if type == "chat":
3493                        return ChatPromptClient(
3494                            prompt=Prompt_Chat(**fallback_client_args),
3495                            is_fallback=True,
3496                        )
3497
3498                raise e
3499
3500        if cached_prompt.is_expired():
3501            langfuse_logger.debug(f"Stale prompt '{cache_key}' found in cache.")
3502            try:
3503                # refresh prompt in background thread, refresh_prompt deduplicates tasks
3504                langfuse_logger.debug(f"Refreshing prompt '{cache_key}' in background.")
3505
3506                def refresh_task() -> None:
3507                    self._fetch_prompt_and_update_cache(
3508                        name,
3509                        version=version,
3510                        label=label,
3511                        ttl_seconds=cache_ttl_seconds,
3512                        max_retries=bounded_max_retries,
3513                        fetch_timeout_seconds=fetch_timeout_seconds,
3514                    )
3515
3516                self._resources.prompt_cache.add_refresh_prompt_task_if_current(
3517                    cache_key,
3518                    cached_prompt,
3519                    refresh_task,
3520                )
3521                langfuse_logger.debug(
3522                    f"Returning stale prompt '{cache_key}' from cache."
3523                )
3524                # return stale prompt
3525                return cached_prompt.value
3526
3527            except Exception as e:
3528                langfuse_logger.warning(
3529                    f"Error when refreshing cached prompt '{cache_key}', returning cached version. Error: {e}"
3530                )
3531                # creation of refresh prompt task failed, return stale prompt
3532                return cached_prompt.value
3533
3534        return cached_prompt.value

Get a prompt.

This method attempts to fetch the requested prompt from the local cache. If the prompt is not found in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will return the expired prompt as a fallback.

Arguments:
  • name (str): The name of the prompt to retrieve.
Keyword Args:
  • version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the production label is returned. Specify either version or label, not both.
  • label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the production label is returned. Specify either version or label, not both.
  • cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a
  • keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0.
  • type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text".
  • fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None.
  • max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds.
  • fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default.
Returns:

The prompt object retrieved from the cache or directly fetched if not cached or expired of type

  • TextPromptClient, if type argument is 'text'.
  • ChatPromptClient, if type argument is 'chat'.
Raises:
  • Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an
  • expired prompt in the cache, in which case it logs a warning and returns the expired prompt.
def create_prompt( self, *, name: str, prompt: Union[str, List[Union[langfuse.model.ChatMessageDict, langfuse.model.ChatMessageWithPlaceholdersDict_Message, langfuse.model.ChatMessageWithPlaceholdersDict_Placeholder]]], labels: List[str] = [], tags: Optional[List[str]] = None, type: Optional[Literal['chat', 'text']] = 'text', config: Optional[Any] = None, commit_message: Optional[str] = None) -> Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient]:
3636    def create_prompt(
3637        self,
3638        *,
3639        name: str,
3640        prompt: Union[
3641            str, List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]]
3642        ],
3643        labels: List[str] = [],
3644        tags: Optional[List[str]] = None,
3645        type: Optional[Literal["chat", "text"]] = "text",
3646        config: Optional[Any] = None,
3647        commit_message: Optional[str] = None,
3648    ) -> PromptClient:
3649        """Create a new prompt in Langfuse.
3650
3651        Keyword Args:
3652            name : The name of the prompt to be created.
3653            prompt : The content of the prompt to be created.
3654            is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead.
3655            labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label.
3656            tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt.
3657            config: Additional structured data to be saved with the prompt. Defaults to None.
3658            type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text".
3659            commit_message: Optional string describing the change.
3660
3661        Returns:
3662            TextPromptClient: The prompt if type argument is 'text'.
3663            ChatPromptClient: The prompt if type argument is 'chat'.
3664        """
3665        try:
3666            langfuse_logger.debug(f"Creating prompt {name=}, {labels=}")
3667
3668            if type == "chat":
3669                if not isinstance(prompt, list):
3670                    raise ValueError(
3671                        "For 'chat' type, 'prompt' must be a list of chat messages with role and content attributes."
3672                    )
3673                request: Union[CreateChatPromptRequest, CreateTextPromptRequest] = (
3674                    CreateChatPromptRequest(
3675                        name=name,
3676                        prompt=cast(Any, prompt),
3677                        labels=labels,
3678                        tags=tags,
3679                        config=config or {},
3680                        commit_message=commit_message,
3681                        type=CreateChatPromptType.CHAT,
3682                    )
3683                )
3684                server_prompt = self.api.prompts.create(request=request)
3685
3686                if self._resources is not None:
3687                    self._resources.prompt_cache.invalidate(name)
3688
3689                return ChatPromptClient(prompt=cast(Prompt_Chat, server_prompt))
3690
3691            if not isinstance(prompt, str):
3692                raise ValueError("For 'text' type, 'prompt' must be a string.")
3693
3694            request = CreateTextPromptRequest(
3695                name=name,
3696                prompt=prompt,
3697                labels=labels,
3698                tags=tags,
3699                config=config or {},
3700                commit_message=commit_message,
3701            )
3702
3703            server_prompt = self.api.prompts.create(request=request)
3704
3705            if self._resources is not None:
3706                self._resources.prompt_cache.invalidate(name)
3707
3708            return TextPromptClient(prompt=cast(Prompt_Text, server_prompt))
3709
3710        except Error as e:
3711            handle_fern_exception(e)
3712            raise e

Create a new prompt in Langfuse.

Keyword Args:
  • name : The name of the prompt to be created.
  • prompt : The content of the prompt to be created.
  • is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead.
  • labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label.
  • tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt.
  • config: Additional structured data to be saved with the prompt. Defaults to None.
  • type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text".
  • commit_message: Optional string describing the change.
Returns:

TextPromptClient: The prompt if type argument is 'text'. ChatPromptClient: The prompt if type argument is 'chat'.

def update_prompt(self, *, name: str, version: int, new_labels: List[str] = []) -> Any:
3714    def update_prompt(
3715        self,
3716        *,
3717        name: str,
3718        version: int,
3719        new_labels: List[str] = [],
3720    ) -> Any:
3721        """Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name.
3722
3723        Args:
3724            name (str): The name of the prompt to update.
3725            version (int): The version number of the prompt to update.
3726            new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to [].
3727
3728        Returns:
3729            Prompt: The updated prompt from the Langfuse API.
3730
3731        """
3732        updated_prompt = self.api.prompt_version.update(
3733            name=self._url_encode(name),
3734            version=version,
3735            new_labels=new_labels,
3736        )
3737
3738        if self._resources is not None:
3739            self._resources.prompt_cache.invalidate(name)
3740
3741        return updated_prompt

Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name.

Arguments:
  • name (str): The name of the prompt to update.
  • version (int): The version number of the prompt to update.
  • new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to [].
Returns:

Prompt: The updated prompt from the Langfuse API.

def clear_prompt_cache(self) -> None:
3756    def clear_prompt_cache(self) -> None:
3757        """Clear the entire prompt cache, removing all cached prompts.
3758
3759        This method is useful when you want to force a complete refresh of all
3760        cached prompts, for example after major updates or when you need to
3761        ensure the latest versions are fetched from the server.
3762        """
3763        if self._resources is not None:
3764            self._resources.prompt_cache.clear()

Clear the entire prompt cache, removing all cached prompts.

This method is useful when you want to force a complete refresh of all cached prompts, for example after major updates or when you need to ensure the latest versions are fetched from the server.

def get_client(*, public_key: Optional[str] = None) -> Langfuse:
 63def get_client(*, public_key: Optional[str] = None) -> Langfuse:
 64    """Get or create a Langfuse client instance.
 65
 66    Returns an existing Langfuse client or creates a new one if none exists. In multi-project setups,
 67    providing a public_key is required. Multi-project support is experimental - see Langfuse docs.
 68
 69    Behavior:
 70    - Single project: Returns existing client or creates new one
 71    - Multi-project: Requires public_key to return specific client
 72    - No public_key in multi-project: Returns disabled client to prevent data leakage
 73
 74    The function uses a singleton pattern per public_key to conserve resources and maintain state.
 75
 76    Args:
 77        public_key (Optional[str]): Project identifier
 78            - With key: Returns client for that project
 79            - Without key: Returns single client or disabled client if multiple exist
 80
 81    Returns:
 82        Langfuse: Client instance in one of three states:
 83            1. Client for specified public_key
 84            2. Default client for single-project setup
 85            3. Disabled client when multiple projects exist without key
 86
 87    Security:
 88        Disables tracing when multiple projects exist without explicit key to prevent
 89        cross-project data leakage. Multi-project setups are experimental.
 90
 91    Example:
 92        ```python
 93        # Single project
 94        client = get_client()  # Default client
 95
 96        # In multi-project usage:
 97        client_a = get_client(public_key="project_a_key")  # Returns project A's client
 98        client_b = get_client(public_key="project_b_key")  # Returns project B's client
 99
100        # Without specific key in multi-project setup:
101        client = get_client()  # Returns disabled client for safety
102        ```
103    """
104    with LangfuseResourceManager._lock:
105        active_instances = LangfuseResourceManager._instances
106
107        # If no explicit public_key provided, check execution context
108        if not public_key:
109            public_key = _current_public_key.get(None)
110
111        if not public_key:
112            if len(active_instances) == 0:
113                # No clients initialized yet, create default instance
114                return Langfuse()
115
116            if len(active_instances) == 1:
117                # Only one client exists, safe to use without specifying key
118                instance = list(active_instances.values())[0]
119
120                # Initialize with the credentials bound to the instance
121                # This is important if the original instance was instantiated
122                # via constructor arguments
123                return _create_client_from_instance(instance)
124
125            else:
126                # Multiple clients exist but no key specified - disable tracing
127                # to prevent cross-project data leakage
128                langfuse_logger.warning(
129                    "No 'langfuse_public_key' passed to decorated function, but multiple langfuse clients are instantiated in current process. Skipping tracing for this function to avoid cross-project leakage."
130                )
131                return Langfuse(
132                    tracing_enabled=False, public_key="fake", secret_key="fake"
133                )
134
135        else:
136            # Specific key provided, look up existing instance
137            target_instance: Optional[LangfuseResourceManager] = active_instances.get(
138                public_key, None
139            )
140
141            if target_instance is None:
142                # No instance found with this key - client not initialized properly
143                langfuse_logger.warning(
144                    f"No Langfuse client with public key {public_key} has been initialized. Skipping tracing for decorated function."
145                )
146                return Langfuse(
147                    tracing_enabled=False, public_key="fake", secret_key="fake"
148                )
149
150            # target_instance is guaranteed to be not None at this point
151            return _create_client_from_instance(target_instance, public_key)

Get or create a Langfuse client instance.

Returns an existing Langfuse client or creates a new one if none exists. In multi-project setups, providing a public_key is required. Multi-project support is experimental - see Langfuse docs.

Behavior:

  • Single project: Returns existing client or creates new one
  • Multi-project: Requires public_key to return specific client
  • No public_key in multi-project: Returns disabled client to prevent data leakage

The function uses a singleton pattern per public_key to conserve resources and maintain state.

Arguments:
  • public_key (Optional[str]): Project identifier
    • With key: Returns client for that project
    • Without key: Returns single client or disabled client if multiple exist
Returns:

Langfuse: Client instance in one of three states: 1. Client for specified public_key 2. Default client for single-project setup 3. Disabled client when multiple projects exist without key

Security:

Disables tracing when multiple projects exist without explicit key to prevent cross-project data leakage. Multi-project setups are experimental.

Example:
# Single project
client = get_client()  # Default client

# In multi-project usage:
client_a = get_client(public_key="project_a_key")  # Returns project A's client
client_b = get_client(public_key="project_b_key")  # Returns project B's client

# Without specific key in multi-project setup:
client = get_client()  # Returns disabled client for safety
def observe( func: Optional[~F] = None, *, name: Optional[str] = None, as_type: Union[Literal['generation', 'embedding'], Literal['span', 'agent', 'tool', 'chain', 'retriever', 'evaluator', 'guardrail'], NoneType] = None, capture_input: Optional[bool] = None, capture_output: Optional[bool] = None, transform_to_string: Optional[Callable[[Iterable], str]] = None) -> Union[~F, Callable[[~F], ~F]]:
 88    def observe(
 89        self,
 90        func: Optional[F] = None,
 91        *,
 92        name: Optional[str] = None,
 93        as_type: Optional[ObservationTypeLiteralNoEvent] = None,
 94        capture_input: Optional[bool] = None,
 95        capture_output: Optional[bool] = None,
 96        transform_to_string: Optional[Callable[[Iterable], str]] = None,
 97    ) -> Union[F, Callable[[F], F]]:
 98        """Wrap a function to create and manage Langfuse tracing around its execution, supporting both synchronous and asynchronous functions.
 99
100        This decorator provides seamless integration of Langfuse observability into your codebase. It automatically creates
101        spans or generations around function execution, capturing timing, inputs/outputs, and error states. The decorator
102        intelligently handles both synchronous and asynchronous functions, preserving function signatures and type hints.
103
104        Using OpenTelemetry's distributed tracing system, it maintains proper trace context propagation throughout your application,
105        enabling you to see hierarchical traces of function calls with detailed performance metrics and function-specific details.
106
107        Args:
108            func (Optional[Callable]): The function to decorate. When used with parentheses @observe(), this will be None.
109            name (Optional[str]): Custom name for the created trace or span. If not provided, the function name is used.
110            as_type (Optional[Literal]): Set the observation type. Supported values:
111                    "generation", "span", "agent", "tool", "chain", "retriever", "embedding", "evaluator", "guardrail".
112                    Observation types are highlighted in the Langfuse UI for filtering and visualization.
113                    The types "generation" and "embedding" create a span on which additional attributes such as model metrics
114                    can be set.
115
116        Returns:
117            Callable: A wrapped version of the original function that automatically creates and manages Langfuse spans.
118
119        Example:
120            For general function tracing with automatic naming:
121            ```python
122            @observe()
123            def process_user_request(user_id, query):
124                # Function is automatically traced with name "process_user_request"
125                return get_response(query)
126            ```
127
128            For language model generation tracking:
129            ```python
130            @observe(name="answer-generation", as_type="generation")
131            async def generate_answer(query):
132                # Creates a generation-type span with extended LLM metrics
133                response = await openai.chat.completions.create(
134                    model="gpt-4",
135                    messages=[{"role": "user", "content": query}]
136                )
137                return response.choices[0].message.content
138            ```
139
140            For trace context propagation between functions:
141            ```python
142            @observe()
143            def main_process():
144                # Parent span is created
145                return sub_process()  # Child span automatically connected to parent
146
147            @observe()
148            def sub_process():
149                # Automatically becomes a child span of main_process
150                return "result"
151            ```
152
153        Raises:
154            Exception: Propagates any exceptions from the wrapped function after logging them in the trace.
155
156        Notes:
157            - The decorator preserves the original function's signature, docstring, and return type.
158            - Proper parent-child relationships between spans are automatically maintained.
159            - Special keyword arguments can be passed to control tracing:
160              - langfuse_trace_id: Explicitly set the trace ID for this function call
161              - langfuse_parent_observation_id: Explicitly set the parent span ID
162              - langfuse_public_key: Use a specific Langfuse project (when multiple clients exist)
163            - For async functions, the decorator returns an async function wrapper.
164            - For sync functions, the decorator returns a synchronous wrapper.
165        """
166        valid_types = set(get_observation_types_list(ObservationTypeLiteralNoEvent))
167        if as_type is not None and as_type not in valid_types:
168            logger.warning(
169                f"Invalid as_type '{as_type}'. Valid types are: {', '.join(sorted(valid_types))}. Defaulting to 'span'."
170            )
171            as_type = "span"
172
173        function_io_capture_enabled = os.environ.get(
174            LANGFUSE_OBSERVE_DECORATOR_IO_CAPTURE_ENABLED, "True"
175        ).lower() not in ("false", "0")
176
177        should_capture_input = (
178            capture_input if capture_input is not None else function_io_capture_enabled
179        )
180
181        should_capture_output = (
182            capture_output
183            if capture_output is not None
184            else function_io_capture_enabled
185        )
186
187        def decorator(func: F) -> F:
188            return (
189                self._async_observe(
190                    func,
191                    name=name,
192                    as_type=as_type,
193                    capture_input=should_capture_input,
194                    capture_output=should_capture_output,
195                    transform_to_string=transform_to_string,
196                )
197                if asyncio.iscoroutinefunction(func)
198                else self._sync_observe(
199                    func,
200                    name=name,
201                    as_type=as_type,
202                    capture_input=should_capture_input,
203                    capture_output=should_capture_output,
204                    transform_to_string=transform_to_string,
205                )
206            )
207
208        """Handle decorator with or without parentheses.
209
210        This logic enables the decorator to work both with and without parentheses:
211        - @observe - Python passes the function directly to the decorator
212        - @observe() - Python calls the decorator first, which must return a function decorator
213
214        When called without arguments (@observe), the func parameter contains the function to decorate,
215        so we directly apply the decorator to it. When called with parentheses (@observe()),
216        func is None, so we return the decorator function itself for Python to apply in the next step.
217        """
218        if func is None:
219            return decorator
220        else:
221            return decorator(func)

Wrap a function to create and manage Langfuse tracing around its execution, supporting both synchronous and asynchronous functions.

This decorator provides seamless integration of Langfuse observability into your codebase. It automatically creates spans or generations around function execution, capturing timing, inputs/outputs, and error states. The decorator intelligently handles both synchronous and asynchronous functions, preserving function signatures and type hints.

Using OpenTelemetry's distributed tracing system, it maintains proper trace context propagation throughout your application, enabling you to see hierarchical traces of function calls with detailed performance metrics and function-specific details.

Arguments:
  • func (Optional[Callable]): The function to decorate. When used with parentheses @observe(), this will be None.
  • name (Optional[str]): Custom name for the created trace or span. If not provided, the function name is used.
  • as_type (Optional[Literal]): Set the observation type. Supported values: "generation", "span", "agent", "tool", "chain", "retriever", "embedding", "evaluator", "guardrail". Observation types are highlighted in the Langfuse UI for filtering and visualization. The types "generation" and "embedding" create a span on which additional attributes such as model metrics can be set.
Returns:

Callable: A wrapped version of the original function that automatically creates and manages Langfuse spans.

Example:

For general function tracing with automatic naming:

@observe()
def process_user_request(user_id, query):
    # Function is automatically traced with name "process_user_request"
    return get_response(query)

For language model generation tracking:

@observe(name="answer-generation", as_type="generation")
async def generate_answer(query):
    # Creates a generation-type span with extended LLM metrics
    response = await openai.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": query}]
    )
    return response.choices[0].message.content

For trace context propagation between functions:

@observe()
def main_process():
    # Parent span is created
    return sub_process()  # Child span automatically connected to parent

@observe()
def sub_process():
    # Automatically becomes a child span of main_process
    return "result"
Raises:
  • Exception: Propagates any exceptions from the wrapped function after logging them in the trace.
Notes:
  • The decorator preserves the original function's signature, docstring, and return type.
  • Proper parent-child relationships between spans are automatically maintained.
  • Special keyword arguments can be passed to control tracing:
    • langfuse_trace_id: Explicitly set the trace ID for this function call
    • langfuse_parent_observation_id: Explicitly set the parent span ID
    • langfuse_public_key: Use a specific Langfuse project (when multiple clients exist)
  • For async functions, the decorator returns an async function wrapper.
  • For sync functions, the decorator returns a synchronous wrapper.
def propagate_attributes( *, user_id: Optional[str] = None, session_id: Optional[str] = None, metadata: Optional[Dict[str, str]] = None, version: Optional[str] = None, tags: Optional[List[str]] = None, trace_name: Optional[str] = None, as_baggage: bool = False) -> opentelemetry.util._decorator._AgnosticContextManager[typing.Any]:
 95def propagate_attributes(
 96    *,
 97    user_id: Optional[str] = None,
 98    session_id: Optional[str] = None,
 99    metadata: Optional[Dict[str, str]] = None,
100    version: Optional[str] = None,
101    tags: Optional[List[str]] = None,
102    trace_name: Optional[str] = None,
103    as_baggage: bool = False,
104) -> _AgnosticContextManager[Any]:
105    """Propagate trace-level attributes to all spans created within this context.
106
107    This context manager sets attributes on the currently active span AND automatically
108    propagates them to all new child spans created within the context. This is the
109    recommended way to set trace-level attributes like user_id, session_id, and metadata
110    dimensions that should be consistently applied across all observations in a trace.
111
112    **IMPORTANT**: Call this as early as possible within your trace/workflow. Only the
113    currently active span and spans created after entering this context will have these
114    attributes. Pre-existing spans will NOT be retroactively updated.
115
116    **Why this matters**: Langfuse aggregation queries (e.g., total cost by user_id,
117    filtering by session_id) only include observations that have the attribute set.
118    If you call `propagate_attributes` late in your workflow, earlier spans won't be
119    included in aggregations for that attribute.
120
121    Args:
122        user_id: User identifier to associate with all spans in this context.
123            Must be US-ASCII string, ≤200 characters. Use this to track which user
124            generated each trace and enable e.g. per-user cost/performance analysis.
125        session_id: Session identifier to associate with all spans in this context.
126            Must be US-ASCII string, ≤200 characters. Use this to group related traces
127            within a user session (e.g., a conversation thread, multi-turn interaction).
128        metadata: Additional key-value metadata to propagate to all spans.
129            - Keys and values must be US-ASCII strings
130            - All values must be ≤200 characters
131            - Use for dimensions like internal correlating identifiers
132            - AVOID: large payloads, sensitive data, non-string values (will be dropped with warning)
133        version: Version identfier for parts of your application that are independently versioned, e.g. agents
134        tags: List of tags to categorize the group of observations
135        trace_name: Name to assign to the trace. Must be US-ASCII string, ≤200 characters.
136            Use this to set a consistent trace name for all spans created within this context.
137        as_baggage: If True, propagates attributes using OpenTelemetry baggage for
138            cross-process/service propagation. **Security warning**: When enabled,
139            attribute values are added to HTTP headers on ALL outbound requests.
140            Only enable if values are safe to transmit via HTTP headers and you need
141            cross-service tracing. Default: False.
142
143    Returns:
144        Context manager that propagates attributes to all child spans.
145
146    Example:
147        Basic usage with user and session tracking:
148
149        ```python
150        from langfuse import Langfuse
151
152        langfuse = Langfuse()
153
154        # Set attributes early in the trace
155        with langfuse.start_as_current_observation(name="user_workflow") as span:
156            with langfuse.propagate_attributes(
157                user_id="user_123",
158                session_id="session_abc",
159                metadata={"experiment": "variant_a", "environment": "production"}
160            ):
161                # All spans created here will have user_id, session_id, and metadata
162                with langfuse.start_observation(name="llm_call") as llm_span:
163                    # This span inherits: user_id, session_id, experiment, environment
164                    ...
165
166                with langfuse.start_generation(name="completion") as gen:
167                    # This span also inherits all attributes
168                    ...
169        ```
170
171        Late propagation (anti-pattern):
172
173        ```python
174        with langfuse.start_as_current_observation(name="workflow") as span:
175            # These spans WON'T have user_id
176            early_span = langfuse.start_observation(name="early_work")
177            early_span.end()
178
179            # Set attributes in the middle
180            with langfuse.propagate_attributes(user_id="user_123"):
181                # Only spans created AFTER this point will have user_id
182                late_span = langfuse.start_observation(name="late_work")
183                late_span.end()
184
185            # Result: Aggregations by user_id will miss "early_work" span
186        ```
187
188        Cross-service propagation with baggage (advanced):
189
190        ```python
191        # Service A - originating service
192        with langfuse.start_as_current_observation(name="api_request"):
193            with langfuse.propagate_attributes(
194                user_id="user_123",
195                session_id="session_abc",
196                as_baggage=True  # Propagate via HTTP headers
197            ):
198                # Make HTTP request to Service B
199                response = requests.get("https://service-b.example.com/api")
200                # user_id and session_id are now in HTTP headers
201
202        # Service B - downstream service
203        # OpenTelemetry will automatically extract baggage from HTTP headers
204        # and propagate to spans in Service B
205        ```
206
207    Note:
208        - **Validation**: All attribute values (user_id, session_id, metadata values)
209          must be strings ≤200 characters. Invalid values will be dropped with a
210          warning logged. Ensure values meet constraints before calling.
211        - **OpenTelemetry**: This uses OpenTelemetry context propagation under the hood,
212          making it compatible with other OTel-instrumented libraries.
213
214    Raises:
215        No exceptions are raised. Invalid values are logged as warnings and dropped.
216    """
217    return _propagate_attributes(
218        user_id=user_id,
219        session_id=session_id,
220        metadata=metadata,
221        version=version,
222        tags=tags,
223        trace_name=trace_name,
224        as_baggage=as_baggage,
225    )

Propagate trace-level attributes to all spans created within this context.

This context manager sets attributes on the currently active span AND automatically propagates them to all new child spans created within the context. This is the recommended way to set trace-level attributes like user_id, session_id, and metadata dimensions that should be consistently applied across all observations in a trace.

IMPORTANT: Call this as early as possible within your trace/workflow. Only the currently active span and spans created after entering this context will have these attributes. Pre-existing spans will NOT be retroactively updated.

Why this matters: Langfuse aggregation queries (e.g., total cost by user_id, filtering by session_id) only include observations that have the attribute set. If you call propagate_attributes late in your workflow, earlier spans won't be included in aggregations for that attribute.

Arguments:
  • user_id: User identifier to associate with all spans in this context. Must be US-ASCII string, ≤200 characters. Use this to track which user generated each trace and enable e.g. per-user cost/performance analysis.
  • session_id: Session identifier to associate with all spans in this context. Must be US-ASCII string, ≤200 characters. Use this to group related traces within a user session (e.g., a conversation thread, multi-turn interaction).
  • metadata: Additional key-value metadata to propagate to all spans.
    • Keys and values must be US-ASCII strings
    • All values must be ≤200 characters
    • Use for dimensions like internal correlating identifiers
    • AVOID: large payloads, sensitive data, non-string values (will be dropped with warning)
  • version: Version identfier for parts of your application that are independently versioned, e.g. agents
  • tags: List of tags to categorize the group of observations
  • trace_name: Name to assign to the trace. Must be US-ASCII string, ≤200 characters. Use this to set a consistent trace name for all spans created within this context.
  • as_baggage: If True, propagates attributes using OpenTelemetry baggage for cross-process/service propagation. Security warning: When enabled, attribute values are added to HTTP headers on ALL outbound requests. Only enable if values are safe to transmit via HTTP headers and you need cross-service tracing. Default: False.
Returns:

Context manager that propagates attributes to all child spans.

Example:

Basic usage with user and session tracking:

from langfuse import Langfuse

langfuse = Langfuse()

# Set attributes early in the trace
with langfuse.start_as_current_observation(name="user_workflow") as span:
    with langfuse.propagate_attributes(
        user_id="user_123",
        session_id="session_abc",
        metadata={"experiment": "variant_a", "environment": "production"}
    ):
        # All spans created here will have user_id, session_id, and metadata
        with langfuse.start_observation(name="llm_call") as llm_span:
            # This span inherits: user_id, session_id, experiment, environment
            ...

        with langfuse.start_generation(name="completion") as gen:
            # This span also inherits all attributes
            ...

Late propagation (anti-pattern):

with langfuse.start_as_current_observation(name="workflow") as span:
    # These spans WON'T have user_id
    early_span = langfuse.start_observation(name="early_work")
    early_span.end()

    # Set attributes in the middle
    with langfuse.propagate_attributes(user_id="user_123"):
        # Only spans created AFTER this point will have user_id
        late_span = langfuse.start_observation(name="late_work")
        late_span.end()

    # Result: Aggregations by user_id will miss "early_work" span

Cross-service propagation with baggage (advanced):

# Service A - originating service
with langfuse.start_as_current_observation(name="api_request"):
    with langfuse.propagate_attributes(
        user_id="user_123",
        session_id="session_abc",
        as_baggage=True  # Propagate via HTTP headers
    ):
        # Make HTTP request to Service B
        response = requests.get("https://service-b.example.com/api")
        # user_id and session_id are now in HTTP headers

# Service B - downstream service
# OpenTelemetry will automatically extract baggage from HTTP headers
# and propagate to spans in Service B
Note:
  • Validation: All attribute values (user_id, session_id, metadata values) must be strings ≤200 characters. Invalid values will be dropped with a warning logged. Ensure values meet constraints before calling.
  • OpenTelemetry: This uses OpenTelemetry context propagation under the hood, making it compatible with other OTel-instrumented libraries.
Raises:
  • No exceptions are raised. Invalid values are logged as warnings and dropped.
ObservationTypeLiteral = typing.Union[typing.Literal['generation', 'embedding'], typing.Literal['span', 'agent', 'tool', 'chain', 'retriever', 'evaluator', 'guardrail'], typing.Literal['event']]
class LangfuseSpan(langfuse._client.span.LangfuseObservationWrapper):
1247class LangfuseSpan(LangfuseObservationWrapper):
1248    """Standard span implementation for general operations in Langfuse.
1249
1250    This class represents a general-purpose span that can be used to trace
1251    any operation in your application. It extends the base LangfuseObservationWrapper
1252    with specific methods for creating child spans, generations, and updating
1253    span-specific attributes. If possible, use a more specific type for
1254    better observability and insights.
1255    """
1256
1257    def __init__(
1258        self,
1259        *,
1260        otel_span: otel_trace_api.Span,
1261        langfuse_client: "Langfuse",
1262        input: Optional[Any] = None,
1263        output: Optional[Any] = None,
1264        metadata: Optional[Any] = None,
1265        environment: Optional[str] = None,
1266        release: Optional[str] = None,
1267        version: Optional[str] = None,
1268        level: Optional[SpanLevel] = None,
1269        status_message: Optional[str] = None,
1270    ):
1271        """Initialize a new LangfuseSpan.
1272
1273        Args:
1274            otel_span: The OpenTelemetry span to wrap
1275            langfuse_client: Reference to the parent Langfuse client
1276            input: Input data for the span (any JSON-serializable object)
1277            output: Output data from the span (any JSON-serializable object)
1278            metadata: Additional metadata to associate with the span
1279            environment: The tracing environment
1280            release: Release identifier for the application
1281            version: Version identifier for the code or component
1282            level: Importance level of the span (info, warning, error)
1283            status_message: Optional status message for the span
1284        """
1285        super().__init__(
1286            otel_span=otel_span,
1287            as_type="span",
1288            langfuse_client=langfuse_client,
1289            input=input,
1290            output=output,
1291            metadata=metadata,
1292            environment=environment,
1293            release=release,
1294            version=version,
1295            level=level,
1296            status_message=status_message,
1297        )

Standard span implementation for general operations in Langfuse.

This class represents a general-purpose span that can be used to trace any operation in your application. It extends the base LangfuseObservationWrapper with specific methods for creating child spans, generations, and updating span-specific attributes. If possible, use a more specific type for better observability and insights.

LangfuseSpan( *, otel_span: opentelemetry.trace.span.Span, langfuse_client: Langfuse, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, environment: Optional[str] = None, release: Optional[str] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None)
1257    def __init__(
1258        self,
1259        *,
1260        otel_span: otel_trace_api.Span,
1261        langfuse_client: "Langfuse",
1262        input: Optional[Any] = None,
1263        output: Optional[Any] = None,
1264        metadata: Optional[Any] = None,
1265        environment: Optional[str] = None,
1266        release: Optional[str] = None,
1267        version: Optional[str] = None,
1268        level: Optional[SpanLevel] = None,
1269        status_message: Optional[str] = None,
1270    ):
1271        """Initialize a new LangfuseSpan.
1272
1273        Args:
1274            otel_span: The OpenTelemetry span to wrap
1275            langfuse_client: Reference to the parent Langfuse client
1276            input: Input data for the span (any JSON-serializable object)
1277            output: Output data from the span (any JSON-serializable object)
1278            metadata: Additional metadata to associate with the span
1279            environment: The tracing environment
1280            release: Release identifier for the application
1281            version: Version identifier for the code or component
1282            level: Importance level of the span (info, warning, error)
1283            status_message: Optional status message for the span
1284        """
1285        super().__init__(
1286            otel_span=otel_span,
1287            as_type="span",
1288            langfuse_client=langfuse_client,
1289            input=input,
1290            output=output,
1291            metadata=metadata,
1292            environment=environment,
1293            release=release,
1294            version=version,
1295            level=level,
1296            status_message=status_message,
1297        )

Initialize a new LangfuseSpan.

Arguments:
  • otel_span: The OpenTelemetry span to wrap
  • langfuse_client: Reference to the parent Langfuse client
  • input: Input data for the span (any JSON-serializable object)
  • output: Output data from the span (any JSON-serializable object)
  • metadata: Additional metadata to associate with the span
  • environment: The tracing environment
  • release: Release identifier for the application
  • version: Version identifier for the code or component
  • level: Importance level of the span (info, warning, error)
  • status_message: Optional status message for the span
class LangfuseGeneration(langfuse._client.span.LangfuseObservationWrapper):
1300class LangfuseGeneration(LangfuseObservationWrapper):
1301    """Specialized span implementation for AI model generations in Langfuse.
1302
1303    This class represents a generation span specifically designed for tracking
1304    AI/LLM operations. It extends the base LangfuseObservationWrapper with specialized
1305    attributes for model details, token usage, and costs.
1306    """
1307
1308    def __init__(
1309        self,
1310        *,
1311        otel_span: otel_trace_api.Span,
1312        langfuse_client: "Langfuse",
1313        input: Optional[Any] = None,
1314        output: Optional[Any] = None,
1315        metadata: Optional[Any] = None,
1316        environment: Optional[str] = None,
1317        release: Optional[str] = None,
1318        version: Optional[str] = None,
1319        level: Optional[SpanLevel] = None,
1320        status_message: Optional[str] = None,
1321        completion_start_time: Optional[datetime] = None,
1322        model: Optional[str] = None,
1323        model_parameters: Optional[Dict[str, MapValue]] = None,
1324        usage_details: Optional[Dict[str, int]] = None,
1325        cost_details: Optional[Dict[str, float]] = None,
1326        prompt: Optional[PromptClient] = None,
1327    ):
1328        """Initialize a new LangfuseGeneration span.
1329
1330        Args:
1331            otel_span: The OpenTelemetry span to wrap
1332            langfuse_client: Reference to the parent Langfuse client
1333            input: Input data for the generation (e.g., prompts)
1334            output: Output from the generation (e.g., completions)
1335            metadata: Additional metadata to associate with the generation
1336            environment: The tracing environment
1337            release: Release identifier for the application
1338            version: Version identifier for the model or component
1339            level: Importance level of the generation (info, warning, error)
1340            status_message: Optional status message for the generation
1341            completion_start_time: When the model started generating the response
1342            model: Name/identifier of the AI model used (e.g., "gpt-4")
1343            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1344            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1345            cost_details: Cost information for the model call
1346            prompt: Associated prompt template from Langfuse prompt management
1347        """
1348        super().__init__(
1349            as_type="generation",
1350            otel_span=otel_span,
1351            langfuse_client=langfuse_client,
1352            input=input,
1353            output=output,
1354            metadata=metadata,
1355            environment=environment,
1356            release=release,
1357            version=version,
1358            level=level,
1359            status_message=status_message,
1360            completion_start_time=completion_start_time,
1361            model=model,
1362            model_parameters=model_parameters,
1363            usage_details=usage_details,
1364            cost_details=cost_details,
1365            prompt=prompt,
1366        )

Specialized span implementation for AI model generations in Langfuse.

This class represents a generation span specifically designed for tracking AI/LLM operations. It extends the base LangfuseObservationWrapper with specialized attributes for model details, token usage, and costs.

LangfuseGeneration( *, otel_span: opentelemetry.trace.span.Span, langfuse_client: Langfuse, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, environment: Optional[str] = None, release: Optional[str] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, float, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None)
1308    def __init__(
1309        self,
1310        *,
1311        otel_span: otel_trace_api.Span,
1312        langfuse_client: "Langfuse",
1313        input: Optional[Any] = None,
1314        output: Optional[Any] = None,
1315        metadata: Optional[Any] = None,
1316        environment: Optional[str] = None,
1317        release: Optional[str] = None,
1318        version: Optional[str] = None,
1319        level: Optional[SpanLevel] = None,
1320        status_message: Optional[str] = None,
1321        completion_start_time: Optional[datetime] = None,
1322        model: Optional[str] = None,
1323        model_parameters: Optional[Dict[str, MapValue]] = None,
1324        usage_details: Optional[Dict[str, int]] = None,
1325        cost_details: Optional[Dict[str, float]] = None,
1326        prompt: Optional[PromptClient] = None,
1327    ):
1328        """Initialize a new LangfuseGeneration span.
1329
1330        Args:
1331            otel_span: The OpenTelemetry span to wrap
1332            langfuse_client: Reference to the parent Langfuse client
1333            input: Input data for the generation (e.g., prompts)
1334            output: Output from the generation (e.g., completions)
1335            metadata: Additional metadata to associate with the generation
1336            environment: The tracing environment
1337            release: Release identifier for the application
1338            version: Version identifier for the model or component
1339            level: Importance level of the generation (info, warning, error)
1340            status_message: Optional status message for the generation
1341            completion_start_time: When the model started generating the response
1342            model: Name/identifier of the AI model used (e.g., "gpt-4")
1343            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1344            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1345            cost_details: Cost information for the model call
1346            prompt: Associated prompt template from Langfuse prompt management
1347        """
1348        super().__init__(
1349            as_type="generation",
1350            otel_span=otel_span,
1351            langfuse_client=langfuse_client,
1352            input=input,
1353            output=output,
1354            metadata=metadata,
1355            environment=environment,
1356            release=release,
1357            version=version,
1358            level=level,
1359            status_message=status_message,
1360            completion_start_time=completion_start_time,
1361            model=model,
1362            model_parameters=model_parameters,
1363            usage_details=usage_details,
1364            cost_details=cost_details,
1365            prompt=prompt,
1366        )

Initialize a new LangfuseGeneration span.

Arguments:
  • otel_span: The OpenTelemetry span to wrap
  • langfuse_client: Reference to the parent Langfuse client
  • input: Input data for the generation (e.g., prompts)
  • output: Output from the generation (e.g., completions)
  • metadata: Additional metadata to associate with the generation
  • environment: The tracing environment
  • release: Release identifier for the application
  • version: Version identifier for the model or component
  • level: Importance level of the generation (info, warning, error)
  • status_message: Optional status message for the generation
  • completion_start_time: When the model started generating the response
  • model: Name/identifier of the AI model used (e.g., "gpt-4")
  • model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
  • usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
  • cost_details: Cost information for the model call
  • prompt: Associated prompt template from Langfuse prompt management
class LangfuseEvent(langfuse._client.span.LangfuseObservationWrapper):
1369class LangfuseEvent(LangfuseObservationWrapper):
1370    """Specialized span implementation for Langfuse Events."""
1371
1372    def __init__(
1373        self,
1374        *,
1375        otel_span: otel_trace_api.Span,
1376        langfuse_client: "Langfuse",
1377        input: Optional[Any] = None,
1378        output: Optional[Any] = None,
1379        metadata: Optional[Any] = None,
1380        environment: Optional[str] = None,
1381        release: Optional[str] = None,
1382        version: Optional[str] = None,
1383        level: Optional[SpanLevel] = None,
1384        status_message: Optional[str] = None,
1385    ):
1386        """Initialize a new LangfuseEvent span.
1387
1388        Args:
1389            otel_span: The OpenTelemetry span to wrap
1390            langfuse_client: Reference to the parent Langfuse client
1391            input: Input data for the event
1392            output: Output from the event
1393            metadata: Additional metadata to associate with the generation
1394            environment: The tracing environment
1395            release: Release identifier for the application
1396            version: Version identifier for the model or component
1397            level: Importance level of the generation (info, warning, error)
1398            status_message: Optional status message for the generation
1399        """
1400        super().__init__(
1401            otel_span=otel_span,
1402            as_type="event",
1403            langfuse_client=langfuse_client,
1404            input=input,
1405            output=output,
1406            metadata=metadata,
1407            environment=environment,
1408            release=release,
1409            version=version,
1410            level=level,
1411            status_message=status_message,
1412        )
1413
1414    def update(
1415        self,
1416        *,
1417        name: Optional[str] = None,
1418        input: Optional[Any] = None,
1419        output: Optional[Any] = None,
1420        metadata: Optional[Any] = None,
1421        version: Optional[str] = None,
1422        level: Optional[SpanLevel] = None,
1423        status_message: Optional[str] = None,
1424        completion_start_time: Optional[datetime] = None,
1425        model: Optional[str] = None,
1426        model_parameters: Optional[Dict[str, MapValue]] = None,
1427        usage_details: Optional[Dict[str, int]] = None,
1428        cost_details: Optional[Dict[str, float]] = None,
1429        prompt: Optional[PromptClient] = None,
1430        **kwargs: Any,
1431    ) -> "LangfuseEvent":
1432        """Update is not allowed for LangfuseEvent because events cannot be updated.
1433
1434        This method logs a warning and returns self without making changes.
1435
1436        Returns:
1437            self: Returns the unchanged LangfuseEvent instance
1438        """
1439        langfuse_logger.warning(
1440            "Attempted to update LangfuseEvent observation. Events cannot be updated after creation."
1441        )
1442        return self

Specialized span implementation for Langfuse Events.

LangfuseEvent( *, otel_span: opentelemetry.trace.span.Span, langfuse_client: Langfuse, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, environment: Optional[str] = None, release: Optional[str] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None)
1372    def __init__(
1373        self,
1374        *,
1375        otel_span: otel_trace_api.Span,
1376        langfuse_client: "Langfuse",
1377        input: Optional[Any] = None,
1378        output: Optional[Any] = None,
1379        metadata: Optional[Any] = None,
1380        environment: Optional[str] = None,
1381        release: Optional[str] = None,
1382        version: Optional[str] = None,
1383        level: Optional[SpanLevel] = None,
1384        status_message: Optional[str] = None,
1385    ):
1386        """Initialize a new LangfuseEvent span.
1387
1388        Args:
1389            otel_span: The OpenTelemetry span to wrap
1390            langfuse_client: Reference to the parent Langfuse client
1391            input: Input data for the event
1392            output: Output from the event
1393            metadata: Additional metadata to associate with the generation
1394            environment: The tracing environment
1395            release: Release identifier for the application
1396            version: Version identifier for the model or component
1397            level: Importance level of the generation (info, warning, error)
1398            status_message: Optional status message for the generation
1399        """
1400        super().__init__(
1401            otel_span=otel_span,
1402            as_type="event",
1403            langfuse_client=langfuse_client,
1404            input=input,
1405            output=output,
1406            metadata=metadata,
1407            environment=environment,
1408            release=release,
1409            version=version,
1410            level=level,
1411            status_message=status_message,
1412        )

Initialize a new LangfuseEvent span.

Arguments:
  • otel_span: The OpenTelemetry span to wrap
  • langfuse_client: Reference to the parent Langfuse client
  • input: Input data for the event
  • output: Output from the event
  • metadata: Additional metadata to associate with the generation
  • environment: The tracing environment
  • release: Release identifier for the application
  • version: Version identifier for the model or component
  • level: Importance level of the generation (info, warning, error)
  • status_message: Optional status message for the generation
def update( self, *, name: Optional[str] = None, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, float, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None, **kwargs: Any) -> LangfuseEvent:
1414    def update(
1415        self,
1416        *,
1417        name: Optional[str] = None,
1418        input: Optional[Any] = None,
1419        output: Optional[Any] = None,
1420        metadata: Optional[Any] = None,
1421        version: Optional[str] = None,
1422        level: Optional[SpanLevel] = None,
1423        status_message: Optional[str] = None,
1424        completion_start_time: Optional[datetime] = None,
1425        model: Optional[str] = None,
1426        model_parameters: Optional[Dict[str, MapValue]] = None,
1427        usage_details: Optional[Dict[str, int]] = None,
1428        cost_details: Optional[Dict[str, float]] = None,
1429        prompt: Optional[PromptClient] = None,
1430        **kwargs: Any,
1431    ) -> "LangfuseEvent":
1432        """Update is not allowed for LangfuseEvent because events cannot be updated.
1433
1434        This method logs a warning and returns self without making changes.
1435
1436        Returns:
1437            self: Returns the unchanged LangfuseEvent instance
1438        """
1439        langfuse_logger.warning(
1440            "Attempted to update LangfuseEvent observation. Events cannot be updated after creation."
1441        )
1442        return self

Update is not allowed for LangfuseEvent because events cannot be updated.

This method logs a warning and returns self without making changes.

Returns:

self: Returns the unchanged LangfuseEvent instance

class LangfuseOtelSpanAttributes:
28class LangfuseOtelSpanAttributes:
29    # Langfuse-Trace attributes
30    TRACE_NAME = "langfuse.trace.name"
31    TRACE_USER_ID = "user.id"
32    TRACE_SESSION_ID = "session.id"
33    TRACE_TAGS = "langfuse.trace.tags"
34    TRACE_PUBLIC = "langfuse.trace.public"
35    TRACE_METADATA = "langfuse.trace.metadata"
36    TRACE_INPUT = "langfuse.trace.input"
37    TRACE_OUTPUT = "langfuse.trace.output"
38
39    # Langfuse-observation attributes
40    OBSERVATION_TYPE = "langfuse.observation.type"
41    OBSERVATION_METADATA = "langfuse.observation.metadata"
42    OBSERVATION_LEVEL = "langfuse.observation.level"
43    OBSERVATION_STATUS_MESSAGE = "langfuse.observation.status_message"
44    OBSERVATION_INPUT = "langfuse.observation.input"
45    OBSERVATION_OUTPUT = "langfuse.observation.output"
46
47    # Langfuse-observation of type Generation attributes
48    OBSERVATION_COMPLETION_START_TIME = "langfuse.observation.completion_start_time"
49    OBSERVATION_MODEL = "langfuse.observation.model.name"
50    OBSERVATION_MODEL_PARAMETERS = "langfuse.observation.model.parameters"
51    OBSERVATION_USAGE_DETAILS = "langfuse.observation.usage_details"
52    OBSERVATION_COST_DETAILS = "langfuse.observation.cost_details"
53    OBSERVATION_PROMPT_NAME = "langfuse.observation.prompt.name"
54    OBSERVATION_PROMPT_VERSION = "langfuse.observation.prompt.version"
55
56    # General
57    ENVIRONMENT = "langfuse.environment"
58    RELEASE = "langfuse.release"
59    VERSION = "langfuse.version"
60
61    # Internal
62    AS_ROOT = "langfuse.internal.as_root"
63    IS_APP_ROOT = "langfuse.internal.is_app_root"
64
65    # Experiments
66    EXPERIMENT_ID = "langfuse.experiment.id"
67    EXPERIMENT_NAME = "langfuse.experiment.name"
68    EXPERIMENT_DESCRIPTION = "langfuse.experiment.description"
69    EXPERIMENT_METADATA = "langfuse.experiment.metadata"
70    EXPERIMENT_DATASET_ID = "langfuse.experiment.dataset.id"
71    EXPERIMENT_ITEM_ID = "langfuse.experiment.item.id"
72    EXPERIMENT_ITEM_EXPECTED_OUTPUT = "langfuse.experiment.item.expected_output"
73    EXPERIMENT_ITEM_METADATA = "langfuse.experiment.item.metadata"
74    EXPERIMENT_ITEM_ROOT_OBSERVATION_ID = "langfuse.experiment.item.root_observation_id"
TRACE_NAME = 'langfuse.trace.name'
TRACE_USER_ID = 'user.id'
TRACE_SESSION_ID = 'session.id'
TRACE_TAGS = 'langfuse.trace.tags'
TRACE_PUBLIC = 'langfuse.trace.public'
TRACE_METADATA = 'langfuse.trace.metadata'
TRACE_INPUT = 'langfuse.trace.input'
TRACE_OUTPUT = 'langfuse.trace.output'
OBSERVATION_TYPE = 'langfuse.observation.type'
OBSERVATION_METADATA = 'langfuse.observation.metadata'
OBSERVATION_LEVEL = 'langfuse.observation.level'
OBSERVATION_STATUS_MESSAGE = 'langfuse.observation.status_message'
OBSERVATION_INPUT = 'langfuse.observation.input'
OBSERVATION_OUTPUT = 'langfuse.observation.output'
OBSERVATION_COMPLETION_START_TIME = 'langfuse.observation.completion_start_time'
OBSERVATION_MODEL = 'langfuse.observation.model.name'
OBSERVATION_MODEL_PARAMETERS = 'langfuse.observation.model.parameters'
OBSERVATION_USAGE_DETAILS = 'langfuse.observation.usage_details'
OBSERVATION_COST_DETAILS = 'langfuse.observation.cost_details'
OBSERVATION_PROMPT_NAME = 'langfuse.observation.prompt.name'
OBSERVATION_PROMPT_VERSION = 'langfuse.observation.prompt.version'
ENVIRONMENT = 'langfuse.environment'
RELEASE = 'langfuse.release'
VERSION = 'langfuse.version'
AS_ROOT = 'langfuse.internal.as_root'
IS_APP_ROOT = 'langfuse.internal.is_app_root'
EXPERIMENT_ID = 'langfuse.experiment.id'
EXPERIMENT_NAME = 'langfuse.experiment.name'
EXPERIMENT_DESCRIPTION = 'langfuse.experiment.description'
EXPERIMENT_METADATA = 'langfuse.experiment.metadata'
EXPERIMENT_DATASET_ID = 'langfuse.experiment.dataset.id'
EXPERIMENT_ITEM_ID = 'langfuse.experiment.item.id'
EXPERIMENT_ITEM_EXPECTED_OUTPUT = 'langfuse.experiment.item.expected_output'
EXPERIMENT_ITEM_METADATA = 'langfuse.experiment.item.metadata'
EXPERIMENT_ITEM_ROOT_OBSERVATION_ID = 'langfuse.experiment.item.root_observation_id'
class LangfuseAgent(langfuse._client.span.LangfuseObservationWrapper):
1445class LangfuseAgent(LangfuseObservationWrapper):
1446    """Agent observation for reasoning blocks that act on tools using LLM guidance."""
1447
1448    def __init__(self, **kwargs: Any) -> None:
1449        """Initialize a new LangfuseAgent span."""
1450        kwargs["as_type"] = "agent"
1451        super().__init__(**kwargs)

Agent observation for reasoning blocks that act on tools using LLM guidance.

LangfuseAgent(**kwargs: Any)
1448    def __init__(self, **kwargs: Any) -> None:
1449        """Initialize a new LangfuseAgent span."""
1450        kwargs["as_type"] = "agent"
1451        super().__init__(**kwargs)

Initialize a new LangfuseAgent span.

class LangfuseTool(langfuse._client.span.LangfuseObservationWrapper):
1454class LangfuseTool(LangfuseObservationWrapper):
1455    """Tool observation representing external tool calls, e.g., calling a weather API."""
1456
1457    def __init__(self, **kwargs: Any) -> None:
1458        """Initialize a new LangfuseTool span."""
1459        kwargs["as_type"] = "tool"
1460        super().__init__(**kwargs)

Tool observation representing external tool calls, e.g., calling a weather API.

LangfuseTool(**kwargs: Any)
1457    def __init__(self, **kwargs: Any) -> None:
1458        """Initialize a new LangfuseTool span."""
1459        kwargs["as_type"] = "tool"
1460        super().__init__(**kwargs)

Initialize a new LangfuseTool span.

class LangfuseChain(langfuse._client.span.LangfuseObservationWrapper):
1463class LangfuseChain(LangfuseObservationWrapper):
1464    """Chain observation for connecting LLM application steps, e.g. passing context from retriever to LLM."""
1465
1466    def __init__(self, **kwargs: Any) -> None:
1467        """Initialize a new LangfuseChain span."""
1468        kwargs["as_type"] = "chain"
1469        super().__init__(**kwargs)

Chain observation for connecting LLM application steps, e.g. passing context from retriever to LLM.

LangfuseChain(**kwargs: Any)
1466    def __init__(self, **kwargs: Any) -> None:
1467        """Initialize a new LangfuseChain span."""
1468        kwargs["as_type"] = "chain"
1469        super().__init__(**kwargs)

Initialize a new LangfuseChain span.

class LangfuseEmbedding(langfuse._client.span.LangfuseObservationWrapper):
1481class LangfuseEmbedding(LangfuseObservationWrapper):
1482    """Embedding observation for LLM embedding calls, typically used before retrieval."""
1483
1484    def __init__(self, **kwargs: Any) -> None:
1485        """Initialize a new LangfuseEmbedding span."""
1486        kwargs["as_type"] = "embedding"
1487        super().__init__(**kwargs)

Embedding observation for LLM embedding calls, typically used before retrieval.

LangfuseEmbedding(**kwargs: Any)
1484    def __init__(self, **kwargs: Any) -> None:
1485        """Initialize a new LangfuseEmbedding span."""
1486        kwargs["as_type"] = "embedding"
1487        super().__init__(**kwargs)

Initialize a new LangfuseEmbedding span.

class LangfuseEvaluator(langfuse._client.span.LangfuseObservationWrapper):
1490class LangfuseEvaluator(LangfuseObservationWrapper):
1491    """Evaluator observation for assessing relevance, correctness, or helpfulness of LLM outputs."""
1492
1493    def __init__(self, **kwargs: Any) -> None:
1494        """Initialize a new LangfuseEvaluator span."""
1495        kwargs["as_type"] = "evaluator"
1496        super().__init__(**kwargs)

Evaluator observation for assessing relevance, correctness, or helpfulness of LLM outputs.

LangfuseEvaluator(**kwargs: Any)
1493    def __init__(self, **kwargs: Any) -> None:
1494        """Initialize a new LangfuseEvaluator span."""
1495        kwargs["as_type"] = "evaluator"
1496        super().__init__(**kwargs)

Initialize a new LangfuseEvaluator span.

class LangfuseRetriever(langfuse._client.span.LangfuseObservationWrapper):
1472class LangfuseRetriever(LangfuseObservationWrapper):
1473    """Retriever observation for data retrieval steps, e.g. vector store or database queries."""
1474
1475    def __init__(self, **kwargs: Any) -> None:
1476        """Initialize a new LangfuseRetriever span."""
1477        kwargs["as_type"] = "retriever"
1478        super().__init__(**kwargs)

Retriever observation for data retrieval steps, e.g. vector store or database queries.

LangfuseRetriever(**kwargs: Any)
1475    def __init__(self, **kwargs: Any) -> None:
1476        """Initialize a new LangfuseRetriever span."""
1477        kwargs["as_type"] = "retriever"
1478        super().__init__(**kwargs)

Initialize a new LangfuseRetriever span.

class LangfuseGuardrail(langfuse._client.span.LangfuseObservationWrapper):
1499class LangfuseGuardrail(LangfuseObservationWrapper):
1500    """Guardrail observation for protection e.g. against jailbreaks or offensive content."""
1501
1502    def __init__(self, **kwargs: Any) -> None:
1503        """Initialize a new LangfuseGuardrail span."""
1504        kwargs["as_type"] = "guardrail"
1505        super().__init__(**kwargs)

Guardrail observation for protection e.g. against jailbreaks or offensive content.

LangfuseGuardrail(**kwargs: Any)
1502    def __init__(self, **kwargs: Any) -> None:
1503        """Initialize a new LangfuseGuardrail span."""
1504        kwargs["as_type"] = "guardrail"
1505        super().__init__(**kwargs)

Initialize a new LangfuseGuardrail span.

class Evaluation:
101class Evaluation:
102    """Represents an evaluation result for an experiment item or an entire experiment run.
103
104    This class provides a strongly-typed way to create evaluation results in evaluator functions.
105    Users must use keyword arguments when instantiating this class.
106
107    Attributes:
108        name: Unique identifier for the evaluation metric. Should be descriptive
109            and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity").
110            Used for aggregation and comparison across experiment runs.
111        value: The evaluation score or result. Can be:
112            - Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42)
113            - String: For categorical results like "positive", "negative", "neutral"
114            - Boolean: For binary assessments like "passes_safety_check"
115        comment: Optional human-readable explanation of the evaluation result.
116            Useful for providing context, explaining scoring rationale, or noting
117            special conditions. Displayed in Langfuse UI for interpretability.
118        metadata: Optional structured metadata about the evaluation process.
119            Can include confidence scores, intermediate calculations, model versions,
120            or any other relevant technical details.
121        data_type: Optional score data type. Required if value is not NUMERIC.
122            One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC.
123        config_id: Optional Langfuse score config ID.
124
125    Examples:
126        Basic accuracy evaluation:
127        ```python
128        from langfuse import Evaluation
129
130        def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
131            if not expected_output:
132                return Evaluation(name="accuracy", value=0, comment="No expected output")
133
134            is_correct = output.strip().lower() == expected_output.strip().lower()
135            return Evaluation(
136                name="accuracy",
137                value=1.0 if is_correct else 0.0,
138                comment="Correct answer" if is_correct else "Incorrect answer"
139            )
140        ```
141
142        Multi-metric evaluator:
143        ```python
144        def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs):
145            return [
146                Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"),
147                Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"),
148                Evaluation(
149                    name="quality",
150                    value=0.85,
151                    comment="High quality response",
152                    metadata={"confidence": 0.92, "model": "gpt-4"}
153                )
154            ]
155        ```
156
157        Categorical evaluation:
158        ```python
159        def sentiment_evaluator(*, input, output, **kwargs):
160            sentiment = analyze_sentiment(output)  # Returns "positive", "negative", or "neutral"
161            return Evaluation(
162                name="sentiment",
163                value=sentiment,
164                comment=f"Response expresses {sentiment} sentiment",
165                data_type="CATEGORICAL"
166            )
167        ```
168
169        Failed evaluation with error handling:
170        ```python
171        def external_api_evaluator(*, input, output, **kwargs):
172            try:
173                score = external_api.evaluate(output)
174                return Evaluation(name="external_score", value=score)
175            except Exception as e:
176                return Evaluation(
177                    name="external_score",
178                    value=0,
179                    comment=f"API unavailable: {e}",
180                    metadata={"error": str(e), "retry_count": 3}
181                )
182        ```
183
184    Note:
185        All arguments must be passed as keywords. Positional arguments are not allowed
186        to ensure code clarity and prevent errors from argument reordering.
187    """
188
189    def __init__(
190        self,
191        *,
192        name: str,
193        value: Union[int, float, str, bool],
194        comment: Optional[str] = None,
195        metadata: Optional[Dict[str, Any]] = None,
196        data_type: Optional[ExperimentScoreType] = None,
197        config_id: Optional[str] = None,
198    ):
199        """Initialize an Evaluation with the provided data.
200
201        Args:
202            name: Unique identifier for the evaluation metric.
203            value: The evaluation score or result.
204            comment: Optional human-readable explanation of the result.
205            metadata: Optional structured metadata about the evaluation process.
206            data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
207            config_id: Optional Langfuse score config ID.
208
209        Note:
210            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
211        """
212        self.name = name
213        self.value = value
214        self.comment = comment
215        self.metadata = metadata
216        self.data_type = data_type
217        self.config_id = config_id

Represents an evaluation result for an experiment item or an entire experiment run.

This class provides a strongly-typed way to create evaluation results in evaluator functions. Users must use keyword arguments when instantiating this class.

Attributes:
  • name: Unique identifier for the evaluation metric. Should be descriptive and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity"). Used for aggregation and comparison across experiment runs.
  • value: The evaluation score or result. Can be:
    • Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42)
    • String: For categorical results like "positive", "negative", "neutral"
    • Boolean: For binary assessments like "passes_safety_check"
  • comment: Optional human-readable explanation of the evaluation result. Useful for providing context, explaining scoring rationale, or noting special conditions. Displayed in Langfuse UI for interpretability.
  • metadata: Optional structured metadata about the evaluation process. Can include confidence scores, intermediate calculations, model versions, or any other relevant technical details.
  • data_type: Optional score data type. Required if value is not NUMERIC. One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC.
  • config_id: Optional Langfuse score config ID.
Examples:

Basic accuracy evaluation:

from langfuse import Evaluation

def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
    if not expected_output:
        return Evaluation(name="accuracy", value=0, comment="No expected output")

    is_correct = output.strip().lower() == expected_output.strip().lower()
    return Evaluation(
        name="accuracy",
        value=1.0 if is_correct else 0.0,
        comment="Correct answer" if is_correct else "Incorrect answer"
    )

Multi-metric evaluator:

def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs):
    return [
        Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"),
        Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"),
        Evaluation(
            name="quality",
            value=0.85,
            comment="High quality response",
            metadata={"confidence": 0.92, "model": "gpt-4"}
        )
    ]

Categorical evaluation:

def sentiment_evaluator(*, input, output, **kwargs):
    sentiment = analyze_sentiment(output)  # Returns "positive", "negative", or "neutral"
    return Evaluation(
        name="sentiment",
        value=sentiment,
        comment=f"Response expresses {sentiment} sentiment",
        data_type="CATEGORICAL"
    )

Failed evaluation with error handling:

def external_api_evaluator(*, input, output, **kwargs):
    try:
        score = external_api.evaluate(output)
        return Evaluation(name="external_score", value=score)
    except Exception as e:
        return Evaluation(
            name="external_score",
            value=0,
            comment=f"API unavailable: {e}",
            metadata={"error": str(e), "retry_count": 3}
        )
Note:

All arguments must be passed as keywords. Positional arguments are not allowed to ensure code clarity and prevent errors from argument reordering.

Evaluation( *, name: str, value: Union[int, float, str, bool], comment: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, data_type: Optional[Literal['NUMERIC', 'CATEGORICAL', 'BOOLEAN']] = None, config_id: Optional[str] = None)
189    def __init__(
190        self,
191        *,
192        name: str,
193        value: Union[int, float, str, bool],
194        comment: Optional[str] = None,
195        metadata: Optional[Dict[str, Any]] = None,
196        data_type: Optional[ExperimentScoreType] = None,
197        config_id: Optional[str] = None,
198    ):
199        """Initialize an Evaluation with the provided data.
200
201        Args:
202            name: Unique identifier for the evaluation metric.
203            value: The evaluation score or result.
204            comment: Optional human-readable explanation of the result.
205            metadata: Optional structured metadata about the evaluation process.
206            data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
207            config_id: Optional Langfuse score config ID.
208
209        Note:
210            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
211        """
212        self.name = name
213        self.value = value
214        self.comment = comment
215        self.metadata = metadata
216        self.data_type = data_type
217        self.config_id = config_id

Initialize an Evaluation with the provided data.

Arguments:
  • name: Unique identifier for the evaluation metric.
  • value: The evaluation score or result.
  • comment: Optional human-readable explanation of the result.
  • metadata: Optional structured metadata about the evaluation process.
  • data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
  • config_id: Optional Langfuse score config ID.
Note:

All arguments must be provided as keywords. Positional arguments will raise a TypeError.

name
value
comment
metadata
data_type
config_id
class EvaluatorInputs:
 38class EvaluatorInputs:
 39    """Input data structure for evaluators, returned by mapper functions.
 40
 41    This class provides a strongly-typed container for transforming API response
 42    objects (traces, observations) into the standardized format expected
 43    by evaluator functions. It ensures consistent access to input, output, expected
 44    output, and metadata regardless of the source entity type.
 45
 46    Attributes:
 47        input: The input data that was provided to generate the output being evaluated.
 48            For traces, this might be the initial prompt or request. For observations,
 49            this could be the span's input. The exact meaning depends on your use case.
 50        output: The actual output that was produced and needs to be evaluated.
 51            For traces, this is typically the final response. For observations,
 52            this might be the generation output or span result.
 53        expected_output: Optional ground truth or expected result for comparison.
 54            Used by evaluators to assess correctness. May be None if no ground truth
 55            is available for the entity being evaluated.
 56        metadata: Optional structured metadata providing additional context for evaluation.
 57            Can include information about the entity, execution context, user attributes,
 58            or any other relevant data that evaluators might use.
 59
 60    Examples:
 61        Simple mapper for traces:
 62        ```python
 63        from langfuse import EvaluatorInputs
 64
 65        def trace_mapper(trace):
 66            return EvaluatorInputs(
 67                input=trace.input,
 68                output=trace.output,
 69                expected_output=None,  # No ground truth available
 70                metadata={"user_id": trace.user_id, "tags": trace.tags}
 71            )
 72        ```
 73
 74        Mapper for observations extracting specific fields:
 75        ```python
 76        def observation_mapper(observation):
 77            # Extract input/output from observation's data
 78            input_data = observation.input if hasattr(observation, 'input') else None
 79            output_data = observation.output if hasattr(observation, 'output') else None
 80
 81            return EvaluatorInputs(
 82                input=input_data,
 83                output=output_data,
 84                expected_output=None,
 85                metadata={
 86                    "observation_type": observation.type,
 87                    "model": observation.model,
 88                    "latency_ms": observation.end_time - observation.start_time
 89                }
 90            )
 91        ```
 92        ```
 93
 94    Note:
 95        All arguments must be passed as keywords when instantiating this class.
 96    """
 97
 98    def __init__(
 99        self,
100        *,
101        input: Any,
102        output: Any,
103        expected_output: Any = None,
104        metadata: Optional[Dict[str, Any]] = None,
105    ):
106        """Initialize EvaluatorInputs with the provided data.
107
108        Args:
109            input: The input data for evaluation.
110            output: The output data to be evaluated.
111            expected_output: Optional ground truth for comparison.
112            metadata: Optional additional context for evaluation.
113
114        Note:
115            All arguments must be provided as keywords.
116        """
117        self.input = input
118        self.output = output
119        self.expected_output = expected_output
120        self.metadata = metadata

Input data structure for evaluators, returned by mapper functions.

This class provides a strongly-typed container for transforming API response objects (traces, observations) into the standardized format expected by evaluator functions. It ensures consistent access to input, output, expected output, and metadata regardless of the source entity type.

Attributes:
  • input: The input data that was provided to generate the output being evaluated. For traces, this might be the initial prompt or request. For observations, this could be the span's input. The exact meaning depends on your use case.
  • output: The actual output that was produced and needs to be evaluated. For traces, this is typically the final response. For observations, this might be the generation output or span result.
  • expected_output: Optional ground truth or expected result for comparison. Used by evaluators to assess correctness. May be None if no ground truth is available for the entity being evaluated.
  • metadata: Optional structured metadata providing additional context for evaluation. Can include information about the entity, execution context, user attributes, or any other relevant data that evaluators might use.
Examples:

Simple mapper for traces:

from langfuse import EvaluatorInputs

def trace_mapper(trace):
    return EvaluatorInputs(
        input=trace.input,
        output=trace.output,
        expected_output=None,  # No ground truth available
        metadata={"user_id": trace.user_id, "tags": trace.tags}
    )

Mapper for observations extracting specific fields:

def observation_mapper(observation):
    # Extract input/output from observation's data
    input_data = observation.input if hasattr(observation, 'input') else None
    output_data = observation.output if hasattr(observation, 'output') else None

    return EvaluatorInputs(
        input=input_data,
        output=output_data,
        expected_output=None,
        metadata={
            "observation_type": observation.type,
            "model": observation.model,
            "latency_ms": observation.end_time - observation.start_time
        }
    )

```

Note:

All arguments must be passed as keywords when instantiating this class.

EvaluatorInputs( *, input: Any, output: Any, expected_output: Any = None, metadata: Optional[Dict[str, Any]] = None)
 98    def __init__(
 99        self,
100        *,
101        input: Any,
102        output: Any,
103        expected_output: Any = None,
104        metadata: Optional[Dict[str, Any]] = None,
105    ):
106        """Initialize EvaluatorInputs with the provided data.
107
108        Args:
109            input: The input data for evaluation.
110            output: The output data to be evaluated.
111            expected_output: Optional ground truth for comparison.
112            metadata: Optional additional context for evaluation.
113
114        Note:
115            All arguments must be provided as keywords.
116        """
117        self.input = input
118        self.output = output
119        self.expected_output = expected_output
120        self.metadata = metadata

Initialize EvaluatorInputs with the provided data.

Arguments:
  • input: The input data for evaluation.
  • output: The output data to be evaluated.
  • expected_output: Optional ground truth for comparison.
  • metadata: Optional additional context for evaluation.
Note:

All arguments must be provided as keywords.

input
output
expected_output
metadata
class MapperFunction(typing.Protocol):
123class MapperFunction(Protocol):
124    """Protocol defining the interface for mapper functions in batch evaluation.
125
126    Mapper functions transform API response objects (traces or observations)
127    into the standardized EvaluatorInputs format that evaluators expect. This abstraction
128    allows you to define how to extract and structure evaluation data from different
129    entity types.
130
131    Mapper functions must:
132    - Accept a single item parameter (trace, observation)
133    - Return an EvaluatorInputs instance with input, output, expected_output, metadata
134    - Can be either synchronous or asynchronous
135    - Should handle missing or malformed data gracefully
136    """
137
138    def __call__(
139        self,
140        *,
141        item: Union["TraceWithFullDetails", "ObservationsView"],
142        **kwargs: Dict[str, Any],
143    ) -> Union[EvaluatorInputs, Awaitable[EvaluatorInputs]]:
144        """Transform an API response object into evaluator inputs.
145
146        This method defines how to extract evaluation-relevant data from the raw
147        API response object. The implementation should map entity-specific fields
148        to the standardized input/output/expected_output/metadata structure.
149
150        Args:
151            item: The API response object to transform. The type depends on the scope:
152                - TraceWithFullDetails: When evaluating traces
153                - ObservationsView: When evaluating observations
154
155        Returns:
156            EvaluatorInputs: A structured container with:
157                - input: The input data that generated the output
158                - output: The output to be evaluated
159                - expected_output: Optional ground truth for comparison
160                - metadata: Optional additional context
161
162            Can return either a direct EvaluatorInputs instance or an awaitable
163            (for async mappers that need to fetch additional data).
164
165        Examples:
166            Basic trace mapper:
167            ```python
168            def map_trace(trace):
169                return EvaluatorInputs(
170                    input=trace.input,
171                    output=trace.output,
172                    expected_output=None,
173                    metadata={"trace_id": trace.id, "user": trace.user_id}
174                )
175            ```
176
177            Observation mapper with conditional logic:
178            ```python
179            def map_observation(observation):
180                # Extract fields based on observation type
181                if observation.type == "GENERATION":
182                    input_data = observation.input
183                    output_data = observation.output
184                else:
185                    # For other types, use different fields
186                    input_data = observation.metadata.get("input")
187                    output_data = observation.metadata.get("output")
188
189                return EvaluatorInputs(
190                    input=input_data,
191                    output=output_data,
192                    expected_output=None,
193                    metadata={"obs_id": observation.id, "type": observation.type}
194                )
195            ```
196
197            Async mapper (if additional processing needed):
198            ```python
199            async def map_trace_async(trace):
200                # Could do async processing here if needed
201                processed_output = await some_async_transformation(trace.output)
202
203                return EvaluatorInputs(
204                    input=trace.input,
205                    output=processed_output,
206                    expected_output=None,
207                    metadata={"trace_id": trace.id}
208                )
209            ```
210        """
211        ...

Protocol defining the interface for mapper functions in batch evaluation.

Mapper functions transform API response objects (traces or observations) into the standardized EvaluatorInputs format that evaluators expect. This abstraction allows you to define how to extract and structure evaluation data from different entity types.

Mapper functions must:

  • Accept a single item parameter (trace, observation)
  • Return an EvaluatorInputs instance with input, output, expected_output, metadata
  • Can be either synchronous or asynchronous
  • Should handle missing or malformed data gracefully
MapperFunction(*args, **kwargs)
1927def _no_init_or_replace_init(self, *args, **kwargs):
1928    cls = type(self)
1929
1930    if cls._is_protocol:
1931        raise TypeError('Protocols cannot be instantiated')
1932
1933    # Already using a custom `__init__`. No need to calculate correct
1934    # `__init__` to call. This can lead to RecursionError. See bpo-45121.
1935    if cls.__init__ is not _no_init_or_replace_init:
1936        return
1937
1938    # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`.
1939    # The first instantiation of the subclass will call `_no_init_or_replace_init` which
1940    # searches for a proper new `__init__` in the MRO. The new `__init__`
1941    # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent
1942    # instantiation of the protocol subclass will thus use the new
1943    # `__init__` and no longer call `_no_init_or_replace_init`.
1944    for base in cls.__mro__:
1945        init = base.__dict__.get('__init__', _no_init_or_replace_init)
1946        if init is not _no_init_or_replace_init:
1947            cls.__init__ = init
1948            break
1949    else:
1950        # should not happen
1951        cls.__init__ = object.__init__
1952
1953    cls.__init__(self, *args, **kwargs)
class CompositeEvaluatorFunction(typing.Protocol):
214class CompositeEvaluatorFunction(Protocol):
215    """Protocol defining the interface for composite evaluator functions.
216
217    Composite evaluators create aggregate scores from multiple item-level evaluations.
218    This is commonly used to compute weighted averages, combined metrics, or other
219    composite assessments based on individual evaluation results.
220
221    Composite evaluators:
222    - Accept the same inputs as item-level evaluators (input, output, expected_output, metadata)
223      plus the list of evaluations
224    - Return either a single Evaluation, a list of Evaluations, or a dict
225    - Can be either synchronous or asynchronous
226    - Have access to both raw item data and evaluation results
227    """
228
229    def __call__(
230        self,
231        *,
232        input: Optional[Any] = None,
233        output: Optional[Any] = None,
234        expected_output: Optional[Any] = None,
235        metadata: Optional[Dict[str, Any]] = None,
236        evaluations: List[Evaluation],
237        **kwargs: Dict[str, Any],
238    ) -> Union[
239        Evaluation,
240        List[Evaluation],
241        Dict[str, Any],
242        Awaitable[Evaluation],
243        Awaitable[List[Evaluation]],
244        Awaitable[Dict[str, Any]],
245    ]:
246        r"""Create a composite evaluation from item-level evaluation results.
247
248        This method combines multiple evaluation scores into a single composite metric.
249        Common use cases include weighted averages, pass/fail decisions based on multiple
250        criteria, or custom scoring logic that considers multiple dimensions.
251
252        Args:
253            input: The input data that was provided to the system being evaluated.
254            output: The output generated by the system being evaluated.
255            expected_output: The expected/reference output for comparison (if available).
256            metadata: Additional metadata about the evaluation context.
257            evaluations: List of evaluation results from item-level evaluators.
258                Each evaluation contains name, value, comment, and metadata.
259
260        Returns:
261            Can return any of:
262            - Evaluation: A single composite evaluation result
263            - List[Evaluation]: Multiple composite evaluations
264            - Dict: A dict that will be converted to an Evaluation
265                - name: Identifier for the composite metric (e.g., "composite_score")
266                - value: The computed composite value
267                - comment: Optional explanation of how the score was computed
268                - metadata: Optional details about the composition logic
269
270            Can return either a direct Evaluation instance or an awaitable
271            (for async composite evaluators).
272
273        Examples:
274            Simple weighted average:
275            ```python
276            def weighted_composite(*, input, output, expected_output, metadata, evaluations):
277                weights = {
278                    "accuracy": 0.5,
279                    "relevance": 0.3,
280                    "safety": 0.2
281                }
282
283                total_score = 0.0
284                total_weight = 0.0
285
286                for eval in evaluations:
287                    if eval.name in weights and isinstance(eval.value, (int, float)):
288                        total_score += eval.value * weights[eval.name]
289                        total_weight += weights[eval.name]
290
291                final_score = total_score / total_weight if total_weight > 0 else 0.0
292
293                return Evaluation(
294                    name="composite_score",
295                    value=final_score,
296                    comment=f"Weighted average of {len(evaluations)} metrics"
297                )
298            ```
299
300            Pass/fail composite based on thresholds:
301            ```python
302            def pass_fail_composite(*, input, output, expected_output, metadata, evaluations):
303                # Must pass all criteria
304                thresholds = {
305                    "accuracy": 0.7,
306                    "safety": 0.9,
307                    "relevance": 0.6
308                }
309
310                passes = True
311                failing_metrics = []
312
313                for metric, threshold in thresholds.items():
314                    eval_result = next((e for e in evaluations if e.name == metric), None)
315                    if eval_result and isinstance(eval_result.value, (int, float)):
316                        if eval_result.value < threshold:
317                            passes = False
318                            failing_metrics.append(metric)
319
320                return Evaluation(
321                    name="passes_all_checks",
322                    value=passes,
323                    comment=f"Failed: {', '.join(failing_metrics)}" if failing_metrics else "All checks passed",
324                    data_type="BOOLEAN"
325                )
326            ```
327
328            Async composite with external scoring:
329            ```python
330            async def llm_composite(*, input, output, expected_output, metadata, evaluations):
331                # Use LLM to synthesize multiple evaluation results
332                eval_summary = "\n".join(
333                    f"- {e.name}: {e.value}" for e in evaluations
334                )
335
336                prompt = f"Given these evaluation scores:\n{eval_summary}\n"
337                prompt += f"For the output: {output}\n"
338                prompt += "Provide an overall quality score from 0-1."
339
340                response = await openai.chat.completions.create(
341                    model="gpt-4",
342                    messages=[{"role": "user", "content": prompt}]
343                )
344
345                score = float(response.choices[0].message.content.strip())
346
347                return Evaluation(
348                    name="llm_composite_score",
349                    value=score,
350                    comment="LLM-synthesized composite score"
351                )
352            ```
353
354            Context-aware composite:
355            ```python
356            def context_composite(*, input, output, expected_output, metadata, evaluations):
357                # Adjust weighting based on metadata
358                base_weights = {"accuracy": 0.5, "speed": 0.3, "cost": 0.2}
359
360                # If metadata indicates high importance, prioritize accuracy
361                if metadata and metadata.get('importance') == 'high':
362                    weights = {"accuracy": 0.7, "speed": 0.2, "cost": 0.1}
363                else:
364                    weights = base_weights
365
366                total = sum(
367                    e.value * weights.get(e.name, 0)
368                    for e in evaluations
369                    if isinstance(e.value, (int, float))
370                )
371
372                return Evaluation(
373                    name="weighted_composite",
374                    value=total,
375                    comment="Context-aware weighted composite"
376                )
377            ```
378        """
379        ...

Protocol defining the interface for composite evaluator functions.

Composite evaluators create aggregate scores from multiple item-level evaluations. This is commonly used to compute weighted averages, combined metrics, or other composite assessments based on individual evaluation results.

Composite evaluators:

  • Accept the same inputs as item-level evaluators (input, output, expected_output, metadata) plus the list of evaluations
  • Return either a single Evaluation, a list of Evaluations, or a dict
  • Can be either synchronous or asynchronous
  • Have access to both raw item data and evaluation results
CompositeEvaluatorFunction(*args, **kwargs)
1927def _no_init_or_replace_init(self, *args, **kwargs):
1928    cls = type(self)
1929
1930    if cls._is_protocol:
1931        raise TypeError('Protocols cannot be instantiated')
1932
1933    # Already using a custom `__init__`. No need to calculate correct
1934    # `__init__` to call. This can lead to RecursionError. See bpo-45121.
1935    if cls.__init__ is not _no_init_or_replace_init:
1936        return
1937
1938    # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`.
1939    # The first instantiation of the subclass will call `_no_init_or_replace_init` which
1940    # searches for a proper new `__init__` in the MRO. The new `__init__`
1941    # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent
1942    # instantiation of the protocol subclass will thus use the new
1943    # `__init__` and no longer call `_no_init_or_replace_init`.
1944    for base in cls.__mro__:
1945        init = base.__dict__.get('__init__', _no_init_or_replace_init)
1946        if init is not _no_init_or_replace_init:
1947            cls.__init__ = init
1948            break
1949    else:
1950        # should not happen
1951        cls.__init__ = object.__init__
1952
1953    cls.__init__(self, *args, **kwargs)
class EvaluatorStats:
382class EvaluatorStats:
383    """Statistics for a single evaluator's performance during batch evaluation.
384
385    This class tracks detailed metrics about how a specific evaluator performed
386    across all items in a batch evaluation run. It helps identify evaluator issues,
387    understand reliability, and optimize evaluation pipelines.
388
389    Attributes:
390        name: The name of the evaluator function (extracted from __name__).
391        total_runs: Total number of times the evaluator was invoked.
392        successful_runs: Number of times the evaluator completed successfully.
393        failed_runs: Number of times the evaluator raised an exception or failed.
394        total_scores_created: Total number of evaluation scores created by this evaluator.
395            Can be higher than successful_runs if the evaluator returns multiple scores.
396
397    Examples:
398        Accessing evaluator stats from batch evaluation result:
399        ```python
400        result = client.run_batched_evaluation(...)
401
402        for stats in result.evaluator_stats:
403            print(f"Evaluator: {stats.name}")
404            print(f"  Success rate: {stats.successful_runs / stats.total_runs:.1%}")
405            print(f"  Scores created: {stats.total_scores_created}")
406
407            if stats.failed_runs > 0:
408                print(f"  ⚠️  Failed {stats.failed_runs} times")
409        ```
410
411        Identifying problematic evaluators:
412        ```python
413        result = client.run_batched_evaluation(...)
414
415        # Find evaluators with high failure rates
416        for stats in result.evaluator_stats:
417            failure_rate = stats.failed_runs / stats.total_runs
418            if failure_rate > 0.1:  # More than 10% failures
419                print(f"⚠️  {stats.name} has {failure_rate:.1%} failure rate")
420                print(f"    Consider debugging or removing this evaluator")
421        ```
422
423    Note:
424        All arguments must be passed as keywords when instantiating this class.
425    """
426
427    def __init__(
428        self,
429        *,
430        name: str,
431        total_runs: int = 0,
432        successful_runs: int = 0,
433        failed_runs: int = 0,
434        total_scores_created: int = 0,
435    ):
436        """Initialize EvaluatorStats with the provided metrics.
437
438        Args:
439            name: The evaluator function name.
440            total_runs: Total number of evaluator invocations.
441            successful_runs: Number of successful completions.
442            failed_runs: Number of failures.
443            total_scores_created: Total scores created by this evaluator.
444
445        Note:
446            All arguments must be provided as keywords.
447        """
448        self.name = name
449        self.total_runs = total_runs
450        self.successful_runs = successful_runs
451        self.failed_runs = failed_runs
452        self.total_scores_created = total_scores_created

Statistics for a single evaluator's performance during batch evaluation.

This class tracks detailed metrics about how a specific evaluator performed across all items in a batch evaluation run. It helps identify evaluator issues, understand reliability, and optimize evaluation pipelines.

Attributes:
  • name: The name of the evaluator function (extracted from __name__).
  • total_runs: Total number of times the evaluator was invoked.
  • successful_runs: Number of times the evaluator completed successfully.
  • failed_runs: Number of times the evaluator raised an exception or failed.
  • total_scores_created: Total number of evaluation scores created by this evaluator. Can be higher than successful_runs if the evaluator returns multiple scores.
Examples:

Accessing evaluator stats from batch evaluation result:

result = client.run_batched_evaluation(...)

for stats in result.evaluator_stats:
    print(f"Evaluator: {stats.name}")
    print(f"  Success rate: {stats.successful_runs / stats.total_runs:.1%}")
    print(f"  Scores created: {stats.total_scores_created}")

    if stats.failed_runs > 0:
        print(f"  ⚠️  Failed {stats.failed_runs} times")

Identifying problematic evaluators:

result = client.run_batched_evaluation(...)

# Find evaluators with high failure rates
for stats in result.evaluator_stats:
    failure_rate = stats.failed_runs / stats.total_runs
    if failure_rate > 0.1:  # More than 10% failures
        print(f"⚠️  {stats.name} has {failure_rate:.1%} failure rate")
        print(f"    Consider debugging or removing this evaluator")
Note:

All arguments must be passed as keywords when instantiating this class.

EvaluatorStats( *, name: str, total_runs: int = 0, successful_runs: int = 0, failed_runs: int = 0, total_scores_created: int = 0)
427    def __init__(
428        self,
429        *,
430        name: str,
431        total_runs: int = 0,
432        successful_runs: int = 0,
433        failed_runs: int = 0,
434        total_scores_created: int = 0,
435    ):
436        """Initialize EvaluatorStats with the provided metrics.
437
438        Args:
439            name: The evaluator function name.
440            total_runs: Total number of evaluator invocations.
441            successful_runs: Number of successful completions.
442            failed_runs: Number of failures.
443            total_scores_created: Total scores created by this evaluator.
444
445        Note:
446            All arguments must be provided as keywords.
447        """
448        self.name = name
449        self.total_runs = total_runs
450        self.successful_runs = successful_runs
451        self.failed_runs = failed_runs
452        self.total_scores_created = total_scores_created

Initialize EvaluatorStats with the provided metrics.

Arguments:
  • name: The evaluator function name.
  • total_runs: Total number of evaluator invocations.
  • successful_runs: Number of successful completions.
  • failed_runs: Number of failures.
  • total_scores_created: Total scores created by this evaluator.
Note:

All arguments must be provided as keywords.

name
total_runs
successful_runs
failed_runs
total_scores_created
class BatchEvaluationResumeToken:
455class BatchEvaluationResumeToken:
456    """Token for resuming a failed batch evaluation run.
457
458    This class encapsulates all the information needed to resume a batch evaluation
459    that was interrupted or failed partway through. It uses timestamp-based filtering
460    to avoid re-processing items that were already evaluated, even if the underlying
461    dataset changed between runs.
462
463    Attributes:
464        scope: The type of items being evaluated ("traces", "observations").
465        filter: The original JSON filter string used to query items.
466        last_processed_timestamp: ISO 8601 timestamp of the last successfully processed item.
467            Used to construct a filter that only fetches items after this timestamp.
468        last_processed_id: The ID of the last successfully processed item, for reference.
469        items_processed: Count of items successfully processed before interruption.
470
471    Examples:
472        Resuming a failed batch evaluation:
473        ```python
474        # Initial run that fails partway through
475        try:
476            result = client.run_batched_evaluation(
477                scope="traces",
478                mapper=my_mapper,
479                evaluators=[evaluator1, evaluator2],
480                filter='{"tags": ["production"]}',
481                max_items=10000
482            )
483        except Exception as e:
484            print(f"Evaluation failed: {e}")
485
486            # Save the resume token
487            if result.resume_token:
488                # Store resume token for later (e.g., in a file or database)
489                import json
490                with open("resume_token.json", "w") as f:
491                    json.dump({
492                        "scope": result.resume_token.scope,
493                        "filter": result.resume_token.filter,
494                        "last_timestamp": result.resume_token.last_processed_timestamp,
495                        "last_id": result.resume_token.last_processed_id,
496                        "items_done": result.resume_token.items_processed
497                    }, f)
498
499        # Later, resume from where it left off
500        with open("resume_token.json") as f:
501            token_data = json.load(f)
502
503        resume_token = BatchEvaluationResumeToken(
504            scope=token_data["scope"],
505            filter=token_data["filter"],
506            last_processed_timestamp=token_data["last_timestamp"],
507            last_processed_id=token_data["last_id"],
508            items_processed=token_data["items_done"]
509        )
510
511        # Resume the evaluation
512        result = client.run_batched_evaluation(
513            scope="traces",
514            mapper=my_mapper,
515            evaluators=[evaluator1, evaluator2],
516            resume_from=resume_token
517        )
518
519        print(f"Processed {result.total_items_processed} additional items")
520        ```
521
522        Handling partial completion:
523        ```python
524        result = client.run_batched_evaluation(...)
525
526        if not result.completed:
527            print(f"Evaluation incomplete. Processed {result.resume_token.items_processed} items")
528            print(f"Last item: {result.resume_token.last_processed_id}")
529            print(f"Resume from: {result.resume_token.last_processed_timestamp}")
530
531            # Optionally retry automatically
532            if result.resume_token:
533                print("Retrying...")
534                result = client.run_batched_evaluation(
535                    scope=result.resume_token.scope,
536                    mapper=my_mapper,
537                    evaluators=my_evaluators,
538                    resume_from=result.resume_token
539                )
540        ```
541
542    Note:
543        All arguments must be passed as keywords when instantiating this class.
544        The timestamp-based approach means that items created after the initial run
545        but before the timestamp will be skipped. This is intentional to avoid
546        duplicates and ensure consistent evaluation.
547    """
548
549    def __init__(
550        self,
551        *,
552        scope: str,
553        filter: Optional[str],
554        last_processed_timestamp: str,
555        last_processed_id: str,
556        items_processed: int,
557    ):
558        """Initialize BatchEvaluationResumeToken with the provided state.
559
560        Args:
561            scope: The scope type ("traces", "observations").
562            filter: The original JSON filter string.
563            last_processed_timestamp: ISO 8601 timestamp of last processed item.
564            last_processed_id: ID of last processed item.
565            items_processed: Count of items processed before interruption.
566
567        Note:
568            All arguments must be provided as keywords.
569        """
570        self.scope = scope
571        self.filter = filter
572        self.last_processed_timestamp = last_processed_timestamp
573        self.last_processed_id = last_processed_id
574        self.items_processed = items_processed

Token for resuming a failed batch evaluation run.

This class encapsulates all the information needed to resume a batch evaluation that was interrupted or failed partway through. It uses timestamp-based filtering to avoid re-processing items that were already evaluated, even if the underlying dataset changed between runs.

Attributes:
  • scope: The type of items being evaluated ("traces", "observations").
  • filter: The original JSON filter string used to query items.
  • last_processed_timestamp: ISO 8601 timestamp of the last successfully processed item. Used to construct a filter that only fetches items after this timestamp.
  • last_processed_id: The ID of the last successfully processed item, for reference.
  • items_processed: Count of items successfully processed before interruption.
Examples:

Resuming a failed batch evaluation:

# Initial run that fails partway through
try:
    result = client.run_batched_evaluation(
        scope="traces",
        mapper=my_mapper,
        evaluators=[evaluator1, evaluator2],
        filter='{"tags": ["production"]}',
        max_items=10000
    )
except Exception as e:
    print(f"Evaluation failed: {e}")

    # Save the resume token
    if result.resume_token:
        # Store resume token for later (e.g., in a file or database)
        import json
        with open("resume_token.json", "w") as f:
            json.dump({
                "scope": result.resume_token.scope,
                "filter": result.resume_token.filter,
                "last_timestamp": result.resume_token.last_processed_timestamp,
                "last_id": result.resume_token.last_processed_id,
                "items_done": result.resume_token.items_processed
            }, f)

# Later, resume from where it left off
with open("resume_token.json") as f:
    token_data = json.load(f)

resume_token = BatchEvaluationResumeToken(
    scope=token_data["scope"],
    filter=token_data["filter"],
    last_processed_timestamp=token_data["last_timestamp"],
    last_processed_id=token_data["last_id"],
    items_processed=token_data["items_done"]
)

# Resume the evaluation
result = client.run_batched_evaluation(
    scope="traces",
    mapper=my_mapper,
    evaluators=[evaluator1, evaluator2],
    resume_from=resume_token
)

print(f"Processed {result.total_items_processed} additional items")

Handling partial completion:

result = client.run_batched_evaluation(...)

if not result.completed:
    print(f"Evaluation incomplete. Processed {result.resume_token.items_processed} items")
    print(f"Last item: {result.resume_token.last_processed_id}")
    print(f"Resume from: {result.resume_token.last_processed_timestamp}")

    # Optionally retry automatically
    if result.resume_token:
        print("Retrying...")
        result = client.run_batched_evaluation(
            scope=result.resume_token.scope,
            mapper=my_mapper,
            evaluators=my_evaluators,
            resume_from=result.resume_token
        )
Note:

All arguments must be passed as keywords when instantiating this class. The timestamp-based approach means that items created after the initial run but before the timestamp will be skipped. This is intentional to avoid duplicates and ensure consistent evaluation.

BatchEvaluationResumeToken( *, scope: str, filter: Optional[str], last_processed_timestamp: str, last_processed_id: str, items_processed: int)
549    def __init__(
550        self,
551        *,
552        scope: str,
553        filter: Optional[str],
554        last_processed_timestamp: str,
555        last_processed_id: str,
556        items_processed: int,
557    ):
558        """Initialize BatchEvaluationResumeToken with the provided state.
559
560        Args:
561            scope: The scope type ("traces", "observations").
562            filter: The original JSON filter string.
563            last_processed_timestamp: ISO 8601 timestamp of last processed item.
564            last_processed_id: ID of last processed item.
565            items_processed: Count of items processed before interruption.
566
567        Note:
568            All arguments must be provided as keywords.
569        """
570        self.scope = scope
571        self.filter = filter
572        self.last_processed_timestamp = last_processed_timestamp
573        self.last_processed_id = last_processed_id
574        self.items_processed = items_processed

Initialize BatchEvaluationResumeToken with the provided state.

Arguments:
  • scope: The scope type ("traces", "observations").
  • filter: The original JSON filter string.
  • last_processed_timestamp: ISO 8601 timestamp of last processed item.
  • last_processed_id: ID of last processed item.
  • items_processed: Count of items processed before interruption.
Note:

All arguments must be provided as keywords.

scope
filter
last_processed_timestamp
last_processed_id
items_processed
class BatchEvaluationResult:
577class BatchEvaluationResult:
578    r"""Complete result structure for batch evaluation execution.
579
580    This class encapsulates comprehensive statistics and metadata about a batch
581    evaluation run, including counts, evaluator-specific metrics, timing information,
582    error details, and resume capability.
583
584    Attributes:
585        total_items_fetched: Total number of items fetched from the API.
586        total_items_processed: Number of items successfully evaluated.
587        total_items_failed: Number of items that failed during evaluation.
588        total_scores_created: Total scores created by all item-level evaluators.
589        total_composite_scores_created: Scores created by the composite evaluator.
590        total_evaluations_failed: Number of individual evaluator failures across all items.
591        evaluator_stats: List of per-evaluator statistics (success/failure rates, scores created).
592        resume_token: Token for resuming if evaluation was interrupted (None if completed).
593        completed: True if all items were processed, False if stopped early or failed.
594        duration_seconds: Total time taken to execute the batch evaluation.
595        failed_item_ids: List of IDs for items that failed evaluation.
596        error_summary: Dictionary mapping error types to occurrence counts.
597        has_more_items: True if max_items limit was reached but more items exist.
598        item_evaluations: Dictionary mapping item IDs to their evaluation results (both regular and composite).
599
600    Examples:
601        Basic result inspection:
602        ```python
603        result = client.run_batched_evaluation(...)
604
605        print(f"Processed: {result.total_items_processed}/{result.total_items_fetched}")
606        print(f"Scores created: {result.total_scores_created}")
607        print(f"Duration: {result.duration_seconds:.2f}s")
608        print(f"Success rate: {result.total_items_processed / result.total_items_fetched:.1%}")
609        ```
610
611        Detailed analysis with evaluator stats:
612        ```python
613        result = client.run_batched_evaluation(...)
614
615        print(f"\n📊 Batch Evaluation Results")
616        print(f"{'='*50}")
617        print(f"Items processed: {result.total_items_processed}")
618        print(f"Items failed: {result.total_items_failed}")
619        print(f"Scores created: {result.total_scores_created}")
620
621        if result.total_composite_scores_created > 0:
622            print(f"Composite scores: {result.total_composite_scores_created}")
623
624        print(f"\n📈 Evaluator Performance:")
625        for stats in result.evaluator_stats:
626            success_rate = stats.successful_runs / stats.total_runs if stats.total_runs > 0 else 0
627            print(f"\n  {stats.name}:")
628            print(f"    Success rate: {success_rate:.1%}")
629            print(f"    Scores created: {stats.total_scores_created}")
630            if stats.failed_runs > 0:
631                print(f"    ⚠️  Failures: {stats.failed_runs}")
632
633        if result.error_summary:
634            print(f"\n⚠️  Errors encountered:")
635            for error_type, count in result.error_summary.items():
636                print(f"    {error_type}: {count}")
637        ```
638
639        Handling incomplete runs:
640        ```python
641        result = client.run_batched_evaluation(...)
642
643        if not result.completed:
644            print("⚠️  Evaluation incomplete!")
645
646            if result.resume_token:
647                print(f"Processed {result.resume_token.items_processed} items before failure")
648                print(f"Use resume_from parameter to continue from:")
649                print(f"  Timestamp: {result.resume_token.last_processed_timestamp}")
650                print(f"  Last ID: {result.resume_token.last_processed_id}")
651
652        if result.has_more_items:
653            print(f"ℹ️  More items available beyond max_items limit")
654        ```
655
656        Performance monitoring:
657        ```python
658        result = client.run_batched_evaluation(...)
659
660        items_per_second = result.total_items_processed / result.duration_seconds
661        avg_scores_per_item = result.total_scores_created / result.total_items_processed
662
663        print(f"Performance metrics:")
664        print(f"  Throughput: {items_per_second:.2f} items/second")
665        print(f"  Avg scores/item: {avg_scores_per_item:.2f}")
666        print(f"  Total duration: {result.duration_seconds:.2f}s")
667
668        if result.total_evaluations_failed > 0:
669            failure_rate = result.total_evaluations_failed / (
670                result.total_items_processed * len(result.evaluator_stats)
671            )
672            print(f"  Evaluation failure rate: {failure_rate:.1%}")
673        ```
674
675    Note:
676        All arguments must be passed as keywords when instantiating this class.
677    """
678
679    def __init__(
680        self,
681        *,
682        total_items_fetched: int,
683        total_items_processed: int,
684        total_items_failed: int,
685        total_scores_created: int,
686        total_composite_scores_created: int,
687        total_evaluations_failed: int,
688        evaluator_stats: List[EvaluatorStats],
689        resume_token: Optional[BatchEvaluationResumeToken],
690        completed: bool,
691        duration_seconds: float,
692        failed_item_ids: List[str],
693        error_summary: Dict[str, int],
694        has_more_items: bool,
695        item_evaluations: Dict[str, List["Evaluation"]],
696    ):
697        """Initialize BatchEvaluationResult with comprehensive statistics.
698
699        Args:
700            total_items_fetched: Total items fetched from API.
701            total_items_processed: Items successfully evaluated.
702            total_items_failed: Items that failed evaluation.
703            total_scores_created: Scores from item-level evaluators.
704            total_composite_scores_created: Scores from composite evaluator.
705            total_evaluations_failed: Individual evaluator failures.
706            evaluator_stats: Per-evaluator statistics.
707            resume_token: Token for resuming (None if completed).
708            completed: Whether all items were processed.
709            duration_seconds: Total execution time.
710            failed_item_ids: IDs of failed items.
711            error_summary: Error types and counts.
712            has_more_items: Whether more items exist beyond max_items.
713            item_evaluations: Dictionary mapping item IDs to their evaluation results.
714
715        Note:
716            All arguments must be provided as keywords.
717        """
718        self.total_items_fetched = total_items_fetched
719        self.total_items_processed = total_items_processed
720        self.total_items_failed = total_items_failed
721        self.total_scores_created = total_scores_created
722        self.total_composite_scores_created = total_composite_scores_created
723        self.total_evaluations_failed = total_evaluations_failed
724        self.evaluator_stats = evaluator_stats
725        self.resume_token = resume_token
726        self.completed = completed
727        self.duration_seconds = duration_seconds
728        self.failed_item_ids = failed_item_ids
729        self.error_summary = error_summary
730        self.has_more_items = has_more_items
731        self.item_evaluations = item_evaluations
732
733    def __str__(self) -> str:
734        """Return a formatted string representation of the batch evaluation results.
735
736        Returns:
737            A multi-line string with a summary of the evaluation results.
738        """
739        lines = []
740        lines.append("=" * 60)
741        lines.append("Batch Evaluation Results")
742        lines.append("=" * 60)
743
744        # Summary statistics
745        lines.append(f"\nStatus: {'Completed' if self.completed else 'Incomplete'}")
746        lines.append(f"Duration: {self.duration_seconds:.2f}s")
747        lines.append(f"\nItems fetched: {self.total_items_fetched}")
748        lines.append(f"Items processed: {self.total_items_processed}")
749
750        if self.total_items_failed > 0:
751            lines.append(f"Items failed: {self.total_items_failed}")
752
753        # Success rate
754        if self.total_items_fetched > 0:
755            success_rate = self.total_items_processed / self.total_items_fetched * 100
756            lines.append(f"Success rate: {success_rate:.1f}%")
757
758        # Scores created
759        lines.append(f"\nScores created: {self.total_scores_created}")
760        if self.total_composite_scores_created > 0:
761            lines.append(f"Composite scores: {self.total_composite_scores_created}")
762
763        total_scores = self.total_scores_created + self.total_composite_scores_created
764        lines.append(f"Total scores: {total_scores}")
765
766        # Evaluator statistics
767        if self.evaluator_stats:
768            lines.append("\nEvaluator Performance:")
769            for stats in self.evaluator_stats:
770                lines.append(f"  {stats.name}:")
771                if stats.total_runs > 0:
772                    success_rate = (
773                        stats.successful_runs / stats.total_runs * 100
774                        if stats.total_runs > 0
775                        else 0
776                    )
777                    lines.append(
778                        f"    Runs: {stats.successful_runs}/{stats.total_runs} "
779                        f"({success_rate:.1f}% success)"
780                    )
781                    lines.append(f"    Scores created: {stats.total_scores_created}")
782                    if stats.failed_runs > 0:
783                        lines.append(f"    Failed runs: {stats.failed_runs}")
784
785        # Performance metrics
786        if self.total_items_processed > 0 and self.duration_seconds > 0:
787            items_per_sec = self.total_items_processed / self.duration_seconds
788            lines.append("\nPerformance:")
789            lines.append(f"  Throughput: {items_per_sec:.2f} items/second")
790            if self.total_scores_created > 0:
791                avg_scores = self.total_scores_created / self.total_items_processed
792                lines.append(f"  Avg scores per item: {avg_scores:.2f}")
793
794        # Errors and warnings
795        if self.error_summary:
796            lines.append("\nErrors encountered:")
797            for error_type, count in self.error_summary.items():
798                lines.append(f"  {error_type}: {count}")
799
800        # Incomplete run information
801        if not self.completed:
802            lines.append("\nWarning: Evaluation incomplete")
803            if self.resume_token:
804                lines.append(
805                    f"  Last processed: {self.resume_token.last_processed_timestamp}"
806                )
807                lines.append(f"  Items processed: {self.resume_token.items_processed}")
808                lines.append("  Use resume_from parameter to continue")
809
810        if self.has_more_items:
811            lines.append("\nNote: More items available beyond max_items limit")
812
813        lines.append("=" * 60)
814        return "\n".join(lines)

Complete result structure for batch evaluation execution.

This class encapsulates comprehensive statistics and metadata about a batch evaluation run, including counts, evaluator-specific metrics, timing information, error details, and resume capability.

Attributes:
  • total_items_fetched: Total number of items fetched from the API.
  • total_items_processed: Number of items successfully evaluated.
  • total_items_failed: Number of items that failed during evaluation.
  • total_scores_created: Total scores created by all item-level evaluators.
  • total_composite_scores_created: Scores created by the composite evaluator.
  • total_evaluations_failed: Number of individual evaluator failures across all items.
  • evaluator_stats: List of per-evaluator statistics (success/failure rates, scores created).
  • resume_token: Token for resuming if evaluation was interrupted (None if completed).
  • completed: True if all items were processed, False if stopped early or failed.
  • duration_seconds: Total time taken to execute the batch evaluation.
  • failed_item_ids: List of IDs for items that failed evaluation.
  • error_summary: Dictionary mapping error types to occurrence counts.
  • has_more_items: True if max_items limit was reached but more items exist.
  • item_evaluations: Dictionary mapping item IDs to their evaluation results (both regular and composite).
Examples:

Basic result inspection:

result = client.run_batched_evaluation(...)

print(f"Processed: {result.total_items_processed}/{result.total_items_fetched}")
print(f"Scores created: {result.total_scores_created}")
print(f"Duration: {result.duration_seconds:.2f}s")
print(f"Success rate: {result.total_items_processed / result.total_items_fetched:.1%}")

Detailed analysis with evaluator stats:

result = client.run_batched_evaluation(...)

print(f"\n📊 Batch Evaluation Results")
print(f"{'='*50}")
print(f"Items processed: {result.total_items_processed}")
print(f"Items failed: {result.total_items_failed}")
print(f"Scores created: {result.total_scores_created}")

if result.total_composite_scores_created > 0:
    print(f"Composite scores: {result.total_composite_scores_created}")

print(f"\n📈 Evaluator Performance:")
for stats in result.evaluator_stats:
    success_rate = stats.successful_runs / stats.total_runs if stats.total_runs > 0 else 0
    print(f"\n  {stats.name}:")
    print(f"    Success rate: {success_rate:.1%}")
    print(f"    Scores created: {stats.total_scores_created}")
    if stats.failed_runs > 0:
        print(f"    ⚠️  Failures: {stats.failed_runs}")

if result.error_summary:
    print(f"\n⚠️  Errors encountered:")
    for error_type, count in result.error_summary.items():
        print(f"    {error_type}: {count}")

Handling incomplete runs:

result = client.run_batched_evaluation(...)

if not result.completed:
    print("⚠️  Evaluation incomplete!")

    if result.resume_token:
        print(f"Processed {result.resume_token.items_processed} items before failure")
        print(f"Use resume_from parameter to continue from:")
        print(f"  Timestamp: {result.resume_token.last_processed_timestamp}")
        print(f"  Last ID: {result.resume_token.last_processed_id}")

if result.has_more_items:
    print(f"ℹ️  More items available beyond max_items limit")

Performance monitoring:

result = client.run_batched_evaluation(...)

items_per_second = result.total_items_processed / result.duration_seconds
avg_scores_per_item = result.total_scores_created / result.total_items_processed

print(f"Performance metrics:")
print(f"  Throughput: {items_per_second:.2f} items/second")
print(f"  Avg scores/item: {avg_scores_per_item:.2f}")
print(f"  Total duration: {result.duration_seconds:.2f}s")

if result.total_evaluations_failed > 0:
    failure_rate = result.total_evaluations_failed / (
        result.total_items_processed * len(result.evaluator_stats)
    )
    print(f"  Evaluation failure rate: {failure_rate:.1%}")
Note:

All arguments must be passed as keywords when instantiating this class.

BatchEvaluationResult( *, total_items_fetched: int, total_items_processed: int, total_items_failed: int, total_scores_created: int, total_composite_scores_created: int, total_evaluations_failed: int, evaluator_stats: List[EvaluatorStats], resume_token: Optional[BatchEvaluationResumeToken], completed: bool, duration_seconds: float, failed_item_ids: List[str], error_summary: Dict[str, int], has_more_items: bool, item_evaluations: Dict[str, List[Evaluation]])
679    def __init__(
680        self,
681        *,
682        total_items_fetched: int,
683        total_items_processed: int,
684        total_items_failed: int,
685        total_scores_created: int,
686        total_composite_scores_created: int,
687        total_evaluations_failed: int,
688        evaluator_stats: List[EvaluatorStats],
689        resume_token: Optional[BatchEvaluationResumeToken],
690        completed: bool,
691        duration_seconds: float,
692        failed_item_ids: List[str],
693        error_summary: Dict[str, int],
694        has_more_items: bool,
695        item_evaluations: Dict[str, List["Evaluation"]],
696    ):
697        """Initialize BatchEvaluationResult with comprehensive statistics.
698
699        Args:
700            total_items_fetched: Total items fetched from API.
701            total_items_processed: Items successfully evaluated.
702            total_items_failed: Items that failed evaluation.
703            total_scores_created: Scores from item-level evaluators.
704            total_composite_scores_created: Scores from composite evaluator.
705            total_evaluations_failed: Individual evaluator failures.
706            evaluator_stats: Per-evaluator statistics.
707            resume_token: Token for resuming (None if completed).
708            completed: Whether all items were processed.
709            duration_seconds: Total execution time.
710            failed_item_ids: IDs of failed items.
711            error_summary: Error types and counts.
712            has_more_items: Whether more items exist beyond max_items.
713            item_evaluations: Dictionary mapping item IDs to their evaluation results.
714
715        Note:
716            All arguments must be provided as keywords.
717        """
718        self.total_items_fetched = total_items_fetched
719        self.total_items_processed = total_items_processed
720        self.total_items_failed = total_items_failed
721        self.total_scores_created = total_scores_created
722        self.total_composite_scores_created = total_composite_scores_created
723        self.total_evaluations_failed = total_evaluations_failed
724        self.evaluator_stats = evaluator_stats
725        self.resume_token = resume_token
726        self.completed = completed
727        self.duration_seconds = duration_seconds
728        self.failed_item_ids = failed_item_ids
729        self.error_summary = error_summary
730        self.has_more_items = has_more_items
731        self.item_evaluations = item_evaluations

Initialize BatchEvaluationResult with comprehensive statistics.

Arguments:
  • total_items_fetched: Total items fetched from API.
  • total_items_processed: Items successfully evaluated.
  • total_items_failed: Items that failed evaluation.
  • total_scores_created: Scores from item-level evaluators.
  • total_composite_scores_created: Scores from composite evaluator.
  • total_evaluations_failed: Individual evaluator failures.
  • evaluator_stats: Per-evaluator statistics.
  • resume_token: Token for resuming (None if completed).
  • completed: Whether all items were processed.
  • duration_seconds: Total execution time.
  • failed_item_ids: IDs of failed items.
  • error_summary: Error types and counts.
  • has_more_items: Whether more items exist beyond max_items.
  • item_evaluations: Dictionary mapping item IDs to their evaluation results.
Note:

All arguments must be provided as keywords.

total_items_fetched
total_items_processed
total_items_failed
total_scores_created
total_composite_scores_created
total_evaluations_failed
evaluator_stats
resume_token
completed
duration_seconds
failed_item_ids
error_summary
has_more_items
item_evaluations
class RunnerContext:
1062class RunnerContext:
1063    """Wraps :meth:`Langfuse.run_experiment` with CI-injected defaults.
1064
1065    Intended for use with the ``langfuse/experiment-action`` GitHub Action
1066    (https://github.com/langfuse/experiment-action). The action builds a
1067    ``RunnerContext`` before invoking the user's ``experiment(context)``
1068    function. Defaults set here (dataset, metadata tags) are applied when
1069    the user omits them on the :meth:`run_experiment` call; users can
1070    override any default by passing the corresponding argument explicitly.
1071    """
1072
1073    def __init__(
1074        self,
1075        *,
1076        client: "Langfuse",
1077        data: Optional[ExperimentData] = None,
1078        dataset_version: Optional[datetime] = None,
1079        metadata: Optional[Dict[str, str]] = None,
1080    ):
1081        """Build a ``RunnerContext`` populated with defaults for ``run_experiment``.
1082
1083        Typically called by the ``langfuse/experiment-action`` GitHub Action,
1084        not by end users directly. Every field except ``client`` is optional:
1085        fields left as ``None`` simply mean the corresponding argument must be
1086        supplied on the :meth:`run_experiment` call.
1087
1088        Args:
1089            client: Initialized Langfuse SDK client used to execute the
1090                experiment. The action creates this from the
1091                ``langfuse_public_key`` / ``langfuse_secret_key`` /
1092                ``langfuse_base_url`` inputs.
1093            data: Default dataset items to run the experiment on. Accepts
1094                either ``List[LocalExperimentItem]`` or ``List[DatasetItem]``.
1095                Injected by the action when ``dataset_name`` is configured.
1096                If ``None``, the user must pass ``data=`` to
1097                :meth:`run_experiment`.
1098            dataset_version: Optional pinned dataset version. Injected by the
1099                action when ``dataset_version`` is configured.
1100            metadata: Default metadata attached to every experiment trace and
1101                the dataset run. The action injects GitHub-sourced tags (SHA,
1102                PR link, workflow run link, branch, GH user, etc.). Merged
1103                with any ``metadata`` passed to :meth:`run_experiment`, with
1104                user-supplied keys winning on collision.
1105        """
1106        self.client = client
1107        self.data = data
1108        self.dataset_version = dataset_version
1109        self.metadata = metadata
1110
1111    def run_experiment(
1112        self,
1113        *,
1114        name: str,
1115        run_name: Optional[str] = None,
1116        description: Optional[str] = None,
1117        data: Optional[ExperimentData] = None,
1118        task: TaskFunction,
1119        evaluators: List[EvaluatorFunction] = [],
1120        composite_evaluator: Optional["CompositeEvaluatorFunction"] = None,
1121        run_evaluators: List[RunEvaluatorFunction] = [],
1122        max_concurrency: int = 50,
1123        metadata: Optional[Dict[str, str]] = None,
1124        _dataset_version: Optional[datetime] = None,
1125    ) -> ExperimentResult:
1126        resolved_data = data if data is not None else self.data
1127        if resolved_data is None:
1128            raise ValueError(
1129                "`data` must be provided either on the RunnerContext or the run_experiment call"
1130            )
1131
1132        resolved_dataset_version = (
1133            _dataset_version if _dataset_version is not None else self.dataset_version
1134        )
1135
1136        merged_metadata: Optional[Dict[str, str]]
1137        if self.metadata is None and metadata is None:
1138            merged_metadata = None
1139        else:
1140            merged_metadata = {**(self.metadata or {}), **(metadata or {})}
1141
1142        return self.client.run_experiment(
1143            name=name,
1144            run_name=run_name,
1145            description=description,
1146            data=resolved_data,
1147            task=task,
1148            evaluators=evaluators,
1149            composite_evaluator=composite_evaluator,
1150            run_evaluators=run_evaluators,
1151            max_concurrency=max_concurrency,
1152            metadata=merged_metadata,
1153            _dataset_version=resolved_dataset_version,
1154        )

Wraps Langfuse.run_experiment() with CI-injected defaults.

Intended for use with the langfuse/experiment-action GitHub Action (https://github.com/langfuse/experiment-action). The action builds a RunnerContext before invoking the user's experiment(context) function. Defaults set here (dataset, metadata tags) are applied when the user omits them on the run_experiment() call; users can override any default by passing the corresponding argument explicitly.

RunnerContext( *, client: Langfuse, data: Union[List[langfuse.experiment.LocalExperimentItem], List[langfuse.api.DatasetItem], NoneType] = None, dataset_version: Optional[datetime.datetime] = None, metadata: Optional[Dict[str, str]] = None)
1073    def __init__(
1074        self,
1075        *,
1076        client: "Langfuse",
1077        data: Optional[ExperimentData] = None,
1078        dataset_version: Optional[datetime] = None,
1079        metadata: Optional[Dict[str, str]] = None,
1080    ):
1081        """Build a ``RunnerContext`` populated with defaults for ``run_experiment``.
1082
1083        Typically called by the ``langfuse/experiment-action`` GitHub Action,
1084        not by end users directly. Every field except ``client`` is optional:
1085        fields left as ``None`` simply mean the corresponding argument must be
1086        supplied on the :meth:`run_experiment` call.
1087
1088        Args:
1089            client: Initialized Langfuse SDK client used to execute the
1090                experiment. The action creates this from the
1091                ``langfuse_public_key`` / ``langfuse_secret_key`` /
1092                ``langfuse_base_url`` inputs.
1093            data: Default dataset items to run the experiment on. Accepts
1094                either ``List[LocalExperimentItem]`` or ``List[DatasetItem]``.
1095                Injected by the action when ``dataset_name`` is configured.
1096                If ``None``, the user must pass ``data=`` to
1097                :meth:`run_experiment`.
1098            dataset_version: Optional pinned dataset version. Injected by the
1099                action when ``dataset_version`` is configured.
1100            metadata: Default metadata attached to every experiment trace and
1101                the dataset run. The action injects GitHub-sourced tags (SHA,
1102                PR link, workflow run link, branch, GH user, etc.). Merged
1103                with any ``metadata`` passed to :meth:`run_experiment`, with
1104                user-supplied keys winning on collision.
1105        """
1106        self.client = client
1107        self.data = data
1108        self.dataset_version = dataset_version
1109        self.metadata = metadata

Build a RunnerContext populated with defaults for run_experiment.

Typically called by the langfuse/experiment-action GitHub Action, not by end users directly. Every field except client is optional: fields left as None simply mean the corresponding argument must be supplied on the run_experiment() call.

Arguments:
  • client: Initialized Langfuse SDK client used to execute the experiment. The action creates this from the langfuse_public_key / langfuse_secret_key / langfuse_base_url inputs.
  • data: Default dataset items to run the experiment on. Accepts either List[LocalExperimentItem] or List[DatasetItem]. Injected by the action when dataset_name is configured. If None, the user must pass data= to run_experiment().
  • dataset_version: Optional pinned dataset version. Injected by the action when dataset_version is configured.
  • metadata: Default metadata attached to every experiment trace and the dataset run. The action injects GitHub-sourced tags (SHA, PR link, workflow run link, branch, GH user, etc.). Merged with any metadata passed to run_experiment(), with user-supplied keys winning on collision.
client
data
dataset_version
metadata
def run_experiment( self, *, name: str, run_name: Optional[str] = None, description: Optional[str] = None, data: Union[List[langfuse.experiment.LocalExperimentItem], List[langfuse.api.DatasetItem], NoneType] = None, task: langfuse.experiment.TaskFunction, evaluators: List[langfuse.experiment.EvaluatorFunction] = [], composite_evaluator: Optional[CompositeEvaluatorFunction] = None, run_evaluators: List[langfuse.experiment.RunEvaluatorFunction] = [], max_concurrency: int = 50, metadata: Optional[Dict[str, str]] = None, _dataset_version: Optional[datetime.datetime] = None) -> langfuse.experiment.ExperimentResult:
1111    def run_experiment(
1112        self,
1113        *,
1114        name: str,
1115        run_name: Optional[str] = None,
1116        description: Optional[str] = None,
1117        data: Optional[ExperimentData] = None,
1118        task: TaskFunction,
1119        evaluators: List[EvaluatorFunction] = [],
1120        composite_evaluator: Optional["CompositeEvaluatorFunction"] = None,
1121        run_evaluators: List[RunEvaluatorFunction] = [],
1122        max_concurrency: int = 50,
1123        metadata: Optional[Dict[str, str]] = None,
1124        _dataset_version: Optional[datetime] = None,
1125    ) -> ExperimentResult:
1126        resolved_data = data if data is not None else self.data
1127        if resolved_data is None:
1128            raise ValueError(
1129                "`data` must be provided either on the RunnerContext or the run_experiment call"
1130            )
1131
1132        resolved_dataset_version = (
1133            _dataset_version if _dataset_version is not None else self.dataset_version
1134        )
1135
1136        merged_metadata: Optional[Dict[str, str]]
1137        if self.metadata is None and metadata is None:
1138            merged_metadata = None
1139        else:
1140            merged_metadata = {**(self.metadata or {}), **(metadata or {})}
1141
1142        return self.client.run_experiment(
1143            name=name,
1144            run_name=run_name,
1145            description=description,
1146            data=resolved_data,
1147            task=task,
1148            evaluators=evaluators,
1149            composite_evaluator=composite_evaluator,
1150            run_evaluators=run_evaluators,
1151            max_concurrency=max_concurrency,
1152            metadata=merged_metadata,
1153            _dataset_version=resolved_dataset_version,
1154        )
class RegressionError(builtins.Exception):
1157class RegressionError(Exception):
1158    """Raised by a user's ``experiment`` function to signal a CI gate failure.
1159
1160    Intended for use with the ``langfuse/experiment-action`` GitHub Action
1161    (https://github.com/langfuse/experiment-action). The action catches this
1162    exception and, when ``should_fail_on_error`` is enabled, fails the
1163    workflow run and renders a callout in the PR comment using
1164    ``metric``/``value``/``threshold`` if supplied, otherwise ``str(exc)``.
1165
1166    Callers choose one of three forms:
1167
1168    - ``RegressionError(result=r)`` — minimal, generic message.
1169    - ``RegressionError(result=r, message="...")`` — free-form message.
1170    - ``RegressionError(result=r, metric="acc", value=0.7, threshold=0.9)`` —
1171      structured; ``metric`` and ``value`` must be provided together so the
1172      action can render a targeted callout without ``None`` placeholders.
1173    """
1174
1175    @overload
1176    def __init__(self, *, result: ExperimentResult) -> None: ...
1177    @overload
1178    def __init__(self, *, result: ExperimentResult, message: str) -> None: ...
1179    @overload
1180    def __init__(
1181        self,
1182        *,
1183        result: ExperimentResult,
1184        metric: str,
1185        value: float,
1186        threshold: Optional[float] = None,
1187        message: Optional[str] = None,
1188    ) -> None: ...
1189    def __init__(
1190        self,
1191        *,
1192        result: ExperimentResult,
1193        metric: Optional[str] = None,
1194        value: Optional[float] = None,
1195        threshold: Optional[float] = None,
1196        message: Optional[str] = None,
1197    ):
1198        self.result = result
1199        self.metric = metric
1200        self.value = value
1201        self.threshold = threshold
1202        if message is not None:
1203            formatted = message
1204        elif metric is not None and value is not None:
1205            formatted = f"Regression on `{metric}`: {value} (threshold {threshold})"
1206        else:
1207            formatted = "Experiment regression detected"
1208        super().__init__(formatted)

Raised by a user's experiment function to signal a CI gate failure.

Intended for use with the langfuse/experiment-action GitHub Action (https://github.com/langfuse/experiment-action). The action catches this exception and, when should_fail_on_error is enabled, fails the workflow run and renders a callout in the PR comment using metric/value/threshold if supplied, otherwise str(exc).

Callers choose one of three forms:

  • RegressionError(result=r) — minimal, generic message.
  • RegressionError(result=r, message="...") — free-form message.
  • RegressionError(result=r, metric="acc", value=0.7, threshold=0.9) — structured; metric and value must be provided together so the action can render a targeted callout without None placeholders.
RegressionError( *, result: langfuse.experiment.ExperimentResult, metric: Optional[str] = None, value: Optional[float] = None, threshold: Optional[float] = None, message: Optional[str] = None)
1189    def __init__(
1190        self,
1191        *,
1192        result: ExperimentResult,
1193        metric: Optional[str] = None,
1194        value: Optional[float] = None,
1195        threshold: Optional[float] = None,
1196        message: Optional[str] = None,
1197    ):
1198        self.result = result
1199        self.metric = metric
1200        self.value = value
1201        self.threshold = threshold
1202        if message is not None:
1203            formatted = message
1204        elif metric is not None and value is not None:
1205            formatted = f"Regression on `{metric}`: {value} (threshold {threshold})"
1206        else:
1207            formatted = "Experiment regression detected"
1208        super().__init__(formatted)
result
metric
value
threshold
__version__ = '4.6.1'
def is_default_export_span(span: opentelemetry.sdk.trace.ReadableSpan) -> bool:
 98def is_default_export_span(span: ReadableSpan) -> bool:
 99    """Return whether a span should be exported by default."""
100    return (
101        is_langfuse_span(span) or is_genai_span(span) or is_known_llm_instrumentor(span)
102    )

Return whether a span should be exported by default.

def is_langfuse_span(span: opentelemetry.sdk.trace.ReadableSpan) -> bool:
61def is_langfuse_span(span: ReadableSpan) -> bool:
62    """Return whether the span was created by the Langfuse SDK tracer."""
63    return (
64        span.instrumentation_scope is not None
65        and span.instrumentation_scope.name == LANGFUSE_TRACER_NAME
66    )

Return whether the span was created by the Langfuse SDK tracer.

def is_genai_span(span: opentelemetry.sdk.trace.ReadableSpan) -> bool:
69def is_genai_span(span: ReadableSpan) -> bool:
70    """Return whether the span has any ``gen_ai.*`` semantic convention attribute."""
71    if span.attributes is None:
72        return False
73
74    return any(
75        isinstance(key, str) and key.startswith("gen_ai")
76        for key in span.attributes.keys()
77    )

Return whether the span has any gen_ai.* semantic convention attribute.

def is_known_llm_instrumentor(span: opentelemetry.sdk.trace.ReadableSpan) -> bool:
85def is_known_llm_instrumentor(span: ReadableSpan) -> bool:
86    """Return whether the span comes from a known LLM instrumentation scope."""
87    if span.instrumentation_scope is None:
88        return False
89
90    scope_name = span.instrumentation_scope.name
91
92    return any(
93        _matches_scope_prefix(scope_name, prefix)
94        for prefix in KNOWN_LLM_INSTRUMENTATION_SCOPE_PREFIXES
95    )

Return whether the span comes from a known LLM instrumentation scope.

KNOWN_LLM_INSTRUMENTATION_SCOPE_PREFIXES = frozenset({'opentelemetry.instrumentation.langchain', 'opentelemetry.instrumentation.writer', 'opentelemetry.instrumentation.ollama', 'ai', 'opentelemetry.instrumentation.sagemaker', 'opentelemetry.instrumentation.crewai', 'opentelemetry.instrumentation.openai_agents', 'vllm', 'opentelemetry.instrumentation.replicate', 'haystack', 'autogen-core', 'opentelemetry.instrumentation.groq', 'opentelemetry.instrumentation.alephalpha', 'langsmith', 'opentelemetry.instrumentation.watsonx', 'strands-agents', 'opentelemetry.instrumentation.voyageai', 'opentelemetry.instrumentation.haystack', 'opentelemetry.instrumentation.openai', 'litellm', 'opentelemetry.instrumentation.llamaindex', 'opentelemetry.instrumentation.openai_v2', 'opentelemetry.instrumentation.agno', 'opentelemetry.instrumentation.anthropic', 'opentelemetry.instrumentation.transformers', 'langfuse-sdk', 'opentelemetry.instrumentation.together', 'openinference', 'opentelemetry.instrumentation.bedrock', 'pydantic-ai', 'agent_framework', 'opentelemetry.instrumentation.google_generativeai', 'opentelemetry.instrumentation.cohere', 'opentelemetry.instrumentation.vertexai', 'opentelemetry.instrumentation.mistralai'})