langfuse

Langfuse GitHub Banner

Langfuse Python SDK

MIT License CI test status PyPI Version GitHub Repo stars Discord YC W23

Installation

Important

The SDK was rewritten in v3 and released in June 2025. Refer to the v3 migration guide for instructions on updating your code.

pip install langfuse

Docs

Please see our docs for detailed information on this SDK.

 1""".. include:: ../README.md"""
 2
 3from langfuse.batch_evaluation import (
 4    BatchEvaluationResult,
 5    BatchEvaluationResumeToken,
 6    CompositeEvaluatorFunction,
 7    EvaluatorInputs,
 8    EvaluatorStats,
 9    MapperFunction,
10)
11from langfuse.experiment import Evaluation
12
13from ._client import client as _client_module
14from ._client.attributes import LangfuseOtelSpanAttributes
15from ._client.constants import ObservationTypeLiteral
16from ._client.get_client import get_client
17from ._client.observe import observe
18from ._client.propagation import propagate_attributes
19from ._client.span import (
20    LangfuseAgent,
21    LangfuseChain,
22    LangfuseEmbedding,
23    LangfuseEvaluator,
24    LangfuseEvent,
25    LangfuseGeneration,
26    LangfuseGuardrail,
27    LangfuseRetriever,
28    LangfuseSpan,
29    LangfuseTool,
30)
31
32Langfuse = _client_module.Langfuse
33
34__all__ = [
35    "Langfuse",
36    "get_client",
37    "observe",
38    "propagate_attributes",
39    "ObservationTypeLiteral",
40    "LangfuseSpan",
41    "LangfuseGeneration",
42    "LangfuseEvent",
43    "LangfuseOtelSpanAttributes",
44    "LangfuseAgent",
45    "LangfuseTool",
46    "LangfuseChain",
47    "LangfuseEmbedding",
48    "LangfuseEvaluator",
49    "LangfuseRetriever",
50    "LangfuseGuardrail",
51    "Evaluation",
52    "EvaluatorInputs",
53    "MapperFunction",
54    "CompositeEvaluatorFunction",
55    "EvaluatorStats",
56    "BatchEvaluationResumeToken",
57    "BatchEvaluationResult",
58    "experiment",
59    "api",
60]
class Langfuse:
 128class Langfuse:
 129    """Main client for Langfuse tracing and platform features.
 130
 131    This class provides an interface for creating and managing traces, spans,
 132    and generations in Langfuse as well as interacting with the Langfuse API.
 133
 134    The client features a thread-safe singleton pattern for each unique public API key,
 135    ensuring consistent trace context propagation across your application. It implements
 136    efficient batching of spans with configurable flush settings and includes background
 137    thread management for media uploads and score ingestion.
 138
 139    Configuration is flexible through either direct parameters or environment variables,
 140    with graceful fallbacks and runtime configuration updates.
 141
 142    Attributes:
 143        api: Synchronous API client for Langfuse backend communication
 144        async_api: Asynchronous API client for Langfuse backend communication
 145        _otel_tracer: Internal LangfuseTracer instance managing OpenTelemetry components
 146
 147    Parameters:
 148        public_key (Optional[str]): Your Langfuse public API key. Can also be set via LANGFUSE_PUBLIC_KEY environment variable.
 149        secret_key (Optional[str]): Your Langfuse secret API key. Can also be set via LANGFUSE_SECRET_KEY environment variable.
 150        base_url (Optional[str]): The Langfuse API base URL. Defaults to "https://cloud.langfuse.com". Can also be set via LANGFUSE_BASE_URL environment variable.
 151        host (Optional[str]): Deprecated. Use base_url instead. The Langfuse API host URL. Defaults to "https://cloud.langfuse.com".
 152        timeout (Optional[int]): Timeout in seconds for API requests. Defaults to 5 seconds.
 153        httpx_client (Optional[httpx.Client]): Custom httpx client for making non-tracing HTTP requests. If not provided, a default client will be created.
 154        debug (bool): Enable debug logging. Defaults to False. Can also be set via LANGFUSE_DEBUG environment variable.
 155        tracing_enabled (Optional[bool]): Enable or disable tracing. Defaults to True. Can also be set via LANGFUSE_TRACING_ENABLED environment variable.
 156        flush_at (Optional[int]): Number of spans to batch before sending to the API. Defaults to 512. Can also be set via LANGFUSE_FLUSH_AT environment variable.
 157        flush_interval (Optional[float]): Time in seconds between batch flushes. Defaults to 5 seconds. Can also be set via LANGFUSE_FLUSH_INTERVAL environment variable.
 158        environment (Optional[str]): Environment name for tracing. Default is 'default'. Can also be set via LANGFUSE_TRACING_ENVIRONMENT environment variable. Can be any lowercase alphanumeric string with hyphens and underscores that does not start with 'langfuse'.
 159        release (Optional[str]): Release version/hash of your application. Used for grouping analytics by release.
 160        media_upload_thread_count (Optional[int]): Number of background threads for handling media uploads. Defaults to 1. Can also be set via LANGFUSE_MEDIA_UPLOAD_THREAD_COUNT environment variable.
 161        sample_rate (Optional[float]): Sampling rate for traces (0.0 to 1.0). Defaults to 1.0 (100% of traces are sampled). Can also be set via LANGFUSE_SAMPLE_RATE environment variable.
 162        mask (Optional[MaskFunction]): Function to mask sensitive data in traces before sending to the API.
 163        blocked_instrumentation_scopes (Optional[List[str]]): List of instrumentation scope names to block from being exported to Langfuse. Spans from these scopes will be filtered out before being sent to the API. Useful for filtering out spans from specific libraries or frameworks. For exported spans, you can see the instrumentation scope name in the span metadata in Langfuse (`metadata.scope.name`)
 164        additional_headers (Optional[Dict[str, str]]): Additional headers to include in all API requests and OTLPSpanExporter requests. These headers will be merged with default headers. Note: If httpx_client is provided, additional_headers must be set directly on your custom httpx_client as well.
 165        tracer_provider(Optional[TracerProvider]): OpenTelemetry TracerProvider to use for Langfuse. This can be useful to set to have disconnected tracing between Langfuse and other OpenTelemetry-span emitting libraries. Note: To track active spans, the context is still shared between TracerProviders. This may lead to broken trace trees.
 166
 167    Example:
 168        ```python
 169        from langfuse.otel import Langfuse
 170
 171        # Initialize the client (reads from env vars if not provided)
 172        langfuse = Langfuse(
 173            public_key="your-public-key",
 174            secret_key="your-secret-key",
 175            host="https://cloud.langfuse.com",  # Optional, default shown
 176        )
 177
 178        # Create a trace span
 179        with langfuse.start_as_current_span(name="process-query") as span:
 180            # Your application code here
 181
 182            # Create a nested generation span for an LLM call
 183            with span.start_as_current_generation(
 184                name="generate-response",
 185                model="gpt-4",
 186                input={"query": "Tell me about AI"},
 187                model_parameters={"temperature": 0.7, "max_tokens": 500}
 188            ) as generation:
 189                # Generate response here
 190                response = "AI is a field of computer science..."
 191
 192                generation.update(
 193                    output=response,
 194                    usage_details={"prompt_tokens": 10, "completion_tokens": 50},
 195                    cost_details={"total_cost": 0.0023}
 196                )
 197
 198                # Score the generation (supports NUMERIC, BOOLEAN, CATEGORICAL)
 199                generation.score(name="relevance", value=0.95, data_type="NUMERIC")
 200        ```
 201    """
 202
 203    _resources: Optional[LangfuseResourceManager] = None
 204    _mask: Optional[MaskFunction] = None
 205    _otel_tracer: otel_trace_api.Tracer
 206
 207    def __init__(
 208        self,
 209        *,
 210        public_key: Optional[str] = None,
 211        secret_key: Optional[str] = None,
 212        base_url: Optional[str] = None,
 213        host: Optional[str] = None,
 214        timeout: Optional[int] = None,
 215        httpx_client: Optional[httpx.Client] = None,
 216        debug: bool = False,
 217        tracing_enabled: Optional[bool] = True,
 218        flush_at: Optional[int] = None,
 219        flush_interval: Optional[float] = None,
 220        environment: Optional[str] = None,
 221        release: Optional[str] = None,
 222        media_upload_thread_count: Optional[int] = None,
 223        sample_rate: Optional[float] = None,
 224        mask: Optional[MaskFunction] = None,
 225        blocked_instrumentation_scopes: Optional[List[str]] = None,
 226        additional_headers: Optional[Dict[str, str]] = None,
 227        tracer_provider: Optional[TracerProvider] = None,
 228    ):
 229        self._base_url = (
 230            base_url
 231            or os.environ.get(LANGFUSE_BASE_URL)
 232            or host
 233            or os.environ.get(LANGFUSE_HOST, "https://cloud.langfuse.com")
 234        )
 235        self._environment = environment or cast(
 236            str, os.environ.get(LANGFUSE_TRACING_ENVIRONMENT)
 237        )
 238        self._project_id: Optional[str] = None
 239        sample_rate = sample_rate or float(os.environ.get(LANGFUSE_SAMPLE_RATE, 1.0))
 240        if not 0.0 <= sample_rate <= 1.0:
 241            raise ValueError(
 242                f"Sample rate must be between 0.0 and 1.0, got {sample_rate}"
 243            )
 244
 245        timeout = timeout or int(os.environ.get(LANGFUSE_TIMEOUT, 5))
 246
 247        self._tracing_enabled = (
 248            tracing_enabled
 249            and os.environ.get(LANGFUSE_TRACING_ENABLED, "true").lower() != "false"
 250        )
 251        if not self._tracing_enabled:
 252            langfuse_logger.info(
 253                "Configuration: Langfuse tracing is explicitly disabled. No data will be sent to the Langfuse API."
 254            )
 255
 256        debug = (
 257            debug if debug else (os.getenv(LANGFUSE_DEBUG, "false").lower() == "true")
 258        )
 259        if debug:
 260            logging.basicConfig(
 261                format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 262            )
 263            langfuse_logger.setLevel(logging.DEBUG)
 264
 265        public_key = public_key or os.environ.get(LANGFUSE_PUBLIC_KEY)
 266        if public_key is None:
 267            langfuse_logger.warning(
 268                "Authentication error: Langfuse client initialized without public_key. Client will be disabled. "
 269                "Provide a public_key parameter or set LANGFUSE_PUBLIC_KEY environment variable. "
 270            )
 271            self._otel_tracer = otel_trace_api.NoOpTracer()
 272            return
 273
 274        secret_key = secret_key or os.environ.get(LANGFUSE_SECRET_KEY)
 275        if secret_key is None:
 276            langfuse_logger.warning(
 277                "Authentication error: Langfuse client initialized without secret_key. Client will be disabled. "
 278                "Provide a secret_key parameter or set LANGFUSE_SECRET_KEY environment variable. "
 279            )
 280            self._otel_tracer = otel_trace_api.NoOpTracer()
 281            return
 282
 283        if os.environ.get("OTEL_SDK_DISABLED", "false").lower() == "true":
 284            langfuse_logger.warning(
 285                "OTEL_SDK_DISABLED is set. Langfuse tracing will be disabled and no traces will appear in the UI."
 286            )
 287
 288        # Initialize api and tracer if requirements are met
 289        self._resources = LangfuseResourceManager(
 290            public_key=public_key,
 291            secret_key=secret_key,
 292            base_url=self._base_url,
 293            timeout=timeout,
 294            environment=self._environment,
 295            release=release,
 296            flush_at=flush_at,
 297            flush_interval=flush_interval,
 298            httpx_client=httpx_client,
 299            media_upload_thread_count=media_upload_thread_count,
 300            sample_rate=sample_rate,
 301            mask=mask,
 302            tracing_enabled=self._tracing_enabled,
 303            blocked_instrumentation_scopes=blocked_instrumentation_scopes,
 304            additional_headers=additional_headers,
 305            tracer_provider=tracer_provider,
 306        )
 307        self._mask = self._resources.mask
 308
 309        self._otel_tracer = (
 310            self._resources.tracer
 311            if self._tracing_enabled and self._resources.tracer is not None
 312            else otel_trace_api.NoOpTracer()
 313        )
 314        self.api = self._resources.api
 315        self.async_api = self._resources.async_api
 316
 317    def start_span(
 318        self,
 319        *,
 320        trace_context: Optional[TraceContext] = None,
 321        name: str,
 322        input: Optional[Any] = None,
 323        output: Optional[Any] = None,
 324        metadata: Optional[Any] = None,
 325        version: Optional[str] = None,
 326        level: Optional[SpanLevel] = None,
 327        status_message: Optional[str] = None,
 328    ) -> LangfuseSpan:
 329        """Create a new span for tracing a unit of work.
 330
 331        This method creates a new span but does not set it as the current span in the
 332        context. To create and use a span within a context, use start_as_current_span().
 333
 334        The created span will be the child of the current span in the context.
 335
 336        Args:
 337            trace_context: Optional context for connecting to an existing trace
 338            name: Name of the span (e.g., function or operation name)
 339            input: Input data for the operation (can be any JSON-serializable object)
 340            output: Output data from the operation (can be any JSON-serializable object)
 341            metadata: Additional metadata to associate with the span
 342            version: Version identifier for the code or component
 343            level: Importance level of the span (info, warning, error)
 344            status_message: Optional status message for the span
 345
 346        Returns:
 347            A LangfuseSpan object that must be ended with .end() when the operation completes
 348
 349        Example:
 350            ```python
 351            span = langfuse.start_span(name="process-data")
 352            try:
 353                # Do work
 354                span.update(output="result")
 355            finally:
 356                span.end()
 357            ```
 358        """
 359        return self.start_observation(
 360            trace_context=trace_context,
 361            name=name,
 362            as_type="span",
 363            input=input,
 364            output=output,
 365            metadata=metadata,
 366            version=version,
 367            level=level,
 368            status_message=status_message,
 369        )
 370
 371    def start_as_current_span(
 372        self,
 373        *,
 374        trace_context: Optional[TraceContext] = None,
 375        name: str,
 376        input: Optional[Any] = None,
 377        output: Optional[Any] = None,
 378        metadata: Optional[Any] = None,
 379        version: Optional[str] = None,
 380        level: Optional[SpanLevel] = None,
 381        status_message: Optional[str] = None,
 382        end_on_exit: Optional[bool] = None,
 383    ) -> _AgnosticContextManager[LangfuseSpan]:
 384        """Create a new span and set it as the current span in a context manager.
 385
 386        This method creates a new span and sets it as the current span within a context
 387        manager. Use this method with a 'with' statement to automatically handle span
 388        lifecycle within a code block.
 389
 390        The created span will be the child of the current span in the context.
 391
 392        Args:
 393            trace_context: Optional context for connecting to an existing trace
 394            name: Name of the span (e.g., function or operation name)
 395            input: Input data for the operation (can be any JSON-serializable object)
 396            output: Output data from the operation (can be any JSON-serializable object)
 397            metadata: Additional metadata to associate with the span
 398            version: Version identifier for the code or component
 399            level: Importance level of the span (info, warning, error)
 400            status_message: Optional status message for the span
 401            end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
 402
 403        Returns:
 404            A context manager that yields a LangfuseSpan
 405
 406        Example:
 407            ```python
 408            with langfuse.start_as_current_span(name="process-query") as span:
 409                # Do work
 410                result = process_data()
 411                span.update(output=result)
 412
 413                # Create a child span automatically
 414                with span.start_as_current_span(name="sub-operation") as child_span:
 415                    # Do sub-operation work
 416                    child_span.update(output="sub-result")
 417            ```
 418        """
 419        return self.start_as_current_observation(
 420            trace_context=trace_context,
 421            name=name,
 422            as_type="span",
 423            input=input,
 424            output=output,
 425            metadata=metadata,
 426            version=version,
 427            level=level,
 428            status_message=status_message,
 429            end_on_exit=end_on_exit,
 430        )
 431
 432    @overload
 433    def start_observation(
 434        self,
 435        *,
 436        trace_context: Optional[TraceContext] = None,
 437        name: str,
 438        as_type: Literal["generation"],
 439        input: Optional[Any] = None,
 440        output: Optional[Any] = None,
 441        metadata: Optional[Any] = None,
 442        version: Optional[str] = None,
 443        level: Optional[SpanLevel] = None,
 444        status_message: Optional[str] = None,
 445        completion_start_time: Optional[datetime] = None,
 446        model: Optional[str] = None,
 447        model_parameters: Optional[Dict[str, MapValue]] = None,
 448        usage_details: Optional[Dict[str, int]] = None,
 449        cost_details: Optional[Dict[str, float]] = None,
 450        prompt: Optional[PromptClient] = None,
 451    ) -> LangfuseGeneration: ...
 452
 453    @overload
 454    def start_observation(
 455        self,
 456        *,
 457        trace_context: Optional[TraceContext] = None,
 458        name: str,
 459        as_type: Literal["span"] = "span",
 460        input: Optional[Any] = None,
 461        output: Optional[Any] = None,
 462        metadata: Optional[Any] = None,
 463        version: Optional[str] = None,
 464        level: Optional[SpanLevel] = None,
 465        status_message: Optional[str] = None,
 466    ) -> LangfuseSpan: ...
 467
 468    @overload
 469    def start_observation(
 470        self,
 471        *,
 472        trace_context: Optional[TraceContext] = None,
 473        name: str,
 474        as_type: Literal["agent"],
 475        input: Optional[Any] = None,
 476        output: Optional[Any] = None,
 477        metadata: Optional[Any] = None,
 478        version: Optional[str] = None,
 479        level: Optional[SpanLevel] = None,
 480        status_message: Optional[str] = None,
 481    ) -> LangfuseAgent: ...
 482
 483    @overload
 484    def start_observation(
 485        self,
 486        *,
 487        trace_context: Optional[TraceContext] = None,
 488        name: str,
 489        as_type: Literal["tool"],
 490        input: Optional[Any] = None,
 491        output: Optional[Any] = None,
 492        metadata: Optional[Any] = None,
 493        version: Optional[str] = None,
 494        level: Optional[SpanLevel] = None,
 495        status_message: Optional[str] = None,
 496    ) -> LangfuseTool: ...
 497
 498    @overload
 499    def start_observation(
 500        self,
 501        *,
 502        trace_context: Optional[TraceContext] = None,
 503        name: str,
 504        as_type: Literal["chain"],
 505        input: Optional[Any] = None,
 506        output: Optional[Any] = None,
 507        metadata: Optional[Any] = None,
 508        version: Optional[str] = None,
 509        level: Optional[SpanLevel] = None,
 510        status_message: Optional[str] = None,
 511    ) -> LangfuseChain: ...
 512
 513    @overload
 514    def start_observation(
 515        self,
 516        *,
 517        trace_context: Optional[TraceContext] = None,
 518        name: str,
 519        as_type: Literal["retriever"],
 520        input: Optional[Any] = None,
 521        output: Optional[Any] = None,
 522        metadata: Optional[Any] = None,
 523        version: Optional[str] = None,
 524        level: Optional[SpanLevel] = None,
 525        status_message: Optional[str] = None,
 526    ) -> LangfuseRetriever: ...
 527
 528    @overload
 529    def start_observation(
 530        self,
 531        *,
 532        trace_context: Optional[TraceContext] = None,
 533        name: str,
 534        as_type: Literal["evaluator"],
 535        input: Optional[Any] = None,
 536        output: Optional[Any] = None,
 537        metadata: Optional[Any] = None,
 538        version: Optional[str] = None,
 539        level: Optional[SpanLevel] = None,
 540        status_message: Optional[str] = None,
 541    ) -> LangfuseEvaluator: ...
 542
 543    @overload
 544    def start_observation(
 545        self,
 546        *,
 547        trace_context: Optional[TraceContext] = None,
 548        name: str,
 549        as_type: Literal["embedding"],
 550        input: Optional[Any] = None,
 551        output: Optional[Any] = None,
 552        metadata: Optional[Any] = None,
 553        version: Optional[str] = None,
 554        level: Optional[SpanLevel] = None,
 555        status_message: Optional[str] = None,
 556        completion_start_time: Optional[datetime] = None,
 557        model: Optional[str] = None,
 558        model_parameters: Optional[Dict[str, MapValue]] = None,
 559        usage_details: Optional[Dict[str, int]] = None,
 560        cost_details: Optional[Dict[str, float]] = None,
 561        prompt: Optional[PromptClient] = None,
 562    ) -> LangfuseEmbedding: ...
 563
 564    @overload
 565    def start_observation(
 566        self,
 567        *,
 568        trace_context: Optional[TraceContext] = None,
 569        name: str,
 570        as_type: Literal["guardrail"],
 571        input: Optional[Any] = None,
 572        output: Optional[Any] = None,
 573        metadata: Optional[Any] = None,
 574        version: Optional[str] = None,
 575        level: Optional[SpanLevel] = None,
 576        status_message: Optional[str] = None,
 577    ) -> LangfuseGuardrail: ...
 578
 579    def start_observation(
 580        self,
 581        *,
 582        trace_context: Optional[TraceContext] = None,
 583        name: str,
 584        as_type: ObservationTypeLiteralNoEvent = "span",
 585        input: Optional[Any] = None,
 586        output: Optional[Any] = None,
 587        metadata: Optional[Any] = None,
 588        version: Optional[str] = None,
 589        level: Optional[SpanLevel] = None,
 590        status_message: Optional[str] = None,
 591        completion_start_time: Optional[datetime] = None,
 592        model: Optional[str] = None,
 593        model_parameters: Optional[Dict[str, MapValue]] = None,
 594        usage_details: Optional[Dict[str, int]] = None,
 595        cost_details: Optional[Dict[str, float]] = None,
 596        prompt: Optional[PromptClient] = None,
 597    ) -> Union[
 598        LangfuseSpan,
 599        LangfuseGeneration,
 600        LangfuseAgent,
 601        LangfuseTool,
 602        LangfuseChain,
 603        LangfuseRetriever,
 604        LangfuseEvaluator,
 605        LangfuseEmbedding,
 606        LangfuseGuardrail,
 607    ]:
 608        """Create a new observation of the specified type.
 609
 610        This method creates a new observation but does not set it as the current span in the
 611        context. To create and use an observation within a context, use start_as_current_observation().
 612
 613        Args:
 614            trace_context: Optional context for connecting to an existing trace
 615            name: Name of the observation
 616            as_type: Type of observation to create (defaults to "span")
 617            input: Input data for the operation
 618            output: Output data from the operation
 619            metadata: Additional metadata to associate with the observation
 620            version: Version identifier for the code or component
 621            level: Importance level of the observation
 622            status_message: Optional status message for the observation
 623            completion_start_time: When the model started generating (for generation types)
 624            model: Name/identifier of the AI model used (for generation types)
 625            model_parameters: Parameters used for the model (for generation types)
 626            usage_details: Token usage information (for generation types)
 627            cost_details: Cost information (for generation types)
 628            prompt: Associated prompt template (for generation types)
 629
 630        Returns:
 631            An observation object of the appropriate type that must be ended with .end()
 632        """
 633        if trace_context:
 634            trace_id = trace_context.get("trace_id", None)
 635            parent_span_id = trace_context.get("parent_span_id", None)
 636
 637            if trace_id:
 638                remote_parent_span = self._create_remote_parent_span(
 639                    trace_id=trace_id, parent_span_id=parent_span_id
 640                )
 641
 642                with otel_trace_api.use_span(
 643                    cast(otel_trace_api.Span, remote_parent_span)
 644                ):
 645                    otel_span = self._otel_tracer.start_span(name=name)
 646                    otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True)
 647
 648                    return self._create_observation_from_otel_span(
 649                        otel_span=otel_span,
 650                        as_type=as_type,
 651                        input=input,
 652                        output=output,
 653                        metadata=metadata,
 654                        version=version,
 655                        level=level,
 656                        status_message=status_message,
 657                        completion_start_time=completion_start_time,
 658                        model=model,
 659                        model_parameters=model_parameters,
 660                        usage_details=usage_details,
 661                        cost_details=cost_details,
 662                        prompt=prompt,
 663                    )
 664
 665        otel_span = self._otel_tracer.start_span(name=name)
 666
 667        return self._create_observation_from_otel_span(
 668            otel_span=otel_span,
 669            as_type=as_type,
 670            input=input,
 671            output=output,
 672            metadata=metadata,
 673            version=version,
 674            level=level,
 675            status_message=status_message,
 676            completion_start_time=completion_start_time,
 677            model=model,
 678            model_parameters=model_parameters,
 679            usage_details=usage_details,
 680            cost_details=cost_details,
 681            prompt=prompt,
 682        )
 683
 684    def _create_observation_from_otel_span(
 685        self,
 686        *,
 687        otel_span: otel_trace_api.Span,
 688        as_type: ObservationTypeLiteralNoEvent,
 689        input: Optional[Any] = None,
 690        output: Optional[Any] = None,
 691        metadata: Optional[Any] = None,
 692        version: Optional[str] = None,
 693        level: Optional[SpanLevel] = None,
 694        status_message: Optional[str] = None,
 695        completion_start_time: Optional[datetime] = None,
 696        model: Optional[str] = None,
 697        model_parameters: Optional[Dict[str, MapValue]] = None,
 698        usage_details: Optional[Dict[str, int]] = None,
 699        cost_details: Optional[Dict[str, float]] = None,
 700        prompt: Optional[PromptClient] = None,
 701    ) -> Union[
 702        LangfuseSpan,
 703        LangfuseGeneration,
 704        LangfuseAgent,
 705        LangfuseTool,
 706        LangfuseChain,
 707        LangfuseRetriever,
 708        LangfuseEvaluator,
 709        LangfuseEmbedding,
 710        LangfuseGuardrail,
 711    ]:
 712        """Create the appropriate observation type from an OTEL span."""
 713        if as_type in get_observation_types_list(ObservationTypeGenerationLike):
 714            observation_class = self._get_span_class(as_type)
 715            # Type ignore to prevent overloads of internal _get_span_class function,
 716            # issue is that LangfuseEvent could be returned and that classes have diff. args
 717            return observation_class(  # type: ignore[return-value,call-arg]
 718                otel_span=otel_span,
 719                langfuse_client=self,
 720                environment=self._environment,
 721                input=input,
 722                output=output,
 723                metadata=metadata,
 724                version=version,
 725                level=level,
 726                status_message=status_message,
 727                completion_start_time=completion_start_time,
 728                model=model,
 729                model_parameters=model_parameters,
 730                usage_details=usage_details,
 731                cost_details=cost_details,
 732                prompt=prompt,
 733            )
 734        else:
 735            # For other types (e.g. span, guardrail), create appropriate class without generation properties
 736            observation_class = self._get_span_class(as_type)
 737            # Type ignore to prevent overloads of internal _get_span_class function,
 738            # issue is that LangfuseEvent could be returned and that classes have diff. args
 739            return observation_class(  # type: ignore[return-value,call-arg]
 740                otel_span=otel_span,
 741                langfuse_client=self,
 742                environment=self._environment,
 743                input=input,
 744                output=output,
 745                metadata=metadata,
 746                version=version,
 747                level=level,
 748                status_message=status_message,
 749            )
 750            # span._observation_type = as_type
 751            # span._otel_span.set_attribute("langfuse.observation.type", as_type)
 752            # return span
 753
 754    def start_generation(
 755        self,
 756        *,
 757        trace_context: Optional[TraceContext] = None,
 758        name: str,
 759        input: Optional[Any] = None,
 760        output: Optional[Any] = None,
 761        metadata: Optional[Any] = None,
 762        version: Optional[str] = None,
 763        level: Optional[SpanLevel] = None,
 764        status_message: Optional[str] = None,
 765        completion_start_time: Optional[datetime] = None,
 766        model: Optional[str] = None,
 767        model_parameters: Optional[Dict[str, MapValue]] = None,
 768        usage_details: Optional[Dict[str, int]] = None,
 769        cost_details: Optional[Dict[str, float]] = None,
 770        prompt: Optional[PromptClient] = None,
 771    ) -> LangfuseGeneration:
 772        """Create a new generation span for model generations.
 773
 774        DEPRECATED: This method is deprecated and will be removed in a future version.
 775        Use start_observation(as_type='generation') instead.
 776
 777        This method creates a specialized span for tracking model generations.
 778        It includes additional fields specific to model generations such as model name,
 779        token usage, and cost details.
 780
 781        The created generation span will be the child of the current span in the context.
 782
 783        Args:
 784            trace_context: Optional context for connecting to an existing trace
 785            name: Name of the generation operation
 786            input: Input data for the model (e.g., prompts)
 787            output: Output from the model (e.g., completions)
 788            metadata: Additional metadata to associate with the generation
 789            version: Version identifier for the model or component
 790            level: Importance level of the generation (info, warning, error)
 791            status_message: Optional status message for the generation
 792            completion_start_time: When the model started generating the response
 793            model: Name/identifier of the AI model used (e.g., "gpt-4")
 794            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
 795            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
 796            cost_details: Cost information for the model call
 797            prompt: Associated prompt template from Langfuse prompt management
 798
 799        Returns:
 800            A LangfuseGeneration object that must be ended with .end() when complete
 801
 802        Example:
 803            ```python
 804            generation = langfuse.start_generation(
 805                name="answer-generation",
 806                model="gpt-4",
 807                input={"prompt": "Explain quantum computing"},
 808                model_parameters={"temperature": 0.7}
 809            )
 810            try:
 811                # Call model API
 812                response = llm.generate(...)
 813
 814                generation.update(
 815                    output=response.text,
 816                    usage_details={
 817                        "prompt_tokens": response.usage.prompt_tokens,
 818                        "completion_tokens": response.usage.completion_tokens
 819                    }
 820                )
 821            finally:
 822                generation.end()
 823            ```
 824        """
 825        warnings.warn(
 826            "start_generation is deprecated and will be removed in a future version. "
 827            "Use start_observation(as_type='generation') instead.",
 828            DeprecationWarning,
 829            stacklevel=2,
 830        )
 831        return self.start_observation(
 832            trace_context=trace_context,
 833            name=name,
 834            as_type="generation",
 835            input=input,
 836            output=output,
 837            metadata=metadata,
 838            version=version,
 839            level=level,
 840            status_message=status_message,
 841            completion_start_time=completion_start_time,
 842            model=model,
 843            model_parameters=model_parameters,
 844            usage_details=usage_details,
 845            cost_details=cost_details,
 846            prompt=prompt,
 847        )
 848
 849    def start_as_current_generation(
 850        self,
 851        *,
 852        trace_context: Optional[TraceContext] = None,
 853        name: str,
 854        input: Optional[Any] = None,
 855        output: Optional[Any] = None,
 856        metadata: Optional[Any] = None,
 857        version: Optional[str] = None,
 858        level: Optional[SpanLevel] = None,
 859        status_message: Optional[str] = None,
 860        completion_start_time: Optional[datetime] = None,
 861        model: Optional[str] = None,
 862        model_parameters: Optional[Dict[str, MapValue]] = None,
 863        usage_details: Optional[Dict[str, int]] = None,
 864        cost_details: Optional[Dict[str, float]] = None,
 865        prompt: Optional[PromptClient] = None,
 866        end_on_exit: Optional[bool] = None,
 867    ) -> _AgnosticContextManager[LangfuseGeneration]:
 868        """Create a new generation span and set it as the current span in a context manager.
 869
 870        DEPRECATED: This method is deprecated and will be removed in a future version.
 871        Use start_as_current_observation(as_type='generation') instead.
 872
 873        This method creates a specialized span for model generations and sets it as the
 874        current span within a context manager. Use this method with a 'with' statement to
 875        automatically handle the generation span lifecycle within a code block.
 876
 877        The created generation span will be the child of the current span in the context.
 878
 879        Args:
 880            trace_context: Optional context for connecting to an existing trace
 881            name: Name of the generation operation
 882            input: Input data for the model (e.g., prompts)
 883            output: Output from the model (e.g., completions)
 884            metadata: Additional metadata to associate with the generation
 885            version: Version identifier for the model or component
 886            level: Importance level of the generation (info, warning, error)
 887            status_message: Optional status message for the generation
 888            completion_start_time: When the model started generating the response
 889            model: Name/identifier of the AI model used (e.g., "gpt-4")
 890            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
 891            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
 892            cost_details: Cost information for the model call
 893            prompt: Associated prompt template from Langfuse prompt management
 894            end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
 895
 896        Returns:
 897            A context manager that yields a LangfuseGeneration
 898
 899        Example:
 900            ```python
 901            with langfuse.start_as_current_generation(
 902                name="answer-generation",
 903                model="gpt-4",
 904                input={"prompt": "Explain quantum computing"}
 905            ) as generation:
 906                # Call model API
 907                response = llm.generate(...)
 908
 909                # Update with results
 910                generation.update(
 911                    output=response.text,
 912                    usage_details={
 913                        "prompt_tokens": response.usage.prompt_tokens,
 914                        "completion_tokens": response.usage.completion_tokens
 915                    }
 916                )
 917            ```
 918        """
 919        warnings.warn(
 920            "start_as_current_generation is deprecated and will be removed in a future version. "
 921            "Use start_as_current_observation(as_type='generation') instead.",
 922            DeprecationWarning,
 923            stacklevel=2,
 924        )
 925        return self.start_as_current_observation(
 926            trace_context=trace_context,
 927            name=name,
 928            as_type="generation",
 929            input=input,
 930            output=output,
 931            metadata=metadata,
 932            version=version,
 933            level=level,
 934            status_message=status_message,
 935            completion_start_time=completion_start_time,
 936            model=model,
 937            model_parameters=model_parameters,
 938            usage_details=usage_details,
 939            cost_details=cost_details,
 940            prompt=prompt,
 941            end_on_exit=end_on_exit,
 942        )
 943
 944    @overload
 945    def start_as_current_observation(
 946        self,
 947        *,
 948        trace_context: Optional[TraceContext] = None,
 949        name: str,
 950        as_type: Literal["generation"],
 951        input: Optional[Any] = None,
 952        output: Optional[Any] = None,
 953        metadata: Optional[Any] = None,
 954        version: Optional[str] = None,
 955        level: Optional[SpanLevel] = None,
 956        status_message: Optional[str] = None,
 957        completion_start_time: Optional[datetime] = None,
 958        model: Optional[str] = None,
 959        model_parameters: Optional[Dict[str, MapValue]] = None,
 960        usage_details: Optional[Dict[str, int]] = None,
 961        cost_details: Optional[Dict[str, float]] = None,
 962        prompt: Optional[PromptClient] = None,
 963        end_on_exit: Optional[bool] = None,
 964    ) -> _AgnosticContextManager[LangfuseGeneration]: ...
 965
 966    @overload
 967    def start_as_current_observation(
 968        self,
 969        *,
 970        trace_context: Optional[TraceContext] = None,
 971        name: str,
 972        as_type: Literal["span"] = "span",
 973        input: Optional[Any] = None,
 974        output: Optional[Any] = None,
 975        metadata: Optional[Any] = None,
 976        version: Optional[str] = None,
 977        level: Optional[SpanLevel] = None,
 978        status_message: Optional[str] = None,
 979        end_on_exit: Optional[bool] = None,
 980    ) -> _AgnosticContextManager[LangfuseSpan]: ...
 981
 982    @overload
 983    def start_as_current_observation(
 984        self,
 985        *,
 986        trace_context: Optional[TraceContext] = None,
 987        name: str,
 988        as_type: Literal["agent"],
 989        input: Optional[Any] = None,
 990        output: Optional[Any] = None,
 991        metadata: Optional[Any] = None,
 992        version: Optional[str] = None,
 993        level: Optional[SpanLevel] = None,
 994        status_message: Optional[str] = None,
 995        end_on_exit: Optional[bool] = None,
 996    ) -> _AgnosticContextManager[LangfuseAgent]: ...
 997
 998    @overload
 999    def start_as_current_observation(
1000        self,
1001        *,
1002        trace_context: Optional[TraceContext] = None,
1003        name: str,
1004        as_type: Literal["tool"],
1005        input: Optional[Any] = None,
1006        output: Optional[Any] = None,
1007        metadata: Optional[Any] = None,
1008        version: Optional[str] = None,
1009        level: Optional[SpanLevel] = None,
1010        status_message: Optional[str] = None,
1011        end_on_exit: Optional[bool] = None,
1012    ) -> _AgnosticContextManager[LangfuseTool]: ...
1013
1014    @overload
1015    def start_as_current_observation(
1016        self,
1017        *,
1018        trace_context: Optional[TraceContext] = None,
1019        name: str,
1020        as_type: Literal["chain"],
1021        input: Optional[Any] = None,
1022        output: Optional[Any] = None,
1023        metadata: Optional[Any] = None,
1024        version: Optional[str] = None,
1025        level: Optional[SpanLevel] = None,
1026        status_message: Optional[str] = None,
1027        end_on_exit: Optional[bool] = None,
1028    ) -> _AgnosticContextManager[LangfuseChain]: ...
1029
1030    @overload
1031    def start_as_current_observation(
1032        self,
1033        *,
1034        trace_context: Optional[TraceContext] = None,
1035        name: str,
1036        as_type: Literal["retriever"],
1037        input: Optional[Any] = None,
1038        output: Optional[Any] = None,
1039        metadata: Optional[Any] = None,
1040        version: Optional[str] = None,
1041        level: Optional[SpanLevel] = None,
1042        status_message: Optional[str] = None,
1043        end_on_exit: Optional[bool] = None,
1044    ) -> _AgnosticContextManager[LangfuseRetriever]: ...
1045
1046    @overload
1047    def start_as_current_observation(
1048        self,
1049        *,
1050        trace_context: Optional[TraceContext] = None,
1051        name: str,
1052        as_type: Literal["evaluator"],
1053        input: Optional[Any] = None,
1054        output: Optional[Any] = None,
1055        metadata: Optional[Any] = None,
1056        version: Optional[str] = None,
1057        level: Optional[SpanLevel] = None,
1058        status_message: Optional[str] = None,
1059        end_on_exit: Optional[bool] = None,
1060    ) -> _AgnosticContextManager[LangfuseEvaluator]: ...
1061
1062    @overload
1063    def start_as_current_observation(
1064        self,
1065        *,
1066        trace_context: Optional[TraceContext] = None,
1067        name: str,
1068        as_type: Literal["embedding"],
1069        input: Optional[Any] = None,
1070        output: Optional[Any] = None,
1071        metadata: Optional[Any] = None,
1072        version: Optional[str] = None,
1073        level: Optional[SpanLevel] = None,
1074        status_message: Optional[str] = None,
1075        completion_start_time: Optional[datetime] = None,
1076        model: Optional[str] = None,
1077        model_parameters: Optional[Dict[str, MapValue]] = None,
1078        usage_details: Optional[Dict[str, int]] = None,
1079        cost_details: Optional[Dict[str, float]] = None,
1080        prompt: Optional[PromptClient] = None,
1081        end_on_exit: Optional[bool] = None,
1082    ) -> _AgnosticContextManager[LangfuseEmbedding]: ...
1083
1084    @overload
1085    def start_as_current_observation(
1086        self,
1087        *,
1088        trace_context: Optional[TraceContext] = None,
1089        name: str,
1090        as_type: Literal["guardrail"],
1091        input: Optional[Any] = None,
1092        output: Optional[Any] = None,
1093        metadata: Optional[Any] = None,
1094        version: Optional[str] = None,
1095        level: Optional[SpanLevel] = None,
1096        status_message: Optional[str] = None,
1097        end_on_exit: Optional[bool] = None,
1098    ) -> _AgnosticContextManager[LangfuseGuardrail]: ...
1099
1100    def start_as_current_observation(
1101        self,
1102        *,
1103        trace_context: Optional[TraceContext] = None,
1104        name: str,
1105        as_type: ObservationTypeLiteralNoEvent = "span",
1106        input: Optional[Any] = None,
1107        output: Optional[Any] = None,
1108        metadata: Optional[Any] = None,
1109        version: Optional[str] = None,
1110        level: Optional[SpanLevel] = None,
1111        status_message: Optional[str] = None,
1112        completion_start_time: Optional[datetime] = None,
1113        model: Optional[str] = None,
1114        model_parameters: Optional[Dict[str, MapValue]] = None,
1115        usage_details: Optional[Dict[str, int]] = None,
1116        cost_details: Optional[Dict[str, float]] = None,
1117        prompt: Optional[PromptClient] = None,
1118        end_on_exit: Optional[bool] = None,
1119    ) -> Union[
1120        _AgnosticContextManager[LangfuseGeneration],
1121        _AgnosticContextManager[LangfuseSpan],
1122        _AgnosticContextManager[LangfuseAgent],
1123        _AgnosticContextManager[LangfuseTool],
1124        _AgnosticContextManager[LangfuseChain],
1125        _AgnosticContextManager[LangfuseRetriever],
1126        _AgnosticContextManager[LangfuseEvaluator],
1127        _AgnosticContextManager[LangfuseEmbedding],
1128        _AgnosticContextManager[LangfuseGuardrail],
1129    ]:
1130        """Create a new observation and set it as the current span in a context manager.
1131
1132        This method creates a new observation of the specified type and sets it as the
1133        current span within a context manager. Use this method with a 'with' statement to
1134        automatically handle the observation lifecycle within a code block.
1135
1136        The created observation will be the child of the current span in the context.
1137
1138        Args:
1139            trace_context: Optional context for connecting to an existing trace
1140            name: Name of the observation (e.g., function or operation name)
1141            as_type: Type of observation to create (defaults to "span")
1142            input: Input data for the operation (can be any JSON-serializable object)
1143            output: Output data from the operation (can be any JSON-serializable object)
1144            metadata: Additional metadata to associate with the observation
1145            version: Version identifier for the code or component
1146            level: Importance level of the observation (info, warning, error)
1147            status_message: Optional status message for the observation
1148            end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
1149
1150            The following parameters are available when as_type is: "generation" or "embedding".
1151            completion_start_time: When the model started generating the response
1152            model: Name/identifier of the AI model used (e.g., "gpt-4")
1153            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1154            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1155            cost_details: Cost information for the model call
1156            prompt: Associated prompt template from Langfuse prompt management
1157
1158        Returns:
1159            A context manager that yields the appropriate observation type based on as_type
1160
1161        Example:
1162            ```python
1163            # Create a span
1164            with langfuse.start_as_current_observation(name="process-query", as_type="span") as span:
1165                # Do work
1166                result = process_data()
1167                span.update(output=result)
1168
1169                # Create a child span automatically
1170                with span.start_as_current_span(name="sub-operation") as child_span:
1171                    # Do sub-operation work
1172                    child_span.update(output="sub-result")
1173
1174            # Create a tool observation
1175            with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool:
1176                # Do tool work
1177                results = search_web(query)
1178                tool.update(output=results)
1179
1180            # Create a generation observation
1181            with langfuse.start_as_current_observation(
1182                name="answer-generation",
1183                as_type="generation",
1184                model="gpt-4"
1185            ) as generation:
1186                # Generate answer
1187                response = llm.generate(...)
1188                generation.update(output=response)
1189            ```
1190        """
1191        if as_type in get_observation_types_list(ObservationTypeGenerationLike):
1192            if trace_context:
1193                trace_id = trace_context.get("trace_id", None)
1194                parent_span_id = trace_context.get("parent_span_id", None)
1195
1196                if trace_id:
1197                    remote_parent_span = self._create_remote_parent_span(
1198                        trace_id=trace_id, parent_span_id=parent_span_id
1199                    )
1200
1201                    return cast(
1202                        Union[
1203                            _AgnosticContextManager[LangfuseGeneration],
1204                            _AgnosticContextManager[LangfuseEmbedding],
1205                        ],
1206                        self._create_span_with_parent_context(
1207                            as_type=as_type,
1208                            name=name,
1209                            remote_parent_span=remote_parent_span,
1210                            parent=None,
1211                            end_on_exit=end_on_exit,
1212                            input=input,
1213                            output=output,
1214                            metadata=metadata,
1215                            version=version,
1216                            level=level,
1217                            status_message=status_message,
1218                            completion_start_time=completion_start_time,
1219                            model=model,
1220                            model_parameters=model_parameters,
1221                            usage_details=usage_details,
1222                            cost_details=cost_details,
1223                            prompt=prompt,
1224                        ),
1225                    )
1226
1227            return cast(
1228                Union[
1229                    _AgnosticContextManager[LangfuseGeneration],
1230                    _AgnosticContextManager[LangfuseEmbedding],
1231                ],
1232                self._start_as_current_otel_span_with_processed_media(
1233                    as_type=as_type,
1234                    name=name,
1235                    end_on_exit=end_on_exit,
1236                    input=input,
1237                    output=output,
1238                    metadata=metadata,
1239                    version=version,
1240                    level=level,
1241                    status_message=status_message,
1242                    completion_start_time=completion_start_time,
1243                    model=model,
1244                    model_parameters=model_parameters,
1245                    usage_details=usage_details,
1246                    cost_details=cost_details,
1247                    prompt=prompt,
1248                ),
1249            )
1250
1251        if as_type in get_observation_types_list(ObservationTypeSpanLike):
1252            if trace_context:
1253                trace_id = trace_context.get("trace_id", None)
1254                parent_span_id = trace_context.get("parent_span_id", None)
1255
1256                if trace_id:
1257                    remote_parent_span = self._create_remote_parent_span(
1258                        trace_id=trace_id, parent_span_id=parent_span_id
1259                    )
1260
1261                    return cast(
1262                        Union[
1263                            _AgnosticContextManager[LangfuseSpan],
1264                            _AgnosticContextManager[LangfuseAgent],
1265                            _AgnosticContextManager[LangfuseTool],
1266                            _AgnosticContextManager[LangfuseChain],
1267                            _AgnosticContextManager[LangfuseRetriever],
1268                            _AgnosticContextManager[LangfuseEvaluator],
1269                            _AgnosticContextManager[LangfuseGuardrail],
1270                        ],
1271                        self._create_span_with_parent_context(
1272                            as_type=as_type,
1273                            name=name,
1274                            remote_parent_span=remote_parent_span,
1275                            parent=None,
1276                            end_on_exit=end_on_exit,
1277                            input=input,
1278                            output=output,
1279                            metadata=metadata,
1280                            version=version,
1281                            level=level,
1282                            status_message=status_message,
1283                        ),
1284                    )
1285
1286            return cast(
1287                Union[
1288                    _AgnosticContextManager[LangfuseSpan],
1289                    _AgnosticContextManager[LangfuseAgent],
1290                    _AgnosticContextManager[LangfuseTool],
1291                    _AgnosticContextManager[LangfuseChain],
1292                    _AgnosticContextManager[LangfuseRetriever],
1293                    _AgnosticContextManager[LangfuseEvaluator],
1294                    _AgnosticContextManager[LangfuseGuardrail],
1295                ],
1296                self._start_as_current_otel_span_with_processed_media(
1297                    as_type=as_type,
1298                    name=name,
1299                    end_on_exit=end_on_exit,
1300                    input=input,
1301                    output=output,
1302                    metadata=metadata,
1303                    version=version,
1304                    level=level,
1305                    status_message=status_message,
1306                ),
1307            )
1308
1309        # This should never be reached since all valid types are handled above
1310        langfuse_logger.warning(
1311            f"Unknown observation type: {as_type}, falling back to span"
1312        )
1313        return self._start_as_current_otel_span_with_processed_media(
1314            as_type="span",
1315            name=name,
1316            end_on_exit=end_on_exit,
1317            input=input,
1318            output=output,
1319            metadata=metadata,
1320            version=version,
1321            level=level,
1322            status_message=status_message,
1323        )
1324
1325    def _get_span_class(
1326        self,
1327        as_type: ObservationTypeLiteral,
1328    ) -> Union[
1329        Type[LangfuseAgent],
1330        Type[LangfuseTool],
1331        Type[LangfuseChain],
1332        Type[LangfuseRetriever],
1333        Type[LangfuseEvaluator],
1334        Type[LangfuseEmbedding],
1335        Type[LangfuseGuardrail],
1336        Type[LangfuseGeneration],
1337        Type[LangfuseEvent],
1338        Type[LangfuseSpan],
1339    ]:
1340        """Get the appropriate span class based on as_type."""
1341        normalized_type = as_type.lower()
1342
1343        if normalized_type == "agent":
1344            return LangfuseAgent
1345        elif normalized_type == "tool":
1346            return LangfuseTool
1347        elif normalized_type == "chain":
1348            return LangfuseChain
1349        elif normalized_type == "retriever":
1350            return LangfuseRetriever
1351        elif normalized_type == "evaluator":
1352            return LangfuseEvaluator
1353        elif normalized_type == "embedding":
1354            return LangfuseEmbedding
1355        elif normalized_type == "guardrail":
1356            return LangfuseGuardrail
1357        elif normalized_type == "generation":
1358            return LangfuseGeneration
1359        elif normalized_type == "event":
1360            return LangfuseEvent
1361        elif normalized_type == "span":
1362            return LangfuseSpan
1363        else:
1364            return LangfuseSpan
1365
1366    @_agnosticcontextmanager
1367    def _create_span_with_parent_context(
1368        self,
1369        *,
1370        name: str,
1371        parent: Optional[otel_trace_api.Span] = None,
1372        remote_parent_span: Optional[otel_trace_api.Span] = None,
1373        as_type: ObservationTypeLiteralNoEvent,
1374        end_on_exit: Optional[bool] = None,
1375        input: Optional[Any] = None,
1376        output: Optional[Any] = None,
1377        metadata: Optional[Any] = None,
1378        version: Optional[str] = None,
1379        level: Optional[SpanLevel] = None,
1380        status_message: Optional[str] = None,
1381        completion_start_time: Optional[datetime] = None,
1382        model: Optional[str] = None,
1383        model_parameters: Optional[Dict[str, MapValue]] = None,
1384        usage_details: Optional[Dict[str, int]] = None,
1385        cost_details: Optional[Dict[str, float]] = None,
1386        prompt: Optional[PromptClient] = None,
1387    ) -> Any:
1388        parent_span = parent or cast(otel_trace_api.Span, remote_parent_span)
1389
1390        with otel_trace_api.use_span(parent_span):
1391            with self._start_as_current_otel_span_with_processed_media(
1392                name=name,
1393                as_type=as_type,
1394                end_on_exit=end_on_exit,
1395                input=input,
1396                output=output,
1397                metadata=metadata,
1398                version=version,
1399                level=level,
1400                status_message=status_message,
1401                completion_start_time=completion_start_time,
1402                model=model,
1403                model_parameters=model_parameters,
1404                usage_details=usage_details,
1405                cost_details=cost_details,
1406                prompt=prompt,
1407            ) as langfuse_span:
1408                if remote_parent_span is not None:
1409                    langfuse_span._otel_span.set_attribute(
1410                        LangfuseOtelSpanAttributes.AS_ROOT, True
1411                    )
1412
1413                yield langfuse_span
1414
1415    @_agnosticcontextmanager
1416    def _start_as_current_otel_span_with_processed_media(
1417        self,
1418        *,
1419        name: str,
1420        as_type: Optional[ObservationTypeLiteralNoEvent] = None,
1421        end_on_exit: Optional[bool] = None,
1422        input: Optional[Any] = None,
1423        output: Optional[Any] = None,
1424        metadata: Optional[Any] = None,
1425        version: Optional[str] = None,
1426        level: Optional[SpanLevel] = None,
1427        status_message: Optional[str] = None,
1428        completion_start_time: Optional[datetime] = None,
1429        model: Optional[str] = None,
1430        model_parameters: Optional[Dict[str, MapValue]] = None,
1431        usage_details: Optional[Dict[str, int]] = None,
1432        cost_details: Optional[Dict[str, float]] = None,
1433        prompt: Optional[PromptClient] = None,
1434    ) -> Any:
1435        with self._otel_tracer.start_as_current_span(
1436            name=name,
1437            end_on_exit=end_on_exit if end_on_exit is not None else True,
1438        ) as otel_span:
1439            span_class = self._get_span_class(
1440                as_type or "generation"
1441            )  # default was "generation"
1442            common_args = {
1443                "otel_span": otel_span,
1444                "langfuse_client": self,
1445                "environment": self._environment,
1446                "input": input,
1447                "output": output,
1448                "metadata": metadata,
1449                "version": version,
1450                "level": level,
1451                "status_message": status_message,
1452            }
1453
1454            if span_class in [
1455                LangfuseGeneration,
1456                LangfuseEmbedding,
1457            ]:
1458                common_args.update(
1459                    {
1460                        "completion_start_time": completion_start_time,
1461                        "model": model,
1462                        "model_parameters": model_parameters,
1463                        "usage_details": usage_details,
1464                        "cost_details": cost_details,
1465                        "prompt": prompt,
1466                    }
1467                )
1468            # For span-like types (span, agent, tool, chain, retriever, evaluator, guardrail), no generation properties needed
1469
1470            yield span_class(**common_args)  # type: ignore[arg-type]
1471
1472    def _get_current_otel_span(self) -> Optional[otel_trace_api.Span]:
1473        current_span = otel_trace_api.get_current_span()
1474
1475        if current_span is otel_trace_api.INVALID_SPAN:
1476            langfuse_logger.warning(
1477                "Context error: No active span in current context. Operations that depend on an active span will be skipped. "
1478                "Ensure spans are created with start_as_current_span() or that you're operating within an active span context."
1479            )
1480            return None
1481
1482        return current_span
1483
1484    def update_current_generation(
1485        self,
1486        *,
1487        name: Optional[str] = None,
1488        input: Optional[Any] = None,
1489        output: Optional[Any] = None,
1490        metadata: Optional[Any] = None,
1491        version: Optional[str] = None,
1492        level: Optional[SpanLevel] = None,
1493        status_message: Optional[str] = None,
1494        completion_start_time: Optional[datetime] = None,
1495        model: Optional[str] = None,
1496        model_parameters: Optional[Dict[str, MapValue]] = None,
1497        usage_details: Optional[Dict[str, int]] = None,
1498        cost_details: Optional[Dict[str, float]] = None,
1499        prompt: Optional[PromptClient] = None,
1500    ) -> None:
1501        """Update the current active generation span with new information.
1502
1503        This method updates the current generation span in the active context with
1504        additional information. It's useful for adding output, usage stats, or other
1505        details that become available during or after model generation.
1506
1507        Args:
1508            name: The generation name
1509            input: Updated input data for the model
1510            output: Output from the model (e.g., completions)
1511            metadata: Additional metadata to associate with the generation
1512            version: Version identifier for the model or component
1513            level: Importance level of the generation (info, warning, error)
1514            status_message: Optional status message for the generation
1515            completion_start_time: When the model started generating the response
1516            model: Name/identifier of the AI model used (e.g., "gpt-4")
1517            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1518            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1519            cost_details: Cost information for the model call
1520            prompt: Associated prompt template from Langfuse prompt management
1521
1522        Example:
1523            ```python
1524            with langfuse.start_as_current_generation(name="answer-query") as generation:
1525                # Initial setup and API call
1526                response = llm.generate(...)
1527
1528                # Update with results that weren't available at creation time
1529                langfuse.update_current_generation(
1530                    output=response.text,
1531                    usage_details={
1532                        "prompt_tokens": response.usage.prompt_tokens,
1533                        "completion_tokens": response.usage.completion_tokens
1534                    }
1535                )
1536            ```
1537        """
1538        if not self._tracing_enabled:
1539            langfuse_logger.debug(
1540                "Operation skipped: update_current_generation - Tracing is disabled or client is in no-op mode."
1541            )
1542            return
1543
1544        current_otel_span = self._get_current_otel_span()
1545
1546        if current_otel_span is not None:
1547            generation = LangfuseGeneration(
1548                otel_span=current_otel_span, langfuse_client=self
1549            )
1550
1551            if name:
1552                current_otel_span.update_name(name)
1553
1554            generation.update(
1555                input=input,
1556                output=output,
1557                metadata=metadata,
1558                version=version,
1559                level=level,
1560                status_message=status_message,
1561                completion_start_time=completion_start_time,
1562                model=model,
1563                model_parameters=model_parameters,
1564                usage_details=usage_details,
1565                cost_details=cost_details,
1566                prompt=prompt,
1567            )
1568
1569    def update_current_span(
1570        self,
1571        *,
1572        name: Optional[str] = None,
1573        input: Optional[Any] = None,
1574        output: Optional[Any] = None,
1575        metadata: Optional[Any] = None,
1576        version: Optional[str] = None,
1577        level: Optional[SpanLevel] = None,
1578        status_message: Optional[str] = None,
1579    ) -> None:
1580        """Update the current active span with new information.
1581
1582        This method updates the current span in the active context with
1583        additional information. It's useful for adding outputs or metadata
1584        that become available during execution.
1585
1586        Args:
1587            name: The span name
1588            input: Updated input data for the operation
1589            output: Output data from the operation
1590            metadata: Additional metadata to associate with the span
1591            version: Version identifier for the code or component
1592            level: Importance level of the span (info, warning, error)
1593            status_message: Optional status message for the span
1594
1595        Example:
1596            ```python
1597            with langfuse.start_as_current_span(name="process-data") as span:
1598                # Initial processing
1599                result = process_first_part()
1600
1601                # Update with intermediate results
1602                langfuse.update_current_span(metadata={"intermediate_result": result})
1603
1604                # Continue processing
1605                final_result = process_second_part(result)
1606
1607                # Final update
1608                langfuse.update_current_span(output=final_result)
1609            ```
1610        """
1611        if not self._tracing_enabled:
1612            langfuse_logger.debug(
1613                "Operation skipped: update_current_span - Tracing is disabled or client is in no-op mode."
1614            )
1615            return
1616
1617        current_otel_span = self._get_current_otel_span()
1618
1619        if current_otel_span is not None:
1620            span = LangfuseSpan(
1621                otel_span=current_otel_span,
1622                langfuse_client=self,
1623                environment=self._environment,
1624            )
1625
1626            if name:
1627                current_otel_span.update_name(name)
1628
1629            span.update(
1630                input=input,
1631                output=output,
1632                metadata=metadata,
1633                version=version,
1634                level=level,
1635                status_message=status_message,
1636            )
1637
1638    def update_current_trace(
1639        self,
1640        *,
1641        name: Optional[str] = None,
1642        user_id: Optional[str] = None,
1643        session_id: Optional[str] = None,
1644        version: Optional[str] = None,
1645        input: Optional[Any] = None,
1646        output: Optional[Any] = None,
1647        metadata: Optional[Any] = None,
1648        tags: Optional[List[str]] = None,
1649        public: Optional[bool] = None,
1650    ) -> None:
1651        """Update the current trace with additional information.
1652
1653        Args:
1654            name: Updated name for the Langfuse trace
1655            user_id: ID of the user who initiated the Langfuse trace
1656            session_id: Session identifier for grouping related Langfuse traces
1657            version: Version identifier for the application or service
1658            input: Input data for the overall Langfuse trace
1659            output: Output data from the overall Langfuse trace
1660            metadata: Additional metadata to associate with the Langfuse trace
1661            tags: List of tags to categorize the Langfuse trace
1662            public: Whether the Langfuse trace should be publicly accessible
1663
1664        See Also:
1665            :func:`langfuse.propagate_attributes`: Recommended replacement
1666        """
1667        if not self._tracing_enabled:
1668            langfuse_logger.debug(
1669                "Operation skipped: update_current_trace - Tracing is disabled or client is in no-op mode."
1670            )
1671            return
1672
1673        current_otel_span = self._get_current_otel_span()
1674
1675        if current_otel_span is not None and current_otel_span.is_recording():
1676            existing_observation_type = current_otel_span.attributes.get(  # type: ignore[attr-defined]
1677                LangfuseOtelSpanAttributes.OBSERVATION_TYPE, "span"
1678            )
1679            # We need to preserve the class to keep the correct observation type
1680            span_class = self._get_span_class(existing_observation_type)
1681            span = span_class(
1682                otel_span=current_otel_span,
1683                langfuse_client=self,
1684                environment=self._environment,
1685            )
1686
1687            span.update_trace(
1688                name=name,
1689                user_id=user_id,
1690                session_id=session_id,
1691                version=version,
1692                input=input,
1693                output=output,
1694                metadata=metadata,
1695                tags=tags,
1696                public=public,
1697            )
1698
1699    def create_event(
1700        self,
1701        *,
1702        trace_context: Optional[TraceContext] = None,
1703        name: str,
1704        input: Optional[Any] = None,
1705        output: Optional[Any] = None,
1706        metadata: Optional[Any] = None,
1707        version: Optional[str] = None,
1708        level: Optional[SpanLevel] = None,
1709        status_message: Optional[str] = None,
1710    ) -> LangfuseEvent:
1711        """Create a new Langfuse observation of type 'EVENT'.
1712
1713        The created Langfuse Event observation will be the child of the current span in the context.
1714
1715        Args:
1716            trace_context: Optional context for connecting to an existing trace
1717            name: Name of the span (e.g., function or operation name)
1718            input: Input data for the operation (can be any JSON-serializable object)
1719            output: Output data from the operation (can be any JSON-serializable object)
1720            metadata: Additional metadata to associate with the span
1721            version: Version identifier for the code or component
1722            level: Importance level of the span (info, warning, error)
1723            status_message: Optional status message for the span
1724
1725        Returns:
1726            The Langfuse Event object
1727
1728        Example:
1729            ```python
1730            event = langfuse.create_event(name="process-event")
1731            ```
1732        """
1733        timestamp = time_ns()
1734
1735        if trace_context:
1736            trace_id = trace_context.get("trace_id", None)
1737            parent_span_id = trace_context.get("parent_span_id", None)
1738
1739            if trace_id:
1740                remote_parent_span = self._create_remote_parent_span(
1741                    trace_id=trace_id, parent_span_id=parent_span_id
1742                )
1743
1744                with otel_trace_api.use_span(
1745                    cast(otel_trace_api.Span, remote_parent_span)
1746                ):
1747                    otel_span = self._otel_tracer.start_span(
1748                        name=name, start_time=timestamp
1749                    )
1750                    otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True)
1751
1752                    return cast(
1753                        LangfuseEvent,
1754                        LangfuseEvent(
1755                            otel_span=otel_span,
1756                            langfuse_client=self,
1757                            environment=self._environment,
1758                            input=input,
1759                            output=output,
1760                            metadata=metadata,
1761                            version=version,
1762                            level=level,
1763                            status_message=status_message,
1764                        ).end(end_time=timestamp),
1765                    )
1766
1767        otel_span = self._otel_tracer.start_span(name=name, start_time=timestamp)
1768
1769        return cast(
1770            LangfuseEvent,
1771            LangfuseEvent(
1772                otel_span=otel_span,
1773                langfuse_client=self,
1774                environment=self._environment,
1775                input=input,
1776                output=output,
1777                metadata=metadata,
1778                version=version,
1779                level=level,
1780                status_message=status_message,
1781            ).end(end_time=timestamp),
1782        )
1783
1784    def _create_remote_parent_span(
1785        self, *, trace_id: str, parent_span_id: Optional[str]
1786    ) -> Any:
1787        if not self._is_valid_trace_id(trace_id):
1788            langfuse_logger.warning(
1789                f"Passed trace ID '{trace_id}' is not a valid 32 lowercase hex char Langfuse trace id. Ignoring trace ID."
1790            )
1791
1792        if parent_span_id and not self._is_valid_span_id(parent_span_id):
1793            langfuse_logger.warning(
1794                f"Passed span ID '{parent_span_id}' is not a valid 16 lowercase hex char Langfuse span id. Ignoring parent span ID."
1795            )
1796
1797        int_trace_id = int(trace_id, 16)
1798        int_parent_span_id = (
1799            int(parent_span_id, 16)
1800            if parent_span_id
1801            else RandomIdGenerator().generate_span_id()
1802        )
1803
1804        span_context = otel_trace_api.SpanContext(
1805            trace_id=int_trace_id,
1806            span_id=int_parent_span_id,
1807            trace_flags=otel_trace_api.TraceFlags(0x01),  # mark span as sampled
1808            is_remote=False,
1809        )
1810
1811        return otel_trace_api.NonRecordingSpan(span_context)
1812
1813    def _is_valid_trace_id(self, trace_id: str) -> bool:
1814        pattern = r"^[0-9a-f]{32}$"
1815
1816        return bool(re.match(pattern, trace_id))
1817
1818    def _is_valid_span_id(self, span_id: str) -> bool:
1819        pattern = r"^[0-9a-f]{16}$"
1820
1821        return bool(re.match(pattern, span_id))
1822
1823    def _create_observation_id(self, *, seed: Optional[str] = None) -> str:
1824        """Create a unique observation ID for use with Langfuse.
1825
1826        This method generates a unique observation ID (span ID in OpenTelemetry terms)
1827        for use with various Langfuse APIs. It can either generate a random ID or
1828        create a deterministic ID based on a seed string.
1829
1830        Observation IDs must be 16 lowercase hexadecimal characters, representing 8 bytes.
1831        This method ensures the generated ID meets this requirement. If you need to
1832        correlate an external ID with a Langfuse observation ID, use the external ID as
1833        the seed to get a valid, deterministic observation ID.
1834
1835        Args:
1836            seed: Optional string to use as a seed for deterministic ID generation.
1837                 If provided, the same seed will always produce the same ID.
1838                 If not provided, a random ID will be generated.
1839
1840        Returns:
1841            A 16-character lowercase hexadecimal string representing the observation ID.
1842
1843        Example:
1844            ```python
1845            # Generate a random observation ID
1846            obs_id = langfuse.create_observation_id()
1847
1848            # Generate a deterministic ID based on a seed
1849            user_obs_id = langfuse.create_observation_id(seed="user-123-feedback")
1850
1851            # Correlate an external item ID with a Langfuse observation ID
1852            item_id = "item-789012"
1853            correlated_obs_id = langfuse.create_observation_id(seed=item_id)
1854
1855            # Use the ID with Langfuse APIs
1856            langfuse.create_score(
1857                name="relevance",
1858                value=0.95,
1859                trace_id=trace_id,
1860                observation_id=obs_id
1861            )
1862            ```
1863        """
1864        if not seed:
1865            span_id_int = RandomIdGenerator().generate_span_id()
1866
1867            return self._format_otel_span_id(span_id_int)
1868
1869        return sha256(seed.encode("utf-8")).digest()[:8].hex()
1870
1871    @staticmethod
1872    def create_trace_id(*, seed: Optional[str] = None) -> str:
1873        """Create a unique trace ID for use with Langfuse.
1874
1875        This method generates a unique trace ID for use with various Langfuse APIs.
1876        It can either generate a random ID or create a deterministic ID based on
1877        a seed string.
1878
1879        Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes.
1880        This method ensures the generated ID meets this requirement. If you need to
1881        correlate an external ID with a Langfuse trace ID, use the external ID as the
1882        seed to get a valid, deterministic Langfuse trace ID.
1883
1884        Args:
1885            seed: Optional string to use as a seed for deterministic ID generation.
1886                 If provided, the same seed will always produce the same ID.
1887                 If not provided, a random ID will be generated.
1888
1889        Returns:
1890            A 32-character lowercase hexadecimal string representing the Langfuse trace ID.
1891
1892        Example:
1893            ```python
1894            # Generate a random trace ID
1895            trace_id = langfuse.create_trace_id()
1896
1897            # Generate a deterministic ID based on a seed
1898            session_trace_id = langfuse.create_trace_id(seed="session-456")
1899
1900            # Correlate an external ID with a Langfuse trace ID
1901            external_id = "external-system-123456"
1902            correlated_trace_id = langfuse.create_trace_id(seed=external_id)
1903
1904            # Use the ID with trace context
1905            with langfuse.start_as_current_span(
1906                name="process-request",
1907                trace_context={"trace_id": trace_id}
1908            ) as span:
1909                # Operation will be part of the specific trace
1910                pass
1911            ```
1912        """
1913        if not seed:
1914            trace_id_int = RandomIdGenerator().generate_trace_id()
1915
1916            return Langfuse._format_otel_trace_id(trace_id_int)
1917
1918        return sha256(seed.encode("utf-8")).digest()[:16].hex()
1919
1920    def _get_otel_trace_id(self, otel_span: otel_trace_api.Span) -> str:
1921        span_context = otel_span.get_span_context()
1922
1923        return self._format_otel_trace_id(span_context.trace_id)
1924
1925    def _get_otel_span_id(self, otel_span: otel_trace_api.Span) -> str:
1926        span_context = otel_span.get_span_context()
1927
1928        return self._format_otel_span_id(span_context.span_id)
1929
1930    @staticmethod
1931    def _format_otel_span_id(span_id_int: int) -> str:
1932        """Format an integer span ID to a 16-character lowercase hex string.
1933
1934        Internal method to convert an OpenTelemetry integer span ID to the standard
1935        W3C Trace Context format (16-character lowercase hex string).
1936
1937        Args:
1938            span_id_int: 64-bit integer representing a span ID
1939
1940        Returns:
1941            A 16-character lowercase hexadecimal string
1942        """
1943        return format(span_id_int, "016x")
1944
1945    @staticmethod
1946    def _format_otel_trace_id(trace_id_int: int) -> str:
1947        """Format an integer trace ID to a 32-character lowercase hex string.
1948
1949        Internal method to convert an OpenTelemetry integer trace ID to the standard
1950        W3C Trace Context format (32-character lowercase hex string).
1951
1952        Args:
1953            trace_id_int: 128-bit integer representing a trace ID
1954
1955        Returns:
1956            A 32-character lowercase hexadecimal string
1957        """
1958        return format(trace_id_int, "032x")
1959
1960    @overload
1961    def create_score(
1962        self,
1963        *,
1964        name: str,
1965        value: float,
1966        session_id: Optional[str] = None,
1967        dataset_run_id: Optional[str] = None,
1968        trace_id: Optional[str] = None,
1969        observation_id: Optional[str] = None,
1970        score_id: Optional[str] = None,
1971        data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None,
1972        comment: Optional[str] = None,
1973        config_id: Optional[str] = None,
1974        metadata: Optional[Any] = None,
1975    ) -> None: ...
1976
1977    @overload
1978    def create_score(
1979        self,
1980        *,
1981        name: str,
1982        value: str,
1983        session_id: Optional[str] = None,
1984        dataset_run_id: Optional[str] = None,
1985        trace_id: Optional[str] = None,
1986        score_id: Optional[str] = None,
1987        observation_id: Optional[str] = None,
1988        data_type: Optional[Literal["CATEGORICAL"]] = "CATEGORICAL",
1989        comment: Optional[str] = None,
1990        config_id: Optional[str] = None,
1991        metadata: Optional[Any] = None,
1992    ) -> None: ...
1993
1994    def create_score(
1995        self,
1996        *,
1997        name: str,
1998        value: Union[float, str],
1999        session_id: Optional[str] = None,
2000        dataset_run_id: Optional[str] = None,
2001        trace_id: Optional[str] = None,
2002        observation_id: Optional[str] = None,
2003        score_id: Optional[str] = None,
2004        data_type: Optional[ScoreDataType] = None,
2005        comment: Optional[str] = None,
2006        config_id: Optional[str] = None,
2007        metadata: Optional[Any] = None,
2008    ) -> None:
2009        """Create a score for a specific trace or observation.
2010
2011        This method creates a score for evaluating a Langfuse trace or observation. Scores can be
2012        used to track quality metrics, user feedback, or automated evaluations.
2013
2014        Args:
2015            name: Name of the score (e.g., "relevance", "accuracy")
2016            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
2017            session_id: ID of the Langfuse session to associate the score with
2018            dataset_run_id: ID of the Langfuse dataset run to associate the score with
2019            trace_id: ID of the Langfuse trace to associate the score with
2020            observation_id: Optional ID of the specific observation to score. Trace ID must be provided too.
2021            score_id: Optional custom ID for the score (auto-generated if not provided)
2022            data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
2023            comment: Optional comment or explanation for the score
2024            config_id: Optional ID of a score config defined in Langfuse
2025            metadata: Optional metadata to be attached to the score
2026
2027        Example:
2028            ```python
2029            # Create a numeric score for accuracy
2030            langfuse.create_score(
2031                name="accuracy",
2032                value=0.92,
2033                trace_id="abcdef1234567890abcdef1234567890",
2034                data_type="NUMERIC",
2035                comment="High accuracy with minor irrelevant details"
2036            )
2037
2038            # Create a categorical score for sentiment
2039            langfuse.create_score(
2040                name="sentiment",
2041                value="positive",
2042                trace_id="abcdef1234567890abcdef1234567890",
2043                observation_id="abcdef1234567890",
2044                data_type="CATEGORICAL"
2045            )
2046            ```
2047        """
2048        if not self._tracing_enabled:
2049            return
2050
2051        score_id = score_id or self._create_observation_id()
2052
2053        try:
2054            new_body = ScoreBody(
2055                id=score_id,
2056                sessionId=session_id,
2057                datasetRunId=dataset_run_id,
2058                traceId=trace_id,
2059                observationId=observation_id,
2060                name=name,
2061                value=value,
2062                dataType=data_type,  # type: ignore
2063                comment=comment,
2064                configId=config_id,
2065                environment=self._environment,
2066                metadata=metadata,
2067            )
2068
2069            event = {
2070                "id": self.create_trace_id(),
2071                "type": "score-create",
2072                "timestamp": _get_timestamp(),
2073                "body": new_body,
2074            }
2075
2076            if self._resources is not None:
2077                # Force the score to be in sample if it was for a legacy trace ID, i.e. non-32 hexchar
2078                force_sample = (
2079                    not self._is_valid_trace_id(trace_id) if trace_id else True
2080                )
2081
2082                self._resources.add_score_task(
2083                    event,
2084                    force_sample=force_sample,
2085                )
2086
2087        except Exception as e:
2088            langfuse_logger.exception(
2089                f"Error creating score: Failed to process score event for trace_id={trace_id}, name={name}. Error: {e}"
2090            )
2091
2092    @overload
2093    def score_current_span(
2094        self,
2095        *,
2096        name: str,
2097        value: float,
2098        score_id: Optional[str] = None,
2099        data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None,
2100        comment: Optional[str] = None,
2101        config_id: Optional[str] = None,
2102    ) -> None: ...
2103
2104    @overload
2105    def score_current_span(
2106        self,
2107        *,
2108        name: str,
2109        value: str,
2110        score_id: Optional[str] = None,
2111        data_type: Optional[Literal["CATEGORICAL"]] = "CATEGORICAL",
2112        comment: Optional[str] = None,
2113        config_id: Optional[str] = None,
2114    ) -> None: ...
2115
2116    def score_current_span(
2117        self,
2118        *,
2119        name: str,
2120        value: Union[float, str],
2121        score_id: Optional[str] = None,
2122        data_type: Optional[ScoreDataType] = None,
2123        comment: Optional[str] = None,
2124        config_id: Optional[str] = None,
2125    ) -> None:
2126        """Create a score for the current active span.
2127
2128        This method scores the currently active span in the context. It's a convenient
2129        way to score the current operation without needing to know its trace and span IDs.
2130
2131        Args:
2132            name: Name of the score (e.g., "relevance", "accuracy")
2133            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
2134            score_id: Optional custom ID for the score (auto-generated if not provided)
2135            data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
2136            comment: Optional comment or explanation for the score
2137            config_id: Optional ID of a score config defined in Langfuse
2138
2139        Example:
2140            ```python
2141            with langfuse.start_as_current_generation(name="answer-query") as generation:
2142                # Generate answer
2143                response = generate_answer(...)
2144                generation.update(output=response)
2145
2146                # Score the generation
2147                langfuse.score_current_span(
2148                    name="relevance",
2149                    value=0.85,
2150                    data_type="NUMERIC",
2151                    comment="Mostly relevant but contains some tangential information"
2152                )
2153            ```
2154        """
2155        current_span = self._get_current_otel_span()
2156
2157        if current_span is not None:
2158            trace_id = self._get_otel_trace_id(current_span)
2159            observation_id = self._get_otel_span_id(current_span)
2160
2161            langfuse_logger.info(
2162                f"Score: Creating score name='{name}' value={value} for current span ({observation_id}) in trace {trace_id}"
2163            )
2164
2165            self.create_score(
2166                trace_id=trace_id,
2167                observation_id=observation_id,
2168                name=name,
2169                value=cast(str, value),
2170                score_id=score_id,
2171                data_type=cast(Literal["CATEGORICAL"], data_type),
2172                comment=comment,
2173                config_id=config_id,
2174            )
2175
2176    @overload
2177    def score_current_trace(
2178        self,
2179        *,
2180        name: str,
2181        value: float,
2182        score_id: Optional[str] = None,
2183        data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None,
2184        comment: Optional[str] = None,
2185        config_id: Optional[str] = None,
2186    ) -> None: ...
2187
2188    @overload
2189    def score_current_trace(
2190        self,
2191        *,
2192        name: str,
2193        value: str,
2194        score_id: Optional[str] = None,
2195        data_type: Optional[Literal["CATEGORICAL"]] = "CATEGORICAL",
2196        comment: Optional[str] = None,
2197        config_id: Optional[str] = None,
2198    ) -> None: ...
2199
2200    def score_current_trace(
2201        self,
2202        *,
2203        name: str,
2204        value: Union[float, str],
2205        score_id: Optional[str] = None,
2206        data_type: Optional[ScoreDataType] = None,
2207        comment: Optional[str] = None,
2208        config_id: Optional[str] = None,
2209    ) -> None:
2210        """Create a score for the current trace.
2211
2212        This method scores the trace of the currently active span. Unlike score_current_span,
2213        this method associates the score with the entire trace rather than a specific span.
2214        It's useful for scoring overall performance or quality of the entire operation.
2215
2216        Args:
2217            name: Name of the score (e.g., "user_satisfaction", "overall_quality")
2218            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
2219            score_id: Optional custom ID for the score (auto-generated if not provided)
2220            data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
2221            comment: Optional comment or explanation for the score
2222            config_id: Optional ID of a score config defined in Langfuse
2223
2224        Example:
2225            ```python
2226            with langfuse.start_as_current_span(name="process-user-request") as span:
2227                # Process request
2228                result = process_complete_request()
2229                span.update(output=result)
2230
2231                # Score the overall trace
2232                langfuse.score_current_trace(
2233                    name="overall_quality",
2234                    value=0.95,
2235                    data_type="NUMERIC",
2236                    comment="High quality end-to-end response"
2237                )
2238            ```
2239        """
2240        current_span = self._get_current_otel_span()
2241
2242        if current_span is not None:
2243            trace_id = self._get_otel_trace_id(current_span)
2244
2245            langfuse_logger.info(
2246                f"Score: Creating score name='{name}' value={value} for entire trace {trace_id}"
2247            )
2248
2249            self.create_score(
2250                trace_id=trace_id,
2251                name=name,
2252                value=cast(str, value),
2253                score_id=score_id,
2254                data_type=cast(Literal["CATEGORICAL"], data_type),
2255                comment=comment,
2256                config_id=config_id,
2257            )
2258
2259    def flush(self) -> None:
2260        """Force flush all pending spans and events to the Langfuse API.
2261
2262        This method manually flushes any pending spans, scores, and other events to the
2263        Langfuse API. It's useful in scenarios where you want to ensure all data is sent
2264        before proceeding, without waiting for the automatic flush interval.
2265
2266        Example:
2267            ```python
2268            # Record some spans and scores
2269            with langfuse.start_as_current_span(name="operation") as span:
2270                # Do work...
2271                pass
2272
2273            # Ensure all data is sent to Langfuse before proceeding
2274            langfuse.flush()
2275
2276            # Continue with other work
2277            ```
2278        """
2279        if self._resources is not None:
2280            self._resources.flush()
2281
2282    def shutdown(self) -> None:
2283        """Shut down the Langfuse client and flush all pending data.
2284
2285        This method cleanly shuts down the Langfuse client, ensuring all pending data
2286        is flushed to the API and all background threads are properly terminated.
2287
2288        It's important to call this method when your application is shutting down to
2289        prevent data loss and resource leaks. For most applications, using the client
2290        as a context manager or relying on the automatic shutdown via atexit is sufficient.
2291
2292        Example:
2293            ```python
2294            # Initialize Langfuse
2295            langfuse = Langfuse(public_key="...", secret_key="...")
2296
2297            # Use Langfuse throughout your application
2298            # ...
2299
2300            # When application is shutting down
2301            langfuse.shutdown()
2302            ```
2303        """
2304        if self._resources is not None:
2305            self._resources.shutdown()
2306
2307    def get_current_trace_id(self) -> Optional[str]:
2308        """Get the trace ID of the current active span.
2309
2310        This method retrieves the trace ID from the currently active span in the context.
2311        It can be used to get the trace ID for referencing in logs, external systems,
2312        or for creating related operations.
2313
2314        Returns:
2315            The current trace ID as a 32-character lowercase hexadecimal string,
2316            or None if there is no active span.
2317
2318        Example:
2319            ```python
2320            with langfuse.start_as_current_span(name="process-request") as span:
2321                # Get the current trace ID for reference
2322                trace_id = langfuse.get_current_trace_id()
2323
2324                # Use it for external correlation
2325                log.info(f"Processing request with trace_id: {trace_id}")
2326
2327                # Or pass to another system
2328                external_system.process(data, trace_id=trace_id)
2329            ```
2330        """
2331        if not self._tracing_enabled:
2332            langfuse_logger.debug(
2333                "Operation skipped: get_current_trace_id - Tracing is disabled or client is in no-op mode."
2334            )
2335            return None
2336
2337        current_otel_span = self._get_current_otel_span()
2338
2339        return self._get_otel_trace_id(current_otel_span) if current_otel_span else None
2340
2341    def get_current_observation_id(self) -> Optional[str]:
2342        """Get the observation ID (span ID) of the current active span.
2343
2344        This method retrieves the observation ID from the currently active span in the context.
2345        It can be used to get the observation ID for referencing in logs, external systems,
2346        or for creating scores or other related operations.
2347
2348        Returns:
2349            The current observation ID as a 16-character lowercase hexadecimal string,
2350            or None if there is no active span.
2351
2352        Example:
2353            ```python
2354            with langfuse.start_as_current_span(name="process-user-query") as span:
2355                # Get the current observation ID
2356                observation_id = langfuse.get_current_observation_id()
2357
2358                # Store it for later reference
2359                cache.set(f"query_{query_id}_observation", observation_id)
2360
2361                # Process the query...
2362            ```
2363        """
2364        if not self._tracing_enabled:
2365            langfuse_logger.debug(
2366                "Operation skipped: get_current_observation_id - Tracing is disabled or client is in no-op mode."
2367            )
2368            return None
2369
2370        current_otel_span = self._get_current_otel_span()
2371
2372        return self._get_otel_span_id(current_otel_span) if current_otel_span else None
2373
2374    def _get_project_id(self) -> Optional[str]:
2375        """Fetch and return the current project id. Persisted across requests. Returns None if no project id is found for api keys."""
2376        if not self._project_id:
2377            proj = self.api.projects.get()
2378            if not proj.data or not proj.data[0].id:
2379                return None
2380
2381            self._project_id = proj.data[0].id
2382
2383        return self._project_id
2384
2385    def get_trace_url(self, *, trace_id: Optional[str] = None) -> Optional[str]:
2386        """Get the URL to view a trace in the Langfuse UI.
2387
2388        This method generates a URL that links directly to a trace in the Langfuse UI.
2389        It's useful for providing links in logs, notifications, or debugging tools.
2390
2391        Args:
2392            trace_id: Optional trace ID to generate a URL for. If not provided,
2393                     the trace ID of the current active span will be used.
2394
2395        Returns:
2396            A URL string pointing to the trace in the Langfuse UI,
2397            or None if the project ID couldn't be retrieved or no trace ID is available.
2398
2399        Example:
2400            ```python
2401            # Get URL for the current trace
2402            with langfuse.start_as_current_span(name="process-request") as span:
2403                trace_url = langfuse.get_trace_url()
2404                log.info(f"Processing trace: {trace_url}")
2405
2406            # Get URL for a specific trace
2407            specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef")
2408            send_notification(f"Review needed for trace: {specific_trace_url}")
2409            ```
2410        """
2411        project_id = self._get_project_id()
2412        final_trace_id = trace_id or self.get_current_trace_id()
2413
2414        return (
2415            f"{self._base_url}/project/{project_id}/traces/{final_trace_id}"
2416            if project_id and final_trace_id
2417            else None
2418        )
2419
2420    def get_dataset(
2421        self, name: str, *, fetch_items_page_size: Optional[int] = 50
2422    ) -> "DatasetClient":
2423        """Fetch a dataset by its name.
2424
2425        Args:
2426            name (str): The name of the dataset to fetch.
2427            fetch_items_page_size (Optional[int]): All items of the dataset will be fetched in chunks of this size. Defaults to 50.
2428
2429        Returns:
2430            DatasetClient: The dataset with the given name.
2431        """
2432        try:
2433            langfuse_logger.debug(f"Getting datasets {name}")
2434            dataset = self.api.datasets.get(dataset_name=name)
2435
2436            dataset_items = []
2437            page = 1
2438
2439            while True:
2440                new_items = self.api.dataset_items.list(
2441                    dataset_name=self._url_encode(name, is_url_param=True),
2442                    page=page,
2443                    limit=fetch_items_page_size,
2444                )
2445                dataset_items.extend(new_items.data)
2446
2447                if new_items.meta.total_pages <= page:
2448                    break
2449
2450                page += 1
2451
2452            items = [DatasetItemClient(i, langfuse=self) for i in dataset_items]
2453
2454            return DatasetClient(dataset, items=items)
2455
2456        except Error as e:
2457            handle_fern_exception(e)
2458            raise e
2459
2460    def run_experiment(
2461        self,
2462        *,
2463        name: str,
2464        run_name: Optional[str] = None,
2465        description: Optional[str] = None,
2466        data: ExperimentData,
2467        task: TaskFunction,
2468        evaluators: List[EvaluatorFunction] = [],
2469        composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
2470        run_evaluators: List[RunEvaluatorFunction] = [],
2471        max_concurrency: int = 50,
2472        metadata: Optional[Dict[str, str]] = None,
2473    ) -> ExperimentResult:
2474        """Run an experiment on a dataset with automatic tracing and evaluation.
2475
2476        This method executes a task function on each item in the provided dataset,
2477        automatically traces all executions with Langfuse for observability, runs
2478        item-level and run-level evaluators on the outputs, and returns comprehensive
2479        results with evaluation metrics.
2480
2481        The experiment system provides:
2482        - Automatic tracing of all task executions
2483        - Concurrent processing with configurable limits
2484        - Comprehensive error handling that isolates failures
2485        - Integration with Langfuse datasets for experiment tracking
2486        - Flexible evaluation framework supporting both sync and async evaluators
2487
2488        Args:
2489            name: Human-readable name for the experiment. Used for identification
2490                in the Langfuse UI.
2491            run_name: Optional exact name for the experiment run. If provided, this will be
2492                used as the exact dataset run name if the `data` contains Langfuse dataset items.
2493                If not provided, this will default to the experiment name appended with an ISO timestamp.
2494            description: Optional description explaining the experiment's purpose,
2495                methodology, or expected outcomes.
2496            data: Array of data items to process. Can be either:
2497                - List of dict-like items with 'input', 'expected_output', 'metadata' keys
2498                - List of Langfuse DatasetItem objects from dataset.items
2499            task: Function that processes each data item and returns output.
2500                Must accept 'item' as keyword argument and can return sync or async results.
2501                The task function signature should be: task(*, item, **kwargs) -> Any
2502            evaluators: List of functions to evaluate each item's output individually.
2503                Each evaluator receives input, output, expected_output, and metadata.
2504                Can return single Evaluation dict or list of Evaluation dicts.
2505            composite_evaluator: Optional function that creates composite scores from item-level evaluations.
2506                Receives the same inputs as item-level evaluators (input, output, expected_output, metadata)
2507                plus the list of evaluations from item-level evaluators. Useful for weighted averages,
2508                pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics.
2509            run_evaluators: List of functions to evaluate the entire experiment run.
2510                Each run evaluator receives all item_results and can compute aggregate metrics.
2511                Useful for calculating averages, distributions, or cross-item comparisons.
2512            max_concurrency: Maximum number of concurrent task executions (default: 50).
2513                Controls the number of items processed simultaneously. Adjust based on
2514                API rate limits and system resources.
2515            metadata: Optional metadata dictionary to attach to all experiment traces.
2516                This metadata will be included in every trace created during the experiment.
2517                If `data` are Langfuse dataset items, the metadata will be attached to the dataset run, too.
2518
2519        Returns:
2520            ExperimentResult containing:
2521            - run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset.
2522            - item_results: List of results for each processed item with outputs and evaluations
2523            - run_evaluations: List of aggregate evaluation results for the entire run
2524            - dataset_run_id: ID of the dataset run (if using Langfuse datasets)
2525            - dataset_run_url: Direct URL to view results in Langfuse UI (if applicable)
2526
2527        Raises:
2528            ValueError: If required parameters are missing or invalid
2529            Exception: If experiment setup fails (individual item failures are handled gracefully)
2530
2531        Examples:
2532            Basic experiment with local data:
2533            ```python
2534            def summarize_text(*, item, **kwargs):
2535                return f"Summary: {item['input'][:50]}..."
2536
2537            def length_evaluator(*, input, output, expected_output=None, **kwargs):
2538                return {
2539                    "name": "output_length",
2540                    "value": len(output),
2541                    "comment": f"Output contains {len(output)} characters"
2542                }
2543
2544            result = langfuse.run_experiment(
2545                name="Text Summarization Test",
2546                description="Evaluate summarization quality and length",
2547                data=[
2548                    {"input": "Long article text...", "expected_output": "Expected summary"},
2549                    {"input": "Another article...", "expected_output": "Another summary"}
2550                ],
2551                task=summarize_text,
2552                evaluators=[length_evaluator]
2553            )
2554
2555            print(f"Processed {len(result.item_results)} items")
2556            for item_result in result.item_results:
2557                print(f"Input: {item_result.item['input']}")
2558                print(f"Output: {item_result.output}")
2559                print(f"Evaluations: {item_result.evaluations}")
2560            ```
2561
2562            Advanced experiment with async task and multiple evaluators:
2563            ```python
2564            async def llm_task(*, item, **kwargs):
2565                # Simulate async LLM call
2566                response = await openai_client.chat.completions.create(
2567                    model="gpt-4",
2568                    messages=[{"role": "user", "content": item["input"]}]
2569                )
2570                return response.choices[0].message.content
2571
2572            def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
2573                if expected_output and expected_output.lower() in output.lower():
2574                    return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"}
2575                return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"}
2576
2577            def toxicity_evaluator(*, input, output, expected_output=None, **kwargs):
2578                # Simulate toxicity check
2579                toxicity_score = check_toxicity(output)  # Your toxicity checker
2580                return {
2581                    "name": "toxicity",
2582                    "value": toxicity_score,
2583                    "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}"
2584                }
2585
2586            def average_accuracy(*, item_results, **kwargs):
2587                accuracies = [
2588                    eval.value for result in item_results
2589                    for eval in result.evaluations
2590                    if eval.name == "accuracy"
2591                ]
2592                return {
2593                    "name": "average_accuracy",
2594                    "value": sum(accuracies) / len(accuracies) if accuracies else 0,
2595                    "comment": f"Average accuracy across {len(accuracies)} items"
2596                }
2597
2598            result = langfuse.run_experiment(
2599                name="LLM Safety and Accuracy Test",
2600                description="Evaluate model accuracy and safety across diverse prompts",
2601                data=test_dataset,  # Your dataset items
2602                task=llm_task,
2603                evaluators=[accuracy_evaluator, toxicity_evaluator],
2604                run_evaluators=[average_accuracy],
2605                max_concurrency=5,  # Limit concurrent API calls
2606                metadata={"model": "gpt-4", "temperature": 0.7}
2607            )
2608            ```
2609
2610            Using with Langfuse datasets:
2611            ```python
2612            # Get dataset from Langfuse
2613            dataset = langfuse.get_dataset("my-eval-dataset")
2614
2615            result = dataset.run_experiment(
2616                name="Production Model Evaluation",
2617                description="Monthly evaluation of production model performance",
2618                task=my_production_task,
2619                evaluators=[accuracy_evaluator, latency_evaluator]
2620            )
2621
2622            # Results automatically linked to dataset in Langfuse UI
2623            print(f"View results: {result['dataset_run_url']}")
2624            ```
2625
2626        Note:
2627            - Task and evaluator functions can be either synchronous or asynchronous
2628            - Individual item failures are logged but don't stop the experiment
2629            - All executions are automatically traced and visible in Langfuse UI
2630            - When using Langfuse datasets, results are automatically linked for easy comparison
2631            - This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.)
2632            - Async execution is handled automatically with smart event loop detection
2633        """
2634        return cast(
2635            ExperimentResult,
2636            run_async_safely(
2637                self._run_experiment_async(
2638                    name=name,
2639                    run_name=self._create_experiment_run_name(
2640                        name=name, run_name=run_name
2641                    ),
2642                    description=description,
2643                    data=data,
2644                    task=task,
2645                    evaluators=evaluators or [],
2646                    composite_evaluator=composite_evaluator,
2647                    run_evaluators=run_evaluators or [],
2648                    max_concurrency=max_concurrency,
2649                    metadata=metadata,
2650                ),
2651            ),
2652        )
2653
2654    async def _run_experiment_async(
2655        self,
2656        *,
2657        name: str,
2658        run_name: str,
2659        description: Optional[str],
2660        data: ExperimentData,
2661        task: TaskFunction,
2662        evaluators: List[EvaluatorFunction],
2663        composite_evaluator: Optional[CompositeEvaluatorFunction],
2664        run_evaluators: List[RunEvaluatorFunction],
2665        max_concurrency: int,
2666        metadata: Optional[Dict[str, Any]] = None,
2667    ) -> ExperimentResult:
2668        langfuse_logger.debug(
2669            f"Starting experiment '{name}' run '{run_name}' with {len(data)} items"
2670        )
2671
2672        # Set up concurrency control
2673        semaphore = asyncio.Semaphore(max_concurrency)
2674
2675        # Process all items
2676        async def process_item(item: ExperimentItem) -> ExperimentItemResult:
2677            async with semaphore:
2678                return await self._process_experiment_item(
2679                    item,
2680                    task,
2681                    evaluators,
2682                    composite_evaluator,
2683                    name,
2684                    run_name,
2685                    description,
2686                    metadata,
2687                )
2688
2689        # Run all items concurrently
2690        tasks = [process_item(item) for item in data]
2691        item_results = await asyncio.gather(*tasks, return_exceptions=True)
2692
2693        # Filter out any exceptions and log errors
2694        valid_results: List[ExperimentItemResult] = []
2695        for i, result in enumerate(item_results):
2696            if isinstance(result, Exception):
2697                langfuse_logger.error(f"Item {i} failed: {result}")
2698            elif isinstance(result, ExperimentItemResult):
2699                valid_results.append(result)  # type: ignore
2700
2701        # Run experiment-level evaluators
2702        run_evaluations: List[Evaluation] = []
2703        for run_evaluator in run_evaluators:
2704            try:
2705                evaluations = await _run_evaluator(
2706                    run_evaluator, item_results=valid_results
2707                )
2708                run_evaluations.extend(evaluations)
2709            except Exception as e:
2710                langfuse_logger.error(f"Run evaluator failed: {e}")
2711
2712        # Generate dataset run URL if applicable
2713        dataset_run_id = valid_results[0].dataset_run_id if valid_results else None
2714        dataset_run_url = None
2715        if dataset_run_id and data:
2716            try:
2717                # Check if the first item has dataset_id (for DatasetItem objects)
2718                first_item = data[0]
2719                dataset_id = None
2720
2721                if hasattr(first_item, "dataset_id"):
2722                    dataset_id = getattr(first_item, "dataset_id", None)
2723
2724                if dataset_id:
2725                    project_id = self._get_project_id()
2726
2727                    if project_id:
2728                        dataset_run_url = f"{self._base_url}/project/{project_id}/datasets/{dataset_id}/runs/{dataset_run_id}"
2729
2730            except Exception:
2731                pass  # URL generation is optional
2732
2733        # Store run-level evaluations as scores
2734        for evaluation in run_evaluations:
2735            try:
2736                if dataset_run_id:
2737                    self.create_score(
2738                        dataset_run_id=dataset_run_id,
2739                        name=evaluation.name or "<unknown>",
2740                        value=evaluation.value,  # type: ignore
2741                        comment=evaluation.comment,
2742                        metadata=evaluation.metadata,
2743                        data_type=evaluation.data_type,  # type: ignore
2744                        config_id=evaluation.config_id,
2745                    )
2746
2747            except Exception as e:
2748                langfuse_logger.error(f"Failed to store run evaluation: {e}")
2749
2750        # Flush scores and traces
2751        self.flush()
2752
2753        return ExperimentResult(
2754            name=name,
2755            run_name=run_name,
2756            description=description,
2757            item_results=valid_results,
2758            run_evaluations=run_evaluations,
2759            dataset_run_id=dataset_run_id,
2760            dataset_run_url=dataset_run_url,
2761        )
2762
2763    async def _process_experiment_item(
2764        self,
2765        item: ExperimentItem,
2766        task: Callable,
2767        evaluators: List[Callable],
2768        composite_evaluator: Optional[CompositeEvaluatorFunction],
2769        experiment_name: str,
2770        experiment_run_name: str,
2771        experiment_description: Optional[str],
2772        experiment_metadata: Optional[Dict[str, Any]] = None,
2773    ) -> ExperimentItemResult:
2774        span_name = "experiment-item-run"
2775
2776        with self.start_as_current_span(name=span_name) as span:
2777            try:
2778                input_data = (
2779                    item.get("input")
2780                    if isinstance(item, dict)
2781                    else getattr(item, "input", None)
2782                )
2783
2784                if input_data is None:
2785                    raise ValueError("Experiment Item is missing input. Skipping item.")
2786
2787                expected_output = (
2788                    item.get("expected_output")
2789                    if isinstance(item, dict)
2790                    else getattr(item, "expected_output", None)
2791                )
2792
2793                item_metadata = (
2794                    item.get("metadata")
2795                    if isinstance(item, dict)
2796                    else getattr(item, "metadata", None)
2797                )
2798
2799                final_observation_metadata = {
2800                    "experiment_name": experiment_name,
2801                    "experiment_run_name": experiment_run_name,
2802                    **(experiment_metadata or {}),
2803                }
2804
2805                trace_id = span.trace_id
2806                dataset_id = None
2807                dataset_item_id = None
2808                dataset_run_id = None
2809
2810                # Link to dataset run if this is a dataset item
2811                if hasattr(item, "id") and hasattr(item, "dataset_id"):
2812                    try:
2813                        # Use sync API to avoid event loop issues when run_async_safely
2814                        # creates multiple event loops across different threads
2815                        dataset_run_item = await asyncio.to_thread(
2816                            self.api.dataset_run_items.create,
2817                            request=CreateDatasetRunItemRequest(
2818                                runName=experiment_run_name,
2819                                runDescription=experiment_description,
2820                                metadata=experiment_metadata,
2821                                datasetItemId=item.id,  # type: ignore
2822                                traceId=trace_id,
2823                                observationId=span.id,
2824                            ),
2825                        )
2826
2827                        dataset_run_id = dataset_run_item.dataset_run_id
2828
2829                    except Exception as e:
2830                        langfuse_logger.error(f"Failed to create dataset run item: {e}")
2831
2832                if (
2833                    not isinstance(item, dict)
2834                    and hasattr(item, "dataset_id")
2835                    and hasattr(item, "id")
2836                ):
2837                    dataset_id = item.dataset_id
2838                    dataset_item_id = item.id
2839
2840                    final_observation_metadata.update(
2841                        {"dataset_id": dataset_id, "dataset_item_id": dataset_item_id}
2842                    )
2843
2844                if isinstance(item_metadata, dict):
2845                    final_observation_metadata.update(item_metadata)
2846
2847                experiment_id = dataset_run_id or self._create_observation_id()
2848                experiment_item_id = (
2849                    dataset_item_id or get_sha256_hash_hex(_serialize(input_data))[:16]
2850                )
2851                span._otel_span.set_attributes(
2852                    {
2853                        k: v
2854                        for k, v in {
2855                            LangfuseOtelSpanAttributes.ENVIRONMENT: LANGFUSE_SDK_EXPERIMENT_ENVIRONMENT,
2856                            LangfuseOtelSpanAttributes.EXPERIMENT_DESCRIPTION: experiment_description,
2857                            LangfuseOtelSpanAttributes.EXPERIMENT_ITEM_EXPECTED_OUTPUT: _serialize(
2858                                expected_output
2859                            ),
2860                        }.items()
2861                        if v is not None
2862                    }
2863                )
2864
2865                with _propagate_attributes(
2866                    experiment=PropagatedExperimentAttributes(
2867                        experiment_id=experiment_id,
2868                        experiment_name=experiment_run_name,
2869                        experiment_metadata=_serialize(experiment_metadata),
2870                        experiment_dataset_id=dataset_id,
2871                        experiment_item_id=experiment_item_id,
2872                        experiment_item_metadata=_serialize(item_metadata),
2873                        experiment_item_root_observation_id=span.id,
2874                    )
2875                ):
2876                    output = await _run_task(task, item)
2877
2878                span.update(
2879                    input=input_data,
2880                    output=output,
2881                    metadata=final_observation_metadata,
2882                )
2883
2884            except Exception as e:
2885                span.update(
2886                    output=f"Error: {str(e)}", level="ERROR", status_message=str(e)
2887                )
2888                raise e
2889
2890        # Run evaluators
2891        evaluations = []
2892
2893        for evaluator in evaluators:
2894            try:
2895                eval_metadata: Optional[Dict[str, Any]] = None
2896
2897                if isinstance(item, dict):
2898                    eval_metadata = item.get("metadata")
2899                elif hasattr(item, "metadata"):
2900                    eval_metadata = item.metadata
2901
2902                eval_results = await _run_evaluator(
2903                    evaluator,
2904                    input=input_data,
2905                    output=output,
2906                    expected_output=expected_output,
2907                    metadata=eval_metadata,
2908                )
2909                evaluations.extend(eval_results)
2910
2911                # Store evaluations as scores
2912                for evaluation in eval_results:
2913                    self.create_score(
2914                        trace_id=trace_id,
2915                        observation_id=span.id,
2916                        name=evaluation.name,
2917                        value=evaluation.value,  # type: ignore
2918                        comment=evaluation.comment,
2919                        metadata=evaluation.metadata,
2920                        config_id=evaluation.config_id,
2921                        data_type=evaluation.data_type,  # type: ignore
2922                    )
2923
2924            except Exception as e:
2925                langfuse_logger.error(f"Evaluator failed: {e}")
2926
2927        # Run composite evaluator if provided and we have evaluations
2928        if composite_evaluator and evaluations:
2929            try:
2930                composite_eval_metadata: Optional[Dict[str, Any]] = None
2931                if isinstance(item, dict):
2932                    composite_eval_metadata = item.get("metadata")
2933                elif hasattr(item, "metadata"):
2934                    composite_eval_metadata = item.metadata
2935
2936                result = composite_evaluator(
2937                    input=input_data,
2938                    output=output,
2939                    expected_output=expected_output,
2940                    metadata=composite_eval_metadata,
2941                    evaluations=evaluations,
2942                )
2943
2944                # Handle async composite evaluators
2945                if asyncio.iscoroutine(result):
2946                    result = await result
2947
2948                # Normalize to list
2949                composite_evals: List[Evaluation] = []
2950                if isinstance(result, (dict, Evaluation)):
2951                    composite_evals = [result]  # type: ignore
2952                elif isinstance(result, list):
2953                    composite_evals = result  # type: ignore
2954
2955                # Store composite evaluations as scores and add to evaluations list
2956                for composite_evaluation in composite_evals:
2957                    self.create_score(
2958                        trace_id=trace_id,
2959                        observation_id=span.id,
2960                        name=composite_evaluation.name,
2961                        value=composite_evaluation.value,  # type: ignore
2962                        comment=composite_evaluation.comment,
2963                        metadata=composite_evaluation.metadata,
2964                        config_id=composite_evaluation.config_id,
2965                        data_type=composite_evaluation.data_type,  # type: ignore
2966                    )
2967                    evaluations.append(composite_evaluation)
2968
2969            except Exception as e:
2970                langfuse_logger.error(f"Composite evaluator failed: {e}")
2971
2972        return ExperimentItemResult(
2973            item=item,
2974            output=output,
2975            evaluations=evaluations,
2976            trace_id=trace_id,
2977            dataset_run_id=dataset_run_id,
2978        )
2979
2980    def _create_experiment_run_name(
2981        self, *, name: Optional[str] = None, run_name: Optional[str] = None
2982    ) -> str:
2983        if run_name:
2984            return run_name
2985
2986        iso_timestamp = _get_timestamp().isoformat().replace("+00:00", "Z")
2987
2988        return f"{name} - {iso_timestamp}"
2989
2990    def run_batched_evaluation(
2991        self,
2992        *,
2993        scope: Literal["traces", "observations"],
2994        mapper: MapperFunction,
2995        filter: Optional[str] = None,
2996        fetch_batch_size: int = 50,
2997        max_items: Optional[int] = None,
2998        max_retries: int = 3,
2999        evaluators: List[EvaluatorFunction],
3000        composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
3001        max_concurrency: int = 50,
3002        metadata: Optional[Dict[str, Any]] = None,
3003        resume_from: Optional[BatchEvaluationResumeToken] = None,
3004        verbose: bool = False,
3005    ) -> BatchEvaluationResult:
3006        """Fetch traces or observations and run evaluations on each item.
3007
3008        This method provides a powerful way to evaluate existing data in Langfuse at scale.
3009        It fetches items based on filters, transforms them using a mapper function, runs
3010        evaluators on each item, and creates scores that are linked back to the original
3011        entities. This is ideal for:
3012
3013        - Running evaluations on production traces after deployment
3014        - Backtesting new evaluation metrics on historical data
3015        - Batch scoring of observations for quality monitoring
3016        - Periodic evaluation runs on recent data
3017
3018        The method uses a streaming/pipeline approach to process items in batches, making
3019        it memory-efficient for large datasets. It includes comprehensive error handling,
3020        retry logic, and resume capability for long-running evaluations.
3021
3022        Args:
3023            scope: The type of items to evaluate. Must be one of:
3024                - "traces": Evaluate complete traces with all their observations
3025                - "observations": Evaluate individual observations (spans, generations, events)
3026            mapper: Function that transforms API response objects into evaluator inputs.
3027                Receives a trace/observation object and returns an EvaluatorInputs
3028                instance with input, output, expected_output, and metadata fields.
3029                Can be sync or async.
3030            evaluators: List of evaluation functions to run on each item. Each evaluator
3031                receives the mapped inputs and returns Evaluation object(s). Evaluator
3032                failures are logged but don't stop the batch evaluation.
3033            filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples:
3034                - '{"tags": ["production"]}'
3035                - '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}'
3036                Default: None (fetches all items).
3037            fetch_batch_size: Number of items to fetch per API call and hold in memory.
3038                Larger values may be faster but use more memory. Default: 50.
3039            max_items: Maximum total number of items to process. If None, processes all
3040                items matching the filter. Useful for testing or limiting evaluation runs.
3041                Default: None (process all).
3042            max_concurrency: Maximum number of items to evaluate concurrently. Controls
3043                parallelism and resource usage. Default: 50.
3044            composite_evaluator: Optional function that creates a composite score from
3045                item-level evaluations. Receives the original item and its evaluations,
3046                returns a single Evaluation. Useful for weighted averages or combined metrics.
3047                Default: None.
3048            metadata: Optional metadata dict to add to all created scores. Useful for
3049                tracking evaluation runs, versions, or other context. Default: None.
3050            max_retries: Maximum number of retry attempts for failed batch fetches.
3051                Uses exponential backoff (1s, 2s, 4s). Default: 3.
3052            verbose: If True, logs progress information to console. Useful for monitoring
3053                long-running evaluations. Default: False.
3054            resume_from: Optional resume token from a previous incomplete run. Allows
3055                continuing evaluation after interruption or failure. Default: None.
3056
3057
3058        Returns:
3059            BatchEvaluationResult containing:
3060                - total_items_fetched: Number of items fetched from API
3061                - total_items_processed: Number of items successfully evaluated
3062                - total_items_failed: Number of items that failed evaluation
3063                - total_scores_created: Scores created by item-level evaluators
3064                - total_composite_scores_created: Scores created by composite evaluator
3065                - total_evaluations_failed: Individual evaluator failures
3066                - evaluator_stats: Per-evaluator statistics (success rate, scores created)
3067                - resume_token: Token for resuming if incomplete (None if completed)
3068                - completed: True if all items processed
3069                - duration_seconds: Total execution time
3070                - failed_item_ids: IDs of items that failed
3071                - error_summary: Error types and counts
3072                - has_more_items: True if max_items reached but more exist
3073
3074        Raises:
3075            ValueError: If invalid scope is provided.
3076
3077        Examples:
3078            Basic trace evaluation:
3079            ```python
3080            from langfuse import Langfuse, EvaluatorInputs, Evaluation
3081
3082            client = Langfuse()
3083
3084            # Define mapper to extract fields from traces
3085            def trace_mapper(trace):
3086                return EvaluatorInputs(
3087                    input=trace.input,
3088                    output=trace.output,
3089                    expected_output=None,
3090                    metadata={"trace_id": trace.id}
3091                )
3092
3093            # Define evaluator
3094            def length_evaluator(*, input, output, expected_output, metadata):
3095                return Evaluation(
3096                    name="output_length",
3097                    value=len(output) if output else 0
3098                )
3099
3100            # Run batch evaluation
3101            result = client.run_batched_evaluation(
3102                scope="traces",
3103                mapper=trace_mapper,
3104                evaluators=[length_evaluator],
3105                filter='{"tags": ["production"]}',
3106                max_items=1000,
3107                verbose=True
3108            )
3109
3110            print(f"Processed {result.total_items_processed} traces")
3111            print(f"Created {result.total_scores_created} scores")
3112            ```
3113
3114            Evaluation with composite scorer:
3115            ```python
3116            def accuracy_evaluator(*, input, output, expected_output, metadata):
3117                # ... evaluation logic
3118                return Evaluation(name="accuracy", value=0.85)
3119
3120            def relevance_evaluator(*, input, output, expected_output, metadata):
3121                # ... evaluation logic
3122                return Evaluation(name="relevance", value=0.92)
3123
3124            def composite_evaluator(*, item, evaluations):
3125                # Weighted average of evaluations
3126                weights = {"accuracy": 0.6, "relevance": 0.4}
3127                total = sum(
3128                    e.value * weights.get(e.name, 0)
3129                    for e in evaluations
3130                    if isinstance(e.value, (int, float))
3131                )
3132                return Evaluation(
3133                    name="composite_score",
3134                    value=total,
3135                    comment=f"Weighted average of {len(evaluations)} metrics"
3136                )
3137
3138            result = client.run_batched_evaluation(
3139                scope="traces",
3140                mapper=trace_mapper,
3141                evaluators=[accuracy_evaluator, relevance_evaluator],
3142                composite_evaluator=composite_evaluator,
3143                filter='{"user_id": "important_user"}',
3144                verbose=True
3145            )
3146            ```
3147
3148            Handling incomplete runs with resume:
3149            ```python
3150            # Initial run that may fail or timeout
3151            result = client.run_batched_evaluation(
3152                scope="observations",
3153                mapper=obs_mapper,
3154                evaluators=[my_evaluator],
3155                max_items=10000,
3156                verbose=True
3157            )
3158
3159            # Check if incomplete
3160            if not result.completed and result.resume_token:
3161                print(f"Processed {result.resume_token.items_processed} items before interruption")
3162
3163                # Resume from where it left off
3164                result = client.run_batched_evaluation(
3165                    scope="observations",
3166                    mapper=obs_mapper,
3167                    evaluators=[my_evaluator],
3168                    resume_from=result.resume_token,
3169                    verbose=True
3170                )
3171
3172            print(f"Total items processed: {result.total_items_processed}")
3173            ```
3174
3175            Monitoring evaluator performance:
3176            ```python
3177            result = client.run_batched_evaluation(...)
3178
3179            for stats in result.evaluator_stats:
3180                success_rate = stats.successful_runs / stats.total_runs
3181                print(f"{stats.name}:")
3182                print(f"  Success rate: {success_rate:.1%}")
3183                print(f"  Scores created: {stats.total_scores_created}")
3184
3185                if stats.failed_runs > 0:
3186                    print(f"  âš ī¸  Failed {stats.failed_runs} times")
3187            ```
3188
3189        Note:
3190            - Evaluator failures are logged but don't stop the batch evaluation
3191            - Individual item failures are tracked but don't stop processing
3192            - Fetch failures are retried with exponential backoff
3193            - All scores are automatically flushed to Langfuse at the end
3194            - The resume mechanism uses timestamp-based filtering to avoid duplicates
3195        """
3196        runner = BatchEvaluationRunner(self)
3197
3198        return cast(
3199            BatchEvaluationResult,
3200            run_async_safely(
3201                runner.run_async(
3202                    scope=scope,
3203                    mapper=mapper,
3204                    evaluators=evaluators,
3205                    filter=filter,
3206                    fetch_batch_size=fetch_batch_size,
3207                    max_items=max_items,
3208                    max_concurrency=max_concurrency,
3209                    composite_evaluator=composite_evaluator,
3210                    metadata=metadata,
3211                    max_retries=max_retries,
3212                    verbose=verbose,
3213                    resume_from=resume_from,
3214                )
3215            ),
3216        )
3217
3218    def auth_check(self) -> bool:
3219        """Check if the provided credentials (public and secret key) are valid.
3220
3221        Raises:
3222            Exception: If no projects were found for the provided credentials.
3223
3224        Note:
3225            This method is blocking. It is discouraged to use it in production code.
3226        """
3227        try:
3228            projects = self.api.projects.get()
3229            langfuse_logger.debug(
3230                f"Auth check successful, found {len(projects.data)} projects"
3231            )
3232            if len(projects.data) == 0:
3233                raise Exception(
3234                    "Auth check failed, no project found for the keys provided."
3235                )
3236            return True
3237
3238        except AttributeError as e:
3239            langfuse_logger.warning(
3240                f"Auth check failed: Client not properly initialized. Error: {e}"
3241            )
3242            return False
3243
3244        except Error as e:
3245            handle_fern_exception(e)
3246            raise e
3247
3248    def create_dataset(
3249        self,
3250        *,
3251        name: str,
3252        description: Optional[str] = None,
3253        metadata: Optional[Any] = None,
3254    ) -> Dataset:
3255        """Create a dataset with the given name on Langfuse.
3256
3257        Args:
3258            name: Name of the dataset to create.
3259            description: Description of the dataset. Defaults to None.
3260            metadata: Additional metadata. Defaults to None.
3261
3262        Returns:
3263            Dataset: The created dataset as returned by the Langfuse API.
3264        """
3265        try:
3266            body = CreateDatasetRequest(
3267                name=name, description=description, metadata=metadata
3268            )
3269            langfuse_logger.debug(f"Creating datasets {body}")
3270
3271            return self.api.datasets.create(request=body)
3272
3273        except Error as e:
3274            handle_fern_exception(e)
3275            raise e
3276
3277    def create_dataset_item(
3278        self,
3279        *,
3280        dataset_name: str,
3281        input: Optional[Any] = None,
3282        expected_output: Optional[Any] = None,
3283        metadata: Optional[Any] = None,
3284        source_trace_id: Optional[str] = None,
3285        source_observation_id: Optional[str] = None,
3286        status: Optional[DatasetStatus] = None,
3287        id: Optional[str] = None,
3288    ) -> DatasetItem:
3289        """Create a dataset item.
3290
3291        Upserts if an item with id already exists.
3292
3293        Args:
3294            dataset_name: Name of the dataset in which the dataset item should be created.
3295            input: Input data. Defaults to None. Can contain any dict, list or scalar.
3296            expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar.
3297            metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar.
3298            source_trace_id: Id of the source trace. Defaults to None.
3299            source_observation_id: Id of the source observation. Defaults to None.
3300            status: Status of the dataset item. Defaults to ACTIVE for newly created items.
3301            id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets.
3302
3303        Returns:
3304            DatasetItem: The created dataset item as returned by the Langfuse API.
3305
3306        Example:
3307            ```python
3308            from langfuse import Langfuse
3309
3310            langfuse = Langfuse()
3311
3312            # Uploading items to the Langfuse dataset named "capital_cities"
3313            langfuse.create_dataset_item(
3314                dataset_name="capital_cities",
3315                input={"input": {"country": "Italy"}},
3316                expected_output={"expected_output": "Rome"},
3317                metadata={"foo": "bar"}
3318            )
3319            ```
3320        """
3321        try:
3322            body = CreateDatasetItemRequest(
3323                datasetName=dataset_name,
3324                input=input,
3325                expectedOutput=expected_output,
3326                metadata=metadata,
3327                sourceTraceId=source_trace_id,
3328                sourceObservationId=source_observation_id,
3329                status=status,
3330                id=id,
3331            )
3332            langfuse_logger.debug(f"Creating dataset item {body}")
3333            return self.api.dataset_items.create(request=body)
3334        except Error as e:
3335            handle_fern_exception(e)
3336            raise e
3337
3338    def resolve_media_references(
3339        self,
3340        *,
3341        obj: Any,
3342        resolve_with: Literal["base64_data_uri"],
3343        max_depth: int = 10,
3344        content_fetch_timeout_seconds: int = 5,
3345    ) -> Any:
3346        """Replace media reference strings in an object with base64 data URIs.
3347
3348        This method recursively traverses an object (up to max_depth) looking for media reference strings
3349        in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using
3350        the provided Langfuse client and replaces the reference string with a base64 data URI.
3351
3352        If fetching media content fails for a reference string, a warning is logged and the reference
3353        string is left unchanged.
3354
3355        Args:
3356            obj: The object to process. Can be a primitive value, array, or nested object.
3357                If the object has a __dict__ attribute, a dict will be returned instead of the original object type.
3358            resolve_with: The representation of the media content to replace the media reference string with.
3359                Currently only "base64_data_uri" is supported.
3360            max_depth: int: The maximum depth to traverse the object. Default is 10.
3361            content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5.
3362
3363        Returns:
3364            A deep copy of the input object with all media references replaced with base64 data URIs where possible.
3365            If the input object has a __dict__ attribute, a dict will be returned instead of the original object type.
3366
3367        Example:
3368            obj = {
3369                "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@",
3370                "nested": {
3371                    "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@"
3372                }
3373            }
3374
3375            result = await LangfuseMedia.resolve_media_references(obj, langfuse_client)
3376
3377            # Result:
3378            # {
3379            #     "image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...",
3380            #     "nested": {
3381            #         "pdf": "data:application/pdf;base64,JVBERi0xLjcK..."
3382            #     }
3383            # }
3384        """
3385        return LangfuseMedia.resolve_media_references(
3386            langfuse_client=self,
3387            obj=obj,
3388            resolve_with=resolve_with,
3389            max_depth=max_depth,
3390            content_fetch_timeout_seconds=content_fetch_timeout_seconds,
3391        )
3392
3393    @overload
3394    def get_prompt(
3395        self,
3396        name: str,
3397        *,
3398        version: Optional[int] = None,
3399        label: Optional[str] = None,
3400        type: Literal["chat"],
3401        cache_ttl_seconds: Optional[int] = None,
3402        fallback: Optional[List[ChatMessageDict]] = None,
3403        max_retries: Optional[int] = None,
3404        fetch_timeout_seconds: Optional[int] = None,
3405    ) -> ChatPromptClient: ...
3406
3407    @overload
3408    def get_prompt(
3409        self,
3410        name: str,
3411        *,
3412        version: Optional[int] = None,
3413        label: Optional[str] = None,
3414        type: Literal["text"] = "text",
3415        cache_ttl_seconds: Optional[int] = None,
3416        fallback: Optional[str] = None,
3417        max_retries: Optional[int] = None,
3418        fetch_timeout_seconds: Optional[int] = None,
3419    ) -> TextPromptClient: ...
3420
3421    def get_prompt(
3422        self,
3423        name: str,
3424        *,
3425        version: Optional[int] = None,
3426        label: Optional[str] = None,
3427        type: Literal["chat", "text"] = "text",
3428        cache_ttl_seconds: Optional[int] = None,
3429        fallback: Union[Optional[List[ChatMessageDict]], Optional[str]] = None,
3430        max_retries: Optional[int] = None,
3431        fetch_timeout_seconds: Optional[int] = None,
3432    ) -> PromptClient:
3433        """Get a prompt.
3434
3435        This method attempts to fetch the requested prompt from the local cache. If the prompt is not found
3436        in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again
3437        and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will
3438        return the expired prompt as a fallback.
3439
3440        Args:
3441            name (str): The name of the prompt to retrieve.
3442
3443        Keyword Args:
3444            version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both.
3445            label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both.
3446            cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a
3447            keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0.
3448            type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text".
3449            fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None.
3450            max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds.
3451            fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default.
3452
3453        Returns:
3454            The prompt object retrieved from the cache or directly fetched if not cached or expired of type
3455            - TextPromptClient, if type argument is 'text'.
3456            - ChatPromptClient, if type argument is 'chat'.
3457
3458        Raises:
3459            Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an
3460            expired prompt in the cache, in which case it logs a warning and returns the expired prompt.
3461        """
3462        if self._resources is None:
3463            raise Error(
3464                "SDK is not correctly initialized. Check the init logs for more details."
3465            )
3466        if version is not None and label is not None:
3467            raise ValueError("Cannot specify both version and label at the same time.")
3468
3469        if not name:
3470            raise ValueError("Prompt name cannot be empty.")
3471
3472        cache_key = PromptCache.generate_cache_key(name, version=version, label=label)
3473        bounded_max_retries = self._get_bounded_max_retries(
3474            max_retries, default_max_retries=2, max_retries_upper_bound=4
3475        )
3476
3477        langfuse_logger.debug(f"Getting prompt '{cache_key}'")
3478        cached_prompt = self._resources.prompt_cache.get(cache_key)
3479
3480        if cached_prompt is None or cache_ttl_seconds == 0:
3481            langfuse_logger.debug(
3482                f"Prompt '{cache_key}' not found in cache or caching disabled."
3483            )
3484            try:
3485                return self._fetch_prompt_and_update_cache(
3486                    name,
3487                    version=version,
3488                    label=label,
3489                    ttl_seconds=cache_ttl_seconds,
3490                    max_retries=bounded_max_retries,
3491                    fetch_timeout_seconds=fetch_timeout_seconds,
3492                )
3493            except Exception as e:
3494                if fallback:
3495                    langfuse_logger.warning(
3496                        f"Returning fallback prompt for '{cache_key}' due to fetch error: {e}"
3497                    )
3498
3499                    fallback_client_args: Dict[str, Any] = {
3500                        "name": name,
3501                        "prompt": fallback,
3502                        "type": type,
3503                        "version": version or 0,
3504                        "config": {},
3505                        "labels": [label] if label else [],
3506                        "tags": [],
3507                    }
3508
3509                    if type == "text":
3510                        return TextPromptClient(
3511                            prompt=Prompt_Text(**fallback_client_args),
3512                            is_fallback=True,
3513                        )
3514
3515                    if type == "chat":
3516                        return ChatPromptClient(
3517                            prompt=Prompt_Chat(**fallback_client_args),
3518                            is_fallback=True,
3519                        )
3520
3521                raise e
3522
3523        if cached_prompt.is_expired():
3524            langfuse_logger.debug(f"Stale prompt '{cache_key}' found in cache.")
3525            try:
3526                # refresh prompt in background thread, refresh_prompt deduplicates tasks
3527                langfuse_logger.debug(f"Refreshing prompt '{cache_key}' in background.")
3528
3529                def refresh_task() -> None:
3530                    self._fetch_prompt_and_update_cache(
3531                        name,
3532                        version=version,
3533                        label=label,
3534                        ttl_seconds=cache_ttl_seconds,
3535                        max_retries=bounded_max_retries,
3536                        fetch_timeout_seconds=fetch_timeout_seconds,
3537                    )
3538
3539                self._resources.prompt_cache.add_refresh_prompt_task(
3540                    cache_key,
3541                    refresh_task,
3542                )
3543                langfuse_logger.debug(
3544                    f"Returning stale prompt '{cache_key}' from cache."
3545                )
3546                # return stale prompt
3547                return cached_prompt.value
3548
3549            except Exception as e:
3550                langfuse_logger.warning(
3551                    f"Error when refreshing cached prompt '{cache_key}', returning cached version. Error: {e}"
3552                )
3553                # creation of refresh prompt task failed, return stale prompt
3554                return cached_prompt.value
3555
3556        return cached_prompt.value
3557
3558    def _fetch_prompt_and_update_cache(
3559        self,
3560        name: str,
3561        *,
3562        version: Optional[int] = None,
3563        label: Optional[str] = None,
3564        ttl_seconds: Optional[int] = None,
3565        max_retries: int,
3566        fetch_timeout_seconds: Optional[int],
3567    ) -> PromptClient:
3568        cache_key = PromptCache.generate_cache_key(name, version=version, label=label)
3569        langfuse_logger.debug(f"Fetching prompt '{cache_key}' from server...")
3570
3571        try:
3572
3573            @backoff.on_exception(
3574                backoff.constant, Exception, max_tries=max_retries + 1, logger=None
3575            )
3576            def fetch_prompts() -> Any:
3577                return self.api.prompts.get(
3578                    self._url_encode(name),
3579                    version=version,
3580                    label=label,
3581                    request_options={
3582                        "timeout_in_seconds": fetch_timeout_seconds,
3583                    }
3584                    if fetch_timeout_seconds is not None
3585                    else None,
3586                )
3587
3588            prompt_response = fetch_prompts()
3589
3590            prompt: PromptClient
3591            if prompt_response.type == "chat":
3592                prompt = ChatPromptClient(prompt_response)
3593            else:
3594                prompt = TextPromptClient(prompt_response)
3595
3596            if self._resources is not None:
3597                self._resources.prompt_cache.set(cache_key, prompt, ttl_seconds)
3598
3599            return prompt
3600
3601        except Exception as e:
3602            langfuse_logger.error(
3603                f"Error while fetching prompt '{cache_key}': {str(e)}"
3604            )
3605            raise e
3606
3607    def _get_bounded_max_retries(
3608        self,
3609        max_retries: Optional[int],
3610        *,
3611        default_max_retries: int = 2,
3612        max_retries_upper_bound: int = 4,
3613    ) -> int:
3614        if max_retries is None:
3615            return default_max_retries
3616
3617        bounded_max_retries = min(
3618            max(max_retries, 0),
3619            max_retries_upper_bound,
3620        )
3621
3622        return bounded_max_retries
3623
3624    @overload
3625    def create_prompt(
3626        self,
3627        *,
3628        name: str,
3629        prompt: List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]],
3630        labels: List[str] = [],
3631        tags: Optional[List[str]] = None,
3632        type: Optional[Literal["chat"]],
3633        config: Optional[Any] = None,
3634        commit_message: Optional[str] = None,
3635    ) -> ChatPromptClient: ...
3636
3637    @overload
3638    def create_prompt(
3639        self,
3640        *,
3641        name: str,
3642        prompt: str,
3643        labels: List[str] = [],
3644        tags: Optional[List[str]] = None,
3645        type: Optional[Literal["text"]] = "text",
3646        config: Optional[Any] = None,
3647        commit_message: Optional[str] = None,
3648    ) -> TextPromptClient: ...
3649
3650    def create_prompt(
3651        self,
3652        *,
3653        name: str,
3654        prompt: Union[
3655            str, List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]]
3656        ],
3657        labels: List[str] = [],
3658        tags: Optional[List[str]] = None,
3659        type: Optional[Literal["chat", "text"]] = "text",
3660        config: Optional[Any] = None,
3661        commit_message: Optional[str] = None,
3662    ) -> PromptClient:
3663        """Create a new prompt in Langfuse.
3664
3665        Keyword Args:
3666            name : The name of the prompt to be created.
3667            prompt : The content of the prompt to be created.
3668            is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead.
3669            labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label.
3670            tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt.
3671            config: Additional structured data to be saved with the prompt. Defaults to None.
3672            type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text".
3673            commit_message: Optional string describing the change.
3674
3675        Returns:
3676            TextPromptClient: The prompt if type argument is 'text'.
3677            ChatPromptClient: The prompt if type argument is 'chat'.
3678        """
3679        try:
3680            langfuse_logger.debug(f"Creating prompt {name=}, {labels=}")
3681
3682            if type == "chat":
3683                if not isinstance(prompt, list):
3684                    raise ValueError(
3685                        "For 'chat' type, 'prompt' must be a list of chat messages with role and content attributes."
3686                    )
3687                request: Union[CreatePromptRequest_Chat, CreatePromptRequest_Text] = (
3688                    CreatePromptRequest_Chat(
3689                        name=name,
3690                        prompt=cast(Any, prompt),
3691                        labels=labels,
3692                        tags=tags,
3693                        config=config or {},
3694                        commitMessage=commit_message,
3695                        type="chat",
3696                    )
3697                )
3698                server_prompt = self.api.prompts.create(request=request)
3699
3700                if self._resources is not None:
3701                    self._resources.prompt_cache.invalidate(name)
3702
3703                return ChatPromptClient(prompt=cast(Prompt_Chat, server_prompt))
3704
3705            if not isinstance(prompt, str):
3706                raise ValueError("For 'text' type, 'prompt' must be a string.")
3707
3708            request = CreatePromptRequest_Text(
3709                name=name,
3710                prompt=prompt,
3711                labels=labels,
3712                tags=tags,
3713                config=config or {},
3714                commitMessage=commit_message,
3715                type="text",
3716            )
3717
3718            server_prompt = self.api.prompts.create(request=request)
3719
3720            if self._resources is not None:
3721                self._resources.prompt_cache.invalidate(name)
3722
3723            return TextPromptClient(prompt=cast(Prompt_Text, server_prompt))
3724
3725        except Error as e:
3726            handle_fern_exception(e)
3727            raise e
3728
3729    def update_prompt(
3730        self,
3731        *,
3732        name: str,
3733        version: int,
3734        new_labels: List[str] = [],
3735    ) -> Any:
3736        """Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name.
3737
3738        Args:
3739            name (str): The name of the prompt to update.
3740            version (int): The version number of the prompt to update.
3741            new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to [].
3742
3743        Returns:
3744            Prompt: The updated prompt from the Langfuse API.
3745
3746        """
3747        updated_prompt = self.api.prompt_version.update(
3748            name=self._url_encode(name),
3749            version=version,
3750            new_labels=new_labels,
3751        )
3752
3753        if self._resources is not None:
3754            self._resources.prompt_cache.invalidate(name)
3755
3756        return updated_prompt
3757
3758    def _url_encode(self, url: str, *, is_url_param: Optional[bool] = False) -> str:
3759        # httpx â‰Ĩ 0.28 does its own WHATWG-compliant quoting (eg. encodes bare
3760        # “%”, “?”, “#”, “|”, â€Ļ in query/path parts).  Re-quoting here would
3761        # double-encode, so we skip when the value is about to be sent straight
3762        # to httpx (`is_url_param=True`) and the installed version is â‰Ĩ 0.28.
3763        if is_url_param and Version(httpx.__version__) >= Version("0.28.0"):
3764            return url
3765
3766        # urllib.parse.quote does not escape slashes "/" by default; we need to add safe="" to force escaping
3767        # we need add safe="" to force escaping of slashes
3768        # This is necessary for prompts in prompt folders
3769        return urllib.parse.quote(url, safe="")
3770
3771    def clear_prompt_cache(self) -> None:
3772        """Clear the entire prompt cache, removing all cached prompts.
3773
3774        This method is useful when you want to force a complete refresh of all
3775        cached prompts, for example after major updates or when you need to
3776        ensure the latest versions are fetched from the server.
3777        """
3778        if self._resources is not None:
3779            self._resources.prompt_cache.clear()

Main client for Langfuse tracing and platform features.

This class provides an interface for creating and managing traces, spans, and generations in Langfuse as well as interacting with the Langfuse API.

The client features a thread-safe singleton pattern for each unique public API key, ensuring consistent trace context propagation across your application. It implements efficient batching of spans with configurable flush settings and includes background thread management for media uploads and score ingestion.

Configuration is flexible through either direct parameters or environment variables, with graceful fallbacks and runtime configuration updates.

Attributes:
  • api: Synchronous API client for Langfuse backend communication
  • async_api: Asynchronous API client for Langfuse backend communication
  • _otel_tracer: Internal LangfuseTracer instance managing OpenTelemetry components
Arguments:
  • public_key (Optional[str]): Your Langfuse public API key. Can also be set via LANGFUSE_PUBLIC_KEY environment variable.
  • secret_key (Optional[str]): Your Langfuse secret API key. Can also be set via LANGFUSE_SECRET_KEY environment variable.
  • base_url (Optional[str]): The Langfuse API base URL. Defaults to "https://cloud.langfuse.com". Can also be set via LANGFUSE_BASE_URL environment variable.
  • host (Optional[str]): Deprecated. Use base_url instead. The Langfuse API host URL. Defaults to "https://cloud.langfuse.com".
  • timeout (Optional[int]): Timeout in seconds for API requests. Defaults to 5 seconds.
  • httpx_client (Optional[httpx.Client]): Custom httpx client for making non-tracing HTTP requests. If not provided, a default client will be created.
  • debug (bool): Enable debug logging. Defaults to False. Can also be set via LANGFUSE_DEBUG environment variable.
  • tracing_enabled (Optional[bool]): Enable or disable tracing. Defaults to True. Can also be set via LANGFUSE_TRACING_ENABLED environment variable.
  • flush_at (Optional[int]): Number of spans to batch before sending to the API. Defaults to 512. Can also be set via LANGFUSE_FLUSH_AT environment variable.
  • flush_interval (Optional[float]): Time in seconds between batch flushes. Defaults to 5 seconds. Can also be set via LANGFUSE_FLUSH_INTERVAL environment variable.
  • environment (Optional[str]): Environment name for tracing. Default is 'default'. Can also be set via LANGFUSE_TRACING_ENVIRONMENT environment variable. Can be any lowercase alphanumeric string with hyphens and underscores that does not start with 'langfuse'.
  • release (Optional[str]): Release version/hash of your application. Used for grouping analytics by release.
  • media_upload_thread_count (Optional[int]): Number of background threads for handling media uploads. Defaults to 1. Can also be set via LANGFUSE_MEDIA_UPLOAD_THREAD_COUNT environment variable.
  • sample_rate (Optional[float]): Sampling rate for traces (0.0 to 1.0). Defaults to 1.0 (100% of traces are sampled). Can also be set via LANGFUSE_SAMPLE_RATE environment variable.
  • mask (Optional[MaskFunction]): Function to mask sensitive data in traces before sending to the API.
  • blocked_instrumentation_scopes (Optional[List[str]]): List of instrumentation scope names to block from being exported to Langfuse. Spans from these scopes will be filtered out before being sent to the API. Useful for filtering out spans from specific libraries or frameworks. For exported spans, you can see the instrumentation scope name in the span metadata in Langfuse (metadata.scope.name)
  • additional_headers (Optional[Dict[str, str]]): Additional headers to include in all API requests and OTLPSpanExporter requests. These headers will be merged with default headers. Note: If httpx_client is provided, additional_headers must be set directly on your custom httpx_client as well.
  • tracer_provider(Optional[TracerProvider]): OpenTelemetry TracerProvider to use for Langfuse. This can be useful to set to have disconnected tracing between Langfuse and other OpenTelemetry-span emitting libraries. Note: To track active spans, the context is still shared between TracerProviders. This may lead to broken trace trees.
Example:
from langfuse.otel import Langfuse

# Initialize the client (reads from env vars if not provided)
langfuse = Langfuse(
    public_key="your-public-key",
    secret_key="your-secret-key",
    host="https://cloud.langfuse.com",  # Optional, default shown
)

# Create a trace span
with langfuse.start_as_current_span(name="process-query") as span:
    # Your application code here

    # Create a nested generation span for an LLM call
    with span.start_as_current_generation(
        name="generate-response",
        model="gpt-4",
        input={"query": "Tell me about AI"},
        model_parameters={"temperature": 0.7, "max_tokens": 500}
    ) as generation:
        # Generate response here
        response = "AI is a field of computer science..."

        generation.update(
            output=response,
            usage_details={"prompt_tokens": 10, "completion_tokens": 50},
            cost_details={"total_cost": 0.0023}
        )

        # Score the generation (supports NUMERIC, BOOLEAN, CATEGORICAL)
        generation.score(name="relevance", value=0.95, data_type="NUMERIC")
Langfuse( *, public_key: Optional[str] = None, secret_key: Optional[str] = None, base_url: Optional[str] = None, host: Optional[str] = None, timeout: Optional[int] = None, httpx_client: Optional[httpx.Client] = None, debug: bool = False, tracing_enabled: Optional[bool] = True, flush_at: Optional[int] = None, flush_interval: Optional[float] = None, environment: Optional[str] = None, release: Optional[str] = None, media_upload_thread_count: Optional[int] = None, sample_rate: Optional[float] = None, mask: Optional[langfuse.types.MaskFunction] = None, blocked_instrumentation_scopes: Optional[List[str]] = None, additional_headers: Optional[Dict[str, str]] = None, tracer_provider: Optional[opentelemetry.sdk.trace.TracerProvider] = None)
207    def __init__(
208        self,
209        *,
210        public_key: Optional[str] = None,
211        secret_key: Optional[str] = None,
212        base_url: Optional[str] = None,
213        host: Optional[str] = None,
214        timeout: Optional[int] = None,
215        httpx_client: Optional[httpx.Client] = None,
216        debug: bool = False,
217        tracing_enabled: Optional[bool] = True,
218        flush_at: Optional[int] = None,
219        flush_interval: Optional[float] = None,
220        environment: Optional[str] = None,
221        release: Optional[str] = None,
222        media_upload_thread_count: Optional[int] = None,
223        sample_rate: Optional[float] = None,
224        mask: Optional[MaskFunction] = None,
225        blocked_instrumentation_scopes: Optional[List[str]] = None,
226        additional_headers: Optional[Dict[str, str]] = None,
227        tracer_provider: Optional[TracerProvider] = None,
228    ):
229        self._base_url = (
230            base_url
231            or os.environ.get(LANGFUSE_BASE_URL)
232            or host
233            or os.environ.get(LANGFUSE_HOST, "https://cloud.langfuse.com")
234        )
235        self._environment = environment or cast(
236            str, os.environ.get(LANGFUSE_TRACING_ENVIRONMENT)
237        )
238        self._project_id: Optional[str] = None
239        sample_rate = sample_rate or float(os.environ.get(LANGFUSE_SAMPLE_RATE, 1.0))
240        if not 0.0 <= sample_rate <= 1.0:
241            raise ValueError(
242                f"Sample rate must be between 0.0 and 1.0, got {sample_rate}"
243            )
244
245        timeout = timeout or int(os.environ.get(LANGFUSE_TIMEOUT, 5))
246
247        self._tracing_enabled = (
248            tracing_enabled
249            and os.environ.get(LANGFUSE_TRACING_ENABLED, "true").lower() != "false"
250        )
251        if not self._tracing_enabled:
252            langfuse_logger.info(
253                "Configuration: Langfuse tracing is explicitly disabled. No data will be sent to the Langfuse API."
254            )
255
256        debug = (
257            debug if debug else (os.getenv(LANGFUSE_DEBUG, "false").lower() == "true")
258        )
259        if debug:
260            logging.basicConfig(
261                format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
262            )
263            langfuse_logger.setLevel(logging.DEBUG)
264
265        public_key = public_key or os.environ.get(LANGFUSE_PUBLIC_KEY)
266        if public_key is None:
267            langfuse_logger.warning(
268                "Authentication error: Langfuse client initialized without public_key. Client will be disabled. "
269                "Provide a public_key parameter or set LANGFUSE_PUBLIC_KEY environment variable. "
270            )
271            self._otel_tracer = otel_trace_api.NoOpTracer()
272            return
273
274        secret_key = secret_key or os.environ.get(LANGFUSE_SECRET_KEY)
275        if secret_key is None:
276            langfuse_logger.warning(
277                "Authentication error: Langfuse client initialized without secret_key. Client will be disabled. "
278                "Provide a secret_key parameter or set LANGFUSE_SECRET_KEY environment variable. "
279            )
280            self._otel_tracer = otel_trace_api.NoOpTracer()
281            return
282
283        if os.environ.get("OTEL_SDK_DISABLED", "false").lower() == "true":
284            langfuse_logger.warning(
285                "OTEL_SDK_DISABLED is set. Langfuse tracing will be disabled and no traces will appear in the UI."
286            )
287
288        # Initialize api and tracer if requirements are met
289        self._resources = LangfuseResourceManager(
290            public_key=public_key,
291            secret_key=secret_key,
292            base_url=self._base_url,
293            timeout=timeout,
294            environment=self._environment,
295            release=release,
296            flush_at=flush_at,
297            flush_interval=flush_interval,
298            httpx_client=httpx_client,
299            media_upload_thread_count=media_upload_thread_count,
300            sample_rate=sample_rate,
301            mask=mask,
302            tracing_enabled=self._tracing_enabled,
303            blocked_instrumentation_scopes=blocked_instrumentation_scopes,
304            additional_headers=additional_headers,
305            tracer_provider=tracer_provider,
306        )
307        self._mask = self._resources.mask
308
309        self._otel_tracer = (
310            self._resources.tracer
311            if self._tracing_enabled and self._resources.tracer is not None
312            else otel_trace_api.NoOpTracer()
313        )
314        self.api = self._resources.api
315        self.async_api = self._resources.async_api
api
async_api
def start_span( self, *, trace_context: Optional[langfuse.types.TraceContext] = None, name: str, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None) -> LangfuseSpan:
317    def start_span(
318        self,
319        *,
320        trace_context: Optional[TraceContext] = None,
321        name: str,
322        input: Optional[Any] = None,
323        output: Optional[Any] = None,
324        metadata: Optional[Any] = None,
325        version: Optional[str] = None,
326        level: Optional[SpanLevel] = None,
327        status_message: Optional[str] = None,
328    ) -> LangfuseSpan:
329        """Create a new span for tracing a unit of work.
330
331        This method creates a new span but does not set it as the current span in the
332        context. To create and use a span within a context, use start_as_current_span().
333
334        The created span will be the child of the current span in the context.
335
336        Args:
337            trace_context: Optional context for connecting to an existing trace
338            name: Name of the span (e.g., function or operation name)
339            input: Input data for the operation (can be any JSON-serializable object)
340            output: Output data from the operation (can be any JSON-serializable object)
341            metadata: Additional metadata to associate with the span
342            version: Version identifier for the code or component
343            level: Importance level of the span (info, warning, error)
344            status_message: Optional status message for the span
345
346        Returns:
347            A LangfuseSpan object that must be ended with .end() when the operation completes
348
349        Example:
350            ```python
351            span = langfuse.start_span(name="process-data")
352            try:
353                # Do work
354                span.update(output="result")
355            finally:
356                span.end()
357            ```
358        """
359        return self.start_observation(
360            trace_context=trace_context,
361            name=name,
362            as_type="span",
363            input=input,
364            output=output,
365            metadata=metadata,
366            version=version,
367            level=level,
368            status_message=status_message,
369        )

Create a new span for tracing a unit of work.

This method creates a new span but does not set it as the current span in the context. To create and use a span within a context, use start_as_current_span().

The created span will be the child of the current span in the context.

Arguments:
  • trace_context: Optional context for connecting to an existing trace
  • name: Name of the span (e.g., function or operation name)
  • input: Input data for the operation (can be any JSON-serializable object)
  • output: Output data from the operation (can be any JSON-serializable object)
  • metadata: Additional metadata to associate with the span
  • version: Version identifier for the code or component
  • level: Importance level of the span (info, warning, error)
  • status_message: Optional status message for the span
Returns:

A LangfuseSpan object that must be ended with .end() when the operation completes

Example:
span = langfuse.start_span(name="process-data")
try:
    # Do work
    span.update(output="result")
finally:
    span.end()
def start_as_current_span( self, *, trace_context: Optional[langfuse.types.TraceContext] = None, name: str, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, end_on_exit: Optional[bool] = None) -> opentelemetry.util._decorator._AgnosticContextManager[LangfuseSpan]:
371    def start_as_current_span(
372        self,
373        *,
374        trace_context: Optional[TraceContext] = None,
375        name: str,
376        input: Optional[Any] = None,
377        output: Optional[Any] = None,
378        metadata: Optional[Any] = None,
379        version: Optional[str] = None,
380        level: Optional[SpanLevel] = None,
381        status_message: Optional[str] = None,
382        end_on_exit: Optional[bool] = None,
383    ) -> _AgnosticContextManager[LangfuseSpan]:
384        """Create a new span and set it as the current span in a context manager.
385
386        This method creates a new span and sets it as the current span within a context
387        manager. Use this method with a 'with' statement to automatically handle span
388        lifecycle within a code block.
389
390        The created span will be the child of the current span in the context.
391
392        Args:
393            trace_context: Optional context for connecting to an existing trace
394            name: Name of the span (e.g., function or operation name)
395            input: Input data for the operation (can be any JSON-serializable object)
396            output: Output data from the operation (can be any JSON-serializable object)
397            metadata: Additional metadata to associate with the span
398            version: Version identifier for the code or component
399            level: Importance level of the span (info, warning, error)
400            status_message: Optional status message for the span
401            end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
402
403        Returns:
404            A context manager that yields a LangfuseSpan
405
406        Example:
407            ```python
408            with langfuse.start_as_current_span(name="process-query") as span:
409                # Do work
410                result = process_data()
411                span.update(output=result)
412
413                # Create a child span automatically
414                with span.start_as_current_span(name="sub-operation") as child_span:
415                    # Do sub-operation work
416                    child_span.update(output="sub-result")
417            ```
418        """
419        return self.start_as_current_observation(
420            trace_context=trace_context,
421            name=name,
422            as_type="span",
423            input=input,
424            output=output,
425            metadata=metadata,
426            version=version,
427            level=level,
428            status_message=status_message,
429            end_on_exit=end_on_exit,
430        )

Create a new span and set it as the current span in a context manager.

This method creates a new span and sets it as the current span within a context manager. Use this method with a 'with' statement to automatically handle span lifecycle within a code block.

The created span will be the child of the current span in the context.

Arguments:
  • trace_context: Optional context for connecting to an existing trace
  • name: Name of the span (e.g., function or operation name)
  • input: Input data for the operation (can be any JSON-serializable object)
  • output: Output data from the operation (can be any JSON-serializable object)
  • metadata: Additional metadata to associate with the span
  • version: Version identifier for the code or component
  • level: Importance level of the span (info, warning, error)
  • status_message: Optional status message for the span
  • end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
Returns:

A context manager that yields a LangfuseSpan

Example:
with langfuse.start_as_current_span(name="process-query") as span:
    # Do work
    result = process_data()
    span.update(output=result)

    # Create a child span automatically
    with span.start_as_current_span(name="sub-operation") as child_span:
        # Do sub-operation work
        child_span.update(output="sub-result")
def start_observation( self, *, trace_context: Optional[langfuse.types.TraceContext] = None, name: str, as_type: Union[Literal['generation', 'embedding'], Literal['span', 'agent', 'tool', 'chain', 'retriever', 'evaluator', 'guardrail']] = 'span', input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None) -> Union[LangfuseSpan, LangfuseGeneration, LangfuseAgent, LangfuseTool, LangfuseChain, LangfuseRetriever, LangfuseEvaluator, LangfuseEmbedding, LangfuseGuardrail]:
579    def start_observation(
580        self,
581        *,
582        trace_context: Optional[TraceContext] = None,
583        name: str,
584        as_type: ObservationTypeLiteralNoEvent = "span",
585        input: Optional[Any] = None,
586        output: Optional[Any] = None,
587        metadata: Optional[Any] = None,
588        version: Optional[str] = None,
589        level: Optional[SpanLevel] = None,
590        status_message: Optional[str] = None,
591        completion_start_time: Optional[datetime] = None,
592        model: Optional[str] = None,
593        model_parameters: Optional[Dict[str, MapValue]] = None,
594        usage_details: Optional[Dict[str, int]] = None,
595        cost_details: Optional[Dict[str, float]] = None,
596        prompt: Optional[PromptClient] = None,
597    ) -> Union[
598        LangfuseSpan,
599        LangfuseGeneration,
600        LangfuseAgent,
601        LangfuseTool,
602        LangfuseChain,
603        LangfuseRetriever,
604        LangfuseEvaluator,
605        LangfuseEmbedding,
606        LangfuseGuardrail,
607    ]:
608        """Create a new observation of the specified type.
609
610        This method creates a new observation but does not set it as the current span in the
611        context. To create and use an observation within a context, use start_as_current_observation().
612
613        Args:
614            trace_context: Optional context for connecting to an existing trace
615            name: Name of the observation
616            as_type: Type of observation to create (defaults to "span")
617            input: Input data for the operation
618            output: Output data from the operation
619            metadata: Additional metadata to associate with the observation
620            version: Version identifier for the code or component
621            level: Importance level of the observation
622            status_message: Optional status message for the observation
623            completion_start_time: When the model started generating (for generation types)
624            model: Name/identifier of the AI model used (for generation types)
625            model_parameters: Parameters used for the model (for generation types)
626            usage_details: Token usage information (for generation types)
627            cost_details: Cost information (for generation types)
628            prompt: Associated prompt template (for generation types)
629
630        Returns:
631            An observation object of the appropriate type that must be ended with .end()
632        """
633        if trace_context:
634            trace_id = trace_context.get("trace_id", None)
635            parent_span_id = trace_context.get("parent_span_id", None)
636
637            if trace_id:
638                remote_parent_span = self._create_remote_parent_span(
639                    trace_id=trace_id, parent_span_id=parent_span_id
640                )
641
642                with otel_trace_api.use_span(
643                    cast(otel_trace_api.Span, remote_parent_span)
644                ):
645                    otel_span = self._otel_tracer.start_span(name=name)
646                    otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True)
647
648                    return self._create_observation_from_otel_span(
649                        otel_span=otel_span,
650                        as_type=as_type,
651                        input=input,
652                        output=output,
653                        metadata=metadata,
654                        version=version,
655                        level=level,
656                        status_message=status_message,
657                        completion_start_time=completion_start_time,
658                        model=model,
659                        model_parameters=model_parameters,
660                        usage_details=usage_details,
661                        cost_details=cost_details,
662                        prompt=prompt,
663                    )
664
665        otel_span = self._otel_tracer.start_span(name=name)
666
667        return self._create_observation_from_otel_span(
668            otel_span=otel_span,
669            as_type=as_type,
670            input=input,
671            output=output,
672            metadata=metadata,
673            version=version,
674            level=level,
675            status_message=status_message,
676            completion_start_time=completion_start_time,
677            model=model,
678            model_parameters=model_parameters,
679            usage_details=usage_details,
680            cost_details=cost_details,
681            prompt=prompt,
682        )

Create a new observation of the specified type.

This method creates a new observation but does not set it as the current span in the context. To create and use an observation within a context, use start_as_current_observation().

Arguments:
  • trace_context: Optional context for connecting to an existing trace
  • name: Name of the observation
  • as_type: Type of observation to create (defaults to "span")
  • input: Input data for the operation
  • output: Output data from the operation
  • metadata: Additional metadata to associate with the observation
  • version: Version identifier for the code or component
  • level: Importance level of the observation
  • status_message: Optional status message for the observation
  • completion_start_time: When the model started generating (for generation types)
  • model: Name/identifier of the AI model used (for generation types)
  • model_parameters: Parameters used for the model (for generation types)
  • usage_details: Token usage information (for generation types)
  • cost_details: Cost information (for generation types)
  • prompt: Associated prompt template (for generation types)
Returns:

An observation object of the appropriate type that must be ended with .end()

def start_generation( self, *, trace_context: Optional[langfuse.types.TraceContext] = None, name: str, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None) -> LangfuseGeneration:
754    def start_generation(
755        self,
756        *,
757        trace_context: Optional[TraceContext] = None,
758        name: str,
759        input: Optional[Any] = None,
760        output: Optional[Any] = None,
761        metadata: Optional[Any] = None,
762        version: Optional[str] = None,
763        level: Optional[SpanLevel] = None,
764        status_message: Optional[str] = None,
765        completion_start_time: Optional[datetime] = None,
766        model: Optional[str] = None,
767        model_parameters: Optional[Dict[str, MapValue]] = None,
768        usage_details: Optional[Dict[str, int]] = None,
769        cost_details: Optional[Dict[str, float]] = None,
770        prompt: Optional[PromptClient] = None,
771    ) -> LangfuseGeneration:
772        """Create a new generation span for model generations.
773
774        DEPRECATED: This method is deprecated and will be removed in a future version.
775        Use start_observation(as_type='generation') instead.
776
777        This method creates a specialized span for tracking model generations.
778        It includes additional fields specific to model generations such as model name,
779        token usage, and cost details.
780
781        The created generation span will be the child of the current span in the context.
782
783        Args:
784            trace_context: Optional context for connecting to an existing trace
785            name: Name of the generation operation
786            input: Input data for the model (e.g., prompts)
787            output: Output from the model (e.g., completions)
788            metadata: Additional metadata to associate with the generation
789            version: Version identifier for the model or component
790            level: Importance level of the generation (info, warning, error)
791            status_message: Optional status message for the generation
792            completion_start_time: When the model started generating the response
793            model: Name/identifier of the AI model used (e.g., "gpt-4")
794            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
795            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
796            cost_details: Cost information for the model call
797            prompt: Associated prompt template from Langfuse prompt management
798
799        Returns:
800            A LangfuseGeneration object that must be ended with .end() when complete
801
802        Example:
803            ```python
804            generation = langfuse.start_generation(
805                name="answer-generation",
806                model="gpt-4",
807                input={"prompt": "Explain quantum computing"},
808                model_parameters={"temperature": 0.7}
809            )
810            try:
811                # Call model API
812                response = llm.generate(...)
813
814                generation.update(
815                    output=response.text,
816                    usage_details={
817                        "prompt_tokens": response.usage.prompt_tokens,
818                        "completion_tokens": response.usage.completion_tokens
819                    }
820                )
821            finally:
822                generation.end()
823            ```
824        """
825        warnings.warn(
826            "start_generation is deprecated and will be removed in a future version. "
827            "Use start_observation(as_type='generation') instead.",
828            DeprecationWarning,
829            stacklevel=2,
830        )
831        return self.start_observation(
832            trace_context=trace_context,
833            name=name,
834            as_type="generation",
835            input=input,
836            output=output,
837            metadata=metadata,
838            version=version,
839            level=level,
840            status_message=status_message,
841            completion_start_time=completion_start_time,
842            model=model,
843            model_parameters=model_parameters,
844            usage_details=usage_details,
845            cost_details=cost_details,
846            prompt=prompt,
847        )

Create a new generation span for model generations.

DEPRECATED: This method is deprecated and will be removed in a future version. Use start_observation(as_type='generation') instead.

This method creates a specialized span for tracking model generations. It includes additional fields specific to model generations such as model name, token usage, and cost details.

The created generation span will be the child of the current span in the context.

Arguments:
  • trace_context: Optional context for connecting to an existing trace
  • name: Name of the generation operation
  • input: Input data for the model (e.g., prompts)
  • output: Output from the model (e.g., completions)
  • metadata: Additional metadata to associate with the generation
  • version: Version identifier for the model or component
  • level: Importance level of the generation (info, warning, error)
  • status_message: Optional status message for the generation
  • completion_start_time: When the model started generating the response
  • model: Name/identifier of the AI model used (e.g., "gpt-4")
  • model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
  • usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
  • cost_details: Cost information for the model call
  • prompt: Associated prompt template from Langfuse prompt management
Returns:

A LangfuseGeneration object that must be ended with .end() when complete

Example:
generation = langfuse.start_generation(
    name="answer-generation",
    model="gpt-4",
    input={"prompt": "Explain quantum computing"},
    model_parameters={"temperature": 0.7}
)
try:
    # Call model API
    response = llm.generate(...)

    generation.update(
        output=response.text,
        usage_details={
            "prompt_tokens": response.usage.prompt_tokens,
            "completion_tokens": response.usage.completion_tokens
        }
    )
finally:
    generation.end()
def start_as_current_generation( self, *, trace_context: Optional[langfuse.types.TraceContext] = None, name: str, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None, end_on_exit: Optional[bool] = None) -> opentelemetry.util._decorator._AgnosticContextManager[LangfuseGeneration]:
849    def start_as_current_generation(
850        self,
851        *,
852        trace_context: Optional[TraceContext] = None,
853        name: str,
854        input: Optional[Any] = None,
855        output: Optional[Any] = None,
856        metadata: Optional[Any] = None,
857        version: Optional[str] = None,
858        level: Optional[SpanLevel] = None,
859        status_message: Optional[str] = None,
860        completion_start_time: Optional[datetime] = None,
861        model: Optional[str] = None,
862        model_parameters: Optional[Dict[str, MapValue]] = None,
863        usage_details: Optional[Dict[str, int]] = None,
864        cost_details: Optional[Dict[str, float]] = None,
865        prompt: Optional[PromptClient] = None,
866        end_on_exit: Optional[bool] = None,
867    ) -> _AgnosticContextManager[LangfuseGeneration]:
868        """Create a new generation span and set it as the current span in a context manager.
869
870        DEPRECATED: This method is deprecated and will be removed in a future version.
871        Use start_as_current_observation(as_type='generation') instead.
872
873        This method creates a specialized span for model generations and sets it as the
874        current span within a context manager. Use this method with a 'with' statement to
875        automatically handle the generation span lifecycle within a code block.
876
877        The created generation span will be the child of the current span in the context.
878
879        Args:
880            trace_context: Optional context for connecting to an existing trace
881            name: Name of the generation operation
882            input: Input data for the model (e.g., prompts)
883            output: Output from the model (e.g., completions)
884            metadata: Additional metadata to associate with the generation
885            version: Version identifier for the model or component
886            level: Importance level of the generation (info, warning, error)
887            status_message: Optional status message for the generation
888            completion_start_time: When the model started generating the response
889            model: Name/identifier of the AI model used (e.g., "gpt-4")
890            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
891            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
892            cost_details: Cost information for the model call
893            prompt: Associated prompt template from Langfuse prompt management
894            end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
895
896        Returns:
897            A context manager that yields a LangfuseGeneration
898
899        Example:
900            ```python
901            with langfuse.start_as_current_generation(
902                name="answer-generation",
903                model="gpt-4",
904                input={"prompt": "Explain quantum computing"}
905            ) as generation:
906                # Call model API
907                response = llm.generate(...)
908
909                # Update with results
910                generation.update(
911                    output=response.text,
912                    usage_details={
913                        "prompt_tokens": response.usage.prompt_tokens,
914                        "completion_tokens": response.usage.completion_tokens
915                    }
916                )
917            ```
918        """
919        warnings.warn(
920            "start_as_current_generation is deprecated and will be removed in a future version. "
921            "Use start_as_current_observation(as_type='generation') instead.",
922            DeprecationWarning,
923            stacklevel=2,
924        )
925        return self.start_as_current_observation(
926            trace_context=trace_context,
927            name=name,
928            as_type="generation",
929            input=input,
930            output=output,
931            metadata=metadata,
932            version=version,
933            level=level,
934            status_message=status_message,
935            completion_start_time=completion_start_time,
936            model=model,
937            model_parameters=model_parameters,
938            usage_details=usage_details,
939            cost_details=cost_details,
940            prompt=prompt,
941            end_on_exit=end_on_exit,
942        )

Create a new generation span and set it as the current span in a context manager.

DEPRECATED: This method is deprecated and will be removed in a future version. Use start_as_current_observation(as_type='generation') instead.

This method creates a specialized span for model generations and sets it as the current span within a context manager. Use this method with a 'with' statement to automatically handle the generation span lifecycle within a code block.

The created generation span will be the child of the current span in the context.

Arguments:
  • trace_context: Optional context for connecting to an existing trace
  • name: Name of the generation operation
  • input: Input data for the model (e.g., prompts)
  • output: Output from the model (e.g., completions)
  • metadata: Additional metadata to associate with the generation
  • version: Version identifier for the model or component
  • level: Importance level of the generation (info, warning, error)
  • status_message: Optional status message for the generation
  • completion_start_time: When the model started generating the response
  • model: Name/identifier of the AI model used (e.g., "gpt-4")
  • model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
  • usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
  • cost_details: Cost information for the model call
  • prompt: Associated prompt template from Langfuse prompt management
  • end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
Returns:

A context manager that yields a LangfuseGeneration

Example:
with langfuse.start_as_current_generation(
    name="answer-generation",
    model="gpt-4",
    input={"prompt": "Explain quantum computing"}
) as generation:
    # Call model API
    response = llm.generate(...)

    # Update with results
    generation.update(
        output=response.text,
        usage_details={
            "prompt_tokens": response.usage.prompt_tokens,
            "completion_tokens": response.usage.completion_tokens
        }
    )
def start_as_current_observation( self, *, trace_context: Optional[langfuse.types.TraceContext] = None, name: str, as_type: Union[Literal['generation', 'embedding'], Literal['span', 'agent', 'tool', 'chain', 'retriever', 'evaluator', 'guardrail']] = 'span', input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None, end_on_exit: Optional[bool] = None) -> Union[opentelemetry.util._decorator._AgnosticContextManager[LangfuseGeneration], opentelemetry.util._decorator._AgnosticContextManager[LangfuseSpan], opentelemetry.util._decorator._AgnosticContextManager[LangfuseAgent], opentelemetry.util._decorator._AgnosticContextManager[LangfuseTool], opentelemetry.util._decorator._AgnosticContextManager[LangfuseChain], opentelemetry.util._decorator._AgnosticContextManager[LangfuseRetriever], opentelemetry.util._decorator._AgnosticContextManager[LangfuseEvaluator], opentelemetry.util._decorator._AgnosticContextManager[LangfuseEmbedding], opentelemetry.util._decorator._AgnosticContextManager[LangfuseGuardrail]]:
1100    def start_as_current_observation(
1101        self,
1102        *,
1103        trace_context: Optional[TraceContext] = None,
1104        name: str,
1105        as_type: ObservationTypeLiteralNoEvent = "span",
1106        input: Optional[Any] = None,
1107        output: Optional[Any] = None,
1108        metadata: Optional[Any] = None,
1109        version: Optional[str] = None,
1110        level: Optional[SpanLevel] = None,
1111        status_message: Optional[str] = None,
1112        completion_start_time: Optional[datetime] = None,
1113        model: Optional[str] = None,
1114        model_parameters: Optional[Dict[str, MapValue]] = None,
1115        usage_details: Optional[Dict[str, int]] = None,
1116        cost_details: Optional[Dict[str, float]] = None,
1117        prompt: Optional[PromptClient] = None,
1118        end_on_exit: Optional[bool] = None,
1119    ) -> Union[
1120        _AgnosticContextManager[LangfuseGeneration],
1121        _AgnosticContextManager[LangfuseSpan],
1122        _AgnosticContextManager[LangfuseAgent],
1123        _AgnosticContextManager[LangfuseTool],
1124        _AgnosticContextManager[LangfuseChain],
1125        _AgnosticContextManager[LangfuseRetriever],
1126        _AgnosticContextManager[LangfuseEvaluator],
1127        _AgnosticContextManager[LangfuseEmbedding],
1128        _AgnosticContextManager[LangfuseGuardrail],
1129    ]:
1130        """Create a new observation and set it as the current span in a context manager.
1131
1132        This method creates a new observation of the specified type and sets it as the
1133        current span within a context manager. Use this method with a 'with' statement to
1134        automatically handle the observation lifecycle within a code block.
1135
1136        The created observation will be the child of the current span in the context.
1137
1138        Args:
1139            trace_context: Optional context for connecting to an existing trace
1140            name: Name of the observation (e.g., function or operation name)
1141            as_type: Type of observation to create (defaults to "span")
1142            input: Input data for the operation (can be any JSON-serializable object)
1143            output: Output data from the operation (can be any JSON-serializable object)
1144            metadata: Additional metadata to associate with the observation
1145            version: Version identifier for the code or component
1146            level: Importance level of the observation (info, warning, error)
1147            status_message: Optional status message for the observation
1148            end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
1149
1150            The following parameters are available when as_type is: "generation" or "embedding".
1151            completion_start_time: When the model started generating the response
1152            model: Name/identifier of the AI model used (e.g., "gpt-4")
1153            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1154            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1155            cost_details: Cost information for the model call
1156            prompt: Associated prompt template from Langfuse prompt management
1157
1158        Returns:
1159            A context manager that yields the appropriate observation type based on as_type
1160
1161        Example:
1162            ```python
1163            # Create a span
1164            with langfuse.start_as_current_observation(name="process-query", as_type="span") as span:
1165                # Do work
1166                result = process_data()
1167                span.update(output=result)
1168
1169                # Create a child span automatically
1170                with span.start_as_current_span(name="sub-operation") as child_span:
1171                    # Do sub-operation work
1172                    child_span.update(output="sub-result")
1173
1174            # Create a tool observation
1175            with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool:
1176                # Do tool work
1177                results = search_web(query)
1178                tool.update(output=results)
1179
1180            # Create a generation observation
1181            with langfuse.start_as_current_observation(
1182                name="answer-generation",
1183                as_type="generation",
1184                model="gpt-4"
1185            ) as generation:
1186                # Generate answer
1187                response = llm.generate(...)
1188                generation.update(output=response)
1189            ```
1190        """
1191        if as_type in get_observation_types_list(ObservationTypeGenerationLike):
1192            if trace_context:
1193                trace_id = trace_context.get("trace_id", None)
1194                parent_span_id = trace_context.get("parent_span_id", None)
1195
1196                if trace_id:
1197                    remote_parent_span = self._create_remote_parent_span(
1198                        trace_id=trace_id, parent_span_id=parent_span_id
1199                    )
1200
1201                    return cast(
1202                        Union[
1203                            _AgnosticContextManager[LangfuseGeneration],
1204                            _AgnosticContextManager[LangfuseEmbedding],
1205                        ],
1206                        self._create_span_with_parent_context(
1207                            as_type=as_type,
1208                            name=name,
1209                            remote_parent_span=remote_parent_span,
1210                            parent=None,
1211                            end_on_exit=end_on_exit,
1212                            input=input,
1213                            output=output,
1214                            metadata=metadata,
1215                            version=version,
1216                            level=level,
1217                            status_message=status_message,
1218                            completion_start_time=completion_start_time,
1219                            model=model,
1220                            model_parameters=model_parameters,
1221                            usage_details=usage_details,
1222                            cost_details=cost_details,
1223                            prompt=prompt,
1224                        ),
1225                    )
1226
1227            return cast(
1228                Union[
1229                    _AgnosticContextManager[LangfuseGeneration],
1230                    _AgnosticContextManager[LangfuseEmbedding],
1231                ],
1232                self._start_as_current_otel_span_with_processed_media(
1233                    as_type=as_type,
1234                    name=name,
1235                    end_on_exit=end_on_exit,
1236                    input=input,
1237                    output=output,
1238                    metadata=metadata,
1239                    version=version,
1240                    level=level,
1241                    status_message=status_message,
1242                    completion_start_time=completion_start_time,
1243                    model=model,
1244                    model_parameters=model_parameters,
1245                    usage_details=usage_details,
1246                    cost_details=cost_details,
1247                    prompt=prompt,
1248                ),
1249            )
1250
1251        if as_type in get_observation_types_list(ObservationTypeSpanLike):
1252            if trace_context:
1253                trace_id = trace_context.get("trace_id", None)
1254                parent_span_id = trace_context.get("parent_span_id", None)
1255
1256                if trace_id:
1257                    remote_parent_span = self._create_remote_parent_span(
1258                        trace_id=trace_id, parent_span_id=parent_span_id
1259                    )
1260
1261                    return cast(
1262                        Union[
1263                            _AgnosticContextManager[LangfuseSpan],
1264                            _AgnosticContextManager[LangfuseAgent],
1265                            _AgnosticContextManager[LangfuseTool],
1266                            _AgnosticContextManager[LangfuseChain],
1267                            _AgnosticContextManager[LangfuseRetriever],
1268                            _AgnosticContextManager[LangfuseEvaluator],
1269                            _AgnosticContextManager[LangfuseGuardrail],
1270                        ],
1271                        self._create_span_with_parent_context(
1272                            as_type=as_type,
1273                            name=name,
1274                            remote_parent_span=remote_parent_span,
1275                            parent=None,
1276                            end_on_exit=end_on_exit,
1277                            input=input,
1278                            output=output,
1279                            metadata=metadata,
1280                            version=version,
1281                            level=level,
1282                            status_message=status_message,
1283                        ),
1284                    )
1285
1286            return cast(
1287                Union[
1288                    _AgnosticContextManager[LangfuseSpan],
1289                    _AgnosticContextManager[LangfuseAgent],
1290                    _AgnosticContextManager[LangfuseTool],
1291                    _AgnosticContextManager[LangfuseChain],
1292                    _AgnosticContextManager[LangfuseRetriever],
1293                    _AgnosticContextManager[LangfuseEvaluator],
1294                    _AgnosticContextManager[LangfuseGuardrail],
1295                ],
1296                self._start_as_current_otel_span_with_processed_media(
1297                    as_type=as_type,
1298                    name=name,
1299                    end_on_exit=end_on_exit,
1300                    input=input,
1301                    output=output,
1302                    metadata=metadata,
1303                    version=version,
1304                    level=level,
1305                    status_message=status_message,
1306                ),
1307            )
1308
1309        # This should never be reached since all valid types are handled above
1310        langfuse_logger.warning(
1311            f"Unknown observation type: {as_type}, falling back to span"
1312        )
1313        return self._start_as_current_otel_span_with_processed_media(
1314            as_type="span",
1315            name=name,
1316            end_on_exit=end_on_exit,
1317            input=input,
1318            output=output,
1319            metadata=metadata,
1320            version=version,
1321            level=level,
1322            status_message=status_message,
1323        )

Create a new observation and set it as the current span in a context manager.

This method creates a new observation of the specified type and sets it as the current span within a context manager. Use this method with a 'with' statement to automatically handle the observation lifecycle within a code block.

The created observation will be the child of the current span in the context.

Arguments:
  • trace_context: Optional context for connecting to an existing trace
  • name: Name of the observation (e.g., function or operation name)
  • as_type: Type of observation to create (defaults to "span")
  • input: Input data for the operation (can be any JSON-serializable object)
  • output: Output data from the operation (can be any JSON-serializable object)
  • metadata: Additional metadata to associate with the observation
  • version: Version identifier for the code or component
  • level: Importance level of the observation (info, warning, error)
  • status_message: Optional status message for the observation
  • end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
  • The following parameters are available when as_type is: "generation" or "embedding".
  • completion_start_time: When the model started generating the response
  • model: Name/identifier of the AI model used (e.g., "gpt-4")
  • model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
  • usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
  • cost_details: Cost information for the model call
  • prompt: Associated prompt template from Langfuse prompt management
Returns:

A context manager that yields the appropriate observation type based on as_type

Example:
# Create a span
with langfuse.start_as_current_observation(name="process-query", as_type="span") as span:
    # Do work
    result = process_data()
    span.update(output=result)

    # Create a child span automatically
    with span.start_as_current_span(name="sub-operation") as child_span:
        # Do sub-operation work
        child_span.update(output="sub-result")

# Create a tool observation
with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool:
    # Do tool work
    results = search_web(query)
    tool.update(output=results)

# Create a generation observation
with langfuse.start_as_current_observation(
    name="answer-generation",
    as_type="generation",
    model="gpt-4"
) as generation:
    # Generate answer
    response = llm.generate(...)
    generation.update(output=response)
def update_current_generation( self, *, name: Optional[str] = None, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None) -> None:
1484    def update_current_generation(
1485        self,
1486        *,
1487        name: Optional[str] = None,
1488        input: Optional[Any] = None,
1489        output: Optional[Any] = None,
1490        metadata: Optional[Any] = None,
1491        version: Optional[str] = None,
1492        level: Optional[SpanLevel] = None,
1493        status_message: Optional[str] = None,
1494        completion_start_time: Optional[datetime] = None,
1495        model: Optional[str] = None,
1496        model_parameters: Optional[Dict[str, MapValue]] = None,
1497        usage_details: Optional[Dict[str, int]] = None,
1498        cost_details: Optional[Dict[str, float]] = None,
1499        prompt: Optional[PromptClient] = None,
1500    ) -> None:
1501        """Update the current active generation span with new information.
1502
1503        This method updates the current generation span in the active context with
1504        additional information. It's useful for adding output, usage stats, or other
1505        details that become available during or after model generation.
1506
1507        Args:
1508            name: The generation name
1509            input: Updated input data for the model
1510            output: Output from the model (e.g., completions)
1511            metadata: Additional metadata to associate with the generation
1512            version: Version identifier for the model or component
1513            level: Importance level of the generation (info, warning, error)
1514            status_message: Optional status message for the generation
1515            completion_start_time: When the model started generating the response
1516            model: Name/identifier of the AI model used (e.g., "gpt-4")
1517            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1518            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1519            cost_details: Cost information for the model call
1520            prompt: Associated prompt template from Langfuse prompt management
1521
1522        Example:
1523            ```python
1524            with langfuse.start_as_current_generation(name="answer-query") as generation:
1525                # Initial setup and API call
1526                response = llm.generate(...)
1527
1528                # Update with results that weren't available at creation time
1529                langfuse.update_current_generation(
1530                    output=response.text,
1531                    usage_details={
1532                        "prompt_tokens": response.usage.prompt_tokens,
1533                        "completion_tokens": response.usage.completion_tokens
1534                    }
1535                )
1536            ```
1537        """
1538        if not self._tracing_enabled:
1539            langfuse_logger.debug(
1540                "Operation skipped: update_current_generation - Tracing is disabled or client is in no-op mode."
1541            )
1542            return
1543
1544        current_otel_span = self._get_current_otel_span()
1545
1546        if current_otel_span is not None:
1547            generation = LangfuseGeneration(
1548                otel_span=current_otel_span, langfuse_client=self
1549            )
1550
1551            if name:
1552                current_otel_span.update_name(name)
1553
1554            generation.update(
1555                input=input,
1556                output=output,
1557                metadata=metadata,
1558                version=version,
1559                level=level,
1560                status_message=status_message,
1561                completion_start_time=completion_start_time,
1562                model=model,
1563                model_parameters=model_parameters,
1564                usage_details=usage_details,
1565                cost_details=cost_details,
1566                prompt=prompt,
1567            )

Update the current active generation span with new information.

This method updates the current generation span in the active context with additional information. It's useful for adding output, usage stats, or other details that become available during or after model generation.

Arguments:
  • name: The generation name
  • input: Updated input data for the model
  • output: Output from the model (e.g., completions)
  • metadata: Additional metadata to associate with the generation
  • version: Version identifier for the model or component
  • level: Importance level of the generation (info, warning, error)
  • status_message: Optional status message for the generation
  • completion_start_time: When the model started generating the response
  • model: Name/identifier of the AI model used (e.g., "gpt-4")
  • model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
  • usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
  • cost_details: Cost information for the model call
  • prompt: Associated prompt template from Langfuse prompt management
Example:
with langfuse.start_as_current_generation(name="answer-query") as generation:
    # Initial setup and API call
    response = llm.generate(...)

    # Update with results that weren't available at creation time
    langfuse.update_current_generation(
        output=response.text,
        usage_details={
            "prompt_tokens": response.usage.prompt_tokens,
            "completion_tokens": response.usage.completion_tokens
        }
    )
def update_current_span( self, *, name: Optional[str] = None, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None) -> None:
1569    def update_current_span(
1570        self,
1571        *,
1572        name: Optional[str] = None,
1573        input: Optional[Any] = None,
1574        output: Optional[Any] = None,
1575        metadata: Optional[Any] = None,
1576        version: Optional[str] = None,
1577        level: Optional[SpanLevel] = None,
1578        status_message: Optional[str] = None,
1579    ) -> None:
1580        """Update the current active span with new information.
1581
1582        This method updates the current span in the active context with
1583        additional information. It's useful for adding outputs or metadata
1584        that become available during execution.
1585
1586        Args:
1587            name: The span name
1588            input: Updated input data for the operation
1589            output: Output data from the operation
1590            metadata: Additional metadata to associate with the span
1591            version: Version identifier for the code or component
1592            level: Importance level of the span (info, warning, error)
1593            status_message: Optional status message for the span
1594
1595        Example:
1596            ```python
1597            with langfuse.start_as_current_span(name="process-data") as span:
1598                # Initial processing
1599                result = process_first_part()
1600
1601                # Update with intermediate results
1602                langfuse.update_current_span(metadata={"intermediate_result": result})
1603
1604                # Continue processing
1605                final_result = process_second_part(result)
1606
1607                # Final update
1608                langfuse.update_current_span(output=final_result)
1609            ```
1610        """
1611        if not self._tracing_enabled:
1612            langfuse_logger.debug(
1613                "Operation skipped: update_current_span - Tracing is disabled or client is in no-op mode."
1614            )
1615            return
1616
1617        current_otel_span = self._get_current_otel_span()
1618
1619        if current_otel_span is not None:
1620            span = LangfuseSpan(
1621                otel_span=current_otel_span,
1622                langfuse_client=self,
1623                environment=self._environment,
1624            )
1625
1626            if name:
1627                current_otel_span.update_name(name)
1628
1629            span.update(
1630                input=input,
1631                output=output,
1632                metadata=metadata,
1633                version=version,
1634                level=level,
1635                status_message=status_message,
1636            )

Update the current active span with new information.

This method updates the current span in the active context with additional information. It's useful for adding outputs or metadata that become available during execution.

Arguments:
  • name: The span name
  • input: Updated input data for the operation
  • output: Output data from the operation
  • metadata: Additional metadata to associate with the span
  • version: Version identifier for the code or component
  • level: Importance level of the span (info, warning, error)
  • status_message: Optional status message for the span
Example:
with langfuse.start_as_current_span(name="process-data") as span:
    # Initial processing
    result = process_first_part()

    # Update with intermediate results
    langfuse.update_current_span(metadata={"intermediate_result": result})

    # Continue processing
    final_result = process_second_part(result)

    # Final update
    langfuse.update_current_span(output=final_result)
def update_current_trace( self, *, name: Optional[str] = None, user_id: Optional[str] = None, session_id: Optional[str] = None, version: Optional[str] = None, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, tags: Optional[List[str]] = None, public: Optional[bool] = None) -> None:
1638    def update_current_trace(
1639        self,
1640        *,
1641        name: Optional[str] = None,
1642        user_id: Optional[str] = None,
1643        session_id: Optional[str] = None,
1644        version: Optional[str] = None,
1645        input: Optional[Any] = None,
1646        output: Optional[Any] = None,
1647        metadata: Optional[Any] = None,
1648        tags: Optional[List[str]] = None,
1649        public: Optional[bool] = None,
1650    ) -> None:
1651        """Update the current trace with additional information.
1652
1653        Args:
1654            name: Updated name for the Langfuse trace
1655            user_id: ID of the user who initiated the Langfuse trace
1656            session_id: Session identifier for grouping related Langfuse traces
1657            version: Version identifier for the application or service
1658            input: Input data for the overall Langfuse trace
1659            output: Output data from the overall Langfuse trace
1660            metadata: Additional metadata to associate with the Langfuse trace
1661            tags: List of tags to categorize the Langfuse trace
1662            public: Whether the Langfuse trace should be publicly accessible
1663
1664        See Also:
1665            :func:`langfuse.propagate_attributes`: Recommended replacement
1666        """
1667        if not self._tracing_enabled:
1668            langfuse_logger.debug(
1669                "Operation skipped: update_current_trace - Tracing is disabled or client is in no-op mode."
1670            )
1671            return
1672
1673        current_otel_span = self._get_current_otel_span()
1674
1675        if current_otel_span is not None and current_otel_span.is_recording():
1676            existing_observation_type = current_otel_span.attributes.get(  # type: ignore[attr-defined]
1677                LangfuseOtelSpanAttributes.OBSERVATION_TYPE, "span"
1678            )
1679            # We need to preserve the class to keep the correct observation type
1680            span_class = self._get_span_class(existing_observation_type)
1681            span = span_class(
1682                otel_span=current_otel_span,
1683                langfuse_client=self,
1684                environment=self._environment,
1685            )
1686
1687            span.update_trace(
1688                name=name,
1689                user_id=user_id,
1690                session_id=session_id,
1691                version=version,
1692                input=input,
1693                output=output,
1694                metadata=metadata,
1695                tags=tags,
1696                public=public,
1697            )

Update the current trace with additional information.

Arguments:
  • name: Updated name for the Langfuse trace
  • user_id: ID of the user who initiated the Langfuse trace
  • session_id: Session identifier for grouping related Langfuse traces
  • version: Version identifier for the application or service
  • input: Input data for the overall Langfuse trace
  • output: Output data from the overall Langfuse trace
  • metadata: Additional metadata to associate with the Langfuse trace
  • tags: List of tags to categorize the Langfuse trace
  • public: Whether the Langfuse trace should be publicly accessible
See Also:

langfuse.propagate_attributes(): Recommended replacement

def create_event( self, *, trace_context: Optional[langfuse.types.TraceContext] = None, name: str, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None) -> LangfuseEvent:
1699    def create_event(
1700        self,
1701        *,
1702        trace_context: Optional[TraceContext] = None,
1703        name: str,
1704        input: Optional[Any] = None,
1705        output: Optional[Any] = None,
1706        metadata: Optional[Any] = None,
1707        version: Optional[str] = None,
1708        level: Optional[SpanLevel] = None,
1709        status_message: Optional[str] = None,
1710    ) -> LangfuseEvent:
1711        """Create a new Langfuse observation of type 'EVENT'.
1712
1713        The created Langfuse Event observation will be the child of the current span in the context.
1714
1715        Args:
1716            trace_context: Optional context for connecting to an existing trace
1717            name: Name of the span (e.g., function or operation name)
1718            input: Input data for the operation (can be any JSON-serializable object)
1719            output: Output data from the operation (can be any JSON-serializable object)
1720            metadata: Additional metadata to associate with the span
1721            version: Version identifier for the code or component
1722            level: Importance level of the span (info, warning, error)
1723            status_message: Optional status message for the span
1724
1725        Returns:
1726            The Langfuse Event object
1727
1728        Example:
1729            ```python
1730            event = langfuse.create_event(name="process-event")
1731            ```
1732        """
1733        timestamp = time_ns()
1734
1735        if trace_context:
1736            trace_id = trace_context.get("trace_id", None)
1737            parent_span_id = trace_context.get("parent_span_id", None)
1738
1739            if trace_id:
1740                remote_parent_span = self._create_remote_parent_span(
1741                    trace_id=trace_id, parent_span_id=parent_span_id
1742                )
1743
1744                with otel_trace_api.use_span(
1745                    cast(otel_trace_api.Span, remote_parent_span)
1746                ):
1747                    otel_span = self._otel_tracer.start_span(
1748                        name=name, start_time=timestamp
1749                    )
1750                    otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True)
1751
1752                    return cast(
1753                        LangfuseEvent,
1754                        LangfuseEvent(
1755                            otel_span=otel_span,
1756                            langfuse_client=self,
1757                            environment=self._environment,
1758                            input=input,
1759                            output=output,
1760                            metadata=metadata,
1761                            version=version,
1762                            level=level,
1763                            status_message=status_message,
1764                        ).end(end_time=timestamp),
1765                    )
1766
1767        otel_span = self._otel_tracer.start_span(name=name, start_time=timestamp)
1768
1769        return cast(
1770            LangfuseEvent,
1771            LangfuseEvent(
1772                otel_span=otel_span,
1773                langfuse_client=self,
1774                environment=self._environment,
1775                input=input,
1776                output=output,
1777                metadata=metadata,
1778                version=version,
1779                level=level,
1780                status_message=status_message,
1781            ).end(end_time=timestamp),
1782        )

Create a new Langfuse observation of type 'EVENT'.

The created Langfuse Event observation will be the child of the current span in the context.

Arguments:
  • trace_context: Optional context for connecting to an existing trace
  • name: Name of the span (e.g., function or operation name)
  • input: Input data for the operation (can be any JSON-serializable object)
  • output: Output data from the operation (can be any JSON-serializable object)
  • metadata: Additional metadata to associate with the span
  • version: Version identifier for the code or component
  • level: Importance level of the span (info, warning, error)
  • status_message: Optional status message for the span
Returns:

The Langfuse Event object

Example:
event = langfuse.create_event(name="process-event")
@staticmethod
def create_trace_id(*, seed: Optional[str] = None) -> str:
1871    @staticmethod
1872    def create_trace_id(*, seed: Optional[str] = None) -> str:
1873        """Create a unique trace ID for use with Langfuse.
1874
1875        This method generates a unique trace ID for use with various Langfuse APIs.
1876        It can either generate a random ID or create a deterministic ID based on
1877        a seed string.
1878
1879        Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes.
1880        This method ensures the generated ID meets this requirement. If you need to
1881        correlate an external ID with a Langfuse trace ID, use the external ID as the
1882        seed to get a valid, deterministic Langfuse trace ID.
1883
1884        Args:
1885            seed: Optional string to use as a seed for deterministic ID generation.
1886                 If provided, the same seed will always produce the same ID.
1887                 If not provided, a random ID will be generated.
1888
1889        Returns:
1890            A 32-character lowercase hexadecimal string representing the Langfuse trace ID.
1891
1892        Example:
1893            ```python
1894            # Generate a random trace ID
1895            trace_id = langfuse.create_trace_id()
1896
1897            # Generate a deterministic ID based on a seed
1898            session_trace_id = langfuse.create_trace_id(seed="session-456")
1899
1900            # Correlate an external ID with a Langfuse trace ID
1901            external_id = "external-system-123456"
1902            correlated_trace_id = langfuse.create_trace_id(seed=external_id)
1903
1904            # Use the ID with trace context
1905            with langfuse.start_as_current_span(
1906                name="process-request",
1907                trace_context={"trace_id": trace_id}
1908            ) as span:
1909                # Operation will be part of the specific trace
1910                pass
1911            ```
1912        """
1913        if not seed:
1914            trace_id_int = RandomIdGenerator().generate_trace_id()
1915
1916            return Langfuse._format_otel_trace_id(trace_id_int)
1917
1918        return sha256(seed.encode("utf-8")).digest()[:16].hex()

Create a unique trace ID for use with Langfuse.

This method generates a unique trace ID for use with various Langfuse APIs. It can either generate a random ID or create a deterministic ID based on a seed string.

Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes. This method ensures the generated ID meets this requirement. If you need to correlate an external ID with a Langfuse trace ID, use the external ID as the seed to get a valid, deterministic Langfuse trace ID.

Arguments:
  • seed: Optional string to use as a seed for deterministic ID generation. If provided, the same seed will always produce the same ID. If not provided, a random ID will be generated.
Returns:

A 32-character lowercase hexadecimal string representing the Langfuse trace ID.

Example:
# Generate a random trace ID
trace_id = langfuse.create_trace_id()

# Generate a deterministic ID based on a seed
session_trace_id = langfuse.create_trace_id(seed="session-456")

# Correlate an external ID with a Langfuse trace ID
external_id = "external-system-123456"
correlated_trace_id = langfuse.create_trace_id(seed=external_id)

# Use the ID with trace context
with langfuse.start_as_current_span(
    name="process-request",
    trace_context={"trace_id": trace_id}
) as span:
    # Operation will be part of the specific trace
    pass
def create_score( self, *, name: str, value: Union[float, str], session_id: Optional[str] = None, dataset_run_id: Optional[str] = None, trace_id: Optional[str] = None, observation_id: Optional[str] = None, score_id: Optional[str] = None, data_type: Optional[Literal['NUMERIC', 'CATEGORICAL', 'BOOLEAN']] = None, comment: Optional[str] = None, config_id: Optional[str] = None, metadata: Optional[Any] = None) -> None:
1994    def create_score(
1995        self,
1996        *,
1997        name: str,
1998        value: Union[float, str],
1999        session_id: Optional[str] = None,
2000        dataset_run_id: Optional[str] = None,
2001        trace_id: Optional[str] = None,
2002        observation_id: Optional[str] = None,
2003        score_id: Optional[str] = None,
2004        data_type: Optional[ScoreDataType] = None,
2005        comment: Optional[str] = None,
2006        config_id: Optional[str] = None,
2007        metadata: Optional[Any] = None,
2008    ) -> None:
2009        """Create a score for a specific trace or observation.
2010
2011        This method creates a score for evaluating a Langfuse trace or observation. Scores can be
2012        used to track quality metrics, user feedback, or automated evaluations.
2013
2014        Args:
2015            name: Name of the score (e.g., "relevance", "accuracy")
2016            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
2017            session_id: ID of the Langfuse session to associate the score with
2018            dataset_run_id: ID of the Langfuse dataset run to associate the score with
2019            trace_id: ID of the Langfuse trace to associate the score with
2020            observation_id: Optional ID of the specific observation to score. Trace ID must be provided too.
2021            score_id: Optional custom ID for the score (auto-generated if not provided)
2022            data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
2023            comment: Optional comment or explanation for the score
2024            config_id: Optional ID of a score config defined in Langfuse
2025            metadata: Optional metadata to be attached to the score
2026
2027        Example:
2028            ```python
2029            # Create a numeric score for accuracy
2030            langfuse.create_score(
2031                name="accuracy",
2032                value=0.92,
2033                trace_id="abcdef1234567890abcdef1234567890",
2034                data_type="NUMERIC",
2035                comment="High accuracy with minor irrelevant details"
2036            )
2037
2038            # Create a categorical score for sentiment
2039            langfuse.create_score(
2040                name="sentiment",
2041                value="positive",
2042                trace_id="abcdef1234567890abcdef1234567890",
2043                observation_id="abcdef1234567890",
2044                data_type="CATEGORICAL"
2045            )
2046            ```
2047        """
2048        if not self._tracing_enabled:
2049            return
2050
2051        score_id = score_id or self._create_observation_id()
2052
2053        try:
2054            new_body = ScoreBody(
2055                id=score_id,
2056                sessionId=session_id,
2057                datasetRunId=dataset_run_id,
2058                traceId=trace_id,
2059                observationId=observation_id,
2060                name=name,
2061                value=value,
2062                dataType=data_type,  # type: ignore
2063                comment=comment,
2064                configId=config_id,
2065                environment=self._environment,
2066                metadata=metadata,
2067            )
2068
2069            event = {
2070                "id": self.create_trace_id(),
2071                "type": "score-create",
2072                "timestamp": _get_timestamp(),
2073                "body": new_body,
2074            }
2075
2076            if self._resources is not None:
2077                # Force the score to be in sample if it was for a legacy trace ID, i.e. non-32 hexchar
2078                force_sample = (
2079                    not self._is_valid_trace_id(trace_id) if trace_id else True
2080                )
2081
2082                self._resources.add_score_task(
2083                    event,
2084                    force_sample=force_sample,
2085                )
2086
2087        except Exception as e:
2088            langfuse_logger.exception(
2089                f"Error creating score: Failed to process score event for trace_id={trace_id}, name={name}. Error: {e}"
2090            )

Create a score for a specific trace or observation.

This method creates a score for evaluating a Langfuse trace or observation. Scores can be used to track quality metrics, user feedback, or automated evaluations.

Arguments:
  • name: Name of the score (e.g., "relevance", "accuracy")
  • value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
  • session_id: ID of the Langfuse session to associate the score with
  • dataset_run_id: ID of the Langfuse dataset run to associate the score with
  • trace_id: ID of the Langfuse trace to associate the score with
  • observation_id: Optional ID of the specific observation to score. Trace ID must be provided too.
  • score_id: Optional custom ID for the score (auto-generated if not provided)
  • data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
  • comment: Optional comment or explanation for the score
  • config_id: Optional ID of a score config defined in Langfuse
  • metadata: Optional metadata to be attached to the score
Example:
# Create a numeric score for accuracy
langfuse.create_score(
    name="accuracy",
    value=0.92,
    trace_id="abcdef1234567890abcdef1234567890",
    data_type="NUMERIC",
    comment="High accuracy with minor irrelevant details"
)

# Create a categorical score for sentiment
langfuse.create_score(
    name="sentiment",
    value="positive",
    trace_id="abcdef1234567890abcdef1234567890",
    observation_id="abcdef1234567890",
    data_type="CATEGORICAL"
)
def score_current_span( self, *, name: str, value: Union[float, str], score_id: Optional[str] = None, data_type: Optional[Literal['NUMERIC', 'CATEGORICAL', 'BOOLEAN']] = None, comment: Optional[str] = None, config_id: Optional[str] = None) -> None:
2116    def score_current_span(
2117        self,
2118        *,
2119        name: str,
2120        value: Union[float, str],
2121        score_id: Optional[str] = None,
2122        data_type: Optional[ScoreDataType] = None,
2123        comment: Optional[str] = None,
2124        config_id: Optional[str] = None,
2125    ) -> None:
2126        """Create a score for the current active span.
2127
2128        This method scores the currently active span in the context. It's a convenient
2129        way to score the current operation without needing to know its trace and span IDs.
2130
2131        Args:
2132            name: Name of the score (e.g., "relevance", "accuracy")
2133            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
2134            score_id: Optional custom ID for the score (auto-generated if not provided)
2135            data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
2136            comment: Optional comment or explanation for the score
2137            config_id: Optional ID of a score config defined in Langfuse
2138
2139        Example:
2140            ```python
2141            with langfuse.start_as_current_generation(name="answer-query") as generation:
2142                # Generate answer
2143                response = generate_answer(...)
2144                generation.update(output=response)
2145
2146                # Score the generation
2147                langfuse.score_current_span(
2148                    name="relevance",
2149                    value=0.85,
2150                    data_type="NUMERIC",
2151                    comment="Mostly relevant but contains some tangential information"
2152                )
2153            ```
2154        """
2155        current_span = self._get_current_otel_span()
2156
2157        if current_span is not None:
2158            trace_id = self._get_otel_trace_id(current_span)
2159            observation_id = self._get_otel_span_id(current_span)
2160
2161            langfuse_logger.info(
2162                f"Score: Creating score name='{name}' value={value} for current span ({observation_id}) in trace {trace_id}"
2163            )
2164
2165            self.create_score(
2166                trace_id=trace_id,
2167                observation_id=observation_id,
2168                name=name,
2169                value=cast(str, value),
2170                score_id=score_id,
2171                data_type=cast(Literal["CATEGORICAL"], data_type),
2172                comment=comment,
2173                config_id=config_id,
2174            )

Create a score for the current active span.

This method scores the currently active span in the context. It's a convenient way to score the current operation without needing to know its trace and span IDs.

Arguments:
  • name: Name of the score (e.g., "relevance", "accuracy")
  • value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
  • score_id: Optional custom ID for the score (auto-generated if not provided)
  • data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
  • comment: Optional comment or explanation for the score
  • config_id: Optional ID of a score config defined in Langfuse
Example:
with langfuse.start_as_current_generation(name="answer-query") as generation:
    # Generate answer
    response = generate_answer(...)
    generation.update(output=response)

    # Score the generation
    langfuse.score_current_span(
        name="relevance",
        value=0.85,
        data_type="NUMERIC",
        comment="Mostly relevant but contains some tangential information"
    )
def score_current_trace( self, *, name: str, value: Union[float, str], score_id: Optional[str] = None, data_type: Optional[Literal['NUMERIC', 'CATEGORICAL', 'BOOLEAN']] = None, comment: Optional[str] = None, config_id: Optional[str] = None) -> None:
2200    def score_current_trace(
2201        self,
2202        *,
2203        name: str,
2204        value: Union[float, str],
2205        score_id: Optional[str] = None,
2206        data_type: Optional[ScoreDataType] = None,
2207        comment: Optional[str] = None,
2208        config_id: Optional[str] = None,
2209    ) -> None:
2210        """Create a score for the current trace.
2211
2212        This method scores the trace of the currently active span. Unlike score_current_span,
2213        this method associates the score with the entire trace rather than a specific span.
2214        It's useful for scoring overall performance or quality of the entire operation.
2215
2216        Args:
2217            name: Name of the score (e.g., "user_satisfaction", "overall_quality")
2218            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
2219            score_id: Optional custom ID for the score (auto-generated if not provided)
2220            data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
2221            comment: Optional comment or explanation for the score
2222            config_id: Optional ID of a score config defined in Langfuse
2223
2224        Example:
2225            ```python
2226            with langfuse.start_as_current_span(name="process-user-request") as span:
2227                # Process request
2228                result = process_complete_request()
2229                span.update(output=result)
2230
2231                # Score the overall trace
2232                langfuse.score_current_trace(
2233                    name="overall_quality",
2234                    value=0.95,
2235                    data_type="NUMERIC",
2236                    comment="High quality end-to-end response"
2237                )
2238            ```
2239        """
2240        current_span = self._get_current_otel_span()
2241
2242        if current_span is not None:
2243            trace_id = self._get_otel_trace_id(current_span)
2244
2245            langfuse_logger.info(
2246                f"Score: Creating score name='{name}' value={value} for entire trace {trace_id}"
2247            )
2248
2249            self.create_score(
2250                trace_id=trace_id,
2251                name=name,
2252                value=cast(str, value),
2253                score_id=score_id,
2254                data_type=cast(Literal["CATEGORICAL"], data_type),
2255                comment=comment,
2256                config_id=config_id,
2257            )

Create a score for the current trace.

This method scores the trace of the currently active span. Unlike score_current_span, this method associates the score with the entire trace rather than a specific span. It's useful for scoring overall performance or quality of the entire operation.

Arguments:
  • name: Name of the score (e.g., "user_satisfaction", "overall_quality")
  • value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
  • score_id: Optional custom ID for the score (auto-generated if not provided)
  • data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
  • comment: Optional comment or explanation for the score
  • config_id: Optional ID of a score config defined in Langfuse
Example:
with langfuse.start_as_current_span(name="process-user-request") as span:
    # Process request
    result = process_complete_request()
    span.update(output=result)

    # Score the overall trace
    langfuse.score_current_trace(
        name="overall_quality",
        value=0.95,
        data_type="NUMERIC",
        comment="High quality end-to-end response"
    )
def flush(self) -> None:
2259    def flush(self) -> None:
2260        """Force flush all pending spans and events to the Langfuse API.
2261
2262        This method manually flushes any pending spans, scores, and other events to the
2263        Langfuse API. It's useful in scenarios where you want to ensure all data is sent
2264        before proceeding, without waiting for the automatic flush interval.
2265
2266        Example:
2267            ```python
2268            # Record some spans and scores
2269            with langfuse.start_as_current_span(name="operation") as span:
2270                # Do work...
2271                pass
2272
2273            # Ensure all data is sent to Langfuse before proceeding
2274            langfuse.flush()
2275
2276            # Continue with other work
2277            ```
2278        """
2279        if self._resources is not None:
2280            self._resources.flush()

Force flush all pending spans and events to the Langfuse API.

This method manually flushes any pending spans, scores, and other events to the Langfuse API. It's useful in scenarios where you want to ensure all data is sent before proceeding, without waiting for the automatic flush interval.

Example:
# Record some spans and scores
with langfuse.start_as_current_span(name="operation") as span:
    # Do work...
    pass

# Ensure all data is sent to Langfuse before proceeding
langfuse.flush()

# Continue with other work
def shutdown(self) -> None:
2282    def shutdown(self) -> None:
2283        """Shut down the Langfuse client and flush all pending data.
2284
2285        This method cleanly shuts down the Langfuse client, ensuring all pending data
2286        is flushed to the API and all background threads are properly terminated.
2287
2288        It's important to call this method when your application is shutting down to
2289        prevent data loss and resource leaks. For most applications, using the client
2290        as a context manager or relying on the automatic shutdown via atexit is sufficient.
2291
2292        Example:
2293            ```python
2294            # Initialize Langfuse
2295            langfuse = Langfuse(public_key="...", secret_key="...")
2296
2297            # Use Langfuse throughout your application
2298            # ...
2299
2300            # When application is shutting down
2301            langfuse.shutdown()
2302            ```
2303        """
2304        if self._resources is not None:
2305            self._resources.shutdown()

Shut down the Langfuse client and flush all pending data.

This method cleanly shuts down the Langfuse client, ensuring all pending data is flushed to the API and all background threads are properly terminated.

It's important to call this method when your application is shutting down to prevent data loss and resource leaks. For most applications, using the client as a context manager or relying on the automatic shutdown via atexit is sufficient.

Example:
# Initialize Langfuse
langfuse = Langfuse(public_key="...", secret_key="...")

# Use Langfuse throughout your application
# ...

# When application is shutting down
langfuse.shutdown()
def get_current_trace_id(self) -> Optional[str]:
2307    def get_current_trace_id(self) -> Optional[str]:
2308        """Get the trace ID of the current active span.
2309
2310        This method retrieves the trace ID from the currently active span in the context.
2311        It can be used to get the trace ID for referencing in logs, external systems,
2312        or for creating related operations.
2313
2314        Returns:
2315            The current trace ID as a 32-character lowercase hexadecimal string,
2316            or None if there is no active span.
2317
2318        Example:
2319            ```python
2320            with langfuse.start_as_current_span(name="process-request") as span:
2321                # Get the current trace ID for reference
2322                trace_id = langfuse.get_current_trace_id()
2323
2324                # Use it for external correlation
2325                log.info(f"Processing request with trace_id: {trace_id}")
2326
2327                # Or pass to another system
2328                external_system.process(data, trace_id=trace_id)
2329            ```
2330        """
2331        if not self._tracing_enabled:
2332            langfuse_logger.debug(
2333                "Operation skipped: get_current_trace_id - Tracing is disabled or client is in no-op mode."
2334            )
2335            return None
2336
2337        current_otel_span = self._get_current_otel_span()
2338
2339        return self._get_otel_trace_id(current_otel_span) if current_otel_span else None

Get the trace ID of the current active span.

This method retrieves the trace ID from the currently active span in the context. It can be used to get the trace ID for referencing in logs, external systems, or for creating related operations.

Returns:

The current trace ID as a 32-character lowercase hexadecimal string, or None if there is no active span.

Example:
with langfuse.start_as_current_span(name="process-request") as span:
    # Get the current trace ID for reference
    trace_id = langfuse.get_current_trace_id()

    # Use it for external correlation
    log.info(f"Processing request with trace_id: {trace_id}")

    # Or pass to another system
    external_system.process(data, trace_id=trace_id)
def get_current_observation_id(self) -> Optional[str]:
2341    def get_current_observation_id(self) -> Optional[str]:
2342        """Get the observation ID (span ID) of the current active span.
2343
2344        This method retrieves the observation ID from the currently active span in the context.
2345        It can be used to get the observation ID for referencing in logs, external systems,
2346        or for creating scores or other related operations.
2347
2348        Returns:
2349            The current observation ID as a 16-character lowercase hexadecimal string,
2350            or None if there is no active span.
2351
2352        Example:
2353            ```python
2354            with langfuse.start_as_current_span(name="process-user-query") as span:
2355                # Get the current observation ID
2356                observation_id = langfuse.get_current_observation_id()
2357
2358                # Store it for later reference
2359                cache.set(f"query_{query_id}_observation", observation_id)
2360
2361                # Process the query...
2362            ```
2363        """
2364        if not self._tracing_enabled:
2365            langfuse_logger.debug(
2366                "Operation skipped: get_current_observation_id - Tracing is disabled or client is in no-op mode."
2367            )
2368            return None
2369
2370        current_otel_span = self._get_current_otel_span()
2371
2372        return self._get_otel_span_id(current_otel_span) if current_otel_span else None

Get the observation ID (span ID) of the current active span.

This method retrieves the observation ID from the currently active span in the context. It can be used to get the observation ID for referencing in logs, external systems, or for creating scores or other related operations.

Returns:

The current observation ID as a 16-character lowercase hexadecimal string, or None if there is no active span.

Example:
with langfuse.start_as_current_span(name="process-user-query") as span:
    # Get the current observation ID
    observation_id = langfuse.get_current_observation_id()

    # Store it for later reference
    cache.set(f"query_{query_id}_observation", observation_id)

    # Process the query...
def get_trace_url(self, *, trace_id: Optional[str] = None) -> Optional[str]:
2385    def get_trace_url(self, *, trace_id: Optional[str] = None) -> Optional[str]:
2386        """Get the URL to view a trace in the Langfuse UI.
2387
2388        This method generates a URL that links directly to a trace in the Langfuse UI.
2389        It's useful for providing links in logs, notifications, or debugging tools.
2390
2391        Args:
2392            trace_id: Optional trace ID to generate a URL for. If not provided,
2393                     the trace ID of the current active span will be used.
2394
2395        Returns:
2396            A URL string pointing to the trace in the Langfuse UI,
2397            or None if the project ID couldn't be retrieved or no trace ID is available.
2398
2399        Example:
2400            ```python
2401            # Get URL for the current trace
2402            with langfuse.start_as_current_span(name="process-request") as span:
2403                trace_url = langfuse.get_trace_url()
2404                log.info(f"Processing trace: {trace_url}")
2405
2406            # Get URL for a specific trace
2407            specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef")
2408            send_notification(f"Review needed for trace: {specific_trace_url}")
2409            ```
2410        """
2411        project_id = self._get_project_id()
2412        final_trace_id = trace_id or self.get_current_trace_id()
2413
2414        return (
2415            f"{self._base_url}/project/{project_id}/traces/{final_trace_id}"
2416            if project_id and final_trace_id
2417            else None
2418        )

Get the URL to view a trace in the Langfuse UI.

This method generates a URL that links directly to a trace in the Langfuse UI. It's useful for providing links in logs, notifications, or debugging tools.

Arguments:
  • trace_id: Optional trace ID to generate a URL for. If not provided, the trace ID of the current active span will be used.
Returns:

A URL string pointing to the trace in the Langfuse UI, or None if the project ID couldn't be retrieved or no trace ID is available.

Example:
# Get URL for the current trace
with langfuse.start_as_current_span(name="process-request") as span:
    trace_url = langfuse.get_trace_url()
    log.info(f"Processing trace: {trace_url}")

# Get URL for a specific trace
specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef")
send_notification(f"Review needed for trace: {specific_trace_url}")
def get_dataset( self, name: str, *, fetch_items_page_size: Optional[int] = 50) -> langfuse._client.datasets.DatasetClient:
2420    def get_dataset(
2421        self, name: str, *, fetch_items_page_size: Optional[int] = 50
2422    ) -> "DatasetClient":
2423        """Fetch a dataset by its name.
2424
2425        Args:
2426            name (str): The name of the dataset to fetch.
2427            fetch_items_page_size (Optional[int]): All items of the dataset will be fetched in chunks of this size. Defaults to 50.
2428
2429        Returns:
2430            DatasetClient: The dataset with the given name.
2431        """
2432        try:
2433            langfuse_logger.debug(f"Getting datasets {name}")
2434            dataset = self.api.datasets.get(dataset_name=name)
2435
2436            dataset_items = []
2437            page = 1
2438
2439            while True:
2440                new_items = self.api.dataset_items.list(
2441                    dataset_name=self._url_encode(name, is_url_param=True),
2442                    page=page,
2443                    limit=fetch_items_page_size,
2444                )
2445                dataset_items.extend(new_items.data)
2446
2447                if new_items.meta.total_pages <= page:
2448                    break
2449
2450                page += 1
2451
2452            items = [DatasetItemClient(i, langfuse=self) for i in dataset_items]
2453
2454            return DatasetClient(dataset, items=items)
2455
2456        except Error as e:
2457            handle_fern_exception(e)
2458            raise e

Fetch a dataset by its name.

Arguments:
  • name (str): The name of the dataset to fetch.
  • fetch_items_page_size (Optional[int]): All items of the dataset will be fetched in chunks of this size. Defaults to 50.
Returns:

DatasetClient: The dataset with the given name.

def run_experiment( self, *, name: str, run_name: Optional[str] = None, description: Optional[str] = None, data: Union[List[langfuse.experiment.LocalExperimentItem], List[langfuse._client.datasets.DatasetItemClient]], task: langfuse.experiment.TaskFunction, evaluators: List[langfuse.experiment.EvaluatorFunction] = [], composite_evaluator: Optional[CompositeEvaluatorFunction] = None, run_evaluators: List[langfuse.experiment.RunEvaluatorFunction] = [], max_concurrency: int = 50, metadata: Optional[Dict[str, str]] = None) -> langfuse.experiment.ExperimentResult:
2460    def run_experiment(
2461        self,
2462        *,
2463        name: str,
2464        run_name: Optional[str] = None,
2465        description: Optional[str] = None,
2466        data: ExperimentData,
2467        task: TaskFunction,
2468        evaluators: List[EvaluatorFunction] = [],
2469        composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
2470        run_evaluators: List[RunEvaluatorFunction] = [],
2471        max_concurrency: int = 50,
2472        metadata: Optional[Dict[str, str]] = None,
2473    ) -> ExperimentResult:
2474        """Run an experiment on a dataset with automatic tracing and evaluation.
2475
2476        This method executes a task function on each item in the provided dataset,
2477        automatically traces all executions with Langfuse for observability, runs
2478        item-level and run-level evaluators on the outputs, and returns comprehensive
2479        results with evaluation metrics.
2480
2481        The experiment system provides:
2482        - Automatic tracing of all task executions
2483        - Concurrent processing with configurable limits
2484        - Comprehensive error handling that isolates failures
2485        - Integration with Langfuse datasets for experiment tracking
2486        - Flexible evaluation framework supporting both sync and async evaluators
2487
2488        Args:
2489            name: Human-readable name for the experiment. Used for identification
2490                in the Langfuse UI.
2491            run_name: Optional exact name for the experiment run. If provided, this will be
2492                used as the exact dataset run name if the `data` contains Langfuse dataset items.
2493                If not provided, this will default to the experiment name appended with an ISO timestamp.
2494            description: Optional description explaining the experiment's purpose,
2495                methodology, or expected outcomes.
2496            data: Array of data items to process. Can be either:
2497                - List of dict-like items with 'input', 'expected_output', 'metadata' keys
2498                - List of Langfuse DatasetItem objects from dataset.items
2499            task: Function that processes each data item and returns output.
2500                Must accept 'item' as keyword argument and can return sync or async results.
2501                The task function signature should be: task(*, item, **kwargs) -> Any
2502            evaluators: List of functions to evaluate each item's output individually.
2503                Each evaluator receives input, output, expected_output, and metadata.
2504                Can return single Evaluation dict or list of Evaluation dicts.
2505            composite_evaluator: Optional function that creates composite scores from item-level evaluations.
2506                Receives the same inputs as item-level evaluators (input, output, expected_output, metadata)
2507                plus the list of evaluations from item-level evaluators. Useful for weighted averages,
2508                pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics.
2509            run_evaluators: List of functions to evaluate the entire experiment run.
2510                Each run evaluator receives all item_results and can compute aggregate metrics.
2511                Useful for calculating averages, distributions, or cross-item comparisons.
2512            max_concurrency: Maximum number of concurrent task executions (default: 50).
2513                Controls the number of items processed simultaneously. Adjust based on
2514                API rate limits and system resources.
2515            metadata: Optional metadata dictionary to attach to all experiment traces.
2516                This metadata will be included in every trace created during the experiment.
2517                If `data` are Langfuse dataset items, the metadata will be attached to the dataset run, too.
2518
2519        Returns:
2520            ExperimentResult containing:
2521            - run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset.
2522            - item_results: List of results for each processed item with outputs and evaluations
2523            - run_evaluations: List of aggregate evaluation results for the entire run
2524            - dataset_run_id: ID of the dataset run (if using Langfuse datasets)
2525            - dataset_run_url: Direct URL to view results in Langfuse UI (if applicable)
2526
2527        Raises:
2528            ValueError: If required parameters are missing or invalid
2529            Exception: If experiment setup fails (individual item failures are handled gracefully)
2530
2531        Examples:
2532            Basic experiment with local data:
2533            ```python
2534            def summarize_text(*, item, **kwargs):
2535                return f"Summary: {item['input'][:50]}..."
2536
2537            def length_evaluator(*, input, output, expected_output=None, **kwargs):
2538                return {
2539                    "name": "output_length",
2540                    "value": len(output),
2541                    "comment": f"Output contains {len(output)} characters"
2542                }
2543
2544            result = langfuse.run_experiment(
2545                name="Text Summarization Test",
2546                description="Evaluate summarization quality and length",
2547                data=[
2548                    {"input": "Long article text...", "expected_output": "Expected summary"},
2549                    {"input": "Another article...", "expected_output": "Another summary"}
2550                ],
2551                task=summarize_text,
2552                evaluators=[length_evaluator]
2553            )
2554
2555            print(f"Processed {len(result.item_results)} items")
2556            for item_result in result.item_results:
2557                print(f"Input: {item_result.item['input']}")
2558                print(f"Output: {item_result.output}")
2559                print(f"Evaluations: {item_result.evaluations}")
2560            ```
2561
2562            Advanced experiment with async task and multiple evaluators:
2563            ```python
2564            async def llm_task(*, item, **kwargs):
2565                # Simulate async LLM call
2566                response = await openai_client.chat.completions.create(
2567                    model="gpt-4",
2568                    messages=[{"role": "user", "content": item["input"]}]
2569                )
2570                return response.choices[0].message.content
2571
2572            def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
2573                if expected_output and expected_output.lower() in output.lower():
2574                    return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"}
2575                return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"}
2576
2577            def toxicity_evaluator(*, input, output, expected_output=None, **kwargs):
2578                # Simulate toxicity check
2579                toxicity_score = check_toxicity(output)  # Your toxicity checker
2580                return {
2581                    "name": "toxicity",
2582                    "value": toxicity_score,
2583                    "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}"
2584                }
2585
2586            def average_accuracy(*, item_results, **kwargs):
2587                accuracies = [
2588                    eval.value for result in item_results
2589                    for eval in result.evaluations
2590                    if eval.name == "accuracy"
2591                ]
2592                return {
2593                    "name": "average_accuracy",
2594                    "value": sum(accuracies) / len(accuracies) if accuracies else 0,
2595                    "comment": f"Average accuracy across {len(accuracies)} items"
2596                }
2597
2598            result = langfuse.run_experiment(
2599                name="LLM Safety and Accuracy Test",
2600                description="Evaluate model accuracy and safety across diverse prompts",
2601                data=test_dataset,  # Your dataset items
2602                task=llm_task,
2603                evaluators=[accuracy_evaluator, toxicity_evaluator],
2604                run_evaluators=[average_accuracy],
2605                max_concurrency=5,  # Limit concurrent API calls
2606                metadata={"model": "gpt-4", "temperature": 0.7}
2607            )
2608            ```
2609
2610            Using with Langfuse datasets:
2611            ```python
2612            # Get dataset from Langfuse
2613            dataset = langfuse.get_dataset("my-eval-dataset")
2614
2615            result = dataset.run_experiment(
2616                name="Production Model Evaluation",
2617                description="Monthly evaluation of production model performance",
2618                task=my_production_task,
2619                evaluators=[accuracy_evaluator, latency_evaluator]
2620            )
2621
2622            # Results automatically linked to dataset in Langfuse UI
2623            print(f"View results: {result['dataset_run_url']}")
2624            ```
2625
2626        Note:
2627            - Task and evaluator functions can be either synchronous or asynchronous
2628            - Individual item failures are logged but don't stop the experiment
2629            - All executions are automatically traced and visible in Langfuse UI
2630            - When using Langfuse datasets, results are automatically linked for easy comparison
2631            - This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.)
2632            - Async execution is handled automatically with smart event loop detection
2633        """
2634        return cast(
2635            ExperimentResult,
2636            run_async_safely(
2637                self._run_experiment_async(
2638                    name=name,
2639                    run_name=self._create_experiment_run_name(
2640                        name=name, run_name=run_name
2641                    ),
2642                    description=description,
2643                    data=data,
2644                    task=task,
2645                    evaluators=evaluators or [],
2646                    composite_evaluator=composite_evaluator,
2647                    run_evaluators=run_evaluators or [],
2648                    max_concurrency=max_concurrency,
2649                    metadata=metadata,
2650                ),
2651            ),
2652        )

Run an experiment on a dataset with automatic tracing and evaluation.

This method executes a task function on each item in the provided dataset, automatically traces all executions with Langfuse for observability, runs item-level and run-level evaluators on the outputs, and returns comprehensive results with evaluation metrics.

The experiment system provides:

  • Automatic tracing of all task executions
  • Concurrent processing with configurable limits
  • Comprehensive error handling that isolates failures
  • Integration with Langfuse datasets for experiment tracking
  • Flexible evaluation framework supporting both sync and async evaluators
Arguments:
  • name: Human-readable name for the experiment. Used for identification in the Langfuse UI.
  • run_name: Optional exact name for the experiment run. If provided, this will be used as the exact dataset run name if the data contains Langfuse dataset items. If not provided, this will default to the experiment name appended with an ISO timestamp.
  • description: Optional description explaining the experiment's purpose, methodology, or expected outcomes.
  • data: Array of data items to process. Can be either:
    • List of dict-like items with 'input', 'expected_output', 'metadata' keys
    • List of Langfuse DatasetItem objects from dataset.items
  • task: Function that processes each data item and returns output. Must accept 'item' as keyword argument and can return sync or async results. The task function signature should be: task(, item, *kwargs) -> Any
  • evaluators: List of functions to evaluate each item's output individually. Each evaluator receives input, output, expected_output, and metadata. Can return single Evaluation dict or list of Evaluation dicts.
  • composite_evaluator: Optional function that creates composite scores from item-level evaluations. Receives the same inputs as item-level evaluators (input, output, expected_output, metadata) plus the list of evaluations from item-level evaluators. Useful for weighted averages, pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics.
  • run_evaluators: List of functions to evaluate the entire experiment run. Each run evaluator receives all item_results and can compute aggregate metrics. Useful for calculating averages, distributions, or cross-item comparisons.
  • max_concurrency: Maximum number of concurrent task executions (default: 50). Controls the number of items processed simultaneously. Adjust based on API rate limits and system resources.
  • metadata: Optional metadata dictionary to attach to all experiment traces. This metadata will be included in every trace created during the experiment. If data are Langfuse dataset items, the metadata will be attached to the dataset run, too.
Returns:

ExperimentResult containing:

  • run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset.
  • item_results: List of results for each processed item with outputs and evaluations
  • run_evaluations: List of aggregate evaluation results for the entire run
  • dataset_run_id: ID of the dataset run (if using Langfuse datasets)
  • dataset_run_url: Direct URL to view results in Langfuse UI (if applicable)
Raises:
  • ValueError: If required parameters are missing or invalid
  • Exception: If experiment setup fails (individual item failures are handled gracefully)
Examples:

Basic experiment with local data:

def summarize_text(*, item, **kwargs):
    return f"Summary: {item['input'][:50]}..."

def length_evaluator(*, input, output, expected_output=None, **kwargs):
    return {
        "name": "output_length",
        "value": len(output),
        "comment": f"Output contains {len(output)} characters"
    }

result = langfuse.run_experiment(
    name="Text Summarization Test",
    description="Evaluate summarization quality and length",
    data=[
        {"input": "Long article text...", "expected_output": "Expected summary"},
        {"input": "Another article...", "expected_output": "Another summary"}
    ],
    task=summarize_text,
    evaluators=[length_evaluator]
)

print(f"Processed {len(result.item_results)} items")
for item_result in result.item_results:
    print(f"Input: {item_result.item['input']}")
    print(f"Output: {item_result.output}")
    print(f"Evaluations: {item_result.evaluations}")

Advanced experiment with async task and multiple evaluators:

async def llm_task(*, item, **kwargs):
    # Simulate async LLM call
    response = await openai_client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": item["input"]}]
    )
    return response.choices[0].message.content

def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
    if expected_output and expected_output.lower() in output.lower():
        return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"}
    return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"}

def toxicity_evaluator(*, input, output, expected_output=None, **kwargs):
    # Simulate toxicity check
    toxicity_score = check_toxicity(output)  # Your toxicity checker
    return {
        "name": "toxicity",
        "value": toxicity_score,
        "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}"
    }

def average_accuracy(*, item_results, **kwargs):
    accuracies = [
        eval.value for result in item_results
        for eval in result.evaluations
        if eval.name == "accuracy"
    ]
    return {
        "name": "average_accuracy",
        "value": sum(accuracies) / len(accuracies) if accuracies else 0,
        "comment": f"Average accuracy across {len(accuracies)} items"
    }

result = langfuse.run_experiment(
    name="LLM Safety and Accuracy Test",
    description="Evaluate model accuracy and safety across diverse prompts",
    data=test_dataset,  # Your dataset items
    task=llm_task,
    evaluators=[accuracy_evaluator, toxicity_evaluator],
    run_evaluators=[average_accuracy],
    max_concurrency=5,  # Limit concurrent API calls
    metadata={"model": "gpt-4", "temperature": 0.7}
)

Using with Langfuse datasets:

# Get dataset from Langfuse
dataset = langfuse.get_dataset("my-eval-dataset")

result = dataset.run_experiment(
    name="Production Model Evaluation",
    description="Monthly evaluation of production model performance",
    task=my_production_task,
    evaluators=[accuracy_evaluator, latency_evaluator]
)

# Results automatically linked to dataset in Langfuse UI
print(f"View results: {result['dataset_run_url']}")
Note:
  • Task and evaluator functions can be either synchronous or asynchronous
  • Individual item failures are logged but don't stop the experiment
  • All executions are automatically traced and visible in Langfuse UI
  • When using Langfuse datasets, results are automatically linked for easy comparison
  • This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.)
  • Async execution is handled automatically with smart event loop detection
def run_batched_evaluation( self, *, scope: Literal['traces', 'observations'], mapper: MapperFunction, filter: Optional[str] = None, fetch_batch_size: int = 50, max_items: Optional[int] = None, max_retries: int = 3, evaluators: List[langfuse.experiment.EvaluatorFunction], composite_evaluator: Optional[CompositeEvaluatorFunction] = None, max_concurrency: int = 50, metadata: Optional[Dict[str, Any]] = None, resume_from: Optional[BatchEvaluationResumeToken] = None, verbose: bool = False) -> BatchEvaluationResult:
2990    def run_batched_evaluation(
2991        self,
2992        *,
2993        scope: Literal["traces", "observations"],
2994        mapper: MapperFunction,
2995        filter: Optional[str] = None,
2996        fetch_batch_size: int = 50,
2997        max_items: Optional[int] = None,
2998        max_retries: int = 3,
2999        evaluators: List[EvaluatorFunction],
3000        composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
3001        max_concurrency: int = 50,
3002        metadata: Optional[Dict[str, Any]] = None,
3003        resume_from: Optional[BatchEvaluationResumeToken] = None,
3004        verbose: bool = False,
3005    ) -> BatchEvaluationResult:
3006        """Fetch traces or observations and run evaluations on each item.
3007
3008        This method provides a powerful way to evaluate existing data in Langfuse at scale.
3009        It fetches items based on filters, transforms them using a mapper function, runs
3010        evaluators on each item, and creates scores that are linked back to the original
3011        entities. This is ideal for:
3012
3013        - Running evaluations on production traces after deployment
3014        - Backtesting new evaluation metrics on historical data
3015        - Batch scoring of observations for quality monitoring
3016        - Periodic evaluation runs on recent data
3017
3018        The method uses a streaming/pipeline approach to process items in batches, making
3019        it memory-efficient for large datasets. It includes comprehensive error handling,
3020        retry logic, and resume capability for long-running evaluations.
3021
3022        Args:
3023            scope: The type of items to evaluate. Must be one of:
3024                - "traces": Evaluate complete traces with all their observations
3025                - "observations": Evaluate individual observations (spans, generations, events)
3026            mapper: Function that transforms API response objects into evaluator inputs.
3027                Receives a trace/observation object and returns an EvaluatorInputs
3028                instance with input, output, expected_output, and metadata fields.
3029                Can be sync or async.
3030            evaluators: List of evaluation functions to run on each item. Each evaluator
3031                receives the mapped inputs and returns Evaluation object(s). Evaluator
3032                failures are logged but don't stop the batch evaluation.
3033            filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples:
3034                - '{"tags": ["production"]}'
3035                - '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}'
3036                Default: None (fetches all items).
3037            fetch_batch_size: Number of items to fetch per API call and hold in memory.
3038                Larger values may be faster but use more memory. Default: 50.
3039            max_items: Maximum total number of items to process. If None, processes all
3040                items matching the filter. Useful for testing or limiting evaluation runs.
3041                Default: None (process all).
3042            max_concurrency: Maximum number of items to evaluate concurrently. Controls
3043                parallelism and resource usage. Default: 50.
3044            composite_evaluator: Optional function that creates a composite score from
3045                item-level evaluations. Receives the original item and its evaluations,
3046                returns a single Evaluation. Useful for weighted averages or combined metrics.
3047                Default: None.
3048            metadata: Optional metadata dict to add to all created scores. Useful for
3049                tracking evaluation runs, versions, or other context. Default: None.
3050            max_retries: Maximum number of retry attempts for failed batch fetches.
3051                Uses exponential backoff (1s, 2s, 4s). Default: 3.
3052            verbose: If True, logs progress information to console. Useful for monitoring
3053                long-running evaluations. Default: False.
3054            resume_from: Optional resume token from a previous incomplete run. Allows
3055                continuing evaluation after interruption or failure. Default: None.
3056
3057
3058        Returns:
3059            BatchEvaluationResult containing:
3060                - total_items_fetched: Number of items fetched from API
3061                - total_items_processed: Number of items successfully evaluated
3062                - total_items_failed: Number of items that failed evaluation
3063                - total_scores_created: Scores created by item-level evaluators
3064                - total_composite_scores_created: Scores created by composite evaluator
3065                - total_evaluations_failed: Individual evaluator failures
3066                - evaluator_stats: Per-evaluator statistics (success rate, scores created)
3067                - resume_token: Token for resuming if incomplete (None if completed)
3068                - completed: True if all items processed
3069                - duration_seconds: Total execution time
3070                - failed_item_ids: IDs of items that failed
3071                - error_summary: Error types and counts
3072                - has_more_items: True if max_items reached but more exist
3073
3074        Raises:
3075            ValueError: If invalid scope is provided.
3076
3077        Examples:
3078            Basic trace evaluation:
3079            ```python
3080            from langfuse import Langfuse, EvaluatorInputs, Evaluation
3081
3082            client = Langfuse()
3083
3084            # Define mapper to extract fields from traces
3085            def trace_mapper(trace):
3086                return EvaluatorInputs(
3087                    input=trace.input,
3088                    output=trace.output,
3089                    expected_output=None,
3090                    metadata={"trace_id": trace.id}
3091                )
3092
3093            # Define evaluator
3094            def length_evaluator(*, input, output, expected_output, metadata):
3095                return Evaluation(
3096                    name="output_length",
3097                    value=len(output) if output else 0
3098                )
3099
3100            # Run batch evaluation
3101            result = client.run_batched_evaluation(
3102                scope="traces",
3103                mapper=trace_mapper,
3104                evaluators=[length_evaluator],
3105                filter='{"tags": ["production"]}',
3106                max_items=1000,
3107                verbose=True
3108            )
3109
3110            print(f"Processed {result.total_items_processed} traces")
3111            print(f"Created {result.total_scores_created} scores")
3112            ```
3113
3114            Evaluation with composite scorer:
3115            ```python
3116            def accuracy_evaluator(*, input, output, expected_output, metadata):
3117                # ... evaluation logic
3118                return Evaluation(name="accuracy", value=0.85)
3119
3120            def relevance_evaluator(*, input, output, expected_output, metadata):
3121                # ... evaluation logic
3122                return Evaluation(name="relevance", value=0.92)
3123
3124            def composite_evaluator(*, item, evaluations):
3125                # Weighted average of evaluations
3126                weights = {"accuracy": 0.6, "relevance": 0.4}
3127                total = sum(
3128                    e.value * weights.get(e.name, 0)
3129                    for e in evaluations
3130                    if isinstance(e.value, (int, float))
3131                )
3132                return Evaluation(
3133                    name="composite_score",
3134                    value=total,
3135                    comment=f"Weighted average of {len(evaluations)} metrics"
3136                )
3137
3138            result = client.run_batched_evaluation(
3139                scope="traces",
3140                mapper=trace_mapper,
3141                evaluators=[accuracy_evaluator, relevance_evaluator],
3142                composite_evaluator=composite_evaluator,
3143                filter='{"user_id": "important_user"}',
3144                verbose=True
3145            )
3146            ```
3147
3148            Handling incomplete runs with resume:
3149            ```python
3150            # Initial run that may fail or timeout
3151            result = client.run_batched_evaluation(
3152                scope="observations",
3153                mapper=obs_mapper,
3154                evaluators=[my_evaluator],
3155                max_items=10000,
3156                verbose=True
3157            )
3158
3159            # Check if incomplete
3160            if not result.completed and result.resume_token:
3161                print(f"Processed {result.resume_token.items_processed} items before interruption")
3162
3163                # Resume from where it left off
3164                result = client.run_batched_evaluation(
3165                    scope="observations",
3166                    mapper=obs_mapper,
3167                    evaluators=[my_evaluator],
3168                    resume_from=result.resume_token,
3169                    verbose=True
3170                )
3171
3172            print(f"Total items processed: {result.total_items_processed}")
3173            ```
3174
3175            Monitoring evaluator performance:
3176            ```python
3177            result = client.run_batched_evaluation(...)
3178
3179            for stats in result.evaluator_stats:
3180                success_rate = stats.successful_runs / stats.total_runs
3181                print(f"{stats.name}:")
3182                print(f"  Success rate: {success_rate:.1%}")
3183                print(f"  Scores created: {stats.total_scores_created}")
3184
3185                if stats.failed_runs > 0:
3186                    print(f"  âš ī¸  Failed {stats.failed_runs} times")
3187            ```
3188
3189        Note:
3190            - Evaluator failures are logged but don't stop the batch evaluation
3191            - Individual item failures are tracked but don't stop processing
3192            - Fetch failures are retried with exponential backoff
3193            - All scores are automatically flushed to Langfuse at the end
3194            - The resume mechanism uses timestamp-based filtering to avoid duplicates
3195        """
3196        runner = BatchEvaluationRunner(self)
3197
3198        return cast(
3199            BatchEvaluationResult,
3200            run_async_safely(
3201                runner.run_async(
3202                    scope=scope,
3203                    mapper=mapper,
3204                    evaluators=evaluators,
3205                    filter=filter,
3206                    fetch_batch_size=fetch_batch_size,
3207                    max_items=max_items,
3208                    max_concurrency=max_concurrency,
3209                    composite_evaluator=composite_evaluator,
3210                    metadata=metadata,
3211                    max_retries=max_retries,
3212                    verbose=verbose,
3213                    resume_from=resume_from,
3214                )
3215            ),
3216        )

Fetch traces or observations and run evaluations on each item.

This method provides a powerful way to evaluate existing data in Langfuse at scale. It fetches items based on filters, transforms them using a mapper function, runs evaluators on each item, and creates scores that are linked back to the original entities. This is ideal for:

  • Running evaluations on production traces after deployment
  • Backtesting new evaluation metrics on historical data
  • Batch scoring of observations for quality monitoring
  • Periodic evaluation runs on recent data

The method uses a streaming/pipeline approach to process items in batches, making it memory-efficient for large datasets. It includes comprehensive error handling, retry logic, and resume capability for long-running evaluations.

Arguments:
  • scope: The type of items to evaluate. Must be one of:
    • "traces": Evaluate complete traces with all their observations
    • "observations": Evaluate individual observations (spans, generations, events)
  • mapper: Function that transforms API response objects into evaluator inputs. Receives a trace/observation object and returns an EvaluatorInputs instance with input, output, expected_output, and metadata fields. Can be sync or async.
  • evaluators: List of evaluation functions to run on each item. Each evaluator receives the mapped inputs and returns Evaluation object(s). Evaluator failures are logged but don't stop the batch evaluation.
  • filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples:
    • '{"tags": ["production"]}'
    • '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}' Default: None (fetches all items).
  • fetch_batch_size: Number of items to fetch per API call and hold in memory. Larger values may be faster but use more memory. Default: 50.
  • max_items: Maximum total number of items to process. If None, processes all items matching the filter. Useful for testing or limiting evaluation runs. Default: None (process all).
  • max_concurrency: Maximum number of items to evaluate concurrently. Controls parallelism and resource usage. Default: 50.
  • composite_evaluator: Optional function that creates a composite score from item-level evaluations. Receives the original item and its evaluations, returns a single Evaluation. Useful for weighted averages or combined metrics. Default: None.
  • metadata: Optional metadata dict to add to all created scores. Useful for tracking evaluation runs, versions, or other context. Default: None.
  • max_retries: Maximum number of retry attempts for failed batch fetches. Uses exponential backoff (1s, 2s, 4s). Default: 3.
  • verbose: If True, logs progress information to console. Useful for monitoring long-running evaluations. Default: False.
  • resume_from: Optional resume token from a previous incomplete run. Allows continuing evaluation after interruption or failure. Default: None.
Returns:

BatchEvaluationResult containing: - total_items_fetched: Number of items fetched from API - total_items_processed: Number of items successfully evaluated - total_items_failed: Number of items that failed evaluation - total_scores_created: Scores created by item-level evaluators - total_composite_scores_created: Scores created by composite evaluator - total_evaluations_failed: Individual evaluator failures - evaluator_stats: Per-evaluator statistics (success rate, scores created) - resume_token: Token for resuming if incomplete (None if completed) - completed: True if all items processed - duration_seconds: Total execution time - failed_item_ids: IDs of items that failed - error_summary: Error types and counts - has_more_items: True if max_items reached but more exist

Raises:
  • ValueError: If invalid scope is provided.
Examples:

Basic trace evaluation:

from langfuse import Langfuse, EvaluatorInputs, Evaluation

client = Langfuse()

# Define mapper to extract fields from traces
def trace_mapper(trace):
    return EvaluatorInputs(
        input=trace.input,
        output=trace.output,
        expected_output=None,
        metadata={"trace_id": trace.id}
    )

# Define evaluator
def length_evaluator(*, input, output, expected_output, metadata):
    return Evaluation(
        name="output_length",
        value=len(output) if output else 0
    )

# Run batch evaluation
result = client.run_batched_evaluation(
    scope="traces",
    mapper=trace_mapper,
    evaluators=[length_evaluator],
    filter='{"tags": ["production"]}',
    max_items=1000,
    verbose=True
)

print(f"Processed {result.total_items_processed} traces")
print(f"Created {result.total_scores_created} scores")

Evaluation with composite scorer:

def accuracy_evaluator(*, input, output, expected_output, metadata):
    # ... evaluation logic
    return Evaluation(name="accuracy", value=0.85)

def relevance_evaluator(*, input, output, expected_output, metadata):
    # ... evaluation logic
    return Evaluation(name="relevance", value=0.92)

def composite_evaluator(*, item, evaluations):
    # Weighted average of evaluations
    weights = {"accuracy": 0.6, "relevance": 0.4}
    total = sum(
        e.value * weights.get(e.name, 0)
        for e in evaluations
        if isinstance(e.value, (int, float))
    )
    return Evaluation(
        name="composite_score",
        value=total,
        comment=f"Weighted average of {len(evaluations)} metrics"
    )

result = client.run_batched_evaluation(
    scope="traces",
    mapper=trace_mapper,
    evaluators=[accuracy_evaluator, relevance_evaluator],
    composite_evaluator=composite_evaluator,
    filter='{"user_id": "important_user"}',
    verbose=True
)

Handling incomplete runs with resume:

# Initial run that may fail or timeout
result = client.run_batched_evaluation(
    scope="observations",
    mapper=obs_mapper,
    evaluators=[my_evaluator],
    max_items=10000,
    verbose=True
)

# Check if incomplete
if not result.completed and result.resume_token:
    print(f"Processed {result.resume_token.items_processed} items before interruption")

    # Resume from where it left off
    result = client.run_batched_evaluation(
        scope="observations",
        mapper=obs_mapper,
        evaluators=[my_evaluator],
        resume_from=result.resume_token,
        verbose=True
    )

print(f"Total items processed: {result.total_items_processed}")

Monitoring evaluator performance:

result = client.run_batched_evaluation(...)

for stats in result.evaluator_stats:
    success_rate = stats.successful_runs / stats.total_runs
    print(f"{stats.name}:")
    print(f"  Success rate: {success_rate:.1%}")
    print(f"  Scores created: {stats.total_scores_created}")

    if stats.failed_runs > 0:
        print(f"  âš ī¸  Failed {stats.failed_runs} times")
Note:
  • Evaluator failures are logged but don't stop the batch evaluation
  • Individual item failures are tracked but don't stop processing
  • Fetch failures are retried with exponential backoff
  • All scores are automatically flushed to Langfuse at the end
  • The resume mechanism uses timestamp-based filtering to avoid duplicates
def auth_check(self) -> bool:
3218    def auth_check(self) -> bool:
3219        """Check if the provided credentials (public and secret key) are valid.
3220
3221        Raises:
3222            Exception: If no projects were found for the provided credentials.
3223
3224        Note:
3225            This method is blocking. It is discouraged to use it in production code.
3226        """
3227        try:
3228            projects = self.api.projects.get()
3229            langfuse_logger.debug(
3230                f"Auth check successful, found {len(projects.data)} projects"
3231            )
3232            if len(projects.data) == 0:
3233                raise Exception(
3234                    "Auth check failed, no project found for the keys provided."
3235                )
3236            return True
3237
3238        except AttributeError as e:
3239            langfuse_logger.warning(
3240                f"Auth check failed: Client not properly initialized. Error: {e}"
3241            )
3242            return False
3243
3244        except Error as e:
3245            handle_fern_exception(e)
3246            raise e

Check if the provided credentials (public and secret key) are valid.

Raises:
  • Exception: If no projects were found for the provided credentials.
Note:

This method is blocking. It is discouraged to use it in production code.

def create_dataset( self, *, name: str, description: Optional[str] = None, metadata: Optional[Any] = None) -> langfuse.api.Dataset:
3248    def create_dataset(
3249        self,
3250        *,
3251        name: str,
3252        description: Optional[str] = None,
3253        metadata: Optional[Any] = None,
3254    ) -> Dataset:
3255        """Create a dataset with the given name on Langfuse.
3256
3257        Args:
3258            name: Name of the dataset to create.
3259            description: Description of the dataset. Defaults to None.
3260            metadata: Additional metadata. Defaults to None.
3261
3262        Returns:
3263            Dataset: The created dataset as returned by the Langfuse API.
3264        """
3265        try:
3266            body = CreateDatasetRequest(
3267                name=name, description=description, metadata=metadata
3268            )
3269            langfuse_logger.debug(f"Creating datasets {body}")
3270
3271            return self.api.datasets.create(request=body)
3272
3273        except Error as e:
3274            handle_fern_exception(e)
3275            raise e

Create a dataset with the given name on Langfuse.

Arguments:
  • name: Name of the dataset to create.
  • description: Description of the dataset. Defaults to None.
  • metadata: Additional metadata. Defaults to None.
Returns:

Dataset: The created dataset as returned by the Langfuse API.

def create_dataset_item( self, *, dataset_name: str, input: Optional[Any] = None, expected_output: Optional[Any] = None, metadata: Optional[Any] = None, source_trace_id: Optional[str] = None, source_observation_id: Optional[str] = None, status: Optional[langfuse.api.DatasetStatus] = None, id: Optional[str] = None) -> langfuse.api.DatasetItem:
3277    def create_dataset_item(
3278        self,
3279        *,
3280        dataset_name: str,
3281        input: Optional[Any] = None,
3282        expected_output: Optional[Any] = None,
3283        metadata: Optional[Any] = None,
3284        source_trace_id: Optional[str] = None,
3285        source_observation_id: Optional[str] = None,
3286        status: Optional[DatasetStatus] = None,
3287        id: Optional[str] = None,
3288    ) -> DatasetItem:
3289        """Create a dataset item.
3290
3291        Upserts if an item with id already exists.
3292
3293        Args:
3294            dataset_name: Name of the dataset in which the dataset item should be created.
3295            input: Input data. Defaults to None. Can contain any dict, list or scalar.
3296            expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar.
3297            metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar.
3298            source_trace_id: Id of the source trace. Defaults to None.
3299            source_observation_id: Id of the source observation. Defaults to None.
3300            status: Status of the dataset item. Defaults to ACTIVE for newly created items.
3301            id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets.
3302
3303        Returns:
3304            DatasetItem: The created dataset item as returned by the Langfuse API.
3305
3306        Example:
3307            ```python
3308            from langfuse import Langfuse
3309
3310            langfuse = Langfuse()
3311
3312            # Uploading items to the Langfuse dataset named "capital_cities"
3313            langfuse.create_dataset_item(
3314                dataset_name="capital_cities",
3315                input={"input": {"country": "Italy"}},
3316                expected_output={"expected_output": "Rome"},
3317                metadata={"foo": "bar"}
3318            )
3319            ```
3320        """
3321        try:
3322            body = CreateDatasetItemRequest(
3323                datasetName=dataset_name,
3324                input=input,
3325                expectedOutput=expected_output,
3326                metadata=metadata,
3327                sourceTraceId=source_trace_id,
3328                sourceObservationId=source_observation_id,
3329                status=status,
3330                id=id,
3331            )
3332            langfuse_logger.debug(f"Creating dataset item {body}")
3333            return self.api.dataset_items.create(request=body)
3334        except Error as e:
3335            handle_fern_exception(e)
3336            raise e

Create a dataset item.

Upserts if an item with id already exists.

Arguments:
  • dataset_name: Name of the dataset in which the dataset item should be created.
  • input: Input data. Defaults to None. Can contain any dict, list or scalar.
  • expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar.
  • metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar.
  • source_trace_id: Id of the source trace. Defaults to None.
  • source_observation_id: Id of the source observation. Defaults to None.
  • status: Status of the dataset item. Defaults to ACTIVE for newly created items.
  • id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets.
Returns:

DatasetItem: The created dataset item as returned by the Langfuse API.

Example:
from langfuse import Langfuse

langfuse = Langfuse()

# Uploading items to the Langfuse dataset named "capital_cities"
langfuse.create_dataset_item(
    dataset_name="capital_cities",
    input={"input": {"country": "Italy"}},
    expected_output={"expected_output": "Rome"},
    metadata={"foo": "bar"}
)
def resolve_media_references( self, *, obj: Any, resolve_with: Literal['base64_data_uri'], max_depth: int = 10, content_fetch_timeout_seconds: int = 5) -> Any:
3338    def resolve_media_references(
3339        self,
3340        *,
3341        obj: Any,
3342        resolve_with: Literal["base64_data_uri"],
3343        max_depth: int = 10,
3344        content_fetch_timeout_seconds: int = 5,
3345    ) -> Any:
3346        """Replace media reference strings in an object with base64 data URIs.
3347
3348        This method recursively traverses an object (up to max_depth) looking for media reference strings
3349        in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using
3350        the provided Langfuse client and replaces the reference string with a base64 data URI.
3351
3352        If fetching media content fails for a reference string, a warning is logged and the reference
3353        string is left unchanged.
3354
3355        Args:
3356            obj: The object to process. Can be a primitive value, array, or nested object.
3357                If the object has a __dict__ attribute, a dict will be returned instead of the original object type.
3358            resolve_with: The representation of the media content to replace the media reference string with.
3359                Currently only "base64_data_uri" is supported.
3360            max_depth: int: The maximum depth to traverse the object. Default is 10.
3361            content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5.
3362
3363        Returns:
3364            A deep copy of the input object with all media references replaced with base64 data URIs where possible.
3365            If the input object has a __dict__ attribute, a dict will be returned instead of the original object type.
3366
3367        Example:
3368            obj = {
3369                "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@",
3370                "nested": {
3371                    "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@"
3372                }
3373            }
3374
3375            result = await LangfuseMedia.resolve_media_references(obj, langfuse_client)
3376
3377            # Result:
3378            # {
3379            #     "image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...",
3380            #     "nested": {
3381            #         "pdf": "data:application/pdf;base64,JVBERi0xLjcK..."
3382            #     }
3383            # }
3384        """
3385        return LangfuseMedia.resolve_media_references(
3386            langfuse_client=self,
3387            obj=obj,
3388            resolve_with=resolve_with,
3389            max_depth=max_depth,
3390            content_fetch_timeout_seconds=content_fetch_timeout_seconds,
3391        )

Replace media reference strings in an object with base64 data URIs.

This method recursively traverses an object (up to max_depth) looking for media reference strings in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using the provided Langfuse client and replaces the reference string with a base64 data URI.

If fetching media content fails for a reference string, a warning is logged and the reference string is left unchanged.

Arguments:
  • obj: The object to process. Can be a primitive value, array, or nested object. If the object has a __dict__ attribute, a dict will be returned instead of the original object type.
  • resolve_with: The representation of the media content to replace the media reference string with. Currently only "base64_data_uri" is supported.
  • max_depth: int: The maximum depth to traverse the object. Default is 10.
  • content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5.
Returns:

A deep copy of the input object with all media references replaced with base64 data URIs where possible. If the input object has a __dict__ attribute, a dict will be returned instead of the original object type.

Example:

obj = { "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@", "nested": { "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@" } }

result = await LangfuseMedia.resolve_media_references(obj, langfuse_client)

Result:

{

"image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...",

"nested": {

"pdf": "data:application/pdf;base64,JVBERi0xLjcK..."

}

}

def get_prompt( self, name: str, *, version: Optional[int] = None, label: Optional[str] = None, type: Literal['chat', 'text'] = 'text', cache_ttl_seconds: Optional[int] = None, fallback: Union[List[langfuse.model.ChatMessageDict], NoneType, str] = None, max_retries: Optional[int] = None, fetch_timeout_seconds: Optional[int] = None) -> Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient]:
3421    def get_prompt(
3422        self,
3423        name: str,
3424        *,
3425        version: Optional[int] = None,
3426        label: Optional[str] = None,
3427        type: Literal["chat", "text"] = "text",
3428        cache_ttl_seconds: Optional[int] = None,
3429        fallback: Union[Optional[List[ChatMessageDict]], Optional[str]] = None,
3430        max_retries: Optional[int] = None,
3431        fetch_timeout_seconds: Optional[int] = None,
3432    ) -> PromptClient:
3433        """Get a prompt.
3434
3435        This method attempts to fetch the requested prompt from the local cache. If the prompt is not found
3436        in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again
3437        and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will
3438        return the expired prompt as a fallback.
3439
3440        Args:
3441            name (str): The name of the prompt to retrieve.
3442
3443        Keyword Args:
3444            version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both.
3445            label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both.
3446            cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a
3447            keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0.
3448            type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text".
3449            fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None.
3450            max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds.
3451            fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default.
3452
3453        Returns:
3454            The prompt object retrieved from the cache or directly fetched if not cached or expired of type
3455            - TextPromptClient, if type argument is 'text'.
3456            - ChatPromptClient, if type argument is 'chat'.
3457
3458        Raises:
3459            Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an
3460            expired prompt in the cache, in which case it logs a warning and returns the expired prompt.
3461        """
3462        if self._resources is None:
3463            raise Error(
3464                "SDK is not correctly initialized. Check the init logs for more details."
3465            )
3466        if version is not None and label is not None:
3467            raise ValueError("Cannot specify both version and label at the same time.")
3468
3469        if not name:
3470            raise ValueError("Prompt name cannot be empty.")
3471
3472        cache_key = PromptCache.generate_cache_key(name, version=version, label=label)
3473        bounded_max_retries = self._get_bounded_max_retries(
3474            max_retries, default_max_retries=2, max_retries_upper_bound=4
3475        )
3476
3477        langfuse_logger.debug(f"Getting prompt '{cache_key}'")
3478        cached_prompt = self._resources.prompt_cache.get(cache_key)
3479
3480        if cached_prompt is None or cache_ttl_seconds == 0:
3481            langfuse_logger.debug(
3482                f"Prompt '{cache_key}' not found in cache or caching disabled."
3483            )
3484            try:
3485                return self._fetch_prompt_and_update_cache(
3486                    name,
3487                    version=version,
3488                    label=label,
3489                    ttl_seconds=cache_ttl_seconds,
3490                    max_retries=bounded_max_retries,
3491                    fetch_timeout_seconds=fetch_timeout_seconds,
3492                )
3493            except Exception as e:
3494                if fallback:
3495                    langfuse_logger.warning(
3496                        f"Returning fallback prompt for '{cache_key}' due to fetch error: {e}"
3497                    )
3498
3499                    fallback_client_args: Dict[str, Any] = {
3500                        "name": name,
3501                        "prompt": fallback,
3502                        "type": type,
3503                        "version": version or 0,
3504                        "config": {},
3505                        "labels": [label] if label else [],
3506                        "tags": [],
3507                    }
3508
3509                    if type == "text":
3510                        return TextPromptClient(
3511                            prompt=Prompt_Text(**fallback_client_args),
3512                            is_fallback=True,
3513                        )
3514
3515                    if type == "chat":
3516                        return ChatPromptClient(
3517                            prompt=Prompt_Chat(**fallback_client_args),
3518                            is_fallback=True,
3519                        )
3520
3521                raise e
3522
3523        if cached_prompt.is_expired():
3524            langfuse_logger.debug(f"Stale prompt '{cache_key}' found in cache.")
3525            try:
3526                # refresh prompt in background thread, refresh_prompt deduplicates tasks
3527                langfuse_logger.debug(f"Refreshing prompt '{cache_key}' in background.")
3528
3529                def refresh_task() -> None:
3530                    self._fetch_prompt_and_update_cache(
3531                        name,
3532                        version=version,
3533                        label=label,
3534                        ttl_seconds=cache_ttl_seconds,
3535                        max_retries=bounded_max_retries,
3536                        fetch_timeout_seconds=fetch_timeout_seconds,
3537                    )
3538
3539                self._resources.prompt_cache.add_refresh_prompt_task(
3540                    cache_key,
3541                    refresh_task,
3542                )
3543                langfuse_logger.debug(
3544                    f"Returning stale prompt '{cache_key}' from cache."
3545                )
3546                # return stale prompt
3547                return cached_prompt.value
3548
3549            except Exception as e:
3550                langfuse_logger.warning(
3551                    f"Error when refreshing cached prompt '{cache_key}', returning cached version. Error: {e}"
3552                )
3553                # creation of refresh prompt task failed, return stale prompt
3554                return cached_prompt.value
3555
3556        return cached_prompt.value

Get a prompt.

This method attempts to fetch the requested prompt from the local cache. If the prompt is not found in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will return the expired prompt as a fallback.

Arguments:
  • name (str): The name of the prompt to retrieve.
Keyword Args:

version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the production label is returned. Specify either version or label, not both. label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the production label is returned. Specify either version or label, not both. cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0. type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text". fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None. max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds. fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default.

Returns:

The prompt object retrieved from the cache or directly fetched if not cached or expired of type

  • TextPromptClient, if type argument is 'text'.
  • ChatPromptClient, if type argument is 'chat'.
Raises:
  • Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an
  • expired prompt in the cache, in which case it logs a warning and returns the expired prompt.
def create_prompt( self, *, name: str, prompt: Union[str, List[Union[langfuse.model.ChatMessageDict, langfuse.model.ChatMessageWithPlaceholdersDict_Message, langfuse.model.ChatMessageWithPlaceholdersDict_Placeholder]]], labels: List[str] = [], tags: Optional[List[str]] = None, type: Optional[Literal['chat', 'text']] = 'text', config: Optional[Any] = None, commit_message: Optional[str] = None) -> Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient]:
3650    def create_prompt(
3651        self,
3652        *,
3653        name: str,
3654        prompt: Union[
3655            str, List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]]
3656        ],
3657        labels: List[str] = [],
3658        tags: Optional[List[str]] = None,
3659        type: Optional[Literal["chat", "text"]] = "text",
3660        config: Optional[Any] = None,
3661        commit_message: Optional[str] = None,
3662    ) -> PromptClient:
3663        """Create a new prompt in Langfuse.
3664
3665        Keyword Args:
3666            name : The name of the prompt to be created.
3667            prompt : The content of the prompt to be created.
3668            is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead.
3669            labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label.
3670            tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt.
3671            config: Additional structured data to be saved with the prompt. Defaults to None.
3672            type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text".
3673            commit_message: Optional string describing the change.
3674
3675        Returns:
3676            TextPromptClient: The prompt if type argument is 'text'.
3677            ChatPromptClient: The prompt if type argument is 'chat'.
3678        """
3679        try:
3680            langfuse_logger.debug(f"Creating prompt {name=}, {labels=}")
3681
3682            if type == "chat":
3683                if not isinstance(prompt, list):
3684                    raise ValueError(
3685                        "For 'chat' type, 'prompt' must be a list of chat messages with role and content attributes."
3686                    )
3687                request: Union[CreatePromptRequest_Chat, CreatePromptRequest_Text] = (
3688                    CreatePromptRequest_Chat(
3689                        name=name,
3690                        prompt=cast(Any, prompt),
3691                        labels=labels,
3692                        tags=tags,
3693                        config=config or {},
3694                        commitMessage=commit_message,
3695                        type="chat",
3696                    )
3697                )
3698                server_prompt = self.api.prompts.create(request=request)
3699
3700                if self._resources is not None:
3701                    self._resources.prompt_cache.invalidate(name)
3702
3703                return ChatPromptClient(prompt=cast(Prompt_Chat, server_prompt))
3704
3705            if not isinstance(prompt, str):
3706                raise ValueError("For 'text' type, 'prompt' must be a string.")
3707
3708            request = CreatePromptRequest_Text(
3709                name=name,
3710                prompt=prompt,
3711                labels=labels,
3712                tags=tags,
3713                config=config or {},
3714                commitMessage=commit_message,
3715                type="text",
3716            )
3717
3718            server_prompt = self.api.prompts.create(request=request)
3719
3720            if self._resources is not None:
3721                self._resources.prompt_cache.invalidate(name)
3722
3723            return TextPromptClient(prompt=cast(Prompt_Text, server_prompt))
3724
3725        except Error as e:
3726            handle_fern_exception(e)
3727            raise e

Create a new prompt in Langfuse.

Keyword Args:

name : The name of the prompt to be created. prompt : The content of the prompt to be created. is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead. labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label. tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt. config: Additional structured data to be saved with the prompt. Defaults to None. type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text". commit_message: Optional string describing the change.

Returns:

TextPromptClient: The prompt if type argument is 'text'. ChatPromptClient: The prompt if type argument is 'chat'.

def update_prompt(self, *, name: str, version: int, new_labels: List[str] = []) -> Any:
3729    def update_prompt(
3730        self,
3731        *,
3732        name: str,
3733        version: int,
3734        new_labels: List[str] = [],
3735    ) -> Any:
3736        """Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name.
3737
3738        Args:
3739            name (str): The name of the prompt to update.
3740            version (int): The version number of the prompt to update.
3741            new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to [].
3742
3743        Returns:
3744            Prompt: The updated prompt from the Langfuse API.
3745
3746        """
3747        updated_prompt = self.api.prompt_version.update(
3748            name=self._url_encode(name),
3749            version=version,
3750            new_labels=new_labels,
3751        )
3752
3753        if self._resources is not None:
3754            self._resources.prompt_cache.invalidate(name)
3755
3756        return updated_prompt

Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name.

Arguments:
  • name (str): The name of the prompt to update.
  • version (int): The version number of the prompt to update.
  • new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to [].
Returns:

Prompt: The updated prompt from the Langfuse API.

def clear_prompt_cache(self) -> None:
3771    def clear_prompt_cache(self) -> None:
3772        """Clear the entire prompt cache, removing all cached prompts.
3773
3774        This method is useful when you want to force a complete refresh of all
3775        cached prompts, for example after major updates or when you need to
3776        ensure the latest versions are fetched from the server.
3777        """
3778        if self._resources is not None:
3779            self._resources.prompt_cache.clear()

Clear the entire prompt cache, removing all cached prompts.

This method is useful when you want to force a complete refresh of all cached prompts, for example after major updates or when you need to ensure the latest versions are fetched from the server.

def get_client(*, public_key: Optional[str] = None) -> Langfuse:
 59def get_client(*, public_key: Optional[str] = None) -> Langfuse:
 60    """Get or create a Langfuse client instance.
 61
 62    Returns an existing Langfuse client or creates a new one if none exists. In multi-project setups,
 63    providing a public_key is required. Multi-project support is experimental - see Langfuse docs.
 64
 65    Behavior:
 66    - Single project: Returns existing client or creates new one
 67    - Multi-project: Requires public_key to return specific client
 68    - No public_key in multi-project: Returns disabled client to prevent data leakage
 69
 70    The function uses a singleton pattern per public_key to conserve resources and maintain state.
 71
 72    Args:
 73        public_key (Optional[str]): Project identifier
 74            - With key: Returns client for that project
 75            - Without key: Returns single client or disabled client if multiple exist
 76
 77    Returns:
 78        Langfuse: Client instance in one of three states:
 79            1. Client for specified public_key
 80            2. Default client for single-project setup
 81            3. Disabled client when multiple projects exist without key
 82
 83    Security:
 84        Disables tracing when multiple projects exist without explicit key to prevent
 85        cross-project data leakage. Multi-project setups are experimental.
 86
 87    Example:
 88        ```python
 89        # Single project
 90        client = get_client()  # Default client
 91
 92        # In multi-project usage:
 93        client_a = get_client(public_key="project_a_key")  # Returns project A's client
 94        client_b = get_client(public_key="project_b_key")  # Returns project B's client
 95
 96        # Without specific key in multi-project setup:
 97        client = get_client()  # Returns disabled client for safety
 98        ```
 99    """
100    with LangfuseResourceManager._lock:
101        active_instances = LangfuseResourceManager._instances
102
103        # If no explicit public_key provided, check execution context
104        if not public_key:
105            public_key = _current_public_key.get(None)
106
107        if not public_key:
108            if len(active_instances) == 0:
109                # No clients initialized yet, create default instance
110                return Langfuse()
111
112            if len(active_instances) == 1:
113                # Only one client exists, safe to use without specifying key
114                instance = list(active_instances.values())[0]
115
116                # Initialize with the credentials bound to the instance
117                # This is important if the original instance was instantiated
118                # via constructor arguments
119                return _create_client_from_instance(instance)
120
121            else:
122                # Multiple clients exist but no key specified - disable tracing
123                # to prevent cross-project data leakage
124                langfuse_logger.warning(
125                    "No 'langfuse_public_key' passed to decorated function, but multiple langfuse clients are instantiated in current process. Skipping tracing for this function to avoid cross-project leakage."
126                )
127                return Langfuse(
128                    tracing_enabled=False, public_key="fake", secret_key="fake"
129                )
130
131        else:
132            # Specific key provided, look up existing instance
133            target_instance: Optional[LangfuseResourceManager] = active_instances.get(
134                public_key, None
135            )
136
137            if target_instance is None:
138                # No instance found with this key - client not initialized properly
139                langfuse_logger.warning(
140                    f"No Langfuse client with public key {public_key} has been initialized. Skipping tracing for decorated function."
141                )
142                return Langfuse(
143                    tracing_enabled=False, public_key="fake", secret_key="fake"
144                )
145
146            # target_instance is guaranteed to be not None at this point
147            return _create_client_from_instance(target_instance, public_key)

Get or create a Langfuse client instance.

Returns an existing Langfuse client or creates a new one if none exists. In multi-project setups, providing a public_key is required. Multi-project support is experimental - see Langfuse docs.

Behavior:

  • Single project: Returns existing client or creates new one
  • Multi-project: Requires public_key to return specific client
  • No public_key in multi-project: Returns disabled client to prevent data leakage

The function uses a singleton pattern per public_key to conserve resources and maintain state.

Arguments:
  • public_key (Optional[str]): Project identifier
    • With key: Returns client for that project
    • Without key: Returns single client or disabled client if multiple exist
Returns:

Langfuse: Client instance in one of three states: 1. Client for specified public_key 2. Default client for single-project setup 3. Disabled client when multiple projects exist without key

Security:

Disables tracing when multiple projects exist without explicit key to prevent cross-project data leakage. Multi-project setups are experimental.

Example:
# Single project
client = get_client()  # Default client

# In multi-project usage:
client_a = get_client(public_key="project_a_key")  # Returns project A's client
client_b = get_client(public_key="project_b_key")  # Returns project B's client

# Without specific key in multi-project setup:
client = get_client()  # Returns disabled client for safety
def observe( func: Optional[~F] = None, *, name: Optional[str] = None, as_type: Union[Literal['generation', 'embedding'], Literal['span', 'agent', 'tool', 'chain', 'retriever', 'evaluator', 'guardrail'], NoneType] = None, capture_input: Optional[bool] = None, capture_output: Optional[bool] = None, transform_to_string: Optional[Callable[[Iterable], str]] = None) -> Union[~F, Callable[[~F], ~F]]:
 90    def observe(
 91        self,
 92        func: Optional[F] = None,
 93        *,
 94        name: Optional[str] = None,
 95        as_type: Optional[ObservationTypeLiteralNoEvent] = None,
 96        capture_input: Optional[bool] = None,
 97        capture_output: Optional[bool] = None,
 98        transform_to_string: Optional[Callable[[Iterable], str]] = None,
 99    ) -> Union[F, Callable[[F], F]]:
100        """Wrap a function to create and manage Langfuse tracing around its execution, supporting both synchronous and asynchronous functions.
101
102        This decorator provides seamless integration of Langfuse observability into your codebase. It automatically creates
103        spans or generations around function execution, capturing timing, inputs/outputs, and error states. The decorator
104        intelligently handles both synchronous and asynchronous functions, preserving function signatures and type hints.
105
106        Using OpenTelemetry's distributed tracing system, it maintains proper trace context propagation throughout your application,
107        enabling you to see hierarchical traces of function calls with detailed performance metrics and function-specific details.
108
109        Args:
110            func (Optional[Callable]): The function to decorate. When used with parentheses @observe(), this will be None.
111            name (Optional[str]): Custom name for the created trace or span. If not provided, the function name is used.
112            as_type (Optional[Literal]): Set the observation type. Supported values:
113                    "generation", "span", "agent", "tool", "chain", "retriever", "embedding", "evaluator", "guardrail".
114                    Observation types are highlighted in the Langfuse UI for filtering and visualization.
115                    The types "generation" and "embedding" create a span on which additional attributes such as model metrics
116                    can be set.
117
118        Returns:
119            Callable: A wrapped version of the original function that automatically creates and manages Langfuse spans.
120
121        Example:
122            For general function tracing with automatic naming:
123            ```python
124            @observe()
125            def process_user_request(user_id, query):
126                # Function is automatically traced with name "process_user_request"
127                return get_response(query)
128            ```
129
130            For language model generation tracking:
131            ```python
132            @observe(name="answer-generation", as_type="generation")
133            async def generate_answer(query):
134                # Creates a generation-type span with extended LLM metrics
135                response = await openai.chat.completions.create(
136                    model="gpt-4",
137                    messages=[{"role": "user", "content": query}]
138                )
139                return response.choices[0].message.content
140            ```
141
142            For trace context propagation between functions:
143            ```python
144            @observe()
145            def main_process():
146                # Parent span is created
147                return sub_process()  # Child span automatically connected to parent
148
149            @observe()
150            def sub_process():
151                # Automatically becomes a child span of main_process
152                return "result"
153            ```
154
155        Raises:
156            Exception: Propagates any exceptions from the wrapped function after logging them in the trace.
157
158        Notes:
159            - The decorator preserves the original function's signature, docstring, and return type.
160            - Proper parent-child relationships between spans are automatically maintained.
161            - Special keyword arguments can be passed to control tracing:
162              - langfuse_trace_id: Explicitly set the trace ID for this function call
163              - langfuse_parent_observation_id: Explicitly set the parent span ID
164              - langfuse_public_key: Use a specific Langfuse project (when multiple clients exist)
165            - For async functions, the decorator returns an async function wrapper.
166            - For sync functions, the decorator returns a synchronous wrapper.
167        """
168        valid_types = set(get_observation_types_list(ObservationTypeLiteralNoEvent))
169        if as_type is not None and as_type not in valid_types:
170            self._log.warning(
171                f"Invalid as_type '{as_type}'. Valid types are: {', '.join(sorted(valid_types))}. Defaulting to 'span'."
172            )
173            as_type = "span"
174
175        function_io_capture_enabled = os.environ.get(
176            LANGFUSE_OBSERVE_DECORATOR_IO_CAPTURE_ENABLED, "True"
177        ).lower() not in ("false", "0")
178
179        should_capture_input = (
180            capture_input if capture_input is not None else function_io_capture_enabled
181        )
182
183        should_capture_output = (
184            capture_output
185            if capture_output is not None
186            else function_io_capture_enabled
187        )
188
189        def decorator(func: F) -> F:
190            return (
191                self._async_observe(
192                    func,
193                    name=name,
194                    as_type=as_type,
195                    capture_input=should_capture_input,
196                    capture_output=should_capture_output,
197                    transform_to_string=transform_to_string,
198                )
199                if asyncio.iscoroutinefunction(func)
200                else self._sync_observe(
201                    func,
202                    name=name,
203                    as_type=as_type,
204                    capture_input=should_capture_input,
205                    capture_output=should_capture_output,
206                    transform_to_string=transform_to_string,
207                )
208            )
209
210        """Handle decorator with or without parentheses.
211
212        This logic enables the decorator to work both with and without parentheses:
213        - @observe - Python passes the function directly to the decorator
214        - @observe() - Python calls the decorator first, which must return a function decorator
215
216        When called without arguments (@observe), the func parameter contains the function to decorate,
217        so we directly apply the decorator to it. When called with parentheses (@observe()),
218        func is None, so we return the decorator function itself for Python to apply in the next step.
219        """
220        if func is None:
221            return decorator
222        else:
223            return decorator(func)

Wrap a function to create and manage Langfuse tracing around its execution, supporting both synchronous and asynchronous functions.

This decorator provides seamless integration of Langfuse observability into your codebase. It automatically creates spans or generations around function execution, capturing timing, inputs/outputs, and error states. The decorator intelligently handles both synchronous and asynchronous functions, preserving function signatures and type hints.

Using OpenTelemetry's distributed tracing system, it maintains proper trace context propagation throughout your application, enabling you to see hierarchical traces of function calls with detailed performance metrics and function-specific details.

Arguments:
  • func (Optional[Callable]): The function to decorate. When used with parentheses @observe(), this will be None.
  • name (Optional[str]): Custom name for the created trace or span. If not provided, the function name is used.
  • as_type (Optional[Literal]): Set the observation type. Supported values: "generation", "span", "agent", "tool", "chain", "retriever", "embedding", "evaluator", "guardrail". Observation types are highlighted in the Langfuse UI for filtering and visualization. The types "generation" and "embedding" create a span on which additional attributes such as model metrics can be set.
Returns:

Callable: A wrapped version of the original function that automatically creates and manages Langfuse spans.

Example:

For general function tracing with automatic naming:

@observe()
def process_user_request(user_id, query):
    # Function is automatically traced with name "process_user_request"
    return get_response(query)

For language model generation tracking:

@observe(name="answer-generation", as_type="generation")
async def generate_answer(query):
    # Creates a generation-type span with extended LLM metrics
    response = await openai.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": query}]
    )
    return response.choices[0].message.content

For trace context propagation between functions:

@observe()
def main_process():
    # Parent span is created
    return sub_process()  # Child span automatically connected to parent

@observe()
def sub_process():
    # Automatically becomes a child span of main_process
    return "result"
Raises:
  • Exception: Propagates any exceptions from the wrapped function after logging them in the trace.
Notes:
  • The decorator preserves the original function's signature, docstring, and return type.
  • Proper parent-child relationships between spans are automatically maintained.
  • Special keyword arguments can be passed to control tracing:
    • langfuse_trace_id: Explicitly set the trace ID for this function call
    • langfuse_parent_observation_id: Explicitly set the parent span ID
    • langfuse_public_key: Use a specific Langfuse project (when multiple clients exist)
  • For async functions, the decorator returns an async function wrapper.
  • For sync functions, the decorator returns a synchronous wrapper.
def propagate_attributes( *, user_id: Optional[str] = None, session_id: Optional[str] = None, metadata: Optional[Dict[str, str]] = None, version: Optional[str] = None, tags: Optional[List[str]] = None, as_baggage: bool = False) -> opentelemetry.util._decorator._AgnosticContextManager[typing.Any]:
 74def propagate_attributes(
 75    *,
 76    user_id: Optional[str] = None,
 77    session_id: Optional[str] = None,
 78    metadata: Optional[Dict[str, str]] = None,
 79    version: Optional[str] = None,
 80    tags: Optional[List[str]] = None,
 81    as_baggage: bool = False,
 82) -> _AgnosticContextManager[Any]:
 83    """Propagate trace-level attributes to all spans created within this context.
 84
 85    This context manager sets attributes on the currently active span AND automatically
 86    propagates them to all new child spans created within the context. This is the
 87    recommended way to set trace-level attributes like user_id, session_id, and metadata
 88    dimensions that should be consistently applied across all observations in a trace.
 89
 90    **IMPORTANT**: Call this as early as possible within your trace/workflow. Only the
 91    currently active span and spans created after entering this context will have these
 92    attributes. Pre-existing spans will NOT be retroactively updated.
 93
 94    **Why this matters**: Langfuse aggregation queries (e.g., total cost by user_id,
 95    filtering by session_id) only include observations that have the attribute set.
 96    If you call `propagate_attributes` late in your workflow, earlier spans won't be
 97    included in aggregations for that attribute.
 98
 99    Args:
100        user_id: User identifier to associate with all spans in this context.
101            Must be US-ASCII string, ≤200 characters. Use this to track which user
102            generated each trace and enable e.g. per-user cost/performance analysis.
103        session_id: Session identifier to associate with all spans in this context.
104            Must be US-ASCII string, ≤200 characters. Use this to group related traces
105            within a user session (e.g., a conversation thread, multi-turn interaction).
106        metadata: Additional key-value metadata to propagate to all spans.
107            - Keys and values must be US-ASCII strings
108            - All values must be ≤200 characters
109            - Use for dimensions like internal correlating identifiers
110            - AVOID: large payloads, sensitive data, non-string values (will be dropped with warning)
111        version: Version identfier for parts of your application that are independently versioned, e.g. agents
112        tags: List of tags to categorize the group of observations
113        as_baggage: If True, propagates attributes using OpenTelemetry baggage for
114            cross-process/service propagation. **Security warning**: When enabled,
115            attribute values are added to HTTP headers on ALL outbound requests.
116            Only enable if values are safe to transmit via HTTP headers and you need
117            cross-service tracing. Default: False.
118
119    Returns:
120        Context manager that propagates attributes to all child spans.
121
122    Example:
123        Basic usage with user and session tracking:
124
125        ```python
126        from langfuse import Langfuse
127
128        langfuse = Langfuse()
129
130        # Set attributes early in the trace
131        with langfuse.start_as_current_span(name="user_workflow") as span:
132            with langfuse.propagate_attributes(
133                user_id="user_123",
134                session_id="session_abc",
135                metadata={"experiment": "variant_a", "environment": "production"}
136            ):
137                # All spans created here will have user_id, session_id, and metadata
138                with langfuse.start_span(name="llm_call") as llm_span:
139                    # This span inherits: user_id, session_id, experiment, environment
140                    ...
141
142                with langfuse.start_generation(name="completion") as gen:
143                    # This span also inherits all attributes
144                    ...
145        ```
146
147        Late propagation (anti-pattern):
148
149        ```python
150        with langfuse.start_as_current_span(name="workflow") as span:
151            # These spans WON'T have user_id
152            early_span = langfuse.start_span(name="early_work")
153            early_span.end()
154
155            # Set attributes in the middle
156            with langfuse.propagate_attributes(user_id="user_123"):
157                # Only spans created AFTER this point will have user_id
158                late_span = langfuse.start_span(name="late_work")
159                late_span.end()
160
161            # Result: Aggregations by user_id will miss "early_work" span
162        ```
163
164        Cross-service propagation with baggage (advanced):
165
166        ```python
167        # Service A - originating service
168        with langfuse.start_as_current_span(name="api_request"):
169            with langfuse.propagate_attributes(
170                user_id="user_123",
171                session_id="session_abc",
172                as_baggage=True  # Propagate via HTTP headers
173            ):
174                # Make HTTP request to Service B
175                response = requests.get("https://service-b.example.com/api")
176                # user_id and session_id are now in HTTP headers
177
178        # Service B - downstream service
179        # OpenTelemetry will automatically extract baggage from HTTP headers
180        # and propagate to spans in Service B
181        ```
182
183    Note:
184        - **Validation**: All attribute values (user_id, session_id, metadata values)
185          must be strings ≤200 characters. Invalid values will be dropped with a
186          warning logged. Ensure values meet constraints before calling.
187        - **OpenTelemetry**: This uses OpenTelemetry context propagation under the hood,
188          making it compatible with other OTel-instrumented libraries.
189
190    Raises:
191        No exceptions are raised. Invalid values are logged as warnings and dropped.
192    """
193    return _propagate_attributes(
194        user_id=user_id,
195        session_id=session_id,
196        metadata=metadata,
197        version=version,
198        tags=tags,
199        as_baggage=as_baggage,
200    )

Propagate trace-level attributes to all spans created within this context.

This context manager sets attributes on the currently active span AND automatically propagates them to all new child spans created within the context. This is the recommended way to set trace-level attributes like user_id, session_id, and metadata dimensions that should be consistently applied across all observations in a trace.

IMPORTANT: Call this as early as possible within your trace/workflow. Only the currently active span and spans created after entering this context will have these attributes. Pre-existing spans will NOT be retroactively updated.

Why this matters: Langfuse aggregation queries (e.g., total cost by user_id, filtering by session_id) only include observations that have the attribute set. If you call propagate_attributes late in your workflow, earlier spans won't be included in aggregations for that attribute.

Arguments:
  • user_id: User identifier to associate with all spans in this context. Must be US-ASCII string, ≤200 characters. Use this to track which user generated each trace and enable e.g. per-user cost/performance analysis.
  • session_id: Session identifier to associate with all spans in this context. Must be US-ASCII string, ≤200 characters. Use this to group related traces within a user session (e.g., a conversation thread, multi-turn interaction).
  • metadata: Additional key-value metadata to propagate to all spans.
    • Keys and values must be US-ASCII strings
    • All values must be ≤200 characters
    • Use for dimensions like internal correlating identifiers
    • AVOID: large payloads, sensitive data, non-string values (will be dropped with warning)
  • version: Version identfier for parts of your application that are independently versioned, e.g. agents
  • tags: List of tags to categorize the group of observations
  • as_baggage: If True, propagates attributes using OpenTelemetry baggage for cross-process/service propagation. Security warning: When enabled, attribute values are added to HTTP headers on ALL outbound requests. Only enable if values are safe to transmit via HTTP headers and you need cross-service tracing. Default: False.
Returns:

Context manager that propagates attributes to all child spans.

Example:

Basic usage with user and session tracking:

from langfuse import Langfuse

langfuse = Langfuse()

# Set attributes early in the trace
with langfuse.start_as_current_span(name="user_workflow") as span:
    with langfuse.propagate_attributes(
        user_id="user_123",
        session_id="session_abc",
        metadata={"experiment": "variant_a", "environment": "production"}
    ):
        # All spans created here will have user_id, session_id, and metadata
        with langfuse.start_span(name="llm_call") as llm_span:
            # This span inherits: user_id, session_id, experiment, environment
            ...

        with langfuse.start_generation(name="completion") as gen:
            # This span also inherits all attributes
            ...

Late propagation (anti-pattern):

with langfuse.start_as_current_span(name="workflow") as span:
    # These spans WON'T have user_id
    early_span = langfuse.start_span(name="early_work")
    early_span.end()

    # Set attributes in the middle
    with langfuse.propagate_attributes(user_id="user_123"):
        # Only spans created AFTER this point will have user_id
        late_span = langfuse.start_span(name="late_work")
        late_span.end()

    # Result: Aggregations by user_id will miss "early_work" span

Cross-service propagation with baggage (advanced):

# Service A - originating service
with langfuse.start_as_current_span(name="api_request"):
    with langfuse.propagate_attributes(
        user_id="user_123",
        session_id="session_abc",
        as_baggage=True  # Propagate via HTTP headers
    ):
        # Make HTTP request to Service B
        response = requests.get("https://service-b.example.com/api")
        # user_id and session_id are now in HTTP headers

# Service B - downstream service
# OpenTelemetry will automatically extract baggage from HTTP headers
# and propagate to spans in Service B
Note:
  • Validation: All attribute values (user_id, session_id, metadata values) must be strings ≤200 characters. Invalid values will be dropped with a warning logged. Ensure values meet constraints before calling.
  • OpenTelemetry: This uses OpenTelemetry context propagation under the hood, making it compatible with other OTel-instrumented libraries.
Raises:
  • No exceptions are raised. Invalid values are logged as warnings and dropped.
ObservationTypeLiteral = typing.Union[typing.Literal['generation', 'embedding'], typing.Literal['span', 'agent', 'tool', 'chain', 'retriever', 'evaluator', 'guardrail'], typing.Literal['event']]
class LangfuseSpan(langfuse._client.span.LangfuseObservationWrapper):
1146class LangfuseSpan(LangfuseObservationWrapper):
1147    """Standard span implementation for general operations in Langfuse.
1148
1149    This class represents a general-purpose span that can be used to trace
1150    any operation in your application. It extends the base LangfuseObservationWrapper
1151    with specific methods for creating child spans, generations, and updating
1152    span-specific attributes. If possible, use a more specific type for
1153    better observability and insights.
1154    """
1155
1156    def __init__(
1157        self,
1158        *,
1159        otel_span: otel_trace_api.Span,
1160        langfuse_client: "Langfuse",
1161        input: Optional[Any] = None,
1162        output: Optional[Any] = None,
1163        metadata: Optional[Any] = None,
1164        environment: Optional[str] = None,
1165        version: Optional[str] = None,
1166        level: Optional[SpanLevel] = None,
1167        status_message: Optional[str] = None,
1168    ):
1169        """Initialize a new LangfuseSpan.
1170
1171        Args:
1172            otel_span: The OpenTelemetry span to wrap
1173            langfuse_client: Reference to the parent Langfuse client
1174            input: Input data for the span (any JSON-serializable object)
1175            output: Output data from the span (any JSON-serializable object)
1176            metadata: Additional metadata to associate with the span
1177            environment: The tracing environment
1178            version: Version identifier for the code or component
1179            level: Importance level of the span (info, warning, error)
1180            status_message: Optional status message for the span
1181        """
1182        super().__init__(
1183            otel_span=otel_span,
1184            as_type="span",
1185            langfuse_client=langfuse_client,
1186            input=input,
1187            output=output,
1188            metadata=metadata,
1189            environment=environment,
1190            version=version,
1191            level=level,
1192            status_message=status_message,
1193        )
1194
1195    def start_span(
1196        self,
1197        name: str,
1198        input: Optional[Any] = None,
1199        output: Optional[Any] = None,
1200        metadata: Optional[Any] = None,
1201        version: Optional[str] = None,
1202        level: Optional[SpanLevel] = None,
1203        status_message: Optional[str] = None,
1204    ) -> "LangfuseSpan":
1205        """Create a new child span.
1206
1207        This method creates a new child span with this span as the parent.
1208        Unlike start_as_current_span(), this method does not set the new span
1209        as the current span in the context.
1210
1211        Args:
1212            name: Name of the span (e.g., function or operation name)
1213            input: Input data for the operation
1214            output: Output data from the operation
1215            metadata: Additional metadata to associate with the span
1216            version: Version identifier for the code or component
1217            level: Importance level of the span (info, warning, error)
1218            status_message: Optional status message for the span
1219
1220        Returns:
1221            A new LangfuseSpan that must be ended with .end() when complete
1222
1223        Example:
1224            ```python
1225            parent_span = langfuse.start_span(name="process-request")
1226            try:
1227                # Create a child span
1228                child_span = parent_span.start_span(name="validate-input")
1229                try:
1230                    # Do validation work
1231                    validation_result = validate(request_data)
1232                    child_span.update(output=validation_result)
1233                finally:
1234                    child_span.end()
1235
1236                # Continue with parent span
1237                result = process_validated_data(validation_result)
1238                parent_span.update(output=result)
1239            finally:
1240                parent_span.end()
1241            ```
1242        """
1243        return self.start_observation(
1244            name=name,
1245            as_type="span",
1246            input=input,
1247            output=output,
1248            metadata=metadata,
1249            version=version,
1250            level=level,
1251            status_message=status_message,
1252        )
1253
1254    def start_as_current_span(
1255        self,
1256        *,
1257        name: str,
1258        input: Optional[Any] = None,
1259        output: Optional[Any] = None,
1260        metadata: Optional[Any] = None,
1261        version: Optional[str] = None,
1262        level: Optional[SpanLevel] = None,
1263        status_message: Optional[str] = None,
1264    ) -> _AgnosticContextManager["LangfuseSpan"]:
1265        """[DEPRECATED] Create a new child span and set it as the current span in a context manager.
1266
1267        DEPRECATED: This method is deprecated and will be removed in a future version.
1268        Use start_as_current_observation(as_type='span') instead.
1269
1270        This method creates a new child span and sets it as the current span within
1271        a context manager. It should be used with a 'with' statement to automatically
1272        manage the span's lifecycle.
1273
1274        Args:
1275            name: Name of the span (e.g., function or operation name)
1276            input: Input data for the operation
1277            output: Output data from the operation
1278            metadata: Additional metadata to associate with the span
1279            version: Version identifier for the code or component
1280            level: Importance level of the span (info, warning, error)
1281            status_message: Optional status message for the span
1282
1283        Returns:
1284            A context manager that yields a new LangfuseSpan
1285
1286        Example:
1287            ```python
1288            with langfuse.start_as_current_span(name="process-request") as parent_span:
1289                # Parent span is active here
1290
1291                # Create a child span with context management
1292                with parent_span.start_as_current_span(name="validate-input") as child_span:
1293                    # Child span is active here
1294                    validation_result = validate(request_data)
1295                    child_span.update(output=validation_result)
1296
1297                # Back to parent span context
1298                result = process_validated_data(validation_result)
1299                parent_span.update(output=result)
1300            ```
1301        """
1302        warnings.warn(
1303            "start_as_current_span is deprecated and will be removed in a future version. "
1304            "Use start_as_current_observation(as_type='span') instead.",
1305            DeprecationWarning,
1306            stacklevel=2,
1307        )
1308        return self.start_as_current_observation(
1309            name=name,
1310            as_type="span",
1311            input=input,
1312            output=output,
1313            metadata=metadata,
1314            version=version,
1315            level=level,
1316            status_message=status_message,
1317        )
1318
1319    def start_generation(
1320        self,
1321        *,
1322        name: str,
1323        input: Optional[Any] = None,
1324        output: Optional[Any] = None,
1325        metadata: Optional[Any] = None,
1326        version: Optional[str] = None,
1327        level: Optional[SpanLevel] = None,
1328        status_message: Optional[str] = None,
1329        completion_start_time: Optional[datetime] = None,
1330        model: Optional[str] = None,
1331        model_parameters: Optional[Dict[str, MapValue]] = None,
1332        usage_details: Optional[Dict[str, int]] = None,
1333        cost_details: Optional[Dict[str, float]] = None,
1334        prompt: Optional[PromptClient] = None,
1335    ) -> "LangfuseGeneration":
1336        """[DEPRECATED] Create a new child generation span.
1337
1338        DEPRECATED: This method is deprecated and will be removed in a future version.
1339        Use start_observation(as_type='generation') instead.
1340
1341        This method creates a new child generation span with this span as the parent.
1342        Generation spans are specialized for AI/LLM operations and include additional
1343        fields for model information, usage stats, and costs.
1344
1345        Unlike start_as_current_generation(), this method does not set the new span
1346        as the current span in the context.
1347
1348        Args:
1349            name: Name of the generation operation
1350            input: Input data for the model (e.g., prompts)
1351            output: Output from the model (e.g., completions)
1352            metadata: Additional metadata to associate with the generation
1353            version: Version identifier for the model or component
1354            level: Importance level of the generation (info, warning, error)
1355            status_message: Optional status message for the generation
1356            completion_start_time: When the model started generating the response
1357            model: Name/identifier of the AI model used (e.g., "gpt-4")
1358            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1359            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1360            cost_details: Cost information for the model call
1361            prompt: Associated prompt template from Langfuse prompt management
1362
1363        Returns:
1364            A new LangfuseGeneration that must be ended with .end() when complete
1365
1366        Example:
1367            ```python
1368            span = langfuse.start_span(name="process-query")
1369            try:
1370                # Create a generation child span
1371                generation = span.start_generation(
1372                    name="generate-answer",
1373                    model="gpt-4",
1374                    input={"prompt": "Explain quantum computing"}
1375                )
1376                try:
1377                    # Call model API
1378                    response = llm.generate(...)
1379
1380                    generation.update(
1381                        output=response.text,
1382                        usage_details={
1383                            "prompt_tokens": response.usage.prompt_tokens,
1384                            "completion_tokens": response.usage.completion_tokens
1385                        }
1386                    )
1387                finally:
1388                    generation.end()
1389
1390                # Continue with parent span
1391                span.update(output={"answer": response.text, "source": "gpt-4"})
1392            finally:
1393                span.end()
1394            ```
1395        """
1396        warnings.warn(
1397            "start_generation is deprecated and will be removed in a future version. "
1398            "Use start_observation(as_type='generation') instead.",
1399            DeprecationWarning,
1400            stacklevel=2,
1401        )
1402        return self.start_observation(
1403            name=name,
1404            as_type="generation",
1405            input=input,
1406            output=output,
1407            metadata=metadata,
1408            version=version,
1409            level=level,
1410            status_message=status_message,
1411            completion_start_time=completion_start_time,
1412            model=model,
1413            model_parameters=model_parameters,
1414            usage_details=usage_details,
1415            cost_details=cost_details,
1416            prompt=prompt,
1417        )
1418
1419    def start_as_current_generation(
1420        self,
1421        *,
1422        name: str,
1423        input: Optional[Any] = None,
1424        output: Optional[Any] = None,
1425        metadata: Optional[Any] = None,
1426        version: Optional[str] = None,
1427        level: Optional[SpanLevel] = None,
1428        status_message: Optional[str] = None,
1429        completion_start_time: Optional[datetime] = None,
1430        model: Optional[str] = None,
1431        model_parameters: Optional[Dict[str, MapValue]] = None,
1432        usage_details: Optional[Dict[str, int]] = None,
1433        cost_details: Optional[Dict[str, float]] = None,
1434        prompt: Optional[PromptClient] = None,
1435    ) -> _AgnosticContextManager["LangfuseGeneration"]:
1436        """[DEPRECATED] Create a new child generation span and set it as the current span in a context manager.
1437
1438        DEPRECATED: This method is deprecated and will be removed in a future version.
1439        Use start_as_current_observation(as_type='generation') instead.
1440
1441        This method creates a new child generation span and sets it as the current span
1442        within a context manager. Generation spans are specialized for AI/LLM operations
1443        and include additional fields for model information, usage stats, and costs.
1444
1445        Args:
1446            name: Name of the generation operation
1447            input: Input data for the model (e.g., prompts)
1448            output: Output from the model (e.g., completions)
1449            metadata: Additional metadata to associate with the generation
1450            version: Version identifier for the model or component
1451            level: Importance level of the generation (info, warning, error)
1452            status_message: Optional status message for the generation
1453            completion_start_time: When the model started generating the response
1454            model: Name/identifier of the AI model used (e.g., "gpt-4")
1455            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1456            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1457            cost_details: Cost information for the model call
1458            prompt: Associated prompt template from Langfuse prompt management
1459
1460        Returns:
1461            A context manager that yields a new LangfuseGeneration
1462
1463        Example:
1464            ```python
1465            with langfuse.start_as_current_span(name="process-request") as span:
1466                # Prepare data
1467                query = preprocess_user_query(user_input)
1468
1469                # Create a generation span with context management
1470                with span.start_as_current_generation(
1471                    name="generate-answer",
1472                    model="gpt-4",
1473                    input={"query": query}
1474                ) as generation:
1475                    # Generation span is active here
1476                    response = llm.generate(query)
1477
1478                    # Update with results
1479                    generation.update(
1480                        output=response.text,
1481                        usage_details={
1482                            "prompt_tokens": response.usage.prompt_tokens,
1483                            "completion_tokens": response.usage.completion_tokens
1484                        }
1485                    )
1486
1487                # Back to parent span context
1488                span.update(output={"answer": response.text, "source": "gpt-4"})
1489            ```
1490        """
1491        warnings.warn(
1492            "start_as_current_generation is deprecated and will be removed in a future version. "
1493            "Use start_as_current_observation(as_type='generation') instead.",
1494            DeprecationWarning,
1495            stacklevel=2,
1496        )
1497        return self.start_as_current_observation(
1498            name=name,
1499            as_type="generation",
1500            input=input,
1501            output=output,
1502            metadata=metadata,
1503            version=version,
1504            level=level,
1505            status_message=status_message,
1506            completion_start_time=completion_start_time,
1507            model=model,
1508            model_parameters=model_parameters,
1509            usage_details=usage_details,
1510            cost_details=cost_details,
1511            prompt=prompt,
1512        )
1513
1514    def create_event(
1515        self,
1516        *,
1517        name: str,
1518        input: Optional[Any] = None,
1519        output: Optional[Any] = None,
1520        metadata: Optional[Any] = None,
1521        version: Optional[str] = None,
1522        level: Optional[SpanLevel] = None,
1523        status_message: Optional[str] = None,
1524    ) -> "LangfuseEvent":
1525        """Create a new Langfuse observation of type 'EVENT'.
1526
1527        Args:
1528            name: Name of the span (e.g., function or operation name)
1529            input: Input data for the operation (can be any JSON-serializable object)
1530            output: Output data from the operation (can be any JSON-serializable object)
1531            metadata: Additional metadata to associate with the span
1532            version: Version identifier for the code or component
1533            level: Importance level of the span (info, warning, error)
1534            status_message: Optional status message for the span
1535
1536        Returns:
1537            The LangfuseEvent object
1538
1539        Example:
1540            ```python
1541            event = langfuse.create_event(name="process-event")
1542            ```
1543        """
1544        timestamp = time_ns()
1545
1546        with otel_trace_api.use_span(self._otel_span):
1547            new_otel_span = self._langfuse_client._otel_tracer.start_span(
1548                name=name, start_time=timestamp
1549            )
1550
1551        return cast(
1552            "LangfuseEvent",
1553            LangfuseEvent(
1554                otel_span=new_otel_span,
1555                langfuse_client=self._langfuse_client,
1556                input=input,
1557                output=output,
1558                metadata=metadata,
1559                environment=self._environment,
1560                version=version,
1561                level=level,
1562                status_message=status_message,
1563            ).end(end_time=timestamp),
1564        )

Standard span implementation for general operations in Langfuse.

This class represents a general-purpose span that can be used to trace any operation in your application. It extends the base LangfuseObservationWrapper with specific methods for creating child spans, generations, and updating span-specific attributes. If possible, use a more specific type for better observability and insights.

LangfuseSpan( *, otel_span: opentelemetry.trace.span.Span, langfuse_client: Langfuse, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, environment: Optional[str] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None)
1156    def __init__(
1157        self,
1158        *,
1159        otel_span: otel_trace_api.Span,
1160        langfuse_client: "Langfuse",
1161        input: Optional[Any] = None,
1162        output: Optional[Any] = None,
1163        metadata: Optional[Any] = None,
1164        environment: Optional[str] = None,
1165        version: Optional[str] = None,
1166        level: Optional[SpanLevel] = None,
1167        status_message: Optional[str] = None,
1168    ):
1169        """Initialize a new LangfuseSpan.
1170
1171        Args:
1172            otel_span: The OpenTelemetry span to wrap
1173            langfuse_client: Reference to the parent Langfuse client
1174            input: Input data for the span (any JSON-serializable object)
1175            output: Output data from the span (any JSON-serializable object)
1176            metadata: Additional metadata to associate with the span
1177            environment: The tracing environment
1178            version: Version identifier for the code or component
1179            level: Importance level of the span (info, warning, error)
1180            status_message: Optional status message for the span
1181        """
1182        super().__init__(
1183            otel_span=otel_span,
1184            as_type="span",
1185            langfuse_client=langfuse_client,
1186            input=input,
1187            output=output,
1188            metadata=metadata,
1189            environment=environment,
1190            version=version,
1191            level=level,
1192            status_message=status_message,
1193        )

Initialize a new LangfuseSpan.

Arguments:
  • otel_span: The OpenTelemetry span to wrap
  • langfuse_client: Reference to the parent Langfuse client
  • input: Input data for the span (any JSON-serializable object)
  • output: Output data from the span (any JSON-serializable object)
  • metadata: Additional metadata to associate with the span
  • environment: The tracing environment
  • version: Version identifier for the code or component
  • level: Importance level of the span (info, warning, error)
  • status_message: Optional status message for the span
def start_span( self, name: str, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None) -> LangfuseSpan:
1195    def start_span(
1196        self,
1197        name: str,
1198        input: Optional[Any] = None,
1199        output: Optional[Any] = None,
1200        metadata: Optional[Any] = None,
1201        version: Optional[str] = None,
1202        level: Optional[SpanLevel] = None,
1203        status_message: Optional[str] = None,
1204    ) -> "LangfuseSpan":
1205        """Create a new child span.
1206
1207        This method creates a new child span with this span as the parent.
1208        Unlike start_as_current_span(), this method does not set the new span
1209        as the current span in the context.
1210
1211        Args:
1212            name: Name of the span (e.g., function or operation name)
1213            input: Input data for the operation
1214            output: Output data from the operation
1215            metadata: Additional metadata to associate with the span
1216            version: Version identifier for the code or component
1217            level: Importance level of the span (info, warning, error)
1218            status_message: Optional status message for the span
1219
1220        Returns:
1221            A new LangfuseSpan that must be ended with .end() when complete
1222
1223        Example:
1224            ```python
1225            parent_span = langfuse.start_span(name="process-request")
1226            try:
1227                # Create a child span
1228                child_span = parent_span.start_span(name="validate-input")
1229                try:
1230                    # Do validation work
1231                    validation_result = validate(request_data)
1232                    child_span.update(output=validation_result)
1233                finally:
1234                    child_span.end()
1235
1236                # Continue with parent span
1237                result = process_validated_data(validation_result)
1238                parent_span.update(output=result)
1239            finally:
1240                parent_span.end()
1241            ```
1242        """
1243        return self.start_observation(
1244            name=name,
1245            as_type="span",
1246            input=input,
1247            output=output,
1248            metadata=metadata,
1249            version=version,
1250            level=level,
1251            status_message=status_message,
1252        )

Create a new child span.

This method creates a new child span with this span as the parent. Unlike start_as_current_span(), this method does not set the new span as the current span in the context.

Arguments:
  • name: Name of the span (e.g., function or operation name)
  • input: Input data for the operation
  • output: Output data from the operation
  • metadata: Additional metadata to associate with the span
  • version: Version identifier for the code or component
  • level: Importance level of the span (info, warning, error)
  • status_message: Optional status message for the span
Returns:

A new LangfuseSpan that must be ended with .end() when complete

Example:
parent_span = langfuse.start_span(name="process-request")
try:
    # Create a child span
    child_span = parent_span.start_span(name="validate-input")
    try:
        # Do validation work
        validation_result = validate(request_data)
        child_span.update(output=validation_result)
    finally:
        child_span.end()

    # Continue with parent span
    result = process_validated_data(validation_result)
    parent_span.update(output=result)
finally:
    parent_span.end()
def start_as_current_span( self, *, name: str, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None) -> opentelemetry.util._decorator._AgnosticContextManager[LangfuseSpan]:
1254    def start_as_current_span(
1255        self,
1256        *,
1257        name: str,
1258        input: Optional[Any] = None,
1259        output: Optional[Any] = None,
1260        metadata: Optional[Any] = None,
1261        version: Optional[str] = None,
1262        level: Optional[SpanLevel] = None,
1263        status_message: Optional[str] = None,
1264    ) -> _AgnosticContextManager["LangfuseSpan"]:
1265        """[DEPRECATED] Create a new child span and set it as the current span in a context manager.
1266
1267        DEPRECATED: This method is deprecated and will be removed in a future version.
1268        Use start_as_current_observation(as_type='span') instead.
1269
1270        This method creates a new child span and sets it as the current span within
1271        a context manager. It should be used with a 'with' statement to automatically
1272        manage the span's lifecycle.
1273
1274        Args:
1275            name: Name of the span (e.g., function or operation name)
1276            input: Input data for the operation
1277            output: Output data from the operation
1278            metadata: Additional metadata to associate with the span
1279            version: Version identifier for the code or component
1280            level: Importance level of the span (info, warning, error)
1281            status_message: Optional status message for the span
1282
1283        Returns:
1284            A context manager that yields a new LangfuseSpan
1285
1286        Example:
1287            ```python
1288            with langfuse.start_as_current_span(name="process-request") as parent_span:
1289                # Parent span is active here
1290
1291                # Create a child span with context management
1292                with parent_span.start_as_current_span(name="validate-input") as child_span:
1293                    # Child span is active here
1294                    validation_result = validate(request_data)
1295                    child_span.update(output=validation_result)
1296
1297                # Back to parent span context
1298                result = process_validated_data(validation_result)
1299                parent_span.update(output=result)
1300            ```
1301        """
1302        warnings.warn(
1303            "start_as_current_span is deprecated and will be removed in a future version. "
1304            "Use start_as_current_observation(as_type='span') instead.",
1305            DeprecationWarning,
1306            stacklevel=2,
1307        )
1308        return self.start_as_current_observation(
1309            name=name,
1310            as_type="span",
1311            input=input,
1312            output=output,
1313            metadata=metadata,
1314            version=version,
1315            level=level,
1316            status_message=status_message,
1317        )

[DEPRECATED] Create a new child span and set it as the current span in a context manager.

DEPRECATED: This method is deprecated and will be removed in a future version. Use start_as_current_observation(as_type='span') instead.

This method creates a new child span and sets it as the current span within a context manager. It should be used with a 'with' statement to automatically manage the span's lifecycle.

Arguments:
  • name: Name of the span (e.g., function or operation name)
  • input: Input data for the operation
  • output: Output data from the operation
  • metadata: Additional metadata to associate with the span
  • version: Version identifier for the code or component
  • level: Importance level of the span (info, warning, error)
  • status_message: Optional status message for the span
Returns:

A context manager that yields a new LangfuseSpan

Example:
with langfuse.start_as_current_span(name="process-request") as parent_span:
    # Parent span is active here

    # Create a child span with context management
    with parent_span.start_as_current_span(name="validate-input") as child_span:
        # Child span is active here
        validation_result = validate(request_data)
        child_span.update(output=validation_result)

    # Back to parent span context
    result = process_validated_data(validation_result)
    parent_span.update(output=result)
def start_generation( self, *, name: str, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None) -> LangfuseGeneration:
1319    def start_generation(
1320        self,
1321        *,
1322        name: str,
1323        input: Optional[Any] = None,
1324        output: Optional[Any] = None,
1325        metadata: Optional[Any] = None,
1326        version: Optional[str] = None,
1327        level: Optional[SpanLevel] = None,
1328        status_message: Optional[str] = None,
1329        completion_start_time: Optional[datetime] = None,
1330        model: Optional[str] = None,
1331        model_parameters: Optional[Dict[str, MapValue]] = None,
1332        usage_details: Optional[Dict[str, int]] = None,
1333        cost_details: Optional[Dict[str, float]] = None,
1334        prompt: Optional[PromptClient] = None,
1335    ) -> "LangfuseGeneration":
1336        """[DEPRECATED] Create a new child generation span.
1337
1338        DEPRECATED: This method is deprecated and will be removed in a future version.
1339        Use start_observation(as_type='generation') instead.
1340
1341        This method creates a new child generation span with this span as the parent.
1342        Generation spans are specialized for AI/LLM operations and include additional
1343        fields for model information, usage stats, and costs.
1344
1345        Unlike start_as_current_generation(), this method does not set the new span
1346        as the current span in the context.
1347
1348        Args:
1349            name: Name of the generation operation
1350            input: Input data for the model (e.g., prompts)
1351            output: Output from the model (e.g., completions)
1352            metadata: Additional metadata to associate with the generation
1353            version: Version identifier for the model or component
1354            level: Importance level of the generation (info, warning, error)
1355            status_message: Optional status message for the generation
1356            completion_start_time: When the model started generating the response
1357            model: Name/identifier of the AI model used (e.g., "gpt-4")
1358            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1359            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1360            cost_details: Cost information for the model call
1361            prompt: Associated prompt template from Langfuse prompt management
1362
1363        Returns:
1364            A new LangfuseGeneration that must be ended with .end() when complete
1365
1366        Example:
1367            ```python
1368            span = langfuse.start_span(name="process-query")
1369            try:
1370                # Create a generation child span
1371                generation = span.start_generation(
1372                    name="generate-answer",
1373                    model="gpt-4",
1374                    input={"prompt": "Explain quantum computing"}
1375                )
1376                try:
1377                    # Call model API
1378                    response = llm.generate(...)
1379
1380                    generation.update(
1381                        output=response.text,
1382                        usage_details={
1383                            "prompt_tokens": response.usage.prompt_tokens,
1384                            "completion_tokens": response.usage.completion_tokens
1385                        }
1386                    )
1387                finally:
1388                    generation.end()
1389
1390                # Continue with parent span
1391                span.update(output={"answer": response.text, "source": "gpt-4"})
1392            finally:
1393                span.end()
1394            ```
1395        """
1396        warnings.warn(
1397            "start_generation is deprecated and will be removed in a future version. "
1398            "Use start_observation(as_type='generation') instead.",
1399            DeprecationWarning,
1400            stacklevel=2,
1401        )
1402        return self.start_observation(
1403            name=name,
1404            as_type="generation",
1405            input=input,
1406            output=output,
1407            metadata=metadata,
1408            version=version,
1409            level=level,
1410            status_message=status_message,
1411            completion_start_time=completion_start_time,
1412            model=model,
1413            model_parameters=model_parameters,
1414            usage_details=usage_details,
1415            cost_details=cost_details,
1416            prompt=prompt,
1417        )

[DEPRECATED] Create a new child generation span.

DEPRECATED: This method is deprecated and will be removed in a future version. Use start_observation(as_type='generation') instead.

This method creates a new child generation span with this span as the parent. Generation spans are specialized for AI/LLM operations and include additional fields for model information, usage stats, and costs.

Unlike start_as_current_generation(), this method does not set the new span as the current span in the context.

Arguments:
  • name: Name of the generation operation
  • input: Input data for the model (e.g., prompts)
  • output: Output from the model (e.g., completions)
  • metadata: Additional metadata to associate with the generation
  • version: Version identifier for the model or component
  • level: Importance level of the generation (info, warning, error)
  • status_message: Optional status message for the generation
  • completion_start_time: When the model started generating the response
  • model: Name/identifier of the AI model used (e.g., "gpt-4")
  • model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
  • usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
  • cost_details: Cost information for the model call
  • prompt: Associated prompt template from Langfuse prompt management
Returns:

A new LangfuseGeneration that must be ended with .end() when complete

Example:
span = langfuse.start_span(name="process-query")
try:
    # Create a generation child span
    generation = span.start_generation(
        name="generate-answer",
        model="gpt-4",
        input={"prompt": "Explain quantum computing"}
    )
    try:
        # Call model API
        response = llm.generate(...)

        generation.update(
            output=response.text,
            usage_details={
                "prompt_tokens": response.usage.prompt_tokens,
                "completion_tokens": response.usage.completion_tokens
            }
        )
    finally:
        generation.end()

    # Continue with parent span
    span.update(output={"answer": response.text, "source": "gpt-4"})
finally:
    span.end()
def start_as_current_generation( self, *, name: str, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None) -> opentelemetry.util._decorator._AgnosticContextManager[LangfuseGeneration]:
1419    def start_as_current_generation(
1420        self,
1421        *,
1422        name: str,
1423        input: Optional[Any] = None,
1424        output: Optional[Any] = None,
1425        metadata: Optional[Any] = None,
1426        version: Optional[str] = None,
1427        level: Optional[SpanLevel] = None,
1428        status_message: Optional[str] = None,
1429        completion_start_time: Optional[datetime] = None,
1430        model: Optional[str] = None,
1431        model_parameters: Optional[Dict[str, MapValue]] = None,
1432        usage_details: Optional[Dict[str, int]] = None,
1433        cost_details: Optional[Dict[str, float]] = None,
1434        prompt: Optional[PromptClient] = None,
1435    ) -> _AgnosticContextManager["LangfuseGeneration"]:
1436        """[DEPRECATED] Create a new child generation span and set it as the current span in a context manager.
1437
1438        DEPRECATED: This method is deprecated and will be removed in a future version.
1439        Use start_as_current_observation(as_type='generation') instead.
1440
1441        This method creates a new child generation span and sets it as the current span
1442        within a context manager. Generation spans are specialized for AI/LLM operations
1443        and include additional fields for model information, usage stats, and costs.
1444
1445        Args:
1446            name: Name of the generation operation
1447            input: Input data for the model (e.g., prompts)
1448            output: Output from the model (e.g., completions)
1449            metadata: Additional metadata to associate with the generation
1450            version: Version identifier for the model or component
1451            level: Importance level of the generation (info, warning, error)
1452            status_message: Optional status message for the generation
1453            completion_start_time: When the model started generating the response
1454            model: Name/identifier of the AI model used (e.g., "gpt-4")
1455            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1456            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1457            cost_details: Cost information for the model call
1458            prompt: Associated prompt template from Langfuse prompt management
1459
1460        Returns:
1461            A context manager that yields a new LangfuseGeneration
1462
1463        Example:
1464            ```python
1465            with langfuse.start_as_current_span(name="process-request") as span:
1466                # Prepare data
1467                query = preprocess_user_query(user_input)
1468
1469                # Create a generation span with context management
1470                with span.start_as_current_generation(
1471                    name="generate-answer",
1472                    model="gpt-4",
1473                    input={"query": query}
1474                ) as generation:
1475                    # Generation span is active here
1476                    response = llm.generate(query)
1477
1478                    # Update with results
1479                    generation.update(
1480                        output=response.text,
1481                        usage_details={
1482                            "prompt_tokens": response.usage.prompt_tokens,
1483                            "completion_tokens": response.usage.completion_tokens
1484                        }
1485                    )
1486
1487                # Back to parent span context
1488                span.update(output={"answer": response.text, "source": "gpt-4"})
1489            ```
1490        """
1491        warnings.warn(
1492            "start_as_current_generation is deprecated and will be removed in a future version. "
1493            "Use start_as_current_observation(as_type='generation') instead.",
1494            DeprecationWarning,
1495            stacklevel=2,
1496        )
1497        return self.start_as_current_observation(
1498            name=name,
1499            as_type="generation",
1500            input=input,
1501            output=output,
1502            metadata=metadata,
1503            version=version,
1504            level=level,
1505            status_message=status_message,
1506            completion_start_time=completion_start_time,
1507            model=model,
1508            model_parameters=model_parameters,
1509            usage_details=usage_details,
1510            cost_details=cost_details,
1511            prompt=prompt,
1512        )

[DEPRECATED] Create a new child generation span and set it as the current span in a context manager.

DEPRECATED: This method is deprecated and will be removed in a future version. Use start_as_current_observation(as_type='generation') instead.

This method creates a new child generation span and sets it as the current span within a context manager. Generation spans are specialized for AI/LLM operations and include additional fields for model information, usage stats, and costs.

Arguments:
  • name: Name of the generation operation
  • input: Input data for the model (e.g., prompts)
  • output: Output from the model (e.g., completions)
  • metadata: Additional metadata to associate with the generation
  • version: Version identifier for the model or component
  • level: Importance level of the generation (info, warning, error)
  • status_message: Optional status message for the generation
  • completion_start_time: When the model started generating the response
  • model: Name/identifier of the AI model used (e.g., "gpt-4")
  • model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
  • usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
  • cost_details: Cost information for the model call
  • prompt: Associated prompt template from Langfuse prompt management
Returns:

A context manager that yields a new LangfuseGeneration

Example:
with langfuse.start_as_current_span(name="process-request") as span:
    # Prepare data
    query = preprocess_user_query(user_input)

    # Create a generation span with context management
    with span.start_as_current_generation(
        name="generate-answer",
        model="gpt-4",
        input={"query": query}
    ) as generation:
        # Generation span is active here
        response = llm.generate(query)

        # Update with results
        generation.update(
            output=response.text,
            usage_details={
                "prompt_tokens": response.usage.prompt_tokens,
                "completion_tokens": response.usage.completion_tokens
            }
        )

    # Back to parent span context
    span.update(output={"answer": response.text, "source": "gpt-4"})
def create_event( self, *, name: str, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None) -> LangfuseEvent:
1514    def create_event(
1515        self,
1516        *,
1517        name: str,
1518        input: Optional[Any] = None,
1519        output: Optional[Any] = None,
1520        metadata: Optional[Any] = None,
1521        version: Optional[str] = None,
1522        level: Optional[SpanLevel] = None,
1523        status_message: Optional[str] = None,
1524    ) -> "LangfuseEvent":
1525        """Create a new Langfuse observation of type 'EVENT'.
1526
1527        Args:
1528            name: Name of the span (e.g., function or operation name)
1529            input: Input data for the operation (can be any JSON-serializable object)
1530            output: Output data from the operation (can be any JSON-serializable object)
1531            metadata: Additional metadata to associate with the span
1532            version: Version identifier for the code or component
1533            level: Importance level of the span (info, warning, error)
1534            status_message: Optional status message for the span
1535
1536        Returns:
1537            The LangfuseEvent object
1538
1539        Example:
1540            ```python
1541            event = langfuse.create_event(name="process-event")
1542            ```
1543        """
1544        timestamp = time_ns()
1545
1546        with otel_trace_api.use_span(self._otel_span):
1547            new_otel_span = self._langfuse_client._otel_tracer.start_span(
1548                name=name, start_time=timestamp
1549            )
1550
1551        return cast(
1552            "LangfuseEvent",
1553            LangfuseEvent(
1554                otel_span=new_otel_span,
1555                langfuse_client=self._langfuse_client,
1556                input=input,
1557                output=output,
1558                metadata=metadata,
1559                environment=self._environment,
1560                version=version,
1561                level=level,
1562                status_message=status_message,
1563            ).end(end_time=timestamp),
1564        )

Create a new Langfuse observation of type 'EVENT'.

Arguments:
  • name: Name of the span (e.g., function or operation name)
  • input: Input data for the operation (can be any JSON-serializable object)
  • output: Output data from the operation (can be any JSON-serializable object)
  • metadata: Additional metadata to associate with the span
  • version: Version identifier for the code or component
  • level: Importance level of the span (info, warning, error)
  • status_message: Optional status message for the span
Returns:

The LangfuseEvent object

Example:
event = langfuse.create_event(name="process-event")
class LangfuseGeneration(langfuse._client.span.LangfuseObservationWrapper):
1567class LangfuseGeneration(LangfuseObservationWrapper):
1568    """Specialized span implementation for AI model generations in Langfuse.
1569
1570    This class represents a generation span specifically designed for tracking
1571    AI/LLM operations. It extends the base LangfuseObservationWrapper with specialized
1572    attributes for model details, token usage, and costs.
1573    """
1574
1575    def __init__(
1576        self,
1577        *,
1578        otel_span: otel_trace_api.Span,
1579        langfuse_client: "Langfuse",
1580        input: Optional[Any] = None,
1581        output: Optional[Any] = None,
1582        metadata: Optional[Any] = None,
1583        environment: Optional[str] = None,
1584        version: Optional[str] = None,
1585        level: Optional[SpanLevel] = None,
1586        status_message: Optional[str] = None,
1587        completion_start_time: Optional[datetime] = None,
1588        model: Optional[str] = None,
1589        model_parameters: Optional[Dict[str, MapValue]] = None,
1590        usage_details: Optional[Dict[str, int]] = None,
1591        cost_details: Optional[Dict[str, float]] = None,
1592        prompt: Optional[PromptClient] = None,
1593    ):
1594        """Initialize a new LangfuseGeneration span.
1595
1596        Args:
1597            otel_span: The OpenTelemetry span to wrap
1598            langfuse_client: Reference to the parent Langfuse client
1599            input: Input data for the generation (e.g., prompts)
1600            output: Output from the generation (e.g., completions)
1601            metadata: Additional metadata to associate with the generation
1602            environment: The tracing environment
1603            version: Version identifier for the model or component
1604            level: Importance level of the generation (info, warning, error)
1605            status_message: Optional status message for the generation
1606            completion_start_time: When the model started generating the response
1607            model: Name/identifier of the AI model used (e.g., "gpt-4")
1608            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1609            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1610            cost_details: Cost information for the model call
1611            prompt: Associated prompt template from Langfuse prompt management
1612        """
1613        super().__init__(
1614            as_type="generation",
1615            otel_span=otel_span,
1616            langfuse_client=langfuse_client,
1617            input=input,
1618            output=output,
1619            metadata=metadata,
1620            environment=environment,
1621            version=version,
1622            level=level,
1623            status_message=status_message,
1624            completion_start_time=completion_start_time,
1625            model=model,
1626            model_parameters=model_parameters,
1627            usage_details=usage_details,
1628            cost_details=cost_details,
1629            prompt=prompt,
1630        )

Specialized span implementation for AI model generations in Langfuse.

This class represents a generation span specifically designed for tracking AI/LLM operations. It extends the base LangfuseObservationWrapper with specialized attributes for model details, token usage, and costs.

LangfuseGeneration( *, otel_span: opentelemetry.trace.span.Span, langfuse_client: Langfuse, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, environment: Optional[str] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None)
1575    def __init__(
1576        self,
1577        *,
1578        otel_span: otel_trace_api.Span,
1579        langfuse_client: "Langfuse",
1580        input: Optional[Any] = None,
1581        output: Optional[Any] = None,
1582        metadata: Optional[Any] = None,
1583        environment: Optional[str] = None,
1584        version: Optional[str] = None,
1585        level: Optional[SpanLevel] = None,
1586        status_message: Optional[str] = None,
1587        completion_start_time: Optional[datetime] = None,
1588        model: Optional[str] = None,
1589        model_parameters: Optional[Dict[str, MapValue]] = None,
1590        usage_details: Optional[Dict[str, int]] = None,
1591        cost_details: Optional[Dict[str, float]] = None,
1592        prompt: Optional[PromptClient] = None,
1593    ):
1594        """Initialize a new LangfuseGeneration span.
1595
1596        Args:
1597            otel_span: The OpenTelemetry span to wrap
1598            langfuse_client: Reference to the parent Langfuse client
1599            input: Input data for the generation (e.g., prompts)
1600            output: Output from the generation (e.g., completions)
1601            metadata: Additional metadata to associate with the generation
1602            environment: The tracing environment
1603            version: Version identifier for the model or component
1604            level: Importance level of the generation (info, warning, error)
1605            status_message: Optional status message for the generation
1606            completion_start_time: When the model started generating the response
1607            model: Name/identifier of the AI model used (e.g., "gpt-4")
1608            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1609            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1610            cost_details: Cost information for the model call
1611            prompt: Associated prompt template from Langfuse prompt management
1612        """
1613        super().__init__(
1614            as_type="generation",
1615            otel_span=otel_span,
1616            langfuse_client=langfuse_client,
1617            input=input,
1618            output=output,
1619            metadata=metadata,
1620            environment=environment,
1621            version=version,
1622            level=level,
1623            status_message=status_message,
1624            completion_start_time=completion_start_time,
1625            model=model,
1626            model_parameters=model_parameters,
1627            usage_details=usage_details,
1628            cost_details=cost_details,
1629            prompt=prompt,
1630        )

Initialize a new LangfuseGeneration span.

Arguments:
  • otel_span: The OpenTelemetry span to wrap
  • langfuse_client: Reference to the parent Langfuse client
  • input: Input data for the generation (e.g., prompts)
  • output: Output from the generation (e.g., completions)
  • metadata: Additional metadata to associate with the generation
  • environment: The tracing environment
  • version: Version identifier for the model or component
  • level: Importance level of the generation (info, warning, error)
  • status_message: Optional status message for the generation
  • completion_start_time: When the model started generating the response
  • model: Name/identifier of the AI model used (e.g., "gpt-4")
  • model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
  • usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
  • cost_details: Cost information for the model call
  • prompt: Associated prompt template from Langfuse prompt management
class LangfuseEvent(langfuse._client.span.LangfuseObservationWrapper):
1633class LangfuseEvent(LangfuseObservationWrapper):
1634    """Specialized span implementation for Langfuse Events."""
1635
1636    def __init__(
1637        self,
1638        *,
1639        otel_span: otel_trace_api.Span,
1640        langfuse_client: "Langfuse",
1641        input: Optional[Any] = None,
1642        output: Optional[Any] = None,
1643        metadata: Optional[Any] = None,
1644        environment: Optional[str] = None,
1645        version: Optional[str] = None,
1646        level: Optional[SpanLevel] = None,
1647        status_message: Optional[str] = None,
1648    ):
1649        """Initialize a new LangfuseEvent span.
1650
1651        Args:
1652            otel_span: The OpenTelemetry span to wrap
1653            langfuse_client: Reference to the parent Langfuse client
1654            input: Input data for the event
1655            output: Output from the event
1656            metadata: Additional metadata to associate with the generation
1657            environment: The tracing environment
1658            version: Version identifier for the model or component
1659            level: Importance level of the generation (info, warning, error)
1660            status_message: Optional status message for the generation
1661        """
1662        super().__init__(
1663            otel_span=otel_span,
1664            as_type="event",
1665            langfuse_client=langfuse_client,
1666            input=input,
1667            output=output,
1668            metadata=metadata,
1669            environment=environment,
1670            version=version,
1671            level=level,
1672            status_message=status_message,
1673        )
1674
1675    def update(
1676        self,
1677        *,
1678        name: Optional[str] = None,
1679        input: Optional[Any] = None,
1680        output: Optional[Any] = None,
1681        metadata: Optional[Any] = None,
1682        version: Optional[str] = None,
1683        level: Optional[SpanLevel] = None,
1684        status_message: Optional[str] = None,
1685        completion_start_time: Optional[datetime] = None,
1686        model: Optional[str] = None,
1687        model_parameters: Optional[Dict[str, MapValue]] = None,
1688        usage_details: Optional[Dict[str, int]] = None,
1689        cost_details: Optional[Dict[str, float]] = None,
1690        prompt: Optional[PromptClient] = None,
1691        **kwargs: Any,
1692    ) -> "LangfuseEvent":
1693        """Update is not allowed for LangfuseEvent because events cannot be updated.
1694
1695        This method logs a warning and returns self without making changes.
1696
1697        Returns:
1698            self: Returns the unchanged LangfuseEvent instance
1699        """
1700        langfuse_logger.warning(
1701            "Attempted to update LangfuseEvent observation. Events cannot be updated after creation."
1702        )
1703        return self

Specialized span implementation for Langfuse Events.

LangfuseEvent( *, otel_span: opentelemetry.trace.span.Span, langfuse_client: Langfuse, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, environment: Optional[str] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None)
1636    def __init__(
1637        self,
1638        *,
1639        otel_span: otel_trace_api.Span,
1640        langfuse_client: "Langfuse",
1641        input: Optional[Any] = None,
1642        output: Optional[Any] = None,
1643        metadata: Optional[Any] = None,
1644        environment: Optional[str] = None,
1645        version: Optional[str] = None,
1646        level: Optional[SpanLevel] = None,
1647        status_message: Optional[str] = None,
1648    ):
1649        """Initialize a new LangfuseEvent span.
1650
1651        Args:
1652            otel_span: The OpenTelemetry span to wrap
1653            langfuse_client: Reference to the parent Langfuse client
1654            input: Input data for the event
1655            output: Output from the event
1656            metadata: Additional metadata to associate with the generation
1657            environment: The tracing environment
1658            version: Version identifier for the model or component
1659            level: Importance level of the generation (info, warning, error)
1660            status_message: Optional status message for the generation
1661        """
1662        super().__init__(
1663            otel_span=otel_span,
1664            as_type="event",
1665            langfuse_client=langfuse_client,
1666            input=input,
1667            output=output,
1668            metadata=metadata,
1669            environment=environment,
1670            version=version,
1671            level=level,
1672            status_message=status_message,
1673        )

Initialize a new LangfuseEvent span.

Arguments:
  • otel_span: The OpenTelemetry span to wrap
  • langfuse_client: Reference to the parent Langfuse client
  • input: Input data for the event
  • output: Output from the event
  • metadata: Additional metadata to associate with the generation
  • environment: The tracing environment
  • version: Version identifier for the model or component
  • level: Importance level of the generation (info, warning, error)
  • status_message: Optional status message for the generation
def update( self, *, name: Optional[str] = None, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None, **kwargs: Any) -> LangfuseEvent:
1675    def update(
1676        self,
1677        *,
1678        name: Optional[str] = None,
1679        input: Optional[Any] = None,
1680        output: Optional[Any] = None,
1681        metadata: Optional[Any] = None,
1682        version: Optional[str] = None,
1683        level: Optional[SpanLevel] = None,
1684        status_message: Optional[str] = None,
1685        completion_start_time: Optional[datetime] = None,
1686        model: Optional[str] = None,
1687        model_parameters: Optional[Dict[str, MapValue]] = None,
1688        usage_details: Optional[Dict[str, int]] = None,
1689        cost_details: Optional[Dict[str, float]] = None,
1690        prompt: Optional[PromptClient] = None,
1691        **kwargs: Any,
1692    ) -> "LangfuseEvent":
1693        """Update is not allowed for LangfuseEvent because events cannot be updated.
1694
1695        This method logs a warning and returns self without making changes.
1696
1697        Returns:
1698            self: Returns the unchanged LangfuseEvent instance
1699        """
1700        langfuse_logger.warning(
1701            "Attempted to update LangfuseEvent observation. Events cannot be updated after creation."
1702        )
1703        return self

Update is not allowed for LangfuseEvent because events cannot be updated.

This method logs a warning and returns self without making changes.

Returns:

self: Returns the unchanged LangfuseEvent instance

class LangfuseOtelSpanAttributes:
27class LangfuseOtelSpanAttributes:
28    # Langfuse-Trace attributes
29    TRACE_NAME = "langfuse.trace.name"
30    TRACE_USER_ID = "user.id"
31    TRACE_SESSION_ID = "session.id"
32    TRACE_TAGS = "langfuse.trace.tags"
33    TRACE_PUBLIC = "langfuse.trace.public"
34    TRACE_METADATA = "langfuse.trace.metadata"
35    TRACE_INPUT = "langfuse.trace.input"
36    TRACE_OUTPUT = "langfuse.trace.output"
37
38    # Langfuse-observation attributes
39    OBSERVATION_TYPE = "langfuse.observation.type"
40    OBSERVATION_METADATA = "langfuse.observation.metadata"
41    OBSERVATION_LEVEL = "langfuse.observation.level"
42    OBSERVATION_STATUS_MESSAGE = "langfuse.observation.status_message"
43    OBSERVATION_INPUT = "langfuse.observation.input"
44    OBSERVATION_OUTPUT = "langfuse.observation.output"
45
46    # Langfuse-observation of type Generation attributes
47    OBSERVATION_COMPLETION_START_TIME = "langfuse.observation.completion_start_time"
48    OBSERVATION_MODEL = "langfuse.observation.model.name"
49    OBSERVATION_MODEL_PARAMETERS = "langfuse.observation.model.parameters"
50    OBSERVATION_USAGE_DETAILS = "langfuse.observation.usage_details"
51    OBSERVATION_COST_DETAILS = "langfuse.observation.cost_details"
52    OBSERVATION_PROMPT_NAME = "langfuse.observation.prompt.name"
53    OBSERVATION_PROMPT_VERSION = "langfuse.observation.prompt.version"
54
55    # General
56    ENVIRONMENT = "langfuse.environment"
57    RELEASE = "langfuse.release"
58    VERSION = "langfuse.version"
59
60    # Internal
61    AS_ROOT = "langfuse.internal.as_root"
62
63    # Experiments
64    EXPERIMENT_ID = "langfuse.experiment.id"
65    EXPERIMENT_NAME = "langfuse.experiment.name"
66    EXPERIMENT_DESCRIPTION = "langfuse.experiment.description"
67    EXPERIMENT_METADATA = "langfuse.experiment.metadata"
68    EXPERIMENT_DATASET_ID = "langfuse.experiment.dataset.id"
69    EXPERIMENT_ITEM_ID = "langfuse.experiment.item.id"
70    EXPERIMENT_ITEM_EXPECTED_OUTPUT = "langfuse.experiment.item.expected_output"
71    EXPERIMENT_ITEM_METADATA = "langfuse.experiment.item.metadata"
72    EXPERIMENT_ITEM_ROOT_OBSERVATION_ID = "langfuse.experiment.item.root_observation_id"
TRACE_NAME = 'langfuse.trace.name'
TRACE_USER_ID = 'user.id'
TRACE_SESSION_ID = 'session.id'
TRACE_TAGS = 'langfuse.trace.tags'
TRACE_PUBLIC = 'langfuse.trace.public'
TRACE_METADATA = 'langfuse.trace.metadata'
TRACE_INPUT = 'langfuse.trace.input'
TRACE_OUTPUT = 'langfuse.trace.output'
OBSERVATION_TYPE = 'langfuse.observation.type'
OBSERVATION_METADATA = 'langfuse.observation.metadata'
OBSERVATION_LEVEL = 'langfuse.observation.level'
OBSERVATION_STATUS_MESSAGE = 'langfuse.observation.status_message'
OBSERVATION_INPUT = 'langfuse.observation.input'
OBSERVATION_OUTPUT = 'langfuse.observation.output'
OBSERVATION_COMPLETION_START_TIME = 'langfuse.observation.completion_start_time'
OBSERVATION_MODEL = 'langfuse.observation.model.name'
OBSERVATION_MODEL_PARAMETERS = 'langfuse.observation.model.parameters'
OBSERVATION_USAGE_DETAILS = 'langfuse.observation.usage_details'
OBSERVATION_COST_DETAILS = 'langfuse.observation.cost_details'
OBSERVATION_PROMPT_NAME = 'langfuse.observation.prompt.name'
OBSERVATION_PROMPT_VERSION = 'langfuse.observation.prompt.version'
ENVIRONMENT = 'langfuse.environment'
RELEASE = 'langfuse.release'
VERSION = 'langfuse.version'
AS_ROOT = 'langfuse.internal.as_root'
EXPERIMENT_ID = 'langfuse.experiment.id'
EXPERIMENT_NAME = 'langfuse.experiment.name'
EXPERIMENT_DESCRIPTION = 'langfuse.experiment.description'
EXPERIMENT_METADATA = 'langfuse.experiment.metadata'
EXPERIMENT_DATASET_ID = 'langfuse.experiment.dataset.id'
EXPERIMENT_ITEM_ID = 'langfuse.experiment.item.id'
EXPERIMENT_ITEM_EXPECTED_OUTPUT = 'langfuse.experiment.item.expected_output'
EXPERIMENT_ITEM_METADATA = 'langfuse.experiment.item.metadata'
EXPERIMENT_ITEM_ROOT_OBSERVATION_ID = 'langfuse.experiment.item.root_observation_id'
class LangfuseAgent(langfuse._client.span.LangfuseObservationWrapper):
1706class LangfuseAgent(LangfuseObservationWrapper):
1707    """Agent observation for reasoning blocks that act on tools using LLM guidance."""
1708
1709    def __init__(self, **kwargs: Any) -> None:
1710        """Initialize a new LangfuseAgent span."""
1711        kwargs["as_type"] = "agent"
1712        super().__init__(**kwargs)

Agent observation for reasoning blocks that act on tools using LLM guidance.

LangfuseAgent(**kwargs: Any)
1709    def __init__(self, **kwargs: Any) -> None:
1710        """Initialize a new LangfuseAgent span."""
1711        kwargs["as_type"] = "agent"
1712        super().__init__(**kwargs)

Initialize a new LangfuseAgent span.

class LangfuseTool(langfuse._client.span.LangfuseObservationWrapper):
1715class LangfuseTool(LangfuseObservationWrapper):
1716    """Tool observation representing external tool calls, e.g., calling a weather API."""
1717
1718    def __init__(self, **kwargs: Any) -> None:
1719        """Initialize a new LangfuseTool span."""
1720        kwargs["as_type"] = "tool"
1721        super().__init__(**kwargs)

Tool observation representing external tool calls, e.g., calling a weather API.

LangfuseTool(**kwargs: Any)
1718    def __init__(self, **kwargs: Any) -> None:
1719        """Initialize a new LangfuseTool span."""
1720        kwargs["as_type"] = "tool"
1721        super().__init__(**kwargs)

Initialize a new LangfuseTool span.

class LangfuseChain(langfuse._client.span.LangfuseObservationWrapper):
1724class LangfuseChain(LangfuseObservationWrapper):
1725    """Chain observation for connecting LLM application steps, e.g. passing context from retriever to LLM."""
1726
1727    def __init__(self, **kwargs: Any) -> None:
1728        """Initialize a new LangfuseChain span."""
1729        kwargs["as_type"] = "chain"
1730        super().__init__(**kwargs)

Chain observation for connecting LLM application steps, e.g. passing context from retriever to LLM.

LangfuseChain(**kwargs: Any)
1727    def __init__(self, **kwargs: Any) -> None:
1728        """Initialize a new LangfuseChain span."""
1729        kwargs["as_type"] = "chain"
1730        super().__init__(**kwargs)

Initialize a new LangfuseChain span.

class LangfuseEmbedding(langfuse._client.span.LangfuseObservationWrapper):
1742class LangfuseEmbedding(LangfuseObservationWrapper):
1743    """Embedding observation for LLM embedding calls, typically used before retrieval."""
1744
1745    def __init__(self, **kwargs: Any) -> None:
1746        """Initialize a new LangfuseEmbedding span."""
1747        kwargs["as_type"] = "embedding"
1748        super().__init__(**kwargs)

Embedding observation for LLM embedding calls, typically used before retrieval.

LangfuseEmbedding(**kwargs: Any)
1745    def __init__(self, **kwargs: Any) -> None:
1746        """Initialize a new LangfuseEmbedding span."""
1747        kwargs["as_type"] = "embedding"
1748        super().__init__(**kwargs)

Initialize a new LangfuseEmbedding span.

class LangfuseEvaluator(langfuse._client.span.LangfuseObservationWrapper):
1751class LangfuseEvaluator(LangfuseObservationWrapper):
1752    """Evaluator observation for assessing relevance, correctness, or helpfulness of LLM outputs."""
1753
1754    def __init__(self, **kwargs: Any) -> None:
1755        """Initialize a new LangfuseEvaluator span."""
1756        kwargs["as_type"] = "evaluator"
1757        super().__init__(**kwargs)

Evaluator observation for assessing relevance, correctness, or helpfulness of LLM outputs.

LangfuseEvaluator(**kwargs: Any)
1754    def __init__(self, **kwargs: Any) -> None:
1755        """Initialize a new LangfuseEvaluator span."""
1756        kwargs["as_type"] = "evaluator"
1757        super().__init__(**kwargs)

Initialize a new LangfuseEvaluator span.

class LangfuseRetriever(langfuse._client.span.LangfuseObservationWrapper):
1733class LangfuseRetriever(LangfuseObservationWrapper):
1734    """Retriever observation for data retrieval steps, e.g. vector store or database queries."""
1735
1736    def __init__(self, **kwargs: Any) -> None:
1737        """Initialize a new LangfuseRetriever span."""
1738        kwargs["as_type"] = "retriever"
1739        super().__init__(**kwargs)

Retriever observation for data retrieval steps, e.g. vector store or database queries.

LangfuseRetriever(**kwargs: Any)
1736    def __init__(self, **kwargs: Any) -> None:
1737        """Initialize a new LangfuseRetriever span."""
1738        kwargs["as_type"] = "retriever"
1739        super().__init__(**kwargs)

Initialize a new LangfuseRetriever span.

class LangfuseGuardrail(langfuse._client.span.LangfuseObservationWrapper):
1760class LangfuseGuardrail(LangfuseObservationWrapper):
1761    """Guardrail observation for protection e.g. against jailbreaks or offensive content."""
1762
1763    def __init__(self, **kwargs: Any) -> None:
1764        """Initialize a new LangfuseGuardrail span."""
1765        kwargs["as_type"] = "guardrail"
1766        super().__init__(**kwargs)

Guardrail observation for protection e.g. against jailbreaks or offensive content.

LangfuseGuardrail(**kwargs: Any)
1763    def __init__(self, **kwargs: Any) -> None:
1764        """Initialize a new LangfuseGuardrail span."""
1765        kwargs["as_type"] = "guardrail"
1766        super().__init__(**kwargs)

Initialize a new LangfuseGuardrail span.

class Evaluation:
 97class Evaluation:
 98    """Represents an evaluation result for an experiment item or an entire experiment run.
 99
100    This class provides a strongly-typed way to create evaluation results in evaluator functions.
101    Users must use keyword arguments when instantiating this class.
102
103    Attributes:
104        name: Unique identifier for the evaluation metric. Should be descriptive
105            and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity").
106            Used for aggregation and comparison across experiment runs.
107        value: The evaluation score or result. Can be:
108            - Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42)
109            - String: For categorical results like "positive", "negative", "neutral"
110            - Boolean: For binary assessments like "passes_safety_check"
111            - None: When evaluation cannot be computed (missing data, API errors, etc.)
112        comment: Optional human-readable explanation of the evaluation result.
113            Useful for providing context, explaining scoring rationale, or noting
114            special conditions. Displayed in Langfuse UI for interpretability.
115        metadata: Optional structured metadata about the evaluation process.
116            Can include confidence scores, intermediate calculations, model versions,
117            or any other relevant technical details.
118        data_type: Optional score data type. Required if value is not NUMERIC.
119            One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC.
120        config_id: Optional Langfuse score config ID.
121
122    Examples:
123        Basic accuracy evaluation:
124        ```python
125        from langfuse import Evaluation
126
127        def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
128            if not expected_output:
129                return Evaluation(name="accuracy", value=None, comment="No expected output")
130
131            is_correct = output.strip().lower() == expected_output.strip().lower()
132            return Evaluation(
133                name="accuracy",
134                value=1.0 if is_correct else 0.0,
135                comment="Correct answer" if is_correct else "Incorrect answer"
136            )
137        ```
138
139        Multi-metric evaluator:
140        ```python
141        def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs):
142            return [
143                Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"),
144                Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"),
145                Evaluation(
146                    name="quality",
147                    value=0.85,
148                    comment="High quality response",
149                    metadata={"confidence": 0.92, "model": "gpt-4"}
150                )
151            ]
152        ```
153
154        Categorical evaluation:
155        ```python
156        def sentiment_evaluator(*, input, output, **kwargs):
157            sentiment = analyze_sentiment(output)  # Returns "positive", "negative", or "neutral"
158            return Evaluation(
159                name="sentiment",
160                value=sentiment,
161                comment=f"Response expresses {sentiment} sentiment",
162                data_type="CATEGORICAL"
163            )
164        ```
165
166        Failed evaluation with error handling:
167        ```python
168        def external_api_evaluator(*, input, output, **kwargs):
169            try:
170                score = external_api.evaluate(output)
171                return Evaluation(name="external_score", value=score)
172            except Exception as e:
173                return Evaluation(
174                    name="external_score",
175                    value=None,
176                    comment=f"API unavailable: {e}",
177                    metadata={"error": str(e), "retry_count": 3}
178                )
179        ```
180
181    Note:
182        All arguments must be passed as keywords. Positional arguments are not allowed
183        to ensure code clarity and prevent errors from argument reordering.
184    """
185
186    def __init__(
187        self,
188        *,
189        name: str,
190        value: Union[int, float, str, bool, None],
191        comment: Optional[str] = None,
192        metadata: Optional[Dict[str, Any]] = None,
193        data_type: Optional[ScoreDataType] = None,
194        config_id: Optional[str] = None,
195    ):
196        """Initialize an Evaluation with the provided data.
197
198        Args:
199            name: Unique identifier for the evaluation metric.
200            value: The evaluation score or result.
201            comment: Optional human-readable explanation of the result.
202            metadata: Optional structured metadata about the evaluation process.
203            data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
204            config_id: Optional Langfuse score config ID.
205
206        Note:
207            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
208        """
209        self.name = name
210        self.value = value
211        self.comment = comment
212        self.metadata = metadata
213        self.data_type = data_type
214        self.config_id = config_id

Represents an evaluation result for an experiment item or an entire experiment run.

This class provides a strongly-typed way to create evaluation results in evaluator functions. Users must use keyword arguments when instantiating this class.

Attributes:
  • name: Unique identifier for the evaluation metric. Should be descriptive and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity"). Used for aggregation and comparison across experiment runs.
  • value: The evaluation score or result. Can be:
    • Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42)
    • String: For categorical results like "positive", "negative", "neutral"
    • Boolean: For binary assessments like "passes_safety_check"
    • None: When evaluation cannot be computed (missing data, API errors, etc.)
  • comment: Optional human-readable explanation of the evaluation result. Useful for providing context, explaining scoring rationale, or noting special conditions. Displayed in Langfuse UI for interpretability.
  • metadata: Optional structured metadata about the evaluation process. Can include confidence scores, intermediate calculations, model versions, or any other relevant technical details.
  • data_type: Optional score data type. Required if value is not NUMERIC. One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC.
  • config_id: Optional Langfuse score config ID.
Examples:

Basic accuracy evaluation:

from langfuse import Evaluation

def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
    if not expected_output:
        return Evaluation(name="accuracy", value=None, comment="No expected output")

    is_correct = output.strip().lower() == expected_output.strip().lower()
    return Evaluation(
        name="accuracy",
        value=1.0 if is_correct else 0.0,
        comment="Correct answer" if is_correct else "Incorrect answer"
    )

Multi-metric evaluator:

def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs):
    return [
        Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"),
        Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"),
        Evaluation(
            name="quality",
            value=0.85,
            comment="High quality response",
            metadata={"confidence": 0.92, "model": "gpt-4"}
        )
    ]

Categorical evaluation:

def sentiment_evaluator(*, input, output, **kwargs):
    sentiment = analyze_sentiment(output)  # Returns "positive", "negative", or "neutral"
    return Evaluation(
        name="sentiment",
        value=sentiment,
        comment=f"Response expresses {sentiment} sentiment",
        data_type="CATEGORICAL"
    )

Failed evaluation with error handling:

def external_api_evaluator(*, input, output, **kwargs):
    try:
        score = external_api.evaluate(output)
        return Evaluation(name="external_score", value=score)
    except Exception as e:
        return Evaluation(
            name="external_score",
            value=None,
            comment=f"API unavailable: {e}",
            metadata={"error": str(e), "retry_count": 3}
        )
Note:

All arguments must be passed as keywords. Positional arguments are not allowed to ensure code clarity and prevent errors from argument reordering.

Evaluation( *, name: str, value: Union[int, float, str, bool, NoneType], comment: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, data_type: Optional[langfuse.api.ScoreDataType] = None, config_id: Optional[str] = None)
186    def __init__(
187        self,
188        *,
189        name: str,
190        value: Union[int, float, str, bool, None],
191        comment: Optional[str] = None,
192        metadata: Optional[Dict[str, Any]] = None,
193        data_type: Optional[ScoreDataType] = None,
194        config_id: Optional[str] = None,
195    ):
196        """Initialize an Evaluation with the provided data.
197
198        Args:
199            name: Unique identifier for the evaluation metric.
200            value: The evaluation score or result.
201            comment: Optional human-readable explanation of the result.
202            metadata: Optional structured metadata about the evaluation process.
203            data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
204            config_id: Optional Langfuse score config ID.
205
206        Note:
207            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
208        """
209        self.name = name
210        self.value = value
211        self.comment = comment
212        self.metadata = metadata
213        self.data_type = data_type
214        self.config_id = config_id

Initialize an Evaluation with the provided data.

Arguments:
  • name: Unique identifier for the evaluation metric.
  • value: The evaluation score or result.
  • comment: Optional human-readable explanation of the result.
  • metadata: Optional structured metadata about the evaluation process.
  • data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
  • config_id: Optional Langfuse score config ID.
Note:

All arguments must be provided as keywords. Positional arguments will raise a TypeError.

name
value
comment
metadata
data_type
config_id
class EvaluatorInputs:
 38class EvaluatorInputs:
 39    """Input data structure for evaluators, returned by mapper functions.
 40
 41    This class provides a strongly-typed container for transforming API response
 42    objects (traces, observations) into the standardized format expected
 43    by evaluator functions. It ensures consistent access to input, output, expected
 44    output, and metadata regardless of the source entity type.
 45
 46    Attributes:
 47        input: The input data that was provided to generate the output being evaluated.
 48            For traces, this might be the initial prompt or request. For observations,
 49            this could be the span's input. The exact meaning depends on your use case.
 50        output: The actual output that was produced and needs to be evaluated.
 51            For traces, this is typically the final response. For observations,
 52            this might be the generation output or span result.
 53        expected_output: Optional ground truth or expected result for comparison.
 54            Used by evaluators to assess correctness. May be None if no ground truth
 55            is available for the entity being evaluated.
 56        metadata: Optional structured metadata providing additional context for evaluation.
 57            Can include information about the entity, execution context, user attributes,
 58            or any other relevant data that evaluators might use.
 59
 60    Examples:
 61        Simple mapper for traces:
 62        ```python
 63        from langfuse import EvaluatorInputs
 64
 65        def trace_mapper(trace):
 66            return EvaluatorInputs(
 67                input=trace.input,
 68                output=trace.output,
 69                expected_output=None,  # No ground truth available
 70                metadata={"user_id": trace.user_id, "tags": trace.tags}
 71            )
 72        ```
 73
 74        Mapper for observations extracting specific fields:
 75        ```python
 76        def observation_mapper(observation):
 77            # Extract input/output from observation's data
 78            input_data = observation.input if hasattr(observation, 'input') else None
 79            output_data = observation.output if hasattr(observation, 'output') else None
 80
 81            return EvaluatorInputs(
 82                input=input_data,
 83                output=output_data,
 84                expected_output=None,
 85                metadata={
 86                    "observation_type": observation.type,
 87                    "model": observation.model,
 88                    "latency_ms": observation.end_time - observation.start_time
 89                }
 90            )
 91        ```
 92        ```
 93
 94    Note:
 95        All arguments must be passed as keywords when instantiating this class.
 96    """
 97
 98    def __init__(
 99        self,
100        *,
101        input: Any,
102        output: Any,
103        expected_output: Any = None,
104        metadata: Optional[Dict[str, Any]] = None,
105    ):
106        """Initialize EvaluatorInputs with the provided data.
107
108        Args:
109            input: The input data for evaluation.
110            output: The output data to be evaluated.
111            expected_output: Optional ground truth for comparison.
112            metadata: Optional additional context for evaluation.
113
114        Note:
115            All arguments must be provided as keywords.
116        """
117        self.input = input
118        self.output = output
119        self.expected_output = expected_output
120        self.metadata = metadata

Input data structure for evaluators, returned by mapper functions.

This class provides a strongly-typed container for transforming API response objects (traces, observations) into the standardized format expected by evaluator functions. It ensures consistent access to input, output, expected output, and metadata regardless of the source entity type.

Attributes:
  • input: The input data that was provided to generate the output being evaluated. For traces, this might be the initial prompt or request. For observations, this could be the span's input. The exact meaning depends on your use case.
  • output: The actual output that was produced and needs to be evaluated. For traces, this is typically the final response. For observations, this might be the generation output or span result.
  • expected_output: Optional ground truth or expected result for comparison. Used by evaluators to assess correctness. May be None if no ground truth is available for the entity being evaluated.
  • metadata: Optional structured metadata providing additional context for evaluation. Can include information about the entity, execution context, user attributes, or any other relevant data that evaluators might use.
Examples:

Simple mapper for traces:

from langfuse import EvaluatorInputs

def trace_mapper(trace):
    return EvaluatorInputs(
        input=trace.input,
        output=trace.output,
        expected_output=None,  # No ground truth available
        metadata={"user_id": trace.user_id, "tags": trace.tags}
    )

Mapper for observations extracting specific fields:

def observation_mapper(observation):
    # Extract input/output from observation's data
    input_data = observation.input if hasattr(observation, 'input') else None
    output_data = observation.output if hasattr(observation, 'output') else None

    return EvaluatorInputs(
        input=input_data,
        output=output_data,
        expected_output=None,
        metadata={
            "observation_type": observation.type,
            "model": observation.model,
            "latency_ms": observation.end_time - observation.start_time
        }
    )

```

Note:

All arguments must be passed as keywords when instantiating this class.

EvaluatorInputs( *, input: Any, output: Any, expected_output: Any = None, metadata: Optional[Dict[str, Any]] = None)
 98    def __init__(
 99        self,
100        *,
101        input: Any,
102        output: Any,
103        expected_output: Any = None,
104        metadata: Optional[Dict[str, Any]] = None,
105    ):
106        """Initialize EvaluatorInputs with the provided data.
107
108        Args:
109            input: The input data for evaluation.
110            output: The output data to be evaluated.
111            expected_output: Optional ground truth for comparison.
112            metadata: Optional additional context for evaluation.
113
114        Note:
115            All arguments must be provided as keywords.
116        """
117        self.input = input
118        self.output = output
119        self.expected_output = expected_output
120        self.metadata = metadata

Initialize EvaluatorInputs with the provided data.

Arguments:
  • input: The input data for evaluation.
  • output: The output data to be evaluated.
  • expected_output: Optional ground truth for comparison.
  • metadata: Optional additional context for evaluation.
Note:

All arguments must be provided as keywords.

input
output
expected_output
metadata
class MapperFunction(typing.Protocol):
123class MapperFunction(Protocol):
124    """Protocol defining the interface for mapper functions in batch evaluation.
125
126    Mapper functions transform API response objects (traces or observations)
127    into the standardized EvaluatorInputs format that evaluators expect. This abstraction
128    allows you to define how to extract and structure evaluation data from different
129    entity types.
130
131    Mapper functions must:
132    - Accept a single item parameter (trace, observation)
133    - Return an EvaluatorInputs instance with input, output, expected_output, metadata
134    - Can be either synchronous or asynchronous
135    - Should handle missing or malformed data gracefully
136    """
137
138    def __call__(
139        self,
140        *,
141        item: Union["TraceWithFullDetails", "ObservationsView"],
142        **kwargs: Dict[str, Any],
143    ) -> Union[EvaluatorInputs, Awaitable[EvaluatorInputs]]:
144        """Transform an API response object into evaluator inputs.
145
146        This method defines how to extract evaluation-relevant data from the raw
147        API response object. The implementation should map entity-specific fields
148        to the standardized input/output/expected_output/metadata structure.
149
150        Args:
151            item: The API response object to transform. The type depends on the scope:
152                - TraceWithFullDetails: When evaluating traces
153                - ObservationsView: When evaluating observations
154
155        Returns:
156            EvaluatorInputs: A structured container with:
157                - input: The input data that generated the output
158                - output: The output to be evaluated
159                - expected_output: Optional ground truth for comparison
160                - metadata: Optional additional context
161
162            Can return either a direct EvaluatorInputs instance or an awaitable
163            (for async mappers that need to fetch additional data).
164
165        Examples:
166            Basic trace mapper:
167            ```python
168            def map_trace(trace):
169                return EvaluatorInputs(
170                    input=trace.input,
171                    output=trace.output,
172                    expected_output=None,
173                    metadata={"trace_id": trace.id, "user": trace.user_id}
174                )
175            ```
176
177            Observation mapper with conditional logic:
178            ```python
179            def map_observation(observation):
180                # Extract fields based on observation type
181                if observation.type == "GENERATION":
182                    input_data = observation.input
183                    output_data = observation.output
184                else:
185                    # For other types, use different fields
186                    input_data = observation.metadata.get("input")
187                    output_data = observation.metadata.get("output")
188
189                return EvaluatorInputs(
190                    input=input_data,
191                    output=output_data,
192                    expected_output=None,
193                    metadata={"obs_id": observation.id, "type": observation.type}
194                )
195            ```
196
197            Async mapper (if additional processing needed):
198            ```python
199            async def map_trace_async(trace):
200                # Could do async processing here if needed
201                processed_output = await some_async_transformation(trace.output)
202
203                return EvaluatorInputs(
204                    input=trace.input,
205                    output=processed_output,
206                    expected_output=None,
207                    metadata={"trace_id": trace.id}
208                )
209            ```
210        """
211        ...

Protocol defining the interface for mapper functions in batch evaluation.

Mapper functions transform API response objects (traces or observations) into the standardized EvaluatorInputs format that evaluators expect. This abstraction allows you to define how to extract and structure evaluation data from different entity types.

Mapper functions must:

  • Accept a single item parameter (trace, observation)
  • Return an EvaluatorInputs instance with input, output, expected_output, metadata
  • Can be either synchronous or asynchronous
  • Should handle missing or malformed data gracefully
MapperFunction(*args, **kwargs)
1927def _no_init_or_replace_init(self, *args, **kwargs):
1928    cls = type(self)
1929
1930    if cls._is_protocol:
1931        raise TypeError('Protocols cannot be instantiated')
1932
1933    # Already using a custom `__init__`. No need to calculate correct
1934    # `__init__` to call. This can lead to RecursionError. See bpo-45121.
1935    if cls.__init__ is not _no_init_or_replace_init:
1936        return
1937
1938    # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`.
1939    # The first instantiation of the subclass will call `_no_init_or_replace_init` which
1940    # searches for a proper new `__init__` in the MRO. The new `__init__`
1941    # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent
1942    # instantiation of the protocol subclass will thus use the new
1943    # `__init__` and no longer call `_no_init_or_replace_init`.
1944    for base in cls.__mro__:
1945        init = base.__dict__.get('__init__', _no_init_or_replace_init)
1946        if init is not _no_init_or_replace_init:
1947            cls.__init__ = init
1948            break
1949    else:
1950        # should not happen
1951        cls.__init__ = object.__init__
1952
1953    cls.__init__(self, *args, **kwargs)
class CompositeEvaluatorFunction(typing.Protocol):
214class CompositeEvaluatorFunction(Protocol):
215    """Protocol defining the interface for composite evaluator functions.
216
217    Composite evaluators create aggregate scores from multiple item-level evaluations.
218    This is commonly used to compute weighted averages, combined metrics, or other
219    composite assessments based on individual evaluation results.
220
221    Composite evaluators:
222    - Accept the same inputs as item-level evaluators (input, output, expected_output, metadata)
223      plus the list of evaluations
224    - Return either a single Evaluation, a list of Evaluations, or a dict
225    - Can be either synchronous or asynchronous
226    - Have access to both raw item data and evaluation results
227    """
228
229    def __call__(
230        self,
231        *,
232        input: Optional[Any] = None,
233        output: Optional[Any] = None,
234        expected_output: Optional[Any] = None,
235        metadata: Optional[Dict[str, Any]] = None,
236        evaluations: List[Evaluation],
237        **kwargs: Dict[str, Any],
238    ) -> Union[
239        Evaluation,
240        List[Evaluation],
241        Dict[str, Any],
242        Awaitable[Evaluation],
243        Awaitable[List[Evaluation]],
244        Awaitable[Dict[str, Any]],
245    ]:
246        r"""Create a composite evaluation from item-level evaluation results.
247
248        This method combines multiple evaluation scores into a single composite metric.
249        Common use cases include weighted averages, pass/fail decisions based on multiple
250        criteria, or custom scoring logic that considers multiple dimensions.
251
252        Args:
253            input: The input data that was provided to the system being evaluated.
254            output: The output generated by the system being evaluated.
255            expected_output: The expected/reference output for comparison (if available).
256            metadata: Additional metadata about the evaluation context.
257            evaluations: List of evaluation results from item-level evaluators.
258                Each evaluation contains name, value, comment, and metadata.
259
260        Returns:
261            Can return any of:
262            - Evaluation: A single composite evaluation result
263            - List[Evaluation]: Multiple composite evaluations
264            - Dict: A dict that will be converted to an Evaluation
265                - name: Identifier for the composite metric (e.g., "composite_score")
266                - value: The computed composite value
267                - comment: Optional explanation of how the score was computed
268                - metadata: Optional details about the composition logic
269
270            Can return either a direct Evaluation instance or an awaitable
271            (for async composite evaluators).
272
273        Examples:
274            Simple weighted average:
275            ```python
276            def weighted_composite(*, input, output, expected_output, metadata, evaluations):
277                weights = {
278                    "accuracy": 0.5,
279                    "relevance": 0.3,
280                    "safety": 0.2
281                }
282
283                total_score = 0.0
284                total_weight = 0.0
285
286                for eval in evaluations:
287                    if eval.name in weights and isinstance(eval.value, (int, float)):
288                        total_score += eval.value * weights[eval.name]
289                        total_weight += weights[eval.name]
290
291                final_score = total_score / total_weight if total_weight > 0 else 0.0
292
293                return Evaluation(
294                    name="composite_score",
295                    value=final_score,
296                    comment=f"Weighted average of {len(evaluations)} metrics"
297                )
298            ```
299
300            Pass/fail composite based on thresholds:
301            ```python
302            def pass_fail_composite(*, input, output, expected_output, metadata, evaluations):
303                # Must pass all criteria
304                thresholds = {
305                    "accuracy": 0.7,
306                    "safety": 0.9,
307                    "relevance": 0.6
308                }
309
310                passes = True
311                failing_metrics = []
312
313                for metric, threshold in thresholds.items():
314                    eval_result = next((e for e in evaluations if e.name == metric), None)
315                    if eval_result and isinstance(eval_result.value, (int, float)):
316                        if eval_result.value < threshold:
317                            passes = False
318                            failing_metrics.append(metric)
319
320                return Evaluation(
321                    name="passes_all_checks",
322                    value=passes,
323                    comment=f"Failed: {', '.join(failing_metrics)}" if failing_metrics else "All checks passed",
324                    data_type="BOOLEAN"
325                )
326            ```
327
328            Async composite with external scoring:
329            ```python
330            async def llm_composite(*, input, output, expected_output, metadata, evaluations):
331                # Use LLM to synthesize multiple evaluation results
332                eval_summary = "\n".join(
333                    f"- {e.name}: {e.value}" for e in evaluations
334                )
335
336                prompt = f"Given these evaluation scores:\n{eval_summary}\n"
337                prompt += f"For the output: {output}\n"
338                prompt += "Provide an overall quality score from 0-1."
339
340                response = await openai.chat.completions.create(
341                    model="gpt-4",
342                    messages=[{"role": "user", "content": prompt}]
343                )
344
345                score = float(response.choices[0].message.content.strip())
346
347                return Evaluation(
348                    name="llm_composite_score",
349                    value=score,
350                    comment="LLM-synthesized composite score"
351                )
352            ```
353
354            Context-aware composite:
355            ```python
356            def context_composite(*, input, output, expected_output, metadata, evaluations):
357                # Adjust weighting based on metadata
358                base_weights = {"accuracy": 0.5, "speed": 0.3, "cost": 0.2}
359
360                # If metadata indicates high importance, prioritize accuracy
361                if metadata and metadata.get('importance') == 'high':
362                    weights = {"accuracy": 0.7, "speed": 0.2, "cost": 0.1}
363                else:
364                    weights = base_weights
365
366                total = sum(
367                    e.value * weights.get(e.name, 0)
368                    for e in evaluations
369                    if isinstance(e.value, (int, float))
370                )
371
372                return Evaluation(
373                    name="weighted_composite",
374                    value=total,
375                    comment="Context-aware weighted composite"
376                )
377            ```
378        """
379        ...

Protocol defining the interface for composite evaluator functions.

Composite evaluators create aggregate scores from multiple item-level evaluations. This is commonly used to compute weighted averages, combined metrics, or other composite assessments based on individual evaluation results.

Composite evaluators:

  • Accept the same inputs as item-level evaluators (input, output, expected_output, metadata) plus the list of evaluations
  • Return either a single Evaluation, a list of Evaluations, or a dict
  • Can be either synchronous or asynchronous
  • Have access to both raw item data and evaluation results
CompositeEvaluatorFunction(*args, **kwargs)
1927def _no_init_or_replace_init(self, *args, **kwargs):
1928    cls = type(self)
1929
1930    if cls._is_protocol:
1931        raise TypeError('Protocols cannot be instantiated')
1932
1933    # Already using a custom `__init__`. No need to calculate correct
1934    # `__init__` to call. This can lead to RecursionError. See bpo-45121.
1935    if cls.__init__ is not _no_init_or_replace_init:
1936        return
1937
1938    # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`.
1939    # The first instantiation of the subclass will call `_no_init_or_replace_init` which
1940    # searches for a proper new `__init__` in the MRO. The new `__init__`
1941    # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent
1942    # instantiation of the protocol subclass will thus use the new
1943    # `__init__` and no longer call `_no_init_or_replace_init`.
1944    for base in cls.__mro__:
1945        init = base.__dict__.get('__init__', _no_init_or_replace_init)
1946        if init is not _no_init_or_replace_init:
1947            cls.__init__ = init
1948            break
1949    else:
1950        # should not happen
1951        cls.__init__ = object.__init__
1952
1953    cls.__init__(self, *args, **kwargs)
class EvaluatorStats:
382class EvaluatorStats:
383    """Statistics for a single evaluator's performance during batch evaluation.
384
385    This class tracks detailed metrics about how a specific evaluator performed
386    across all items in a batch evaluation run. It helps identify evaluator issues,
387    understand reliability, and optimize evaluation pipelines.
388
389    Attributes:
390        name: The name of the evaluator function (extracted from __name__).
391        total_runs: Total number of times the evaluator was invoked.
392        successful_runs: Number of times the evaluator completed successfully.
393        failed_runs: Number of times the evaluator raised an exception or failed.
394        total_scores_created: Total number of evaluation scores created by this evaluator.
395            Can be higher than successful_runs if the evaluator returns multiple scores.
396
397    Examples:
398        Accessing evaluator stats from batch evaluation result:
399        ```python
400        result = client.run_batched_evaluation(...)
401
402        for stats in result.evaluator_stats:
403            print(f"Evaluator: {stats.name}")
404            print(f"  Success rate: {stats.successful_runs / stats.total_runs:.1%}")
405            print(f"  Scores created: {stats.total_scores_created}")
406
407            if stats.failed_runs > 0:
408                print(f"  âš ī¸  Failed {stats.failed_runs} times")
409        ```
410
411        Identifying problematic evaluators:
412        ```python
413        result = client.run_batched_evaluation(...)
414
415        # Find evaluators with high failure rates
416        for stats in result.evaluator_stats:
417            failure_rate = stats.failed_runs / stats.total_runs
418            if failure_rate > 0.1:  # More than 10% failures
419                print(f"âš ī¸  {stats.name} has {failure_rate:.1%} failure rate")
420                print(f"    Consider debugging or removing this evaluator")
421        ```
422
423    Note:
424        All arguments must be passed as keywords when instantiating this class.
425    """
426
427    def __init__(
428        self,
429        *,
430        name: str,
431        total_runs: int = 0,
432        successful_runs: int = 0,
433        failed_runs: int = 0,
434        total_scores_created: int = 0,
435    ):
436        """Initialize EvaluatorStats with the provided metrics.
437
438        Args:
439            name: The evaluator function name.
440            total_runs: Total number of evaluator invocations.
441            successful_runs: Number of successful completions.
442            failed_runs: Number of failures.
443            total_scores_created: Total scores created by this evaluator.
444
445        Note:
446            All arguments must be provided as keywords.
447        """
448        self.name = name
449        self.total_runs = total_runs
450        self.successful_runs = successful_runs
451        self.failed_runs = failed_runs
452        self.total_scores_created = total_scores_created

Statistics for a single evaluator's performance during batch evaluation.

This class tracks detailed metrics about how a specific evaluator performed across all items in a batch evaluation run. It helps identify evaluator issues, understand reliability, and optimize evaluation pipelines.

Attributes:
  • name: The name of the evaluator function (extracted from __name__).
  • total_runs: Total number of times the evaluator was invoked.
  • successful_runs: Number of times the evaluator completed successfully.
  • failed_runs: Number of times the evaluator raised an exception or failed.
  • total_scores_created: Total number of evaluation scores created by this evaluator. Can be higher than successful_runs if the evaluator returns multiple scores.
Examples:

Accessing evaluator stats from batch evaluation result:

result = client.run_batched_evaluation(...)

for stats in result.evaluator_stats:
    print(f"Evaluator: {stats.name}")
    print(f"  Success rate: {stats.successful_runs / stats.total_runs:.1%}")
    print(f"  Scores created: {stats.total_scores_created}")

    if stats.failed_runs > 0:
        print(f"  âš ī¸  Failed {stats.failed_runs} times")

Identifying problematic evaluators:

result = client.run_batched_evaluation(...)

# Find evaluators with high failure rates
for stats in result.evaluator_stats:
    failure_rate = stats.failed_runs / stats.total_runs
    if failure_rate > 0.1:  # More than 10% failures
        print(f"âš ī¸  {stats.name} has {failure_rate:.1%} failure rate")
        print(f"    Consider debugging or removing this evaluator")
Note:

All arguments must be passed as keywords when instantiating this class.

EvaluatorStats( *, name: str, total_runs: int = 0, successful_runs: int = 0, failed_runs: int = 0, total_scores_created: int = 0)
427    def __init__(
428        self,
429        *,
430        name: str,
431        total_runs: int = 0,
432        successful_runs: int = 0,
433        failed_runs: int = 0,
434        total_scores_created: int = 0,
435    ):
436        """Initialize EvaluatorStats with the provided metrics.
437
438        Args:
439            name: The evaluator function name.
440            total_runs: Total number of evaluator invocations.
441            successful_runs: Number of successful completions.
442            failed_runs: Number of failures.
443            total_scores_created: Total scores created by this evaluator.
444
445        Note:
446            All arguments must be provided as keywords.
447        """
448        self.name = name
449        self.total_runs = total_runs
450        self.successful_runs = successful_runs
451        self.failed_runs = failed_runs
452        self.total_scores_created = total_scores_created

Initialize EvaluatorStats with the provided metrics.

Arguments:
  • name: The evaluator function name.
  • total_runs: Total number of evaluator invocations.
  • successful_runs: Number of successful completions.
  • failed_runs: Number of failures.
  • total_scores_created: Total scores created by this evaluator.
Note:

All arguments must be provided as keywords.

name
total_runs
successful_runs
failed_runs
total_scores_created
class BatchEvaluationResumeToken:
455class BatchEvaluationResumeToken:
456    """Token for resuming a failed batch evaluation run.
457
458    This class encapsulates all the information needed to resume a batch evaluation
459    that was interrupted or failed partway through. It uses timestamp-based filtering
460    to avoid re-processing items that were already evaluated, even if the underlying
461    dataset changed between runs.
462
463    Attributes:
464        scope: The type of items being evaluated ("traces", "observations").
465        filter: The original JSON filter string used to query items.
466        last_processed_timestamp: ISO 8601 timestamp of the last successfully processed item.
467            Used to construct a filter that only fetches items after this timestamp.
468        last_processed_id: The ID of the last successfully processed item, for reference.
469        items_processed: Count of items successfully processed before interruption.
470
471    Examples:
472        Resuming a failed batch evaluation:
473        ```python
474        # Initial run that fails partway through
475        try:
476            result = client.run_batched_evaluation(
477                scope="traces",
478                mapper=my_mapper,
479                evaluators=[evaluator1, evaluator2],
480                filter='{"tags": ["production"]}',
481                max_items=10000
482            )
483        except Exception as e:
484            print(f"Evaluation failed: {e}")
485
486            # Save the resume token
487            if result.resume_token:
488                # Store resume token for later (e.g., in a file or database)
489                import json
490                with open("resume_token.json", "w") as f:
491                    json.dump({
492                        "scope": result.resume_token.scope,
493                        "filter": result.resume_token.filter,
494                        "last_timestamp": result.resume_token.last_processed_timestamp,
495                        "last_id": result.resume_token.last_processed_id,
496                        "items_done": result.resume_token.items_processed
497                    }, f)
498
499        # Later, resume from where it left off
500        with open("resume_token.json") as f:
501            token_data = json.load(f)
502
503        resume_token = BatchEvaluationResumeToken(
504            scope=token_data["scope"],
505            filter=token_data["filter"],
506            last_processed_timestamp=token_data["last_timestamp"],
507            last_processed_id=token_data["last_id"],
508            items_processed=token_data["items_done"]
509        )
510
511        # Resume the evaluation
512        result = client.run_batched_evaluation(
513            scope="traces",
514            mapper=my_mapper,
515            evaluators=[evaluator1, evaluator2],
516            resume_from=resume_token
517        )
518
519        print(f"Processed {result.total_items_processed} additional items")
520        ```
521
522        Handling partial completion:
523        ```python
524        result = client.run_batched_evaluation(...)
525
526        if not result.completed:
527            print(f"Evaluation incomplete. Processed {result.resume_token.items_processed} items")
528            print(f"Last item: {result.resume_token.last_processed_id}")
529            print(f"Resume from: {result.resume_token.last_processed_timestamp}")
530
531            # Optionally retry automatically
532            if result.resume_token:
533                print("Retrying...")
534                result = client.run_batched_evaluation(
535                    scope=result.resume_token.scope,
536                    mapper=my_mapper,
537                    evaluators=my_evaluators,
538                    resume_from=result.resume_token
539                )
540        ```
541
542    Note:
543        All arguments must be passed as keywords when instantiating this class.
544        The timestamp-based approach means that items created after the initial run
545        but before the timestamp will be skipped. This is intentional to avoid
546        duplicates and ensure consistent evaluation.
547    """
548
549    def __init__(
550        self,
551        *,
552        scope: str,
553        filter: Optional[str],
554        last_processed_timestamp: str,
555        last_processed_id: str,
556        items_processed: int,
557    ):
558        """Initialize BatchEvaluationResumeToken with the provided state.
559
560        Args:
561            scope: The scope type ("traces", "observations").
562            filter: The original JSON filter string.
563            last_processed_timestamp: ISO 8601 timestamp of last processed item.
564            last_processed_id: ID of last processed item.
565            items_processed: Count of items processed before interruption.
566
567        Note:
568            All arguments must be provided as keywords.
569        """
570        self.scope = scope
571        self.filter = filter
572        self.last_processed_timestamp = last_processed_timestamp
573        self.last_processed_id = last_processed_id
574        self.items_processed = items_processed

Token for resuming a failed batch evaluation run.

This class encapsulates all the information needed to resume a batch evaluation that was interrupted or failed partway through. It uses timestamp-based filtering to avoid re-processing items that were already evaluated, even if the underlying dataset changed between runs.

Attributes:
  • scope: The type of items being evaluated ("traces", "observations").
  • filter: The original JSON filter string used to query items.
  • last_processed_timestamp: ISO 8601 timestamp of the last successfully processed item. Used to construct a filter that only fetches items after this timestamp.
  • last_processed_id: The ID of the last successfully processed item, for reference.
  • items_processed: Count of items successfully processed before interruption.
Examples:

Resuming a failed batch evaluation:

# Initial run that fails partway through
try:
    result = client.run_batched_evaluation(
        scope="traces",
        mapper=my_mapper,
        evaluators=[evaluator1, evaluator2],
        filter='{"tags": ["production"]}',
        max_items=10000
    )
except Exception as e:
    print(f"Evaluation failed: {e}")

    # Save the resume token
    if result.resume_token:
        # Store resume token for later (e.g., in a file or database)
        import json
        with open("resume_token.json", "w") as f:
            json.dump({
                "scope": result.resume_token.scope,
                "filter": result.resume_token.filter,
                "last_timestamp": result.resume_token.last_processed_timestamp,
                "last_id": result.resume_token.last_processed_id,
                "items_done": result.resume_token.items_processed
            }, f)

# Later, resume from where it left off
with open("resume_token.json") as f:
    token_data = json.load(f)

resume_token = BatchEvaluationResumeToken(
    scope=token_data["scope"],
    filter=token_data["filter"],
    last_processed_timestamp=token_data["last_timestamp"],
    last_processed_id=token_data["last_id"],
    items_processed=token_data["items_done"]
)

# Resume the evaluation
result = client.run_batched_evaluation(
    scope="traces",
    mapper=my_mapper,
    evaluators=[evaluator1, evaluator2],
    resume_from=resume_token
)

print(f"Processed {result.total_items_processed} additional items")

Handling partial completion:

result = client.run_batched_evaluation(...)

if not result.completed:
    print(f"Evaluation incomplete. Processed {result.resume_token.items_processed} items")
    print(f"Last item: {result.resume_token.last_processed_id}")
    print(f"Resume from: {result.resume_token.last_processed_timestamp}")

    # Optionally retry automatically
    if result.resume_token:
        print("Retrying...")
        result = client.run_batched_evaluation(
            scope=result.resume_token.scope,
            mapper=my_mapper,
            evaluators=my_evaluators,
            resume_from=result.resume_token
        )
Note:

All arguments must be passed as keywords when instantiating this class. The timestamp-based approach means that items created after the initial run but before the timestamp will be skipped. This is intentional to avoid duplicates and ensure consistent evaluation.

BatchEvaluationResumeToken( *, scope: str, filter: Optional[str], last_processed_timestamp: str, last_processed_id: str, items_processed: int)
549    def __init__(
550        self,
551        *,
552        scope: str,
553        filter: Optional[str],
554        last_processed_timestamp: str,
555        last_processed_id: str,
556        items_processed: int,
557    ):
558        """Initialize BatchEvaluationResumeToken with the provided state.
559
560        Args:
561            scope: The scope type ("traces", "observations").
562            filter: The original JSON filter string.
563            last_processed_timestamp: ISO 8601 timestamp of last processed item.
564            last_processed_id: ID of last processed item.
565            items_processed: Count of items processed before interruption.
566
567        Note:
568            All arguments must be provided as keywords.
569        """
570        self.scope = scope
571        self.filter = filter
572        self.last_processed_timestamp = last_processed_timestamp
573        self.last_processed_id = last_processed_id
574        self.items_processed = items_processed

Initialize BatchEvaluationResumeToken with the provided state.

Arguments:
  • scope: The scope type ("traces", "observations").
  • filter: The original JSON filter string.
  • last_processed_timestamp: ISO 8601 timestamp of last processed item.
  • last_processed_id: ID of last processed item.
  • items_processed: Count of items processed before interruption.
Note:

All arguments must be provided as keywords.

scope
filter
last_processed_timestamp
last_processed_id
items_processed
class BatchEvaluationResult:
577class BatchEvaluationResult:
578    r"""Complete result structure for batch evaluation execution.
579
580    This class encapsulates comprehensive statistics and metadata about a batch
581    evaluation run, including counts, evaluator-specific metrics, timing information,
582    error details, and resume capability.
583
584    Attributes:
585        total_items_fetched: Total number of items fetched from the API.
586        total_items_processed: Number of items successfully evaluated.
587        total_items_failed: Number of items that failed during evaluation.
588        total_scores_created: Total scores created by all item-level evaluators.
589        total_composite_scores_created: Scores created by the composite evaluator.
590        total_evaluations_failed: Number of individual evaluator failures across all items.
591        evaluator_stats: List of per-evaluator statistics (success/failure rates, scores created).
592        resume_token: Token for resuming if evaluation was interrupted (None if completed).
593        completed: True if all items were processed, False if stopped early or failed.
594        duration_seconds: Total time taken to execute the batch evaluation.
595        failed_item_ids: List of IDs for items that failed evaluation.
596        error_summary: Dictionary mapping error types to occurrence counts.
597        has_more_items: True if max_items limit was reached but more items exist.
598        item_evaluations: Dictionary mapping item IDs to their evaluation results (both regular and composite).
599
600    Examples:
601        Basic result inspection:
602        ```python
603        result = client.run_batched_evaluation(...)
604
605        print(f"Processed: {result.total_items_processed}/{result.total_items_fetched}")
606        print(f"Scores created: {result.total_scores_created}")
607        print(f"Duration: {result.duration_seconds:.2f}s")
608        print(f"Success rate: {result.total_items_processed / result.total_items_fetched:.1%}")
609        ```
610
611        Detailed analysis with evaluator stats:
612        ```python
613        result = client.run_batched_evaluation(...)
614
615        print(f"\n📊 Batch Evaluation Results")
616        print(f"{'='*50}")
617        print(f"Items processed: {result.total_items_processed}")
618        print(f"Items failed: {result.total_items_failed}")
619        print(f"Scores created: {result.total_scores_created}")
620
621        if result.total_composite_scores_created > 0:
622            print(f"Composite scores: {result.total_composite_scores_created}")
623
624        print(f"\n📈 Evaluator Performance:")
625        for stats in result.evaluator_stats:
626            success_rate = stats.successful_runs / stats.total_runs if stats.total_runs > 0 else 0
627            print(f"\n  {stats.name}:")
628            print(f"    Success rate: {success_rate:.1%}")
629            print(f"    Scores created: {stats.total_scores_created}")
630            if stats.failed_runs > 0:
631                print(f"    âš ī¸  Failures: {stats.failed_runs}")
632
633        if result.error_summary:
634            print(f"\nâš ī¸  Errors encountered:")
635            for error_type, count in result.error_summary.items():
636                print(f"    {error_type}: {count}")
637        ```
638
639        Handling incomplete runs:
640        ```python
641        result = client.run_batched_evaluation(...)
642
643        if not result.completed:
644            print("âš ī¸  Evaluation incomplete!")
645
646            if result.resume_token:
647                print(f"Processed {result.resume_token.items_processed} items before failure")
648                print(f"Use resume_from parameter to continue from:")
649                print(f"  Timestamp: {result.resume_token.last_processed_timestamp}")
650                print(f"  Last ID: {result.resume_token.last_processed_id}")
651
652        if result.has_more_items:
653            print(f"â„šī¸  More items available beyond max_items limit")
654        ```
655
656        Performance monitoring:
657        ```python
658        result = client.run_batched_evaluation(...)
659
660        items_per_second = result.total_items_processed / result.duration_seconds
661        avg_scores_per_item = result.total_scores_created / result.total_items_processed
662
663        print(f"Performance metrics:")
664        print(f"  Throughput: {items_per_second:.2f} items/second")
665        print(f"  Avg scores/item: {avg_scores_per_item:.2f}")
666        print(f"  Total duration: {result.duration_seconds:.2f}s")
667
668        if result.total_evaluations_failed > 0:
669            failure_rate = result.total_evaluations_failed / (
670                result.total_items_processed * len(result.evaluator_stats)
671            )
672            print(f"  Evaluation failure rate: {failure_rate:.1%}")
673        ```
674
675    Note:
676        All arguments must be passed as keywords when instantiating this class.
677    """
678
679    def __init__(
680        self,
681        *,
682        total_items_fetched: int,
683        total_items_processed: int,
684        total_items_failed: int,
685        total_scores_created: int,
686        total_composite_scores_created: int,
687        total_evaluations_failed: int,
688        evaluator_stats: List[EvaluatorStats],
689        resume_token: Optional[BatchEvaluationResumeToken],
690        completed: bool,
691        duration_seconds: float,
692        failed_item_ids: List[str],
693        error_summary: Dict[str, int],
694        has_more_items: bool,
695        item_evaluations: Dict[str, List["Evaluation"]],
696    ):
697        """Initialize BatchEvaluationResult with comprehensive statistics.
698
699        Args:
700            total_items_fetched: Total items fetched from API.
701            total_items_processed: Items successfully evaluated.
702            total_items_failed: Items that failed evaluation.
703            total_scores_created: Scores from item-level evaluators.
704            total_composite_scores_created: Scores from composite evaluator.
705            total_evaluations_failed: Individual evaluator failures.
706            evaluator_stats: Per-evaluator statistics.
707            resume_token: Token for resuming (None if completed).
708            completed: Whether all items were processed.
709            duration_seconds: Total execution time.
710            failed_item_ids: IDs of failed items.
711            error_summary: Error types and counts.
712            has_more_items: Whether more items exist beyond max_items.
713            item_evaluations: Dictionary mapping item IDs to their evaluation results.
714
715        Note:
716            All arguments must be provided as keywords.
717        """
718        self.total_items_fetched = total_items_fetched
719        self.total_items_processed = total_items_processed
720        self.total_items_failed = total_items_failed
721        self.total_scores_created = total_scores_created
722        self.total_composite_scores_created = total_composite_scores_created
723        self.total_evaluations_failed = total_evaluations_failed
724        self.evaluator_stats = evaluator_stats
725        self.resume_token = resume_token
726        self.completed = completed
727        self.duration_seconds = duration_seconds
728        self.failed_item_ids = failed_item_ids
729        self.error_summary = error_summary
730        self.has_more_items = has_more_items
731        self.item_evaluations = item_evaluations
732
733    def __str__(self) -> str:
734        """Return a formatted string representation of the batch evaluation results.
735
736        Returns:
737            A multi-line string with a summary of the evaluation results.
738        """
739        lines = []
740        lines.append("=" * 60)
741        lines.append("Batch Evaluation Results")
742        lines.append("=" * 60)
743
744        # Summary statistics
745        lines.append(f"\nStatus: {'Completed' if self.completed else 'Incomplete'}")
746        lines.append(f"Duration: {self.duration_seconds:.2f}s")
747        lines.append(f"\nItems fetched: {self.total_items_fetched}")
748        lines.append(f"Items processed: {self.total_items_processed}")
749
750        if self.total_items_failed > 0:
751            lines.append(f"Items failed: {self.total_items_failed}")
752
753        # Success rate
754        if self.total_items_fetched > 0:
755            success_rate = self.total_items_processed / self.total_items_fetched * 100
756            lines.append(f"Success rate: {success_rate:.1f}%")
757
758        # Scores created
759        lines.append(f"\nScores created: {self.total_scores_created}")
760        if self.total_composite_scores_created > 0:
761            lines.append(f"Composite scores: {self.total_composite_scores_created}")
762
763        total_scores = self.total_scores_created + self.total_composite_scores_created
764        lines.append(f"Total scores: {total_scores}")
765
766        # Evaluator statistics
767        if self.evaluator_stats:
768            lines.append("\nEvaluator Performance:")
769            for stats in self.evaluator_stats:
770                lines.append(f"  {stats.name}:")
771                if stats.total_runs > 0:
772                    success_rate = (
773                        stats.successful_runs / stats.total_runs * 100
774                        if stats.total_runs > 0
775                        else 0
776                    )
777                    lines.append(
778                        f"    Runs: {stats.successful_runs}/{stats.total_runs} "
779                        f"({success_rate:.1f}% success)"
780                    )
781                    lines.append(f"    Scores created: {stats.total_scores_created}")
782                    if stats.failed_runs > 0:
783                        lines.append(f"    Failed runs: {stats.failed_runs}")
784
785        # Performance metrics
786        if self.total_items_processed > 0 and self.duration_seconds > 0:
787            items_per_sec = self.total_items_processed / self.duration_seconds
788            lines.append("\nPerformance:")
789            lines.append(f"  Throughput: {items_per_sec:.2f} items/second")
790            if self.total_scores_created > 0:
791                avg_scores = self.total_scores_created / self.total_items_processed
792                lines.append(f"  Avg scores per item: {avg_scores:.2f}")
793
794        # Errors and warnings
795        if self.error_summary:
796            lines.append("\nErrors encountered:")
797            for error_type, count in self.error_summary.items():
798                lines.append(f"  {error_type}: {count}")
799
800        # Incomplete run information
801        if not self.completed:
802            lines.append("\nWarning: Evaluation incomplete")
803            if self.resume_token:
804                lines.append(
805                    f"  Last processed: {self.resume_token.last_processed_timestamp}"
806                )
807                lines.append(f"  Items processed: {self.resume_token.items_processed}")
808                lines.append("  Use resume_from parameter to continue")
809
810        if self.has_more_items:
811            lines.append("\nNote: More items available beyond max_items limit")
812
813        lines.append("=" * 60)
814        return "\n".join(lines)

Complete result structure for batch evaluation execution.

This class encapsulates comprehensive statistics and metadata about a batch evaluation run, including counts, evaluator-specific metrics, timing information, error details, and resume capability.

Attributes:
  • total_items_fetched: Total number of items fetched from the API.
  • total_items_processed: Number of items successfully evaluated.
  • total_items_failed: Number of items that failed during evaluation.
  • total_scores_created: Total scores created by all item-level evaluators.
  • total_composite_scores_created: Scores created by the composite evaluator.
  • total_evaluations_failed: Number of individual evaluator failures across all items.
  • evaluator_stats: List of per-evaluator statistics (success/failure rates, scores created).
  • resume_token: Token for resuming if evaluation was interrupted (None if completed).
  • completed: True if all items were processed, False if stopped early or failed.
  • duration_seconds: Total time taken to execute the batch evaluation.
  • failed_item_ids: List of IDs for items that failed evaluation.
  • error_summary: Dictionary mapping error types to occurrence counts.
  • has_more_items: True if max_items limit was reached but more items exist.
  • item_evaluations: Dictionary mapping item IDs to their evaluation results (both regular and composite).
Examples:

Basic result inspection:

result = client.run_batched_evaluation(...)

print(f"Processed: {result.total_items_processed}/{result.total_items_fetched}")
print(f"Scores created: {result.total_scores_created}")
print(f"Duration: {result.duration_seconds:.2f}s")
print(f"Success rate: {result.total_items_processed / result.total_items_fetched:.1%}")

Detailed analysis with evaluator stats:

result = client.run_batched_evaluation(...)

print(f"\n📊 Batch Evaluation Results")
print(f"{'='*50}")
print(f"Items processed: {result.total_items_processed}")
print(f"Items failed: {result.total_items_failed}")
print(f"Scores created: {result.total_scores_created}")

if result.total_composite_scores_created > 0:
    print(f"Composite scores: {result.total_composite_scores_created}")

print(f"\n📈 Evaluator Performance:")
for stats in result.evaluator_stats:
    success_rate = stats.successful_runs / stats.total_runs if stats.total_runs > 0 else 0
    print(f"\n  {stats.name}:")
    print(f"    Success rate: {success_rate:.1%}")
    print(f"    Scores created: {stats.total_scores_created}")
    if stats.failed_runs > 0:
        print(f"    âš ī¸  Failures: {stats.failed_runs}")

if result.error_summary:
    print(f"\nâš ī¸  Errors encountered:")
    for error_type, count in result.error_summary.items():
        print(f"    {error_type}: {count}")

Handling incomplete runs:

result = client.run_batched_evaluation(...)

if not result.completed:
    print("âš ī¸  Evaluation incomplete!")

    if result.resume_token:
        print(f"Processed {result.resume_token.items_processed} items before failure")
        print(f"Use resume_from parameter to continue from:")
        print(f"  Timestamp: {result.resume_token.last_processed_timestamp}")
        print(f"  Last ID: {result.resume_token.last_processed_id}")

if result.has_more_items:
    print(f"â„šī¸  More items available beyond max_items limit")

Performance monitoring:

result = client.run_batched_evaluation(...)

items_per_second = result.total_items_processed / result.duration_seconds
avg_scores_per_item = result.total_scores_created / result.total_items_processed

print(f"Performance metrics:")
print(f"  Throughput: {items_per_second:.2f} items/second")
print(f"  Avg scores/item: {avg_scores_per_item:.2f}")
print(f"  Total duration: {result.duration_seconds:.2f}s")

if result.total_evaluations_failed > 0:
    failure_rate = result.total_evaluations_failed / (
        result.total_items_processed * len(result.evaluator_stats)
    )
    print(f"  Evaluation failure rate: {failure_rate:.1%}")
Note:

All arguments must be passed as keywords when instantiating this class.

BatchEvaluationResult( *, total_items_fetched: int, total_items_processed: int, total_items_failed: int, total_scores_created: int, total_composite_scores_created: int, total_evaluations_failed: int, evaluator_stats: List[EvaluatorStats], resume_token: Optional[BatchEvaluationResumeToken], completed: bool, duration_seconds: float, failed_item_ids: List[str], error_summary: Dict[str, int], has_more_items: bool, item_evaluations: Dict[str, List[Evaluation]])
679    def __init__(
680        self,
681        *,
682        total_items_fetched: int,
683        total_items_processed: int,
684        total_items_failed: int,
685        total_scores_created: int,
686        total_composite_scores_created: int,
687        total_evaluations_failed: int,
688        evaluator_stats: List[EvaluatorStats],
689        resume_token: Optional[BatchEvaluationResumeToken],
690        completed: bool,
691        duration_seconds: float,
692        failed_item_ids: List[str],
693        error_summary: Dict[str, int],
694        has_more_items: bool,
695        item_evaluations: Dict[str, List["Evaluation"]],
696    ):
697        """Initialize BatchEvaluationResult with comprehensive statistics.
698
699        Args:
700            total_items_fetched: Total items fetched from API.
701            total_items_processed: Items successfully evaluated.
702            total_items_failed: Items that failed evaluation.
703            total_scores_created: Scores from item-level evaluators.
704            total_composite_scores_created: Scores from composite evaluator.
705            total_evaluations_failed: Individual evaluator failures.
706            evaluator_stats: Per-evaluator statistics.
707            resume_token: Token for resuming (None if completed).
708            completed: Whether all items were processed.
709            duration_seconds: Total execution time.
710            failed_item_ids: IDs of failed items.
711            error_summary: Error types and counts.
712            has_more_items: Whether more items exist beyond max_items.
713            item_evaluations: Dictionary mapping item IDs to their evaluation results.
714
715        Note:
716            All arguments must be provided as keywords.
717        """
718        self.total_items_fetched = total_items_fetched
719        self.total_items_processed = total_items_processed
720        self.total_items_failed = total_items_failed
721        self.total_scores_created = total_scores_created
722        self.total_composite_scores_created = total_composite_scores_created
723        self.total_evaluations_failed = total_evaluations_failed
724        self.evaluator_stats = evaluator_stats
725        self.resume_token = resume_token
726        self.completed = completed
727        self.duration_seconds = duration_seconds
728        self.failed_item_ids = failed_item_ids
729        self.error_summary = error_summary
730        self.has_more_items = has_more_items
731        self.item_evaluations = item_evaluations

Initialize BatchEvaluationResult with comprehensive statistics.

Arguments:
  • total_items_fetched: Total items fetched from API.
  • total_items_processed: Items successfully evaluated.
  • total_items_failed: Items that failed evaluation.
  • total_scores_created: Scores from item-level evaluators.
  • total_composite_scores_created: Scores from composite evaluator.
  • total_evaluations_failed: Individual evaluator failures.
  • evaluator_stats: Per-evaluator statistics.
  • resume_token: Token for resuming (None if completed).
  • completed: Whether all items were processed.
  • duration_seconds: Total execution time.
  • failed_item_ids: IDs of failed items.
  • error_summary: Error types and counts.
  • has_more_items: Whether more items exist beyond max_items.
  • item_evaluations: Dictionary mapping item IDs to their evaluation results.
Note:

All arguments must be provided as keywords.

total_items_fetched
total_items_processed
total_items_failed
total_scores_created
total_composite_scores_created
total_evaluations_failed
evaluator_stats
resume_token
completed
duration_seconds
failed_item_ids
error_summary
has_more_items
item_evaluations