langfuse
1""".. include:: ../README.md""" 2 3from langfuse.batch_evaluation import ( 4 BatchEvaluationResult, 5 BatchEvaluationResumeToken, 6 CompositeEvaluatorFunction, 7 EvaluatorInputs, 8 EvaluatorStats, 9 MapperFunction, 10) 11from langfuse.experiment import Evaluation, RegressionError, RunnerContext 12 13from ._client import client as _client_module 14from ._client.attributes import LangfuseOtelSpanAttributes 15from ._client.constants import ObservationTypeLiteral 16from ._client.get_client import get_client 17from ._client.observe import observe 18from ._client.propagation import propagate_attributes 19from ._client.span import ( 20 LangfuseAgent, 21 LangfuseChain, 22 LangfuseEmbedding, 23 LangfuseEvaluator, 24 LangfuseEvent, 25 LangfuseGeneration, 26 LangfuseGuardrail, 27 LangfuseRetriever, 28 LangfuseSpan, 29 LangfuseTool, 30) 31from ._version import __version__ 32from .media import LangfuseMedia, LangfuseMediaReference 33from .span_filter import ( 34 KNOWN_LLM_INSTRUMENTATION_SCOPE_PREFIXES, 35 is_default_export_span, 36 is_genai_span, 37 is_known_llm_instrumentor, 38 is_langfuse_span, 39) 40from .types import ( 41 MaskOtelSpansFunction, 42 MaskOtelSpansParams, 43 MaskOtelSpansResult, 44 OtelSpanData, 45 OtelSpanIdentifier, 46 OtelSpanPatch, 47) 48 49Langfuse = _client_module.Langfuse 50 51__all__ = [ 52 "Langfuse", 53 "LangfuseMedia", 54 "LangfuseMediaReference", 55 "get_client", 56 "observe", 57 "propagate_attributes", 58 "ObservationTypeLiteral", 59 "LangfuseSpan", 60 "LangfuseGeneration", 61 "LangfuseEvent", 62 "LangfuseOtelSpanAttributes", 63 "LangfuseAgent", 64 "LangfuseTool", 65 "LangfuseChain", 66 "LangfuseEmbedding", 67 "LangfuseEvaluator", 68 "LangfuseRetriever", 69 "LangfuseGuardrail", 70 "Evaluation", 71 "EvaluatorInputs", 72 "MapperFunction", 73 "CompositeEvaluatorFunction", 74 "EvaluatorStats", 75 "BatchEvaluationResumeToken", 76 "BatchEvaluationResult", 77 "RunnerContext", 78 "RegressionError", 79 "__version__", 80 "is_default_export_span", 81 "is_langfuse_span", 82 "is_genai_span", 83 "is_known_llm_instrumentor", 84 "KNOWN_LLM_INSTRUMENTATION_SCOPE_PREFIXES", 85 "MaskOtelSpansFunction", 86 "MaskOtelSpansParams", 87 "MaskOtelSpansResult", 88 "OtelSpanData", 89 "OtelSpanIdentifier", 90 "OtelSpanPatch", 91 "experiment", 92 "api", 93]
156class Langfuse: 157 """Main client for Langfuse tracing and platform features. 158 159 This class provides an interface for creating and managing traces, spans, 160 and generations in Langfuse as well as interacting with the Langfuse API. 161 162 The client features a thread-safe singleton pattern for each unique public API key, 163 ensuring consistent trace context propagation across your application. It implements 164 efficient batching of spans with configurable flush settings and includes background 165 thread management for media uploads and score ingestion. 166 167 Configuration is flexible through either direct parameters or environment variables, 168 with graceful fallbacks and runtime configuration updates. 169 170 Attributes: 171 api: Synchronous API client for Langfuse backend communication 172 async_api: Asynchronous API client for Langfuse backend communication 173 _otel_tracer: Internal LangfuseTracer instance managing OpenTelemetry components 174 175 Parameters: 176 public_key (Optional[str]): Your Langfuse public API key. Can also be set via LANGFUSE_PUBLIC_KEY environment variable. 177 secret_key (Optional[str]): Your Langfuse secret API key. Can also be set via LANGFUSE_SECRET_KEY environment variable. 178 base_url (Optional[str]): The Langfuse API base URL. Defaults to "https://cloud.langfuse.com". Can also be set via LANGFUSE_BASE_URL environment variable. 179 host (Optional[str]): Deprecated. Use base_url instead. The Langfuse API host URL. Defaults to "https://cloud.langfuse.com". 180 timeout (Optional[int]): Timeout in seconds for API requests. Defaults to 5 seconds. 181 httpx_client (Optional[httpx.Client]): Custom httpx client for making non-tracing HTTP requests. If not provided, a default client will be created. 182 **Fork safety**: ``httpx.Client`` is thread-safe but not process-safe. When using 183 ``fork()``-based servers (e.g. Gunicorn with ``--preload``), the SDK automatically 184 recreates its internally-managed HTTP client in child processes after fork. A custom 185 ``httpx_client`` is intentionally left as-is (the fork-inherited copy is reused), so 186 you retain the opportunity to handle process-safety yourself — for example by 187 registering your own ``os.register_at_fork(after_in_child=...)`` handler to close and 188 reopen connections on the custom client. 189 debug (bool): Enable debug logging. Defaults to False. Can also be set via LANGFUSE_DEBUG environment variable. 190 tracing_enabled (Optional[bool]): Enable or disable tracing. Defaults to True. Can also be set via LANGFUSE_TRACING_ENABLED environment variable. 191 flush_at (Optional[int]): Number of spans to batch before sending to the API. Defaults to 512. Can also be set via LANGFUSE_FLUSH_AT environment variable. 192 flush_interval (Optional[float]): Time in seconds between batch flushes. Defaults to 5 seconds. Can also be set via LANGFUSE_FLUSH_INTERVAL environment variable. 193 environment (Optional[str]): Environment name for tracing. Default is 'default'. Can also be set via LANGFUSE_TRACING_ENVIRONMENT environment variable. Can be any lowercase alphanumeric string with hyphens and underscores that does not start with 'langfuse'. 194 release (Optional[str]): Release version/hash of your application. Used for grouping analytics by release. 195 media_upload_thread_count (Optional[int]): Number of background threads for handling media uploads. Defaults to 1. Can also be set via LANGFUSE_MEDIA_UPLOAD_THREAD_COUNT environment variable. 196 sample_rate (Optional[float]): Sampling rate for traces (0.0 to 1.0). Defaults to 1.0 (100% of traces are sampled). Can also be set via LANGFUSE_SAMPLE_RATE environment variable. 197 mask (Optional[MaskFunction]): Function to mask sensitive data synchronously when Langfuse SDK attributes are created. This applies only to data set through Langfuse SDK APIs such as `start_observation()`, `update()`, and `set_trace_io()`. 198 mask_otel_spans (Optional[MaskOtelSpansFunction]): Synchronous export-stage hook for masking raw OpenTelemetry span attributes before this Langfuse client sends them to Langfuse. Use this for spans created by third-party OpenTelemetry instrumentations, or when you need to inspect final span attributes after export filtering and Langfuse media handling. It does not modify spans already exported through other OpenTelemetry exporters. 199 200 The hook receives one OpenTelemetry export batch. A batch is not guaranteed to contain a complete trace, request, or Langfuse observation tree. The hook usually runs on the OpenTelemetry batch span processor worker thread; during `flush()` and shutdown it may run on the caller thread. Keep it synchronous, deterministic, and fast. 201 202 Return `None` to leave the batch unchanged. Return `MaskOtelSpansResult` with `OtelSpanPatch` values to delete or replace attributes on selected spans. If the hook raises or returns an invalid batch result, Langfuse drops the whole export batch. If one returned span patch is invalid, Langfuse drops only that span from the Langfuse export. 203 204 Example: 205 ```python 206 from typing import Optional 207 208 from langfuse import Langfuse 209 from langfuse.types import ( 210 MaskOtelSpansParams, 211 MaskOtelSpansResult, 212 OtelSpanPatch, 213 ) 214 215 def mask_otel_spans( 216 *, params: MaskOtelSpansParams 217 ) -> Optional[MaskOtelSpansResult]: 218 patches = {} 219 220 for identifier, span in params.spans.items(): 221 if "gen_ai.prompt.0.content" in span.attributes: 222 patches[identifier] = OtelSpanPatch( 223 delete_attributes=("gen_ai.prompt.0.content",), 224 set_attributes={"masking.applied": True}, 225 ) 226 227 return MaskOtelSpansResult(span_patches=patches) 228 229 langfuse = Langfuse(mask_otel_spans=mask_otel_spans) 230 ``` 231 blocked_instrumentation_scopes (Optional[List[str]]): Deprecated. Use `should_export_span` instead. Equivalent behavior: 232 ```python 233 from langfuse.span_filter import is_default_export_span 234 blocked = {"sqlite", "requests"} 235 236 should_export_span = lambda span: ( 237 is_default_export_span(span) 238 and ( 239 span.instrumentation_scope is None 240 or span.instrumentation_scope.name not in blocked 241 ) 242 ) 243 ``` 244 should_export_span (Optional[Callable[[ReadableSpan], bool]]): Callback to decide whether to export a span. If omitted, Langfuse uses the default filter (Langfuse SDK spans, spans with `gen_ai.*` attributes, and known LLM instrumentation scopes). 245 additional_headers (Optional[Dict[str, str]]): Additional headers to include in all API requests and in the default OTLPSpanExporter requests. These headers will be merged with default headers. Note: If httpx_client is provided, additional_headers must be set directly on your custom httpx_client as well. If `span_exporter` is provided, these headers are not wired into that exporter and must be configured on the exporter instance directly. 246 tracer_provider(Optional[TracerProvider]): OpenTelemetry TracerProvider to use for Langfuse. This can be useful to set to have disconnected tracing between Langfuse and other OpenTelemetry-span emitting libraries. Note: To track active spans, the context is still shared between TracerProviders. This may lead to broken trace trees. 247 id_generator (Optional[IdGenerator]): OpenTelemetry ID generator to use when Langfuse creates its own TracerProvider. If omitted, the OpenTelemetry SDK default is used. If `tracer_provider` is provided, or an OpenTelemetry TracerProvider is already registered globally, configure the ID generator on that provider instead. 248 span_exporter (Optional[SpanExporter]): Custom OpenTelemetry span exporter for the Langfuse span processor. If omitted, Langfuse creates an OTLPSpanExporter pointed at the Langfuse OTLP endpoint. If provided, Langfuse does not wire `base_url`, exporter headers, exporter auth, or exporter timeout into it. Configure endpoint, headers, and timeout on the exporter instance directly. If you are sending spans to Langfuse v4 or using Langfuse Cloud Fast Preview, include `x-langfuse-ingestion-version=4` on the exporter to enable real time processing of exported spans. 249 250 Example: 251 ```python 252 from langfuse.otel import Langfuse 253 254 # Initialize the client (reads from env vars if not provided) 255 langfuse = Langfuse( 256 public_key="your-public-key", 257 secret_key="your-secret-key", 258 host="https://cloud.langfuse.com", # Optional, default shown 259 ) 260 261 # Create a trace span 262 with langfuse.start_as_current_observation(name="process-query") as span: 263 # Your application code here 264 265 # Create a nested generation span for an LLM call 266 with span.start_as_current_generation( 267 name="generate-response", 268 model="gpt-4", 269 input={"query": "Tell me about AI"}, 270 model_parameters={"temperature": 0.7, "max_tokens": 500} 271 ) as generation: 272 # Generate response here 273 response = "AI is a field of computer science..." 274 275 generation.update( 276 output=response, 277 usage_details={"prompt_tokens": 10, "completion_tokens": 50}, 278 cost_details={"total_cost": 0.0023} 279 ) 280 281 # Score the generation (supports NUMERIC, BOOLEAN, CATEGORICAL) 282 generation.score(name="relevance", value=0.95, data_type="NUMERIC") 283 ``` 284 """ 285 286 _resources: Optional[LangfuseResourceManager] = None 287 _mask: Optional[MaskFunction] = None 288 _otel_tracer: otel_trace_api.Tracer 289 290 def __init__( 291 self, 292 *, 293 public_key: Optional[str] = None, 294 secret_key: Optional[str] = None, 295 base_url: Optional[str] = None, 296 host: Optional[str] = None, 297 timeout: Optional[int] = None, 298 httpx_client: Optional[httpx.Client] = None, 299 debug: bool = False, 300 tracing_enabled: Optional[bool] = True, 301 flush_at: Optional[int] = None, 302 flush_interval: Optional[float] = None, 303 environment: Optional[str] = None, 304 release: Optional[str] = None, 305 media_upload_thread_count: Optional[int] = None, 306 sample_rate: Optional[float] = None, 307 mask: Optional[MaskFunction] = None, 308 mask_otel_spans: Optional[MaskOtelSpansFunction] = None, 309 blocked_instrumentation_scopes: Optional[List[str]] = None, 310 should_export_span: Optional[Callable[[ReadableSpan], bool]] = None, 311 additional_headers: Optional[Dict[str, str]] = None, 312 tracer_provider: Optional[TracerProvider] = None, 313 id_generator: Optional[IdGenerator] = None, 314 span_exporter: Optional[SpanExporter] = None, 315 ): 316 self._base_url = ( 317 base_url 318 or os.environ.get(LANGFUSE_BASE_URL) 319 or host 320 or os.environ.get(LANGFUSE_HOST, "https://cloud.langfuse.com") 321 ) 322 self._environment = environment or cast( 323 str, os.environ.get(LANGFUSE_TRACING_ENVIRONMENT) 324 ) 325 self._release = ( 326 release 327 or os.environ.get(LANGFUSE_RELEASE, None) 328 or get_common_release_envs() 329 ) 330 self._project_id: Optional[str] = None 331 sample_rate = sample_rate or float(os.environ.get(LANGFUSE_SAMPLE_RATE, 1.0)) 332 if not 0.0 <= sample_rate <= 1.0: 333 raise ValueError( 334 f"Sample rate must be between 0.0 and 1.0, got {sample_rate}" 335 ) 336 337 timeout = timeout or int(os.environ.get(LANGFUSE_TIMEOUT, 5)) 338 339 self._tracing_enabled = ( 340 tracing_enabled 341 and os.environ.get(LANGFUSE_TRACING_ENABLED, "true").lower() != "false" 342 ) 343 if not self._tracing_enabled: 344 langfuse_logger.info( 345 "Configuration: Langfuse tracing is explicitly disabled. No data will be sent to the Langfuse API." 346 ) 347 348 debug = ( 349 debug if debug else (os.getenv(LANGFUSE_DEBUG, "false").lower() == "true") 350 ) 351 if debug: 352 logging.basicConfig( 353 format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" 354 ) 355 langfuse_logger.setLevel(logging.DEBUG) 356 357 public_key = public_key or os.environ.get(LANGFUSE_PUBLIC_KEY) 358 if public_key is None: 359 langfuse_logger.warning( 360 "Authentication error: Langfuse client initialized without public_key. Client will be disabled. " 361 "Provide a public_key parameter or set LANGFUSE_PUBLIC_KEY environment variable. " 362 ) 363 self._otel_tracer = otel_trace_api.NoOpTracer() 364 return 365 366 secret_key = secret_key or os.environ.get(LANGFUSE_SECRET_KEY) 367 if secret_key is None: 368 langfuse_logger.warning( 369 "Authentication error: Langfuse client initialized without secret_key. Client will be disabled. " 370 "Provide a secret_key parameter or set LANGFUSE_SECRET_KEY environment variable. " 371 ) 372 self._otel_tracer = otel_trace_api.NoOpTracer() 373 return 374 375 if os.environ.get("OTEL_SDK_DISABLED", "false").lower() == "true": 376 langfuse_logger.warning( 377 "OTEL_SDK_DISABLED is set. Langfuse tracing will be disabled and no traces will appear in the UI." 378 ) 379 380 if blocked_instrumentation_scopes is not None: 381 warnings.warn( 382 "`blocked_instrumentation_scopes` is deprecated and will be removed in a future release. " 383 "Use `should_export_span` instead. Example: " 384 "from langfuse.span_filter import is_default_export_span; " 385 'blocked={"scope"}; should_export_span=lambda span: ' 386 "is_default_export_span(span) and (span.instrumentation_scope is None or " 387 "span.instrumentation_scope.name not in blocked).", 388 DeprecationWarning, 389 stacklevel=2, 390 ) 391 392 # Initialize api and tracer if requirements are met 393 self._resources = LangfuseResourceManager( 394 public_key=public_key, 395 secret_key=secret_key, 396 base_url=self._base_url, 397 timeout=timeout, 398 environment=self._environment, 399 release=release, 400 flush_at=flush_at, 401 flush_interval=flush_interval, 402 httpx_client=httpx_client, 403 media_upload_thread_count=media_upload_thread_count, 404 sample_rate=sample_rate, 405 mask=mask, 406 mask_otel_spans=mask_otel_spans, 407 tracing_enabled=self._tracing_enabled, 408 blocked_instrumentation_scopes=blocked_instrumentation_scopes, 409 should_export_span=should_export_span, 410 additional_headers=additional_headers, 411 tracer_provider=tracer_provider, 412 id_generator=id_generator, 413 span_exporter=span_exporter, 414 ) 415 self._mask = self._resources.mask 416 417 self._otel_tracer = ( 418 self._resources.tracer 419 if self._tracing_enabled and self._resources.tracer is not None 420 else otel_trace_api.NoOpTracer() 421 ) 422 423 @property 424 def api(self) -> LangfuseAPI: 425 if self._resources is None: 426 raise AttributeError("Langfuse client is not initialized") 427 428 return self._resources.api 429 430 @api.setter 431 def api(self, value: LangfuseAPI) -> None: 432 if self._resources is None: 433 raise AttributeError("Langfuse client is not initialized") 434 435 self._resources.api = value 436 437 @property 438 def async_api(self) -> AsyncLangfuseAPI: 439 if self._resources is None: 440 raise AttributeError("Langfuse client is not initialized") 441 442 return self._resources.async_api 443 444 @async_api.setter 445 def async_api(self, value: AsyncLangfuseAPI) -> None: 446 if self._resources is None: 447 raise AttributeError("Langfuse client is not initialized") 448 449 self._resources.async_api = value 450 451 @overload 452 def start_observation( 453 self, 454 *, 455 trace_context: Optional[TraceContext] = None, 456 name: str, 457 as_type: Literal["generation"], 458 input: Optional[Any] = None, 459 output: Optional[Any] = None, 460 metadata: Optional[Any] = None, 461 version: Optional[str] = None, 462 level: Optional[SpanLevel] = None, 463 status_message: Optional[str] = None, 464 completion_start_time: Optional[datetime] = None, 465 model: Optional[str] = None, 466 model_parameters: Optional[Dict[str, MapValue]] = None, 467 usage_details: Optional[Dict[str, int]] = None, 468 cost_details: Optional[Dict[str, float]] = None, 469 prompt: Optional[PromptClient] = None, 470 ) -> LangfuseGeneration: ... 471 472 @overload 473 def start_observation( 474 self, 475 *, 476 trace_context: Optional[TraceContext] = None, 477 name: str, 478 as_type: Literal["span"] = "span", 479 input: Optional[Any] = None, 480 output: Optional[Any] = None, 481 metadata: Optional[Any] = None, 482 version: Optional[str] = None, 483 level: Optional[SpanLevel] = None, 484 status_message: Optional[str] = None, 485 ) -> LangfuseSpan: ... 486 487 @overload 488 def start_observation( 489 self, 490 *, 491 trace_context: Optional[TraceContext] = None, 492 name: str, 493 as_type: Literal["agent"], 494 input: Optional[Any] = None, 495 output: Optional[Any] = None, 496 metadata: Optional[Any] = None, 497 version: Optional[str] = None, 498 level: Optional[SpanLevel] = None, 499 status_message: Optional[str] = None, 500 ) -> LangfuseAgent: ... 501 502 @overload 503 def start_observation( 504 self, 505 *, 506 trace_context: Optional[TraceContext] = None, 507 name: str, 508 as_type: Literal["tool"], 509 input: Optional[Any] = None, 510 output: Optional[Any] = None, 511 metadata: Optional[Any] = None, 512 version: Optional[str] = None, 513 level: Optional[SpanLevel] = None, 514 status_message: Optional[str] = None, 515 ) -> LangfuseTool: ... 516 517 @overload 518 def start_observation( 519 self, 520 *, 521 trace_context: Optional[TraceContext] = None, 522 name: str, 523 as_type: Literal["chain"], 524 input: Optional[Any] = None, 525 output: Optional[Any] = None, 526 metadata: Optional[Any] = None, 527 version: Optional[str] = None, 528 level: Optional[SpanLevel] = None, 529 status_message: Optional[str] = None, 530 ) -> LangfuseChain: ... 531 532 @overload 533 def start_observation( 534 self, 535 *, 536 trace_context: Optional[TraceContext] = None, 537 name: str, 538 as_type: Literal["retriever"], 539 input: Optional[Any] = None, 540 output: Optional[Any] = None, 541 metadata: Optional[Any] = None, 542 version: Optional[str] = None, 543 level: Optional[SpanLevel] = None, 544 status_message: Optional[str] = None, 545 ) -> LangfuseRetriever: ... 546 547 @overload 548 def start_observation( 549 self, 550 *, 551 trace_context: Optional[TraceContext] = None, 552 name: str, 553 as_type: Literal["evaluator"], 554 input: Optional[Any] = None, 555 output: Optional[Any] = None, 556 metadata: Optional[Any] = None, 557 version: Optional[str] = None, 558 level: Optional[SpanLevel] = None, 559 status_message: Optional[str] = None, 560 ) -> LangfuseEvaluator: ... 561 562 @overload 563 def start_observation( 564 self, 565 *, 566 trace_context: Optional[TraceContext] = None, 567 name: str, 568 as_type: Literal["embedding"], 569 input: Optional[Any] = None, 570 output: Optional[Any] = None, 571 metadata: Optional[Any] = None, 572 version: Optional[str] = None, 573 level: Optional[SpanLevel] = None, 574 status_message: Optional[str] = None, 575 completion_start_time: Optional[datetime] = None, 576 model: Optional[str] = None, 577 model_parameters: Optional[Dict[str, MapValue]] = None, 578 usage_details: Optional[Dict[str, int]] = None, 579 cost_details: Optional[Dict[str, float]] = None, 580 prompt: Optional[PromptClient] = None, 581 ) -> LangfuseEmbedding: ... 582 583 @overload 584 def start_observation( 585 self, 586 *, 587 trace_context: Optional[TraceContext] = None, 588 name: str, 589 as_type: Literal["guardrail"], 590 input: Optional[Any] = None, 591 output: Optional[Any] = None, 592 metadata: Optional[Any] = None, 593 version: Optional[str] = None, 594 level: Optional[SpanLevel] = None, 595 status_message: Optional[str] = None, 596 ) -> LangfuseGuardrail: ... 597 598 def start_observation( 599 self, 600 *, 601 trace_context: Optional[TraceContext] = None, 602 name: str, 603 as_type: ObservationTypeLiteralNoEvent = "span", 604 input: Optional[Any] = None, 605 output: Optional[Any] = None, 606 metadata: Optional[Any] = None, 607 version: Optional[str] = None, 608 level: Optional[SpanLevel] = None, 609 status_message: Optional[str] = None, 610 completion_start_time: Optional[datetime] = None, 611 model: Optional[str] = None, 612 model_parameters: Optional[Dict[str, MapValue]] = None, 613 usage_details: Optional[Dict[str, int]] = None, 614 cost_details: Optional[Dict[str, float]] = None, 615 prompt: Optional[PromptClient] = None, 616 ) -> Union[ 617 LangfuseSpan, 618 LangfuseGeneration, 619 LangfuseAgent, 620 LangfuseTool, 621 LangfuseChain, 622 LangfuseRetriever, 623 LangfuseEvaluator, 624 LangfuseEmbedding, 625 LangfuseGuardrail, 626 ]: 627 """Create a new observation of the specified type. 628 629 This method creates a new observation but does not set it as the current span in the 630 context. To create and use an observation within a context, use start_as_current_observation(). 631 632 Args: 633 trace_context: Optional context for connecting to an existing trace 634 name: Name of the observation 635 as_type: Type of observation to create (defaults to "span") 636 input: Input data for the operation 637 output: Output data from the operation 638 metadata: Additional metadata to associate with the observation 639 version: Version identifier for the code or component 640 level: Importance level of the observation 641 status_message: Optional status message for the observation 642 completion_start_time: When the model started generating (for generation types) 643 model: Name/identifier of the AI model used (for generation types) 644 model_parameters: Parameters used for the model (for generation types) 645 usage_details: Token usage information (for generation types) 646 cost_details: Cost information (for generation types) 647 prompt: Associated prompt template (for generation types) 648 649 Returns: 650 An observation object of the appropriate type that must be ended with .end() 651 """ 652 if trace_context: 653 trace_id = trace_context.get("trace_id", None) 654 parent_span_id = trace_context.get("parent_span_id", None) 655 656 if trace_id: 657 remote_parent_span = self._create_remote_parent_span( 658 trace_id=trace_id, parent_span_id=parent_span_id 659 ) 660 661 with otel_trace_api.use_span( 662 cast(otel_trace_api.Span, remote_parent_span) 663 ): 664 otel_span = self._otel_tracer.start_span(name=name) 665 otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True) 666 667 return self._create_observation_from_otel_span( 668 otel_span=otel_span, 669 as_type=as_type, 670 input=input, 671 output=output, 672 metadata=metadata, 673 version=version, 674 level=level, 675 status_message=status_message, 676 completion_start_time=completion_start_time, 677 model=model, 678 model_parameters=model_parameters, 679 usage_details=usage_details, 680 cost_details=cost_details, 681 prompt=prompt, 682 ) 683 684 otel_span = self._otel_tracer.start_span(name=name) 685 686 return self._create_observation_from_otel_span( 687 otel_span=otel_span, 688 as_type=as_type, 689 input=input, 690 output=output, 691 metadata=metadata, 692 version=version, 693 level=level, 694 status_message=status_message, 695 completion_start_time=completion_start_time, 696 model=model, 697 model_parameters=model_parameters, 698 usage_details=usage_details, 699 cost_details=cost_details, 700 prompt=prompt, 701 ) 702 703 def _create_observation_from_otel_span( 704 self, 705 *, 706 otel_span: otel_trace_api.Span, 707 as_type: ObservationTypeLiteralNoEvent, 708 input: Optional[Any] = None, 709 output: Optional[Any] = None, 710 metadata: Optional[Any] = None, 711 version: Optional[str] = None, 712 level: Optional[SpanLevel] = None, 713 status_message: Optional[str] = None, 714 completion_start_time: Optional[datetime] = None, 715 model: Optional[str] = None, 716 model_parameters: Optional[Dict[str, MapValue]] = None, 717 usage_details: Optional[Dict[str, int]] = None, 718 cost_details: Optional[Dict[str, float]] = None, 719 prompt: Optional[PromptClient] = None, 720 ) -> Union[ 721 LangfuseSpan, 722 LangfuseGeneration, 723 LangfuseAgent, 724 LangfuseTool, 725 LangfuseChain, 726 LangfuseRetriever, 727 LangfuseEvaluator, 728 LangfuseEmbedding, 729 LangfuseGuardrail, 730 ]: 731 """Create the appropriate observation type from an OTEL span.""" 732 if as_type in get_observation_types_list(ObservationTypeGenerationLike): 733 observation_class = self._get_span_class(as_type) 734 # Type ignore to prevent overloads of internal _get_span_class function, 735 # issue is that LangfuseEvent could be returned and that classes have diff. args 736 return observation_class( # type: ignore[return-value,call-arg] 737 otel_span=otel_span, 738 langfuse_client=self, 739 environment=self._environment, 740 release=self._release, 741 input=input, 742 output=output, 743 metadata=metadata, 744 version=version, 745 level=level, 746 status_message=status_message, 747 completion_start_time=completion_start_time, 748 model=model, 749 model_parameters=model_parameters, 750 usage_details=usage_details, 751 cost_details=cost_details, 752 prompt=prompt, 753 ) 754 else: 755 # For other types (e.g. span, guardrail), create appropriate class without generation properties 756 observation_class = self._get_span_class(as_type) 757 # Type ignore to prevent overloads of internal _get_span_class function, 758 # issue is that LangfuseEvent could be returned and that classes have diff. args 759 return observation_class( # type: ignore[return-value,call-arg] 760 otel_span=otel_span, 761 langfuse_client=self, 762 environment=self._environment, 763 release=self._release, 764 input=input, 765 output=output, 766 metadata=metadata, 767 version=version, 768 level=level, 769 status_message=status_message, 770 ) 771 # span._observation_type = as_type 772 # span._otel_span.set_attribute("langfuse.observation.type", as_type) 773 # return span 774 775 @overload 776 def start_as_current_observation( 777 self, 778 *, 779 trace_context: Optional[TraceContext] = None, 780 name: str, 781 as_type: Literal["generation"], 782 input: Optional[Any] = None, 783 output: Optional[Any] = None, 784 metadata: Optional[Any] = None, 785 version: Optional[str] = None, 786 level: Optional[SpanLevel] = None, 787 status_message: Optional[str] = None, 788 completion_start_time: Optional[datetime] = None, 789 model: Optional[str] = None, 790 model_parameters: Optional[Dict[str, MapValue]] = None, 791 usage_details: Optional[Dict[str, int]] = None, 792 cost_details: Optional[Dict[str, float]] = None, 793 prompt: Optional[PromptClient] = None, 794 end_on_exit: Optional[bool] = None, 795 ) -> _AgnosticContextManager[LangfuseGeneration]: ... 796 797 @overload 798 def start_as_current_observation( 799 self, 800 *, 801 trace_context: Optional[TraceContext] = None, 802 name: str, 803 as_type: Literal["span"] = "span", 804 input: Optional[Any] = None, 805 output: Optional[Any] = None, 806 metadata: Optional[Any] = None, 807 version: Optional[str] = None, 808 level: Optional[SpanLevel] = None, 809 status_message: Optional[str] = None, 810 end_on_exit: Optional[bool] = None, 811 ) -> _AgnosticContextManager[LangfuseSpan]: ... 812 813 @overload 814 def start_as_current_observation( 815 self, 816 *, 817 trace_context: Optional[TraceContext] = None, 818 name: str, 819 as_type: Literal["agent"], 820 input: Optional[Any] = None, 821 output: Optional[Any] = None, 822 metadata: Optional[Any] = None, 823 version: Optional[str] = None, 824 level: Optional[SpanLevel] = None, 825 status_message: Optional[str] = None, 826 end_on_exit: Optional[bool] = None, 827 ) -> _AgnosticContextManager[LangfuseAgent]: ... 828 829 @overload 830 def start_as_current_observation( 831 self, 832 *, 833 trace_context: Optional[TraceContext] = None, 834 name: str, 835 as_type: Literal["tool"], 836 input: Optional[Any] = None, 837 output: Optional[Any] = None, 838 metadata: Optional[Any] = None, 839 version: Optional[str] = None, 840 level: Optional[SpanLevel] = None, 841 status_message: Optional[str] = None, 842 end_on_exit: Optional[bool] = None, 843 ) -> _AgnosticContextManager[LangfuseTool]: ... 844 845 @overload 846 def start_as_current_observation( 847 self, 848 *, 849 trace_context: Optional[TraceContext] = None, 850 name: str, 851 as_type: Literal["chain"], 852 input: Optional[Any] = None, 853 output: Optional[Any] = None, 854 metadata: Optional[Any] = None, 855 version: Optional[str] = None, 856 level: Optional[SpanLevel] = None, 857 status_message: Optional[str] = None, 858 end_on_exit: Optional[bool] = None, 859 ) -> _AgnosticContextManager[LangfuseChain]: ... 860 861 @overload 862 def start_as_current_observation( 863 self, 864 *, 865 trace_context: Optional[TraceContext] = None, 866 name: str, 867 as_type: Literal["retriever"], 868 input: Optional[Any] = None, 869 output: Optional[Any] = None, 870 metadata: Optional[Any] = None, 871 version: Optional[str] = None, 872 level: Optional[SpanLevel] = None, 873 status_message: Optional[str] = None, 874 end_on_exit: Optional[bool] = None, 875 ) -> _AgnosticContextManager[LangfuseRetriever]: ... 876 877 @overload 878 def start_as_current_observation( 879 self, 880 *, 881 trace_context: Optional[TraceContext] = None, 882 name: str, 883 as_type: Literal["evaluator"], 884 input: Optional[Any] = None, 885 output: Optional[Any] = None, 886 metadata: Optional[Any] = None, 887 version: Optional[str] = None, 888 level: Optional[SpanLevel] = None, 889 status_message: Optional[str] = None, 890 end_on_exit: Optional[bool] = None, 891 ) -> _AgnosticContextManager[LangfuseEvaluator]: ... 892 893 @overload 894 def start_as_current_observation( 895 self, 896 *, 897 trace_context: Optional[TraceContext] = None, 898 name: str, 899 as_type: Literal["embedding"], 900 input: Optional[Any] = None, 901 output: Optional[Any] = None, 902 metadata: Optional[Any] = None, 903 version: Optional[str] = None, 904 level: Optional[SpanLevel] = None, 905 status_message: Optional[str] = None, 906 completion_start_time: Optional[datetime] = None, 907 model: Optional[str] = None, 908 model_parameters: Optional[Dict[str, MapValue]] = None, 909 usage_details: Optional[Dict[str, int]] = None, 910 cost_details: Optional[Dict[str, float]] = None, 911 prompt: Optional[PromptClient] = None, 912 end_on_exit: Optional[bool] = None, 913 ) -> _AgnosticContextManager[LangfuseEmbedding]: ... 914 915 @overload 916 def start_as_current_observation( 917 self, 918 *, 919 trace_context: Optional[TraceContext] = None, 920 name: str, 921 as_type: Literal["guardrail"], 922 input: Optional[Any] = None, 923 output: Optional[Any] = None, 924 metadata: Optional[Any] = None, 925 version: Optional[str] = None, 926 level: Optional[SpanLevel] = None, 927 status_message: Optional[str] = None, 928 end_on_exit: Optional[bool] = None, 929 ) -> _AgnosticContextManager[LangfuseGuardrail]: ... 930 931 def start_as_current_observation( 932 self, 933 *, 934 trace_context: Optional[TraceContext] = None, 935 name: str, 936 as_type: ObservationTypeLiteralNoEvent = "span", 937 input: Optional[Any] = None, 938 output: Optional[Any] = None, 939 metadata: Optional[Any] = None, 940 version: Optional[str] = None, 941 level: Optional[SpanLevel] = None, 942 status_message: Optional[str] = None, 943 completion_start_time: Optional[datetime] = None, 944 model: Optional[str] = None, 945 model_parameters: Optional[Dict[str, MapValue]] = None, 946 usage_details: Optional[Dict[str, int]] = None, 947 cost_details: Optional[Dict[str, float]] = None, 948 prompt: Optional[PromptClient] = None, 949 end_on_exit: Optional[bool] = None, 950 ) -> Union[ 951 _AgnosticContextManager[LangfuseGeneration], 952 _AgnosticContextManager[LangfuseSpan], 953 _AgnosticContextManager[LangfuseAgent], 954 _AgnosticContextManager[LangfuseTool], 955 _AgnosticContextManager[LangfuseChain], 956 _AgnosticContextManager[LangfuseRetriever], 957 _AgnosticContextManager[LangfuseEvaluator], 958 _AgnosticContextManager[LangfuseEmbedding], 959 _AgnosticContextManager[LangfuseGuardrail], 960 ]: 961 """Create a new observation and set it as the current span in a context manager. 962 963 This method creates a new observation of the specified type and sets it as the 964 current span within a context manager. Use this method with a 'with' statement to 965 automatically handle the observation lifecycle within a code block. 966 967 The created observation will be the child of the current span in the context. 968 969 Args: 970 trace_context: Optional context for connecting to an existing trace 971 name: Name of the observation (e.g., function or operation name) 972 as_type: Type of observation to create (defaults to "span") 973 input: Input data for the operation (can be any JSON-serializable object) 974 output: Output data from the operation (can be any JSON-serializable object) 975 metadata: Additional metadata to associate with the observation 976 version: Version identifier for the code or component 977 level: Importance level of the observation (info, warning, error) 978 status_message: Optional status message for the observation 979 end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks. 980 981 The following parameters are available when as_type is: "generation" or "embedding". 982 completion_start_time: When the model started generating the response 983 model: Name/identifier of the AI model used (e.g., "gpt-4") 984 model_parameters: Parameters used for the model (e.g., temperature, max_tokens) 985 usage_details: Token usage information (e.g., prompt_tokens, completion_tokens) 986 cost_details: Cost information for the model call 987 prompt: Associated prompt template from Langfuse prompt management 988 989 Returns: 990 A context manager that yields the appropriate observation type based on as_type 991 992 Example: 993 ```python 994 # Create a span 995 with langfuse.start_as_current_observation(name="process-query", as_type="span") as span: 996 # Do work 997 result = process_data() 998 span.update(output=result) 999 1000 # Create a child span automatically 1001 with span.start_as_current_observation(name="sub-operation") as child_span: 1002 # Do sub-operation work 1003 child_span.update(output="sub-result") 1004 1005 # Create a tool observation 1006 with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool: 1007 # Do tool work 1008 results = search_web(query) 1009 tool.update(output=results) 1010 1011 # Create a generation observation 1012 with langfuse.start_as_current_observation( 1013 name="answer-generation", 1014 as_type="generation", 1015 model="gpt-4" 1016 ) as generation: 1017 # Generate answer 1018 response = llm.generate(...) 1019 generation.update(output=response) 1020 ``` 1021 """ 1022 if as_type in get_observation_types_list(ObservationTypeGenerationLike): 1023 if trace_context: 1024 trace_id = trace_context.get("trace_id", None) 1025 parent_span_id = trace_context.get("parent_span_id", None) 1026 1027 if trace_id: 1028 remote_parent_span = self._create_remote_parent_span( 1029 trace_id=trace_id, parent_span_id=parent_span_id 1030 ) 1031 1032 return cast( 1033 Union[ 1034 _AgnosticContextManager[LangfuseGeneration], 1035 _AgnosticContextManager[LangfuseEmbedding], 1036 ], 1037 self._create_span_with_parent_context( 1038 as_type=as_type, 1039 name=name, 1040 remote_parent_span=remote_parent_span, 1041 parent=None, 1042 end_on_exit=end_on_exit, 1043 input=input, 1044 output=output, 1045 metadata=metadata, 1046 version=version, 1047 level=level, 1048 status_message=status_message, 1049 completion_start_time=completion_start_time, 1050 model=model, 1051 model_parameters=model_parameters, 1052 usage_details=usage_details, 1053 cost_details=cost_details, 1054 prompt=prompt, 1055 ), 1056 ) 1057 1058 return cast( 1059 Union[ 1060 _AgnosticContextManager[LangfuseGeneration], 1061 _AgnosticContextManager[LangfuseEmbedding], 1062 ], 1063 self._start_as_current_otel_span_with_processed_media( 1064 as_type=as_type, 1065 name=name, 1066 end_on_exit=end_on_exit, 1067 input=input, 1068 output=output, 1069 metadata=metadata, 1070 version=version, 1071 level=level, 1072 status_message=status_message, 1073 completion_start_time=completion_start_time, 1074 model=model, 1075 model_parameters=model_parameters, 1076 usage_details=usage_details, 1077 cost_details=cost_details, 1078 prompt=prompt, 1079 ), 1080 ) 1081 1082 if as_type in get_observation_types_list(ObservationTypeSpanLike): 1083 if trace_context: 1084 trace_id = trace_context.get("trace_id", None) 1085 parent_span_id = trace_context.get("parent_span_id", None) 1086 1087 if trace_id: 1088 remote_parent_span = self._create_remote_parent_span( 1089 trace_id=trace_id, parent_span_id=parent_span_id 1090 ) 1091 1092 return cast( 1093 Union[ 1094 _AgnosticContextManager[LangfuseSpan], 1095 _AgnosticContextManager[LangfuseAgent], 1096 _AgnosticContextManager[LangfuseTool], 1097 _AgnosticContextManager[LangfuseChain], 1098 _AgnosticContextManager[LangfuseRetriever], 1099 _AgnosticContextManager[LangfuseEvaluator], 1100 _AgnosticContextManager[LangfuseGuardrail], 1101 ], 1102 self._create_span_with_parent_context( 1103 as_type=as_type, 1104 name=name, 1105 remote_parent_span=remote_parent_span, 1106 parent=None, 1107 end_on_exit=end_on_exit, 1108 input=input, 1109 output=output, 1110 metadata=metadata, 1111 version=version, 1112 level=level, 1113 status_message=status_message, 1114 ), 1115 ) 1116 1117 return cast( 1118 Union[ 1119 _AgnosticContextManager[LangfuseSpan], 1120 _AgnosticContextManager[LangfuseAgent], 1121 _AgnosticContextManager[LangfuseTool], 1122 _AgnosticContextManager[LangfuseChain], 1123 _AgnosticContextManager[LangfuseRetriever], 1124 _AgnosticContextManager[LangfuseEvaluator], 1125 _AgnosticContextManager[LangfuseGuardrail], 1126 ], 1127 self._start_as_current_otel_span_with_processed_media( 1128 as_type=as_type, 1129 name=name, 1130 end_on_exit=end_on_exit, 1131 input=input, 1132 output=output, 1133 metadata=metadata, 1134 version=version, 1135 level=level, 1136 status_message=status_message, 1137 ), 1138 ) 1139 1140 # This should never be reached since all valid types are handled above 1141 langfuse_logger.warning( 1142 f"Unknown observation type: {as_type}, falling back to span" 1143 ) 1144 return self._start_as_current_otel_span_with_processed_media( 1145 as_type="span", 1146 name=name, 1147 end_on_exit=end_on_exit, 1148 input=input, 1149 output=output, 1150 metadata=metadata, 1151 version=version, 1152 level=level, 1153 status_message=status_message, 1154 ) 1155 1156 def _get_span_class( 1157 self, 1158 as_type: str, 1159 ) -> Union[ 1160 Type[LangfuseAgent], 1161 Type[LangfuseTool], 1162 Type[LangfuseChain], 1163 Type[LangfuseRetriever], 1164 Type[LangfuseEvaluator], 1165 Type[LangfuseEmbedding], 1166 Type[LangfuseGuardrail], 1167 Type[LangfuseGeneration], 1168 Type[LangfuseEvent], 1169 Type[LangfuseSpan], 1170 ]: 1171 """Get the appropriate span class based on as_type.""" 1172 normalized_type = as_type.lower() 1173 1174 if normalized_type == "agent": 1175 return LangfuseAgent 1176 elif normalized_type == "tool": 1177 return LangfuseTool 1178 elif normalized_type == "chain": 1179 return LangfuseChain 1180 elif normalized_type == "retriever": 1181 return LangfuseRetriever 1182 elif normalized_type == "evaluator": 1183 return LangfuseEvaluator 1184 elif normalized_type == "embedding": 1185 return LangfuseEmbedding 1186 elif normalized_type == "guardrail": 1187 return LangfuseGuardrail 1188 elif normalized_type == "generation": 1189 return LangfuseGeneration 1190 elif normalized_type == "event": 1191 return LangfuseEvent 1192 elif normalized_type == "span": 1193 return LangfuseSpan 1194 else: 1195 return LangfuseSpan 1196 1197 @staticmethod 1198 def _get_observation_type_from_otel_span(otel_span: otel_trace_api.Span) -> str: 1199 if not otel_span.is_recording(): 1200 return "span" 1201 1202 attributes = getattr(otel_span, "attributes", None) 1203 if attributes is None or not hasattr(attributes, "get"): 1204 return "span" 1205 1206 observation_type = attributes.get( 1207 LangfuseOtelSpanAttributes.OBSERVATION_TYPE, "span" 1208 ) 1209 1210 return observation_type if isinstance(observation_type, str) else "span" 1211 1212 @_agnosticcontextmanager 1213 def _create_span_with_parent_context( 1214 self, 1215 *, 1216 name: str, 1217 parent: Optional[otel_trace_api.Span] = None, 1218 remote_parent_span: Optional[otel_trace_api.Span] = None, 1219 as_type: ObservationTypeLiteralNoEvent, 1220 end_on_exit: Optional[bool] = None, 1221 input: Optional[Any] = None, 1222 output: Optional[Any] = None, 1223 metadata: Optional[Any] = None, 1224 version: Optional[str] = None, 1225 level: Optional[SpanLevel] = None, 1226 status_message: Optional[str] = None, 1227 completion_start_time: Optional[datetime] = None, 1228 model: Optional[str] = None, 1229 model_parameters: Optional[Dict[str, MapValue]] = None, 1230 usage_details: Optional[Dict[str, int]] = None, 1231 cost_details: Optional[Dict[str, float]] = None, 1232 prompt: Optional[PromptClient] = None, 1233 ) -> Any: 1234 parent_span = parent or cast(otel_trace_api.Span, remote_parent_span) 1235 1236 with otel_trace_api.use_span(parent_span): 1237 with self._start_as_current_otel_span_with_processed_media( 1238 name=name, 1239 as_type=as_type, 1240 end_on_exit=end_on_exit, 1241 input=input, 1242 output=output, 1243 metadata=metadata, 1244 version=version, 1245 level=level, 1246 status_message=status_message, 1247 completion_start_time=completion_start_time, 1248 model=model, 1249 model_parameters=model_parameters, 1250 usage_details=usage_details, 1251 cost_details=cost_details, 1252 prompt=prompt, 1253 ) as langfuse_span: 1254 if remote_parent_span is not None: 1255 langfuse_span._otel_span.set_attribute( 1256 LangfuseOtelSpanAttributes.AS_ROOT, True 1257 ) 1258 1259 yield langfuse_span 1260 1261 @_agnosticcontextmanager 1262 def _start_as_current_otel_span_with_processed_media( 1263 self, 1264 *, 1265 name: str, 1266 as_type: Optional[ObservationTypeLiteralNoEvent] = None, 1267 end_on_exit: Optional[bool] = None, 1268 input: Optional[Any] = None, 1269 output: Optional[Any] = None, 1270 metadata: Optional[Any] = None, 1271 version: Optional[str] = None, 1272 level: Optional[SpanLevel] = None, 1273 status_message: Optional[str] = None, 1274 completion_start_time: Optional[datetime] = None, 1275 model: Optional[str] = None, 1276 model_parameters: Optional[Dict[str, MapValue]] = None, 1277 usage_details: Optional[Dict[str, int]] = None, 1278 cost_details: Optional[Dict[str, float]] = None, 1279 prompt: Optional[PromptClient] = None, 1280 ) -> Any: 1281 with self._otel_tracer.start_as_current_span( 1282 name=name, 1283 end_on_exit=end_on_exit if end_on_exit is not None else True, 1284 ) as otel_span: 1285 baggage_token = None 1286 1287 if otel_span.is_recording(): 1288 context_with_app_root_claim = _set_langfuse_trace_id_in_baggage( 1289 trace_id=self._get_otel_trace_id(otel_span), 1290 context=otel_context_api.get_current(), 1291 ) 1292 baggage_token = otel_context_api.attach(context_with_app_root_claim) 1293 1294 span_class = self._get_span_class( 1295 as_type or "generation" 1296 ) # default was "generation" 1297 1298 try: 1299 common_args = { 1300 "otel_span": otel_span, 1301 "langfuse_client": self, 1302 "environment": self._environment, 1303 "release": self._release, 1304 "input": input, 1305 "output": output, 1306 "metadata": metadata, 1307 "version": version, 1308 "level": level, 1309 "status_message": status_message, 1310 } 1311 1312 if span_class in [ 1313 LangfuseGeneration, 1314 LangfuseEmbedding, 1315 ]: 1316 common_args.update( 1317 { 1318 "completion_start_time": completion_start_time, 1319 "model": model, 1320 "model_parameters": model_parameters, 1321 "usage_details": usage_details, 1322 "cost_details": cost_details, 1323 "prompt": prompt, 1324 } 1325 ) 1326 # For span-like types (span, agent, tool, chain, retriever, evaluator, guardrail), no generation properties needed 1327 1328 yield span_class(**common_args) # type: ignore[arg-type] 1329 1330 finally: 1331 if baggage_token is not None: 1332 _detach_context_token_safely(baggage_token) 1333 1334 def _get_current_otel_span(self) -> Optional[otel_trace_api.Span]: 1335 current_span = otel_trace_api.get_current_span() 1336 1337 if current_span is otel_trace_api.INVALID_SPAN: 1338 langfuse_logger.warning( 1339 "Context error: No active span in current context. Operations that depend on an active span will be skipped. " 1340 "Ensure spans are created with start_as_current_observation() or that you're operating within an active span context." 1341 ) 1342 return None 1343 1344 return current_span 1345 1346 def update_current_generation( 1347 self, 1348 *, 1349 name: Optional[str] = None, 1350 input: Optional[Any] = None, 1351 output: Optional[Any] = None, 1352 metadata: Optional[Any] = None, 1353 version: Optional[str] = None, 1354 level: Optional[SpanLevel] = None, 1355 status_message: Optional[str] = None, 1356 completion_start_time: Optional[datetime] = None, 1357 model: Optional[str] = None, 1358 model_parameters: Optional[Dict[str, MapValue]] = None, 1359 usage_details: Optional[Dict[str, int]] = None, 1360 cost_details: Optional[Dict[str, float]] = None, 1361 prompt: Optional[PromptClient] = None, 1362 ) -> None: 1363 """Update the current active generation span with new information. 1364 1365 This method updates the current generation span in the active context with 1366 additional information. It's useful for adding output, usage stats, or other 1367 details that become available during or after model generation. 1368 1369 Args: 1370 name: The generation name 1371 input: Updated input data for the model 1372 output: Output from the model (e.g., completions) 1373 metadata: Additional metadata to associate with the generation 1374 version: Version identifier for the model or component 1375 level: Importance level of the generation (info, warning, error) 1376 status_message: Optional status message for the generation 1377 completion_start_time: When the model started generating the response 1378 model: Name/identifier of the AI model used (e.g., "gpt-4") 1379 model_parameters: Parameters used for the model (e.g., temperature, max_tokens) 1380 usage_details: Token usage information (e.g., prompt_tokens, completion_tokens) 1381 cost_details: Cost information for the model call 1382 prompt: Associated prompt template from Langfuse prompt management 1383 1384 Example: 1385 ```python 1386 with langfuse.start_as_current_generation(name="answer-query") as generation: 1387 # Initial setup and API call 1388 response = llm.generate(...) 1389 1390 # Update with results that weren't available at creation time 1391 langfuse.update_current_generation( 1392 output=response.text, 1393 usage_details={ 1394 "prompt_tokens": response.usage.prompt_tokens, 1395 "completion_tokens": response.usage.completion_tokens 1396 } 1397 ) 1398 ``` 1399 """ 1400 if not self._tracing_enabled: 1401 langfuse_logger.debug( 1402 "Operation skipped: update_current_generation - Tracing is disabled or client is in no-op mode." 1403 ) 1404 return 1405 1406 current_otel_span = self._get_current_otel_span() 1407 1408 if current_otel_span is not None: 1409 generation = LangfuseGeneration( 1410 otel_span=current_otel_span, langfuse_client=self 1411 ) 1412 1413 if name: 1414 current_otel_span.update_name(name) 1415 1416 generation.update( 1417 input=input, 1418 output=output, 1419 metadata=metadata, 1420 version=version, 1421 level=level, 1422 status_message=status_message, 1423 completion_start_time=completion_start_time, 1424 model=model, 1425 model_parameters=model_parameters, 1426 usage_details=usage_details, 1427 cost_details=cost_details, 1428 prompt=prompt, 1429 ) 1430 1431 def update_current_span( 1432 self, 1433 *, 1434 name: Optional[str] = None, 1435 input: Optional[Any] = None, 1436 output: Optional[Any] = None, 1437 metadata: Optional[Any] = None, 1438 version: Optional[str] = None, 1439 level: Optional[SpanLevel] = None, 1440 status_message: Optional[str] = None, 1441 ) -> None: 1442 """Update the current active span with new information. 1443 1444 This method updates the current span in the active context with 1445 additional information. It's useful for adding outputs or metadata 1446 that become available during execution. 1447 1448 Args: 1449 name: The span name 1450 input: Updated input data for the operation 1451 output: Output data from the operation 1452 metadata: Additional metadata to associate with the span 1453 version: Version identifier for the code or component 1454 level: Importance level of the span (info, warning, error) 1455 status_message: Optional status message for the span 1456 1457 Example: 1458 ```python 1459 with langfuse.start_as_current_observation(name="process-data") as span: 1460 # Initial processing 1461 result = process_first_part() 1462 1463 # Update with intermediate results 1464 langfuse.update_current_span(metadata={"intermediate_result": result}) 1465 1466 # Continue processing 1467 final_result = process_second_part(result) 1468 1469 # Final update 1470 langfuse.update_current_span(output=final_result) 1471 ``` 1472 """ 1473 if not self._tracing_enabled: 1474 langfuse_logger.debug( 1475 "Operation skipped: update_current_span - Tracing is disabled or client is in no-op mode." 1476 ) 1477 return 1478 1479 current_otel_span = self._get_current_otel_span() 1480 1481 if current_otel_span is not None: 1482 span_class = self._get_span_class( 1483 self._get_observation_type_from_otel_span(current_otel_span) 1484 ) 1485 span = span_class( 1486 otel_span=current_otel_span, 1487 langfuse_client=self, 1488 environment=self._environment, 1489 release=self._release, 1490 ) 1491 1492 if name: 1493 current_otel_span.update_name(name) 1494 1495 span.update( 1496 input=input, 1497 output=output, 1498 metadata=metadata, 1499 version=version, 1500 level=level, 1501 status_message=status_message, 1502 ) 1503 1504 @deprecated( 1505 "Trace-level input/output is deprecated. " 1506 "For trace attributes (user_id, session_id, tags, etc.), use propagate_attributes() instead. " 1507 "This method will be removed in a future major version." 1508 ) 1509 def set_current_trace_io( 1510 self, 1511 *, 1512 input: Optional[Any] = None, 1513 output: Optional[Any] = None, 1514 ) -> None: 1515 """Set trace-level input and output for the current span's trace. 1516 1517 .. deprecated:: 1518 This is a legacy method for backward compatibility with Langfuse platform 1519 features that still rely on trace-level input/output (e.g., legacy LLM-as-a-judge 1520 evaluators). It will be removed in a future major version. 1521 1522 For setting other trace attributes (user_id, session_id, metadata, tags, version), 1523 use :meth:`propagate_attributes` instead. 1524 1525 Args: 1526 input: Input data to associate with the trace. 1527 output: Output data to associate with the trace. 1528 """ 1529 if not self._tracing_enabled: 1530 langfuse_logger.debug( 1531 "Operation skipped: set_current_trace_io - Tracing is disabled or client is in no-op mode." 1532 ) 1533 return 1534 1535 current_otel_span = self._get_current_otel_span() 1536 1537 if current_otel_span is not None and current_otel_span.is_recording(): 1538 span_class = self._get_span_class( 1539 self._get_observation_type_from_otel_span(current_otel_span) 1540 ) 1541 span = span_class( 1542 otel_span=current_otel_span, 1543 langfuse_client=self, 1544 environment=self._environment, 1545 release=self._release, 1546 ) 1547 1548 span.set_trace_io( 1549 input=input, 1550 output=output, 1551 ) 1552 1553 def set_current_trace_as_public(self) -> None: 1554 """Make the current trace publicly accessible via its URL. 1555 1556 When a trace is published, anyone with the trace link can view the full trace 1557 without needing to be logged in to Langfuse. This action cannot be undone 1558 programmatically - once published, the entire trace becomes public. 1559 1560 This is a convenience method that publishes the trace from the currently 1561 active span context. Use this when you want to make a trace public from 1562 within a traced function without needing direct access to the span object. 1563 """ 1564 if not self._tracing_enabled: 1565 langfuse_logger.debug( 1566 "Operation skipped: set_current_trace_as_public - Tracing is disabled or client is in no-op mode." 1567 ) 1568 return 1569 1570 current_otel_span = self._get_current_otel_span() 1571 1572 if current_otel_span is not None and current_otel_span.is_recording(): 1573 span_class = self._get_span_class( 1574 self._get_observation_type_from_otel_span(current_otel_span) 1575 ) 1576 span = span_class( 1577 otel_span=current_otel_span, 1578 langfuse_client=self, 1579 environment=self._environment, 1580 ) 1581 1582 span.set_trace_as_public() 1583 1584 def create_event( 1585 self, 1586 *, 1587 trace_context: Optional[TraceContext] = None, 1588 name: str, 1589 input: Optional[Any] = None, 1590 output: Optional[Any] = None, 1591 metadata: Optional[Any] = None, 1592 version: Optional[str] = None, 1593 level: Optional[SpanLevel] = None, 1594 status_message: Optional[str] = None, 1595 ) -> LangfuseEvent: 1596 """Create a new Langfuse observation of type 'EVENT'. 1597 1598 The created Langfuse Event observation will be the child of the current span in the context. 1599 1600 Args: 1601 trace_context: Optional context for connecting to an existing trace 1602 name: Name of the span (e.g., function or operation name) 1603 input: Input data for the operation (can be any JSON-serializable object) 1604 output: Output data from the operation (can be any JSON-serializable object) 1605 metadata: Additional metadata to associate with the span 1606 version: Version identifier for the code or component 1607 level: Importance level of the span (info, warning, error) 1608 status_message: Optional status message for the span 1609 1610 Returns: 1611 The Langfuse Event object 1612 1613 Example: 1614 ```python 1615 event = langfuse.create_event(name="process-event") 1616 ``` 1617 """ 1618 timestamp = time_ns() 1619 1620 if trace_context: 1621 trace_id = trace_context.get("trace_id", None) 1622 parent_span_id = trace_context.get("parent_span_id", None) 1623 1624 if trace_id: 1625 remote_parent_span = self._create_remote_parent_span( 1626 trace_id=trace_id, parent_span_id=parent_span_id 1627 ) 1628 1629 with otel_trace_api.use_span( 1630 cast(otel_trace_api.Span, remote_parent_span) 1631 ): 1632 otel_span = self._otel_tracer.start_span( 1633 name=name, start_time=timestamp 1634 ) 1635 otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True) 1636 1637 return cast( 1638 LangfuseEvent, 1639 LangfuseEvent( 1640 otel_span=otel_span, 1641 langfuse_client=self, 1642 environment=self._environment, 1643 release=self._release, 1644 input=input, 1645 output=output, 1646 metadata=metadata, 1647 version=version, 1648 level=level, 1649 status_message=status_message, 1650 ).end(end_time=timestamp), 1651 ) 1652 1653 otel_span = self._otel_tracer.start_span(name=name, start_time=timestamp) 1654 1655 return cast( 1656 LangfuseEvent, 1657 LangfuseEvent( 1658 otel_span=otel_span, 1659 langfuse_client=self, 1660 environment=self._environment, 1661 release=self._release, 1662 input=input, 1663 output=output, 1664 metadata=metadata, 1665 version=version, 1666 level=level, 1667 status_message=status_message, 1668 ).end(end_time=timestamp), 1669 ) 1670 1671 def _create_remote_parent_span( 1672 self, *, trace_id: str, parent_span_id: Optional[str] 1673 ) -> Any: 1674 if not self._is_valid_trace_id(trace_id): 1675 langfuse_logger.warning( 1676 f"Passed trace ID '{trace_id}' is not a valid 32 lowercase hex char Langfuse trace id. Ignoring trace ID." 1677 ) 1678 1679 if parent_span_id and not self._is_valid_span_id(parent_span_id): 1680 langfuse_logger.warning( 1681 f"Passed span ID '{parent_span_id}' is not a valid 16 lowercase hex char Langfuse span id. Ignoring parent span ID." 1682 ) 1683 1684 int_trace_id = int(trace_id, 16) 1685 int_parent_span_id = ( 1686 int(parent_span_id, 16) 1687 if parent_span_id 1688 else RandomIdGenerator().generate_span_id() 1689 ) 1690 1691 span_context = otel_trace_api.SpanContext( 1692 trace_id=int_trace_id, 1693 span_id=int_parent_span_id, 1694 trace_flags=otel_trace_api.TraceFlags(0x01), # mark span as sampled 1695 is_remote=False, 1696 ) 1697 1698 return otel_trace_api.NonRecordingSpan(span_context) 1699 1700 def _is_valid_trace_id(self, trace_id: str) -> bool: 1701 pattern = r"^[0-9a-f]{32}$" 1702 1703 return bool(re.match(pattern, trace_id)) 1704 1705 def _is_valid_span_id(self, span_id: str) -> bool: 1706 pattern = r"^[0-9a-f]{16}$" 1707 1708 return bool(re.match(pattern, span_id)) 1709 1710 def _create_observation_id(self, *, seed: Optional[str] = None) -> str: 1711 """Create a unique observation ID for use with Langfuse. 1712 1713 This method generates a unique observation ID (span ID in OpenTelemetry terms) 1714 for use with various Langfuse APIs. It can either generate a random ID or 1715 create a deterministic ID based on a seed string. 1716 1717 Observation IDs must be 16 lowercase hexadecimal characters, representing 8 bytes. 1718 This method ensures the generated ID meets this requirement. If you need to 1719 correlate an external ID with a Langfuse observation ID, use the external ID as 1720 the seed to get a valid, deterministic observation ID. 1721 1722 Args: 1723 seed: Optional string to use as a seed for deterministic ID generation. 1724 If provided, the same seed will always produce the same ID. 1725 If not provided, a random ID will be generated. 1726 1727 Returns: 1728 A 16-character lowercase hexadecimal string representing the observation ID. 1729 1730 Example: 1731 ```python 1732 # Generate a random observation ID 1733 obs_id = langfuse.create_observation_id() 1734 1735 # Generate a deterministic ID based on a seed 1736 user_obs_id = langfuse.create_observation_id(seed="user-123-feedback") 1737 1738 # Correlate an external item ID with a Langfuse observation ID 1739 item_id = "item-789012" 1740 correlated_obs_id = langfuse.create_observation_id(seed=item_id) 1741 1742 # Use the ID with Langfuse APIs 1743 langfuse.create_score( 1744 name="relevance", 1745 value=0.95, 1746 trace_id=trace_id, 1747 observation_id=obs_id 1748 ) 1749 ``` 1750 """ 1751 if not seed: 1752 span_id_int = RandomIdGenerator().generate_span_id() 1753 1754 return self._format_otel_span_id(span_id_int) 1755 1756 return sha256(seed.encode("utf-8")).digest()[:8].hex() 1757 1758 @staticmethod 1759 def create_trace_id(*, seed: Optional[str] = None) -> str: 1760 """Create a unique trace ID for use with Langfuse. 1761 1762 This method generates a unique trace ID for use with various Langfuse APIs. 1763 It can either generate a random ID or create a deterministic ID based on 1764 a seed string. 1765 1766 Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes. 1767 This method ensures the generated ID meets this requirement. If you need to 1768 correlate an external ID with a Langfuse trace ID, use the external ID as the 1769 seed to get a valid, deterministic Langfuse trace ID. 1770 1771 Args: 1772 seed: Optional string to use as a seed for deterministic ID generation. 1773 If provided, the same seed will always produce the same ID. 1774 If not provided, a random ID will be generated. 1775 1776 Returns: 1777 A 32-character lowercase hexadecimal string representing the Langfuse trace ID. 1778 1779 Example: 1780 ```python 1781 # Generate a random trace ID 1782 trace_id = langfuse.create_trace_id() 1783 1784 # Generate a deterministic ID based on a seed 1785 session_trace_id = langfuse.create_trace_id(seed="session-456") 1786 1787 # Correlate an external ID with a Langfuse trace ID 1788 external_id = "external-system-123456" 1789 correlated_trace_id = langfuse.create_trace_id(seed=external_id) 1790 1791 # Use the ID with trace context 1792 with langfuse.start_as_current_observation( 1793 name="process-request", 1794 trace_context={"trace_id": trace_id} 1795 ) as span: 1796 # Operation will be part of the specific trace 1797 pass 1798 ``` 1799 """ 1800 if not seed: 1801 trace_id_int = RandomIdGenerator().generate_trace_id() 1802 1803 return Langfuse._format_otel_trace_id(trace_id_int) 1804 1805 return sha256(seed.encode("utf-8")).digest()[:16].hex() 1806 1807 def _get_otel_trace_id(self, otel_span: otel_trace_api.Span) -> str: 1808 span_context = otel_span.get_span_context() 1809 1810 return self._format_otel_trace_id(span_context.trace_id) 1811 1812 def _get_otel_span_id(self, otel_span: otel_trace_api.Span) -> str: 1813 span_context = otel_span.get_span_context() 1814 1815 return self._format_otel_span_id(span_context.span_id) 1816 1817 @staticmethod 1818 def _format_otel_span_id(span_id_int: int) -> str: 1819 """Format an integer span ID to a 16-character lowercase hex string. 1820 1821 Internal method to convert an OpenTelemetry integer span ID to the standard 1822 W3C Trace Context format (16-character lowercase hex string). 1823 1824 Args: 1825 span_id_int: 64-bit integer representing a span ID 1826 1827 Returns: 1828 A 16-character lowercase hexadecimal string 1829 """ 1830 return format(span_id_int, "016x") 1831 1832 @staticmethod 1833 def _format_otel_trace_id(trace_id_int: int) -> str: 1834 """Format an integer trace ID to a 32-character lowercase hex string. 1835 1836 Internal method to convert an OpenTelemetry integer trace ID to the standard 1837 W3C Trace Context format (32-character lowercase hex string). 1838 1839 Args: 1840 trace_id_int: 128-bit integer representing a trace ID 1841 1842 Returns: 1843 A 32-character lowercase hexadecimal string 1844 """ 1845 return format(trace_id_int, "032x") 1846 1847 @overload 1848 def create_score( 1849 self, 1850 *, 1851 name: str, 1852 value: float, 1853 session_id: Optional[str] = None, 1854 dataset_run_id: Optional[str] = None, 1855 trace_id: Optional[str] = None, 1856 observation_id: Optional[str] = None, 1857 score_id: Optional[str] = None, 1858 data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None, 1859 comment: Optional[str] = None, 1860 config_id: Optional[str] = None, 1861 metadata: Optional[Any] = None, 1862 timestamp: Optional[datetime] = None, 1863 environment: Optional[str] = None, 1864 ) -> None: ... 1865 1866 @overload 1867 def create_score( 1868 self, 1869 *, 1870 name: str, 1871 value: str, 1872 session_id: Optional[str] = None, 1873 dataset_run_id: Optional[str] = None, 1874 trace_id: Optional[str] = None, 1875 score_id: Optional[str] = None, 1876 observation_id: Optional[str] = None, 1877 data_type: Optional[ 1878 Literal["CATEGORICAL", "TEXT", "CORRECTION"] 1879 ] = "CATEGORICAL", 1880 comment: Optional[str] = None, 1881 config_id: Optional[str] = None, 1882 metadata: Optional[Any] = None, 1883 timestamp: Optional[datetime] = None, 1884 environment: Optional[str] = None, 1885 ) -> None: ... 1886 1887 def create_score( 1888 self, 1889 *, 1890 name: str, 1891 value: Union[float, str], 1892 session_id: Optional[str] = None, 1893 dataset_run_id: Optional[str] = None, 1894 trace_id: Optional[str] = None, 1895 observation_id: Optional[str] = None, 1896 score_id: Optional[str] = None, 1897 data_type: Optional[ScoreDataType] = None, 1898 comment: Optional[str] = None, 1899 config_id: Optional[str] = None, 1900 metadata: Optional[Any] = None, 1901 timestamp: Optional[datetime] = None, 1902 environment: Optional[str] = None, 1903 ) -> None: 1904 """Create a score for a specific trace or observation. 1905 1906 This method creates a score for evaluating a Langfuse trace or observation. Scores can be 1907 used to track quality metrics, user feedback, or automated evaluations. 1908 1909 Args: 1910 name: Name of the score (e.g., "relevance", "accuracy") 1911 value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION) 1912 session_id: ID of the Langfuse session to associate the score with 1913 dataset_run_id: ID of the Langfuse dataset run to associate the score with 1914 trace_id: ID of the Langfuse trace to associate the score with 1915 observation_id: Optional ID of the specific observation to score. Trace ID must be provided too. 1916 score_id: Optional custom ID for the score (auto-generated if not provided) 1917 data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION) 1918 comment: Optional comment or explanation for the score 1919 config_id: Optional ID of a score config defined in Langfuse 1920 metadata: Optional metadata to be attached to the score 1921 timestamp: Optional timestamp for the score (defaults to current UTC time) 1922 environment: Optional environment override for this score. If omitted, 1923 the score uses the client-level environment from 1924 `Langfuse(environment=...)` or `LANGFUSE_TRACING_ENVIRONMENT`. 1925 Langfuse observation wrapper methods pass their resolved span 1926 environment here so scores created via `span.score()` or 1927 `span.score_trace()` stay grouped with the scored observation or 1928 trace, including request-scoped environments propagated with 1929 `propagate_attributes(environment=...)`. 1930 1931 Example: 1932 ```python 1933 # Create a numeric score for accuracy 1934 langfuse.create_score( 1935 name="accuracy", 1936 value=0.92, 1937 trace_id="abcdef1234567890abcdef1234567890", 1938 data_type="NUMERIC", 1939 comment="High accuracy with minor irrelevant details" 1940 ) 1941 1942 # Create a categorical score for sentiment 1943 langfuse.create_score( 1944 name="sentiment", 1945 value="positive", 1946 trace_id="abcdef1234567890abcdef1234567890", 1947 observation_id="abcdef1234567890", 1948 data_type="CATEGORICAL" 1949 ) 1950 ``` 1951 """ 1952 if not self._tracing_enabled: 1953 return 1954 1955 score_id = score_id or self._create_observation_id() 1956 1957 try: 1958 new_body = ScoreBody( 1959 id=score_id, 1960 sessionId=session_id, 1961 datasetRunId=dataset_run_id, 1962 traceId=trace_id, 1963 observationId=observation_id, 1964 name=name, 1965 value=value, 1966 dataType=data_type, # type: ignore 1967 comment=comment, 1968 configId=config_id, 1969 environment=environment or self._environment, 1970 metadata=metadata, 1971 ) 1972 1973 event = { 1974 "id": self.create_trace_id(), 1975 "type": "score-create", 1976 "timestamp": timestamp or _get_timestamp(), 1977 "body": new_body, 1978 } 1979 1980 if self._resources is not None: 1981 # Force the score to be in sample if it was for a legacy trace ID, i.e. non-32 hexchar 1982 force_sample = ( 1983 not self._is_valid_trace_id(trace_id) if trace_id else True 1984 ) 1985 1986 self._resources.add_score_task( 1987 event, 1988 force_sample=force_sample, 1989 ) 1990 1991 except Exception as e: 1992 langfuse_logger.exception( 1993 f"Error creating score: Failed to process score event for trace_id={trace_id}, name={name}. Error: {e}" 1994 ) 1995 1996 def _create_trace_tags_via_ingestion( 1997 self, 1998 *, 1999 trace_id: str, 2000 tags: List[str], 2001 ) -> None: 2002 """Private helper to enqueue trace tag updates via ingestion API events.""" 2003 if not self._tracing_enabled: 2004 return 2005 2006 if len(tags) == 0: 2007 return 2008 2009 try: 2010 new_body = TraceBody( 2011 id=trace_id, 2012 tags=tags, 2013 ) 2014 2015 event = { 2016 "id": self.create_trace_id(), 2017 "type": "trace-create", 2018 "timestamp": _get_timestamp(), 2019 "body": new_body, 2020 } 2021 2022 if self._resources is not None: 2023 self._resources.add_trace_task(event) 2024 except Exception as e: 2025 langfuse_logger.exception( 2026 f"Error updating trace tags: Failed to process trace update event for trace_id={trace_id}. Error: {e}" 2027 ) 2028 2029 @overload 2030 def score_current_span( 2031 self, 2032 *, 2033 name: str, 2034 value: float, 2035 score_id: Optional[str] = None, 2036 data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None, 2037 comment: Optional[str] = None, 2038 config_id: Optional[str] = None, 2039 metadata: Optional[Any] = None, 2040 ) -> None: ... 2041 2042 @overload 2043 def score_current_span( 2044 self, 2045 *, 2046 name: str, 2047 value: str, 2048 score_id: Optional[str] = None, 2049 data_type: Optional[ 2050 Literal["CATEGORICAL", "TEXT", "CORRECTION"] 2051 ] = "CATEGORICAL", 2052 comment: Optional[str] = None, 2053 config_id: Optional[str] = None, 2054 metadata: Optional[Any] = None, 2055 ) -> None: ... 2056 2057 def score_current_span( 2058 self, 2059 *, 2060 name: str, 2061 value: Union[float, str], 2062 score_id: Optional[str] = None, 2063 data_type: Optional[ScoreDataType] = None, 2064 comment: Optional[str] = None, 2065 config_id: Optional[str] = None, 2066 metadata: Optional[Any] = None, 2067 ) -> None: 2068 """Create a score for the current active span. 2069 2070 This method scores the currently active span in the context. It's a convenient 2071 way to score the current operation without needing to know its trace and span IDs. 2072 If the active span has a `langfuse.environment` attribute, including one 2073 set by `propagate_attributes(environment=...)`, the score uses that 2074 environment. Otherwise it uses the client-level environment. 2075 2076 Args: 2077 name: Name of the score (e.g., "relevance", "accuracy") 2078 value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION) 2079 score_id: Optional custom ID for the score (auto-generated if not provided) 2080 data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION) 2081 comment: Optional comment or explanation for the score 2082 config_id: Optional ID of a score config defined in Langfuse 2083 metadata: Optional metadata to be attached to the score 2084 2085 Example: 2086 ```python 2087 with langfuse.start_as_current_generation(name="answer-query") as generation: 2088 # Generate answer 2089 response = generate_answer(...) 2090 generation.update(output=response) 2091 2092 # Score the generation 2093 langfuse.score_current_span( 2094 name="relevance", 2095 value=0.85, 2096 data_type="NUMERIC", 2097 comment="Mostly relevant but contains some tangential information", 2098 metadata={"model": "gpt-4", "prompt_version": "v2"} 2099 ) 2100 ``` 2101 """ 2102 current_span = self._get_current_otel_span() 2103 2104 if current_span is not None: 2105 trace_id = self._get_otel_trace_id(current_span) 2106 observation_id = self._get_otel_span_id(current_span) 2107 2108 langfuse_logger.info( 2109 f"Score: Creating score name='{name}' value={value} for current span ({observation_id}) in trace {trace_id}" 2110 ) 2111 2112 self.create_score( 2113 trace_id=trace_id, 2114 observation_id=observation_id, 2115 name=name, 2116 value=cast(str, value), 2117 score_id=score_id, 2118 data_type=cast(Literal["CATEGORICAL", "TEXT", "CORRECTION"], data_type), 2119 comment=comment, 2120 config_id=config_id, 2121 metadata=metadata, 2122 environment=get_string_span_attribute( 2123 current_span, LangfuseOtelSpanAttributes.ENVIRONMENT 2124 ), 2125 ) 2126 2127 @overload 2128 def score_current_trace( 2129 self, 2130 *, 2131 name: str, 2132 value: float, 2133 score_id: Optional[str] = None, 2134 data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None, 2135 comment: Optional[str] = None, 2136 config_id: Optional[str] = None, 2137 metadata: Optional[Any] = None, 2138 ) -> None: ... 2139 2140 @overload 2141 def score_current_trace( 2142 self, 2143 *, 2144 name: str, 2145 value: str, 2146 score_id: Optional[str] = None, 2147 data_type: Optional[ 2148 Literal["CATEGORICAL", "TEXT", "CORRECTION"] 2149 ] = "CATEGORICAL", 2150 comment: Optional[str] = None, 2151 config_id: Optional[str] = None, 2152 metadata: Optional[Any] = None, 2153 ) -> None: ... 2154 2155 def score_current_trace( 2156 self, 2157 *, 2158 name: str, 2159 value: Union[float, str], 2160 score_id: Optional[str] = None, 2161 data_type: Optional[ScoreDataType] = None, 2162 comment: Optional[str] = None, 2163 config_id: Optional[str] = None, 2164 metadata: Optional[Any] = None, 2165 ) -> None: 2166 """Create a score for the current trace. 2167 2168 This method scores the trace of the currently active span. Unlike score_current_span, 2169 this method associates the score with the entire trace rather than a specific span. 2170 It's useful for scoring overall performance or quality of the entire operation. 2171 If the active span has a `langfuse.environment` attribute, including one 2172 set by `propagate_attributes(environment=...)`, the score uses that 2173 environment. Otherwise it uses the client-level environment. 2174 2175 Args: 2176 name: Name of the score (e.g., "user_satisfaction", "overall_quality") 2177 value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION) 2178 score_id: Optional custom ID for the score (auto-generated if not provided) 2179 data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION) 2180 comment: Optional comment or explanation for the score 2181 config_id: Optional ID of a score config defined in Langfuse 2182 metadata: Optional metadata to be attached to the score 2183 2184 Example: 2185 ```python 2186 with langfuse.start_as_current_observation(name="process-user-request") as span: 2187 # Process request 2188 result = process_complete_request() 2189 span.update(output=result) 2190 2191 # Score the overall trace 2192 langfuse.score_current_trace( 2193 name="overall_quality", 2194 value=0.95, 2195 data_type="NUMERIC", 2196 comment="High quality end-to-end response", 2197 metadata={"evaluator": "gpt-4", "criteria": "comprehensive"} 2198 ) 2199 ``` 2200 """ 2201 current_span = self._get_current_otel_span() 2202 2203 if current_span is not None: 2204 trace_id = self._get_otel_trace_id(current_span) 2205 2206 langfuse_logger.info( 2207 f"Score: Creating score name='{name}' value={value} for entire trace {trace_id}" 2208 ) 2209 2210 self.create_score( 2211 trace_id=trace_id, 2212 name=name, 2213 value=cast(str, value), 2214 score_id=score_id, 2215 data_type=cast(Literal["CATEGORICAL", "TEXT", "CORRECTION"], data_type), 2216 comment=comment, 2217 config_id=config_id, 2218 metadata=metadata, 2219 environment=get_string_span_attribute( 2220 current_span, LangfuseOtelSpanAttributes.ENVIRONMENT 2221 ), 2222 ) 2223 2224 def flush(self) -> None: 2225 """Force flush all pending spans and events to the Langfuse API. 2226 2227 This method manually flushes any pending spans, scores, and other events to the 2228 Langfuse API. It's useful in scenarios where you want to ensure all data is sent 2229 before proceeding, without waiting for the automatic flush interval. 2230 2231 Example: 2232 ```python 2233 # Record some spans and scores 2234 with langfuse.start_as_current_observation(name="operation") as span: 2235 # Do work... 2236 pass 2237 2238 # Ensure all data is sent to Langfuse before proceeding 2239 langfuse.flush() 2240 2241 # Continue with other work 2242 ``` 2243 """ 2244 if self._resources is not None: 2245 self._resources.flush() 2246 2247 def shutdown(self) -> None: 2248 """Shut down the Langfuse client and flush all pending data. 2249 2250 This method cleanly shuts down the Langfuse client, ensuring all pending data 2251 is flushed to the API and all background threads are properly terminated. 2252 2253 It's important to call this method when your application is shutting down to 2254 prevent data loss and resource leaks. For most applications, using the client 2255 as a context manager or relying on the automatic shutdown via atexit is sufficient. 2256 2257 Example: 2258 ```python 2259 # Initialize Langfuse 2260 langfuse = Langfuse(public_key="...", secret_key="...") 2261 2262 # Use Langfuse throughout your application 2263 # ... 2264 2265 # When application is shutting down 2266 langfuse.shutdown() 2267 ``` 2268 """ 2269 if self._resources is not None: 2270 self._resources.shutdown() 2271 2272 def get_current_trace_id(self) -> Optional[str]: 2273 """Get the trace ID of the current active span. 2274 2275 This method retrieves the trace ID from the currently active span in the context. 2276 It can be used to get the trace ID for referencing in logs, external systems, 2277 or for creating related operations. 2278 2279 Returns: 2280 The current trace ID as a 32-character lowercase hexadecimal string, 2281 or None if there is no active span. 2282 2283 Example: 2284 ```python 2285 with langfuse.start_as_current_observation(name="process-request") as span: 2286 # Get the current trace ID for reference 2287 trace_id = langfuse.get_current_trace_id() 2288 2289 # Use it for external correlation 2290 log.info(f"Processing request with trace_id: {trace_id}") 2291 2292 # Or pass to another system 2293 external_system.process(data, trace_id=trace_id) 2294 ``` 2295 """ 2296 if not self._tracing_enabled: 2297 langfuse_logger.debug( 2298 "Operation skipped: get_current_trace_id - Tracing is disabled or client is in no-op mode." 2299 ) 2300 return None 2301 2302 current_otel_span = self._get_current_otel_span() 2303 2304 return self._get_otel_trace_id(current_otel_span) if current_otel_span else None 2305 2306 def get_current_observation_id(self) -> Optional[str]: 2307 """Get the observation ID (span ID) of the current active span. 2308 2309 This method retrieves the observation ID from the currently active span in the context. 2310 It can be used to get the observation ID for referencing in logs, external systems, 2311 or for creating scores or other related operations. 2312 2313 Returns: 2314 The current observation ID as a 16-character lowercase hexadecimal string, 2315 or None if there is no active span. 2316 2317 Example: 2318 ```python 2319 with langfuse.start_as_current_observation(name="process-user-query") as span: 2320 # Get the current observation ID 2321 observation_id = langfuse.get_current_observation_id() 2322 2323 # Store it for later reference 2324 cache.set(f"query_{query_id}_observation", observation_id) 2325 2326 # Process the query... 2327 ``` 2328 """ 2329 if not self._tracing_enabled: 2330 langfuse_logger.debug( 2331 "Operation skipped: get_current_observation_id - Tracing is disabled or client is in no-op mode." 2332 ) 2333 return None 2334 2335 current_otel_span = self._get_current_otel_span() 2336 2337 return self._get_otel_span_id(current_otel_span) if current_otel_span else None 2338 2339 def _get_project_id(self) -> Optional[str]: 2340 """Fetch and return the current project id. Persisted across requests. Returns None if no project id is found for api keys.""" 2341 if not self._project_id: 2342 proj = self.api.projects.get() 2343 if not proj.data or not proj.data[0].id: 2344 return None 2345 2346 self._project_id = proj.data[0].id 2347 2348 return self._project_id 2349 2350 def get_trace_url(self, *, trace_id: Optional[str] = None) -> Optional[str]: 2351 """Get the URL to view a trace in the Langfuse UI. 2352 2353 This method generates a URL that links directly to a trace in the Langfuse UI. 2354 It's useful for providing links in logs, notifications, or debugging tools. 2355 2356 Args: 2357 trace_id: Optional trace ID to generate a URL for. If not provided, 2358 the trace ID of the current active span will be used. 2359 2360 Returns: 2361 A URL string pointing to the trace in the Langfuse UI, 2362 or None if the project ID couldn't be retrieved or no trace ID is available. 2363 2364 Example: 2365 ```python 2366 # Get URL for the current trace 2367 with langfuse.start_as_current_observation(name="process-request") as span: 2368 trace_url = langfuse.get_trace_url() 2369 log.info(f"Processing trace: {trace_url}") 2370 2371 # Get URL for a specific trace 2372 specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef") 2373 send_notification(f"Review needed for trace: {specific_trace_url}") 2374 ``` 2375 """ 2376 final_trace_id = trace_id or self.get_current_trace_id() 2377 if not final_trace_id: 2378 return None 2379 2380 project_id = self._get_project_id() 2381 2382 return ( 2383 f"{self._base_url}/project/{project_id}/traces/{final_trace_id}" 2384 if project_id and final_trace_id 2385 else None 2386 ) 2387 2388 def get_dataset( 2389 self, 2390 name: str, 2391 *, 2392 fetch_items_page_size: Optional[int] = 50, 2393 version: Optional[datetime] = None, 2394 ) -> "DatasetClient": 2395 """Fetch a dataset by its name. 2396 2397 Args: 2398 name: The name of the dataset to fetch. 2399 fetch_items_page_size: All items of the dataset will be fetched in chunks of this size. Defaults to 50. 2400 version: Retrieve dataset items as they existed at this specific point in time (UTC). 2401 If provided, returns the state of items at the specified UTC timestamp. 2402 If not provided, returns the latest version. Must be a timezone-aware datetime object in UTC. 2403 2404 Returns: 2405 DatasetClient: The dataset with the given name. 2406 """ 2407 try: 2408 langfuse_logger.debug(f"Getting datasets {name}") 2409 dataset = self.api.datasets.get(dataset_name=self._url_encode(name)) 2410 2411 dataset_items: List[DatasetItem] = [] 2412 page = 1 2413 2414 while True: 2415 new_items = self.api.dataset_items.list( 2416 dataset_name=self._url_encode(name, is_url_param=True), 2417 page=page, 2418 limit=fetch_items_page_size, 2419 version=version, 2420 ) 2421 dataset_items.extend( 2422 self._hydrate_dataset_item_media_references(item) 2423 for item in new_items.data 2424 ) 2425 2426 if new_items.meta.total_pages <= page: 2427 break 2428 2429 page += 1 2430 2431 return DatasetClient( 2432 dataset=dataset, 2433 items=dataset_items, 2434 version=version, 2435 langfuse_client=self, 2436 ) 2437 2438 except Error as e: 2439 handle_fern_exception(e) 2440 raise e 2441 2442 def get_dataset_run( 2443 self, *, dataset_name: str, run_name: str 2444 ) -> DatasetRunWithItems: 2445 """Fetch a dataset run by dataset name and run name. 2446 2447 Args: 2448 dataset_name (str): The name of the dataset. 2449 run_name (str): The name of the run. 2450 2451 Returns: 2452 DatasetRunWithItems: The dataset run with its items. 2453 """ 2454 try: 2455 return cast( 2456 DatasetRunWithItems, 2457 self.api.datasets.get_run( 2458 dataset_name=self._url_encode(dataset_name), 2459 run_name=self._url_encode(run_name), 2460 request_options=None, 2461 ), 2462 ) 2463 except Error as e: 2464 handle_fern_exception(e) 2465 raise e 2466 2467 def get_dataset_runs( 2468 self, 2469 *, 2470 dataset_name: str, 2471 page: Optional[int] = None, 2472 limit: Optional[int] = None, 2473 ) -> PaginatedDatasetRuns: 2474 """Fetch all runs for a dataset. 2475 2476 Args: 2477 dataset_name (str): The name of the dataset. 2478 page (Optional[int]): Page number, starts at 1. 2479 limit (Optional[int]): Limit of items per page. 2480 2481 Returns: 2482 PaginatedDatasetRuns: Paginated list of dataset runs. 2483 """ 2484 try: 2485 return cast( 2486 PaginatedDatasetRuns, 2487 self.api.datasets.get_runs( 2488 dataset_name=self._url_encode(dataset_name), 2489 page=page, 2490 limit=limit, 2491 request_options=None, 2492 ), 2493 ) 2494 except Error as e: 2495 handle_fern_exception(e) 2496 raise e 2497 2498 def delete_dataset_run( 2499 self, *, dataset_name: str, run_name: str 2500 ) -> DeleteDatasetRunResponse: 2501 """Delete a dataset run and all its run items. This action is irreversible. 2502 2503 Args: 2504 dataset_name (str): The name of the dataset. 2505 run_name (str): The name of the run. 2506 2507 Returns: 2508 DeleteDatasetRunResponse: Confirmation of deletion. 2509 """ 2510 try: 2511 return cast( 2512 DeleteDatasetRunResponse, 2513 self.api.datasets.delete_run( 2514 dataset_name=self._url_encode(dataset_name), 2515 run_name=self._url_encode(run_name), 2516 request_options=None, 2517 ), 2518 ) 2519 except Error as e: 2520 handle_fern_exception(e) 2521 raise e 2522 2523 def run_experiment( 2524 self, 2525 *, 2526 name: str, 2527 run_name: Optional[str] = None, 2528 description: Optional[str] = None, 2529 data: ExperimentData, 2530 task: TaskFunction, 2531 evaluators: List[EvaluatorFunction] = [], 2532 composite_evaluator: Optional[CompositeEvaluatorFunction] = None, 2533 run_evaluators: List[RunEvaluatorFunction] = [], 2534 max_concurrency: int = 50, 2535 metadata: Optional[Dict[str, str]] = None, 2536 _dataset_version: Optional[datetime] = None, 2537 ) -> ExperimentResult: 2538 """Run an experiment on a dataset with automatic tracing and evaluation. 2539 2540 This method executes a task function on each item in the provided dataset, 2541 automatically traces all executions with Langfuse for observability, runs 2542 item-level and run-level evaluators on the outputs, and returns comprehensive 2543 results with evaluation metrics. 2544 2545 The experiment system provides: 2546 - Automatic tracing of all task executions 2547 - Concurrent processing with configurable limits 2548 - Comprehensive error handling that isolates failures 2549 - Integration with Langfuse datasets for experiment tracking 2550 - Flexible evaluation framework supporting both sync and async evaluators 2551 2552 Args: 2553 name: Human-readable name for the experiment. Used for identification 2554 in the Langfuse UI. 2555 run_name: Optional exact name for the experiment run. If provided, this will be 2556 used as the exact dataset run name if the `data` contains Langfuse dataset items. 2557 If not provided, this will default to the experiment name appended with an ISO timestamp. 2558 description: Optional description explaining the experiment's purpose, 2559 methodology, or expected outcomes. 2560 data: Array of data items to process. Can be either: 2561 - List of dict-like items with 'input', 'expected_output', 'metadata' keys 2562 - List of Langfuse DatasetItem objects from dataset.items 2563 task: Function that processes each data item and returns output. 2564 Must accept 'item' as keyword argument and can return sync or async results. 2565 The task function signature should be: task(*, item, **kwargs) -> Any 2566 evaluators: List of functions to evaluate each item's output individually. 2567 Each evaluator receives input, output, expected_output, and metadata. 2568 Can return single Evaluation dict or list of Evaluation dicts. 2569 composite_evaluator: Optional function that creates composite scores from item-level evaluations. 2570 Receives the same inputs as item-level evaluators (input, output, expected_output, metadata) 2571 plus the list of evaluations from item-level evaluators. Useful for weighted averages, 2572 pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics. 2573 run_evaluators: List of functions to evaluate the entire experiment run. 2574 Each run evaluator receives all item_results and can compute aggregate metrics. 2575 Useful for calculating averages, distributions, or cross-item comparisons. 2576 max_concurrency: Maximum number of concurrent task executions (default: 50). 2577 Controls the number of items processed simultaneously. Adjust based on 2578 API rate limits and system resources. 2579 metadata: Optional metadata dictionary to attach to all experiment traces. 2580 This metadata will be included in every trace created during the experiment. 2581 If `data` are Langfuse dataset items, the metadata will be attached to the dataset run, too. 2582 2583 Returns: 2584 ExperimentResult containing: 2585 - run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset. 2586 - item_results: List of results for each processed item with outputs and evaluations 2587 - run_evaluations: List of aggregate evaluation results for the entire run 2588 - experiment_id: Stable identifier for the experiment run across all items 2589 - dataset_run_id: ID of the dataset run (if using Langfuse datasets) 2590 - dataset_run_url: Direct URL to view results in Langfuse UI (if applicable) 2591 2592 Raises: 2593 ValueError: If required parameters are missing or invalid 2594 Exception: If experiment setup fails (individual item failures are handled gracefully) 2595 2596 Examples: 2597 Basic experiment with local data: 2598 ```python 2599 def summarize_text(*, item, **kwargs): 2600 return f"Summary: {item['input'][:50]}..." 2601 2602 def length_evaluator(*, input, output, expected_output=None, **kwargs): 2603 return { 2604 "name": "output_length", 2605 "value": len(output), 2606 "comment": f"Output contains {len(output)} characters" 2607 } 2608 2609 result = langfuse.run_experiment( 2610 name="Text Summarization Test", 2611 description="Evaluate summarization quality and length", 2612 data=[ 2613 {"input": "Long article text...", "expected_output": "Expected summary"}, 2614 {"input": "Another article...", "expected_output": "Another summary"} 2615 ], 2616 task=summarize_text, 2617 evaluators=[length_evaluator] 2618 ) 2619 2620 print(f"Processed {len(result.item_results)} items") 2621 for item_result in result.item_results: 2622 print(f"Input: {item_result.item['input']}") 2623 print(f"Output: {item_result.output}") 2624 print(f"Evaluations: {item_result.evaluations}") 2625 ``` 2626 2627 Advanced experiment with async task and multiple evaluators: 2628 ```python 2629 async def llm_task(*, item, **kwargs): 2630 # Simulate async LLM call 2631 response = await openai_client.chat.completions.create( 2632 model="gpt-4", 2633 messages=[{"role": "user", "content": item["input"]}] 2634 ) 2635 return response.choices[0].message.content 2636 2637 def accuracy_evaluator(*, input, output, expected_output=None, **kwargs): 2638 if expected_output and expected_output.lower() in output.lower(): 2639 return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"} 2640 return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"} 2641 2642 def toxicity_evaluator(*, input, output, expected_output=None, **kwargs): 2643 # Simulate toxicity check 2644 toxicity_score = check_toxicity(output) # Your toxicity checker 2645 return { 2646 "name": "toxicity", 2647 "value": toxicity_score, 2648 "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}" 2649 } 2650 2651 def average_accuracy(*, item_results, **kwargs): 2652 accuracies = [ 2653 eval.value for result in item_results 2654 for eval in result.evaluations 2655 if eval.name == "accuracy" 2656 ] 2657 return { 2658 "name": "average_accuracy", 2659 "value": sum(accuracies) / len(accuracies) if accuracies else 0, 2660 "comment": f"Average accuracy across {len(accuracies)} items" 2661 } 2662 2663 result = langfuse.run_experiment( 2664 name="LLM Safety and Accuracy Test", 2665 description="Evaluate model accuracy and safety across diverse prompts", 2666 data=test_dataset, # Your dataset items 2667 task=llm_task, 2668 evaluators=[accuracy_evaluator, toxicity_evaluator], 2669 run_evaluators=[average_accuracy], 2670 max_concurrency=5, # Limit concurrent API calls 2671 metadata={"model": "gpt-4", "temperature": 0.7} 2672 ) 2673 ``` 2674 2675 Using with Langfuse datasets: 2676 ```python 2677 # Get dataset from Langfuse 2678 dataset = langfuse.get_dataset("my-eval-dataset") 2679 2680 result = dataset.run_experiment( 2681 name="Production Model Evaluation", 2682 description="Monthly evaluation of production model performance", 2683 task=my_production_task, 2684 evaluators=[accuracy_evaluator, latency_evaluator] 2685 ) 2686 2687 # Results automatically linked to dataset in Langfuse UI 2688 print(f"View results: {result['dataset_run_url']}") 2689 ``` 2690 2691 Note: 2692 - Task and evaluator functions can be either synchronous or asynchronous 2693 - Individual item failures are logged but don't stop the experiment 2694 - All executions are automatically traced and visible in Langfuse UI 2695 - When using Langfuse datasets, results are automatically linked for easy comparison 2696 - This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.) 2697 - Async execution is handled automatically with smart event loop detection 2698 """ 2699 return cast( 2700 ExperimentResult, 2701 run_async_safely( 2702 self._run_experiment_async( 2703 name=name, 2704 run_name=self._create_experiment_run_name( 2705 name=name, run_name=run_name 2706 ), 2707 description=description, 2708 data=data, 2709 task=task, 2710 evaluators=evaluators or [], 2711 composite_evaluator=composite_evaluator, 2712 run_evaluators=run_evaluators or [], 2713 max_concurrency=max_concurrency, 2714 metadata=metadata, 2715 dataset_version=_dataset_version, 2716 ), 2717 ), 2718 ) 2719 2720 async def _run_experiment_async( 2721 self, 2722 *, 2723 name: str, 2724 run_name: str, 2725 description: Optional[str], 2726 data: ExperimentData, 2727 task: TaskFunction, 2728 evaluators: List[EvaluatorFunction], 2729 composite_evaluator: Optional[CompositeEvaluatorFunction], 2730 run_evaluators: List[RunEvaluatorFunction], 2731 max_concurrency: int, 2732 metadata: Optional[Dict[str, Any]] = None, 2733 dataset_version: Optional[datetime] = None, 2734 ) -> ExperimentResult: 2735 langfuse_logger.debug( 2736 f"Starting experiment '{name}' run '{run_name}' with {len(data)} items" 2737 ) 2738 2739 shared_fallback_experiment_id = self._create_observation_id() 2740 2741 # Set up concurrency control 2742 semaphore = asyncio.Semaphore(max_concurrency) 2743 2744 # Process all items 2745 async def process_item(item: ExperimentItem) -> ExperimentItemResult: 2746 async with semaphore: 2747 return await self._process_experiment_item( 2748 item, 2749 task, 2750 evaluators, 2751 composite_evaluator, 2752 shared_fallback_experiment_id, 2753 name, 2754 run_name, 2755 description, 2756 metadata, 2757 dataset_version, 2758 ) 2759 2760 # Run all items concurrently 2761 tasks = [process_item(item) for item in data] 2762 item_results = await asyncio.gather(*tasks, return_exceptions=True) 2763 2764 # Filter out any exceptions and log errors 2765 valid_results: List[ExperimentItemResult] = [] 2766 for i, result in enumerate(item_results): 2767 if isinstance(result, Exception): 2768 langfuse_logger.error(f"Item {i} failed: {result}") 2769 elif isinstance(result, ExperimentItemResult): 2770 valid_results.append(result) # type: ignore 2771 2772 # Run experiment-level evaluators 2773 run_evaluations: List[Evaluation] = [] 2774 for run_evaluator in run_evaluators: 2775 try: 2776 evaluations = await _run_evaluator( 2777 run_evaluator, item_results=valid_results 2778 ) 2779 run_evaluations.extend(evaluations) 2780 except Exception as e: 2781 langfuse_logger.error(f"Run evaluator failed: {e}") 2782 2783 # Generate dataset run URL if applicable 2784 dataset_run_id = next( 2785 ( 2786 result.dataset_run_id 2787 for result in valid_results 2788 if result.dataset_run_id 2789 ), 2790 None, 2791 ) 2792 dataset_run_url = None 2793 if dataset_run_id and data: 2794 try: 2795 # Check if the first item has dataset_id (for DatasetItem objects) 2796 first_item = data[0] 2797 dataset_id = None 2798 2799 if hasattr(first_item, "dataset_id"): 2800 dataset_id = getattr(first_item, "dataset_id", None) 2801 2802 if dataset_id: 2803 project_id = self._get_project_id() 2804 2805 if project_id: 2806 dataset_run_url = f"{self._base_url}/project/{project_id}/datasets/{dataset_id}/runs/{dataset_run_id}" 2807 2808 except Exception: 2809 pass # URL generation is optional 2810 2811 # Store run-level evaluations as scores 2812 for evaluation in run_evaluations: 2813 try: 2814 if dataset_run_id: 2815 self.create_score( 2816 dataset_run_id=dataset_run_id, 2817 name=evaluation.name or "<unknown>", 2818 value=evaluation.value, # type: ignore 2819 comment=evaluation.comment, 2820 metadata=evaluation.metadata, 2821 data_type=evaluation.data_type, # type: ignore 2822 config_id=evaluation.config_id, 2823 ) 2824 2825 except Exception as e: 2826 langfuse_logger.error(f"Failed to store run evaluation: {e}") 2827 2828 # Flush scores and traces 2829 self.flush() 2830 2831 return ExperimentResult( 2832 name=name, 2833 run_name=run_name, 2834 description=description, 2835 item_results=valid_results, 2836 run_evaluations=run_evaluations, 2837 experiment_id=dataset_run_id or shared_fallback_experiment_id, 2838 dataset_run_id=dataset_run_id, 2839 dataset_run_url=dataset_run_url, 2840 ) 2841 2842 async def _process_experiment_item( 2843 self, 2844 item: ExperimentItem, 2845 task: Callable, 2846 evaluators: List[Callable], 2847 composite_evaluator: Optional[CompositeEvaluatorFunction], 2848 fallback_experiment_id: str, 2849 experiment_name: str, 2850 experiment_run_name: str, 2851 experiment_description: Optional[str], 2852 experiment_metadata: Optional[Dict[str, Any]] = None, 2853 dataset_version: Optional[datetime] = None, 2854 ) -> ExperimentItemResult: 2855 span_name = "experiment-item-run" 2856 2857 with self.start_as_current_observation(name=span_name) as span: 2858 try: 2859 input_data = ( 2860 item.get("input") 2861 if isinstance(item, dict) 2862 else getattr(item, "input", None) 2863 ) 2864 2865 if input_data is None: 2866 raise ValueError("Experiment Item is missing input. Skipping item.") 2867 2868 expected_output = ( 2869 item.get("expected_output") 2870 if isinstance(item, dict) 2871 else getattr(item, "expected_output", None) 2872 ) 2873 2874 item_metadata = ( 2875 item.get("metadata") 2876 if isinstance(item, dict) 2877 else getattr(item, "metadata", None) 2878 ) 2879 2880 final_observation_metadata = { 2881 "experiment_name": experiment_name, 2882 "experiment_run_name": experiment_run_name, 2883 **(experiment_metadata or {}), 2884 } 2885 2886 trace_id = span.trace_id 2887 dataset_id = None 2888 dataset_item_id = None 2889 dataset_run_id = None 2890 2891 # Link to dataset run if this is a dataset item 2892 if hasattr(item, "id") and hasattr(item, "dataset_id"): 2893 try: 2894 # Use sync API to avoid event loop issues when run_async_safely 2895 # creates multiple event loops across different threads 2896 dataset_run_item = await asyncio.to_thread( 2897 self.api.dataset_run_items.create, 2898 run_name=experiment_run_name, 2899 run_description=experiment_description, 2900 metadata=experiment_metadata, 2901 dataset_item_id=item.id, # type: ignore 2902 trace_id=trace_id, 2903 observation_id=span.id, 2904 dataset_version=dataset_version, 2905 ) 2906 2907 dataset_run_id = dataset_run_item.dataset_run_id 2908 2909 except Exception as e: 2910 langfuse_logger.error(f"Failed to create dataset run item: {e}") 2911 2912 if ( 2913 not isinstance(item, dict) 2914 and hasattr(item, "dataset_id") 2915 and hasattr(item, "id") 2916 ): 2917 dataset_id = item.dataset_id 2918 dataset_item_id = item.id 2919 2920 final_observation_metadata.update( 2921 {"dataset_id": dataset_id, "dataset_item_id": dataset_item_id} 2922 ) 2923 2924 if isinstance(item_metadata, dict): 2925 final_observation_metadata.update(item_metadata) 2926 2927 experiment_id = dataset_run_id or fallback_experiment_id 2928 experiment_item_id = ( 2929 dataset_item_id or get_sha256_hash_hex(_serialize(input_data))[:16] 2930 ) 2931 span._otel_span.set_attributes( 2932 { 2933 k: v 2934 for k, v in { 2935 LangfuseOtelSpanAttributes.ENVIRONMENT: LANGFUSE_SDK_EXPERIMENT_ENVIRONMENT, 2936 LangfuseOtelSpanAttributes.EXPERIMENT_DESCRIPTION: experiment_description, 2937 LangfuseOtelSpanAttributes.EXPERIMENT_ITEM_EXPECTED_OUTPUT: _serialize( 2938 expected_output 2939 ), 2940 }.items() 2941 if v is not None 2942 } 2943 ) 2944 2945 propagated_experiment_attributes = PropagatedExperimentAttributes( 2946 experiment_id=experiment_id, 2947 experiment_name=experiment_run_name, 2948 experiment_metadata=_flatten_and_serialize_metadata_values( 2949 experiment_metadata 2950 ), 2951 experiment_dataset_id=dataset_id, 2952 experiment_item_id=experiment_item_id, 2953 experiment_item_metadata=_flatten_and_serialize_metadata_values( 2954 item_metadata if isinstance(item_metadata, dict) else None 2955 ), 2956 experiment_item_root_observation_id=span.id, 2957 ) 2958 2959 with _propagate_attributes(experiment=propagated_experiment_attributes): 2960 output = await _run_task(task, item) 2961 2962 span.update( 2963 input=input_data, 2964 output=output, 2965 metadata=final_observation_metadata, 2966 ) 2967 2968 except Exception as e: 2969 span.update( 2970 output=f"Error: {str(e)}", level="ERROR", status_message=str(e) 2971 ) 2972 raise e 2973 2974 # Run evaluators 2975 evaluations = [] 2976 2977 for evaluator in evaluators: 2978 try: 2979 eval_metadata: Optional[Dict[str, Any]] = None 2980 2981 if isinstance(item, dict): 2982 eval_metadata = item.get("metadata") 2983 elif hasattr(item, "metadata"): 2984 eval_metadata = item.metadata 2985 2986 with _propagate_attributes( 2987 experiment=propagated_experiment_attributes 2988 ): 2989 eval_results = await _run_evaluator( 2990 evaluator, 2991 input=input_data, 2992 output=output, 2993 expected_output=expected_output, 2994 metadata=eval_metadata, 2995 ) 2996 evaluations.extend(eval_results) 2997 2998 # Store evaluations as scores 2999 for evaluation in eval_results: 3000 self.create_score( 3001 trace_id=trace_id, 3002 observation_id=span.id, 3003 name=evaluation.name, 3004 value=evaluation.value, # type: ignore 3005 comment=evaluation.comment, 3006 metadata=evaluation.metadata, 3007 config_id=evaluation.config_id, 3008 data_type=evaluation.data_type, # type: ignore 3009 ) 3010 3011 except Exception as e: 3012 langfuse_logger.error(f"Evaluator failed: {e}") 3013 3014 # Run composite evaluator if provided and we have evaluations 3015 if composite_evaluator and evaluations: 3016 try: 3017 composite_eval_metadata: Optional[Dict[str, Any]] = None 3018 if isinstance(item, dict): 3019 composite_eval_metadata = item.get("metadata") 3020 elif hasattr(item, "metadata"): 3021 composite_eval_metadata = item.metadata 3022 3023 with _propagate_attributes( 3024 experiment=propagated_experiment_attributes 3025 ): 3026 result = composite_evaluator( 3027 input=input_data, 3028 output=output, 3029 expected_output=expected_output, 3030 metadata=composite_eval_metadata, 3031 evaluations=evaluations, 3032 ) 3033 3034 # Handle async composite evaluators 3035 if asyncio.iscoroutine(result): 3036 result = await result 3037 3038 # Normalize to list 3039 composite_evals: List[Evaluation] = [] 3040 if isinstance(result, (dict, Evaluation)): 3041 composite_evals = [result] # type: ignore 3042 elif isinstance(result, list): 3043 composite_evals = result # type: ignore 3044 3045 # Store composite evaluations as scores and add to evaluations list 3046 for composite_evaluation in composite_evals: 3047 self.create_score( 3048 trace_id=trace_id, 3049 observation_id=span.id, 3050 name=composite_evaluation.name, 3051 value=composite_evaluation.value, # type: ignore 3052 comment=composite_evaluation.comment, 3053 metadata=composite_evaluation.metadata, 3054 config_id=composite_evaluation.config_id, 3055 data_type=composite_evaluation.data_type, # type: ignore 3056 ) 3057 evaluations.append(composite_evaluation) 3058 3059 except Exception as e: 3060 langfuse_logger.error(f"Composite evaluator failed: {e}") 3061 3062 return ExperimentItemResult( 3063 item=item, 3064 output=output, 3065 evaluations=evaluations, 3066 trace_id=trace_id, 3067 dataset_run_id=dataset_run_id, 3068 ) 3069 3070 def _create_experiment_run_name( 3071 self, *, name: Optional[str] = None, run_name: Optional[str] = None 3072 ) -> str: 3073 if run_name: 3074 return run_name 3075 3076 iso_timestamp = _get_timestamp().isoformat().replace("+00:00", "Z") 3077 3078 return f"{name} - {iso_timestamp}" 3079 3080 def run_batched_evaluation( 3081 self, 3082 *, 3083 scope: Literal["traces", "observations"], 3084 mapper: MapperFunction, 3085 filter: Optional[str] = None, 3086 fetch_batch_size: int = 50, 3087 fetch_trace_fields: Optional[str] = None, 3088 max_items: Optional[int] = None, 3089 max_retries: int = 3, 3090 evaluators: List[EvaluatorFunction], 3091 composite_evaluator: Optional[CompositeEvaluatorFunction] = None, 3092 max_concurrency: int = 5, 3093 metadata: Optional[Dict[str, Any]] = None, 3094 _add_observation_scores_to_trace: bool = False, 3095 _additional_trace_tags: Optional[List[str]] = None, 3096 resume_from: Optional[BatchEvaluationResumeToken] = None, 3097 verbose: bool = False, 3098 ) -> BatchEvaluationResult: 3099 """Fetch traces or observations and run evaluations on each item. 3100 3101 This method provides a powerful way to evaluate existing data in Langfuse at scale. 3102 It fetches items based on filters, transforms them using a mapper function, runs 3103 evaluators on each item, and creates scores that are linked back to the original 3104 entities. This is ideal for: 3105 3106 - Running evaluations on production traces after deployment 3107 - Backtesting new evaluation metrics on historical data 3108 - Batch scoring of observations for quality monitoring 3109 - Periodic evaluation runs on recent data 3110 3111 The method uses a streaming/pipeline approach to process items in batches, making 3112 it memory-efficient for large datasets. It includes comprehensive error handling, 3113 retry logic, and resume capability for long-running evaluations. 3114 3115 Args: 3116 scope: The type of items to evaluate. Must be one of: 3117 - "traces": Evaluate complete traces with all their observations 3118 - "observations": Evaluate individual observations (spans, generations, events) 3119 mapper: Function that transforms API response objects into evaluator inputs. 3120 Receives a trace/observation object and returns an EvaluatorInputs 3121 instance with input, output, expected_output, and metadata fields. 3122 Can be sync or async. 3123 evaluators: List of evaluation functions to run on each item. Each evaluator 3124 receives the mapped inputs and returns Evaluation object(s). Evaluator 3125 failures are logged but don't stop the batch evaluation. 3126 filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples: 3127 - '{"tags": ["production"]}' 3128 - '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}' 3129 Default: None (fetches all items). 3130 fetch_batch_size: Number of items to fetch per API call and hold in memory. 3131 Larger values may be faster but use more memory. Default: 50. 3132 fetch_trace_fields: Comma-separated list of fields to include when fetching traces. Available field groups: 'core' (always included), 'io' (input, output, metadata), 'scores', 'observations', 'metrics'. If not specified, all fields are returned. Example: 'core,scores,metrics'. Note: Excluded 'observations' or 'scores' fields return empty arrays; excluded 'metrics' returns -1 for 'totalCost' and 'latency'. Only relevant if scope is 'traces'. 3133 max_items: Maximum total number of items to process. If None, processes all 3134 items matching the filter. Useful for testing or limiting evaluation runs. 3135 Default: None (process all). 3136 max_concurrency: Maximum number of items to evaluate concurrently. Controls 3137 parallelism and resource usage. Default: 5. 3138 composite_evaluator: Optional function that creates a composite score from 3139 item-level evaluations. Receives the original item and its evaluations, 3140 returns a single Evaluation. Useful for weighted averages or combined metrics. 3141 Default: None. 3142 metadata: Optional metadata dict to add to all created scores. Useful for 3143 tracking evaluation runs, versions, or other context. Default: None. 3144 max_retries: Maximum number of retry attempts for failed batch fetches. 3145 Uses exponential backoff (1s, 2s, 4s). Default: 3. 3146 verbose: If True, logs progress information to console. Useful for monitoring 3147 long-running evaluations. Default: False. 3148 resume_from: Optional resume token from a previous incomplete run. Allows 3149 continuing evaluation after interruption or failure. Default: None. 3150 3151 3152 Returns: 3153 BatchEvaluationResult containing: 3154 - total_items_fetched: Number of items fetched from API 3155 - total_items_processed: Number of items successfully evaluated 3156 - total_items_failed: Number of items that failed evaluation 3157 - total_scores_created: Scores created by item-level evaluators 3158 - total_composite_scores_created: Scores created by composite evaluator 3159 - total_evaluations_failed: Individual evaluator failures 3160 - evaluator_stats: Per-evaluator statistics (success rate, scores created) 3161 - resume_token: Token for resuming if incomplete (None if completed) 3162 - completed: True if all items processed 3163 - duration_seconds: Total execution time 3164 - failed_item_ids: IDs of items that failed 3165 - error_summary: Error types and counts 3166 - has_more_items: True if max_items reached but more exist 3167 3168 Raises: 3169 ValueError: If invalid scope is provided. 3170 3171 Examples: 3172 Basic trace evaluation: 3173 ```python 3174 from langfuse import Langfuse, EvaluatorInputs, Evaluation 3175 3176 client = Langfuse() 3177 3178 # Define mapper to extract fields from traces 3179 def trace_mapper(trace): 3180 return EvaluatorInputs( 3181 input=trace.input, 3182 output=trace.output, 3183 expected_output=None, 3184 metadata={"trace_id": trace.id} 3185 ) 3186 3187 # Define evaluator 3188 def length_evaluator(*, input, output, expected_output, metadata): 3189 return Evaluation( 3190 name="output_length", 3191 value=len(output) if output else 0 3192 ) 3193 3194 # Run batch evaluation 3195 result = client.run_batched_evaluation( 3196 scope="traces", 3197 mapper=trace_mapper, 3198 evaluators=[length_evaluator], 3199 filter='{"tags": ["production"]}', 3200 max_items=1000, 3201 verbose=True 3202 ) 3203 3204 print(f"Processed {result.total_items_processed} traces") 3205 print(f"Created {result.total_scores_created} scores") 3206 ``` 3207 3208 Evaluation with composite scorer: 3209 ```python 3210 def accuracy_evaluator(*, input, output, expected_output, metadata): 3211 # ... evaluation logic 3212 return Evaluation(name="accuracy", value=0.85) 3213 3214 def relevance_evaluator(*, input, output, expected_output, metadata): 3215 # ... evaluation logic 3216 return Evaluation(name="relevance", value=0.92) 3217 3218 def composite_evaluator(*, item, evaluations): 3219 # Weighted average of evaluations 3220 weights = {"accuracy": 0.6, "relevance": 0.4} 3221 total = sum( 3222 e.value * weights.get(e.name, 0) 3223 for e in evaluations 3224 if isinstance(e.value, (int, float)) 3225 ) 3226 return Evaluation( 3227 name="composite_score", 3228 value=total, 3229 comment=f"Weighted average of {len(evaluations)} metrics" 3230 ) 3231 3232 result = client.run_batched_evaluation( 3233 scope="traces", 3234 mapper=trace_mapper, 3235 evaluators=[accuracy_evaluator, relevance_evaluator], 3236 composite_evaluator=composite_evaluator, 3237 filter='{"user_id": "important_user"}', 3238 verbose=True 3239 ) 3240 ``` 3241 3242 Handling incomplete runs with resume: 3243 ```python 3244 # Initial run that may fail or timeout 3245 result = client.run_batched_evaluation( 3246 scope="observations", 3247 mapper=obs_mapper, 3248 evaluators=[my_evaluator], 3249 max_items=10000, 3250 verbose=True 3251 ) 3252 3253 # Check if incomplete 3254 if not result.completed and result.resume_token: 3255 print(f"Processed {result.resume_token.items_processed} items before interruption") 3256 3257 # Resume from where it left off 3258 result = client.run_batched_evaluation( 3259 scope="observations", 3260 mapper=obs_mapper, 3261 evaluators=[my_evaluator], 3262 resume_from=result.resume_token, 3263 verbose=True 3264 ) 3265 3266 print(f"Total items processed: {result.total_items_processed}") 3267 ``` 3268 3269 Monitoring evaluator performance: 3270 ```python 3271 result = client.run_batched_evaluation(...) 3272 3273 for stats in result.evaluator_stats: 3274 success_rate = stats.successful_runs / stats.total_runs 3275 print(f"{stats.name}:") 3276 print(f" Success rate: {success_rate:.1%}") 3277 print(f" Scores created: {stats.total_scores_created}") 3278 3279 if stats.failed_runs > 0: 3280 print(f" ⚠️ Failed {stats.failed_runs} times") 3281 ``` 3282 3283 Note: 3284 - Evaluator failures are logged but don't stop the batch evaluation 3285 - Individual item failures are tracked but don't stop processing 3286 - Fetch failures are retried with exponential backoff 3287 - All scores are automatically flushed to Langfuse at the end 3288 - The resume mechanism uses timestamp-based filtering to avoid duplicates 3289 """ 3290 runner = BatchEvaluationRunner(self) 3291 3292 return cast( 3293 BatchEvaluationResult, 3294 run_async_safely( 3295 runner.run_async( 3296 scope=scope, 3297 mapper=mapper, 3298 evaluators=evaluators, 3299 filter=filter, 3300 fetch_batch_size=fetch_batch_size, 3301 fetch_trace_fields=fetch_trace_fields, 3302 max_items=max_items, 3303 max_concurrency=max_concurrency, 3304 composite_evaluator=composite_evaluator, 3305 metadata=metadata, 3306 _add_observation_scores_to_trace=_add_observation_scores_to_trace, 3307 _additional_trace_tags=_additional_trace_tags, 3308 max_retries=max_retries, 3309 verbose=verbose, 3310 resume_from=resume_from, 3311 ) 3312 ), 3313 ) 3314 3315 def auth_check(self) -> bool: 3316 """Check if the provided credentials (public and secret key) are valid. 3317 3318 Raises: 3319 Exception: If no projects were found for the provided credentials. 3320 3321 Note: 3322 This method is blocking. It is discouraged to use it in production code. 3323 """ 3324 try: 3325 projects = self.api.projects.get() 3326 langfuse_logger.debug( 3327 f"Auth check successful, found {len(projects.data)} projects" 3328 ) 3329 if len(projects.data) == 0: 3330 raise Exception( 3331 "Auth check failed, no project found for the keys provided." 3332 ) 3333 return True 3334 3335 except AttributeError as e: 3336 langfuse_logger.warning( 3337 f"Auth check failed: Client not properly initialized. Error: {e}" 3338 ) 3339 return False 3340 3341 except Error as e: 3342 handle_fern_exception(e) 3343 raise e 3344 3345 def create_dataset( 3346 self, 3347 *, 3348 name: str, 3349 description: Optional[str] = None, 3350 metadata: Optional[Any] = None, 3351 input_schema: Optional[Any] = None, 3352 expected_output_schema: Optional[Any] = None, 3353 ) -> Dataset: 3354 """Create a dataset with the given name on Langfuse. 3355 3356 Args: 3357 name: Name of the dataset to create. 3358 description: Description of the dataset. Defaults to None. 3359 metadata: Additional metadata. Defaults to None. 3360 input_schema: JSON Schema for validating dataset item inputs. When set, all new items will be validated against this schema. 3361 expected_output_schema: JSON Schema for validating dataset item expected outputs. When set, all new items will be validated against this schema. 3362 3363 Returns: 3364 Dataset: The created dataset as returned by the Langfuse API. 3365 """ 3366 try: 3367 langfuse_logger.debug(f"Creating datasets {name}") 3368 3369 result = self.api.datasets.create( 3370 name=name, 3371 description=description, 3372 metadata=metadata, 3373 input_schema=input_schema, 3374 expected_output_schema=expected_output_schema, 3375 ) 3376 3377 return cast(Dataset, result) 3378 3379 except Error as e: 3380 handle_fern_exception(e) 3381 raise e 3382 3383 def create_dataset_item( 3384 self, 3385 *, 3386 dataset_name: str, 3387 input: Optional[Any] = None, 3388 expected_output: Optional[Any] = None, 3389 metadata: Optional[Any] = None, 3390 source_trace_id: Optional[str] = None, 3391 source_observation_id: Optional[str] = None, 3392 status: Optional[DatasetStatus] = None, 3393 id: Optional[str] = None, 3394 ) -> DatasetItem: 3395 """Create a dataset item. 3396 3397 Upserts if an item with id already exists. 3398 3399 Args: 3400 dataset_name: Name of the dataset in which the dataset item should be created. 3401 input: Input data. Defaults to None. Can contain any dict, list or scalar. 3402 expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar. 3403 metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar. 3404 source_trace_id: Id of the source trace. Defaults to None. 3405 source_observation_id: Id of the source observation. Defaults to None. 3406 status: Status of the dataset item. Defaults to ACTIVE for newly created items. 3407 id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets. 3408 3409 Returns: 3410 DatasetItem: The created dataset item as returned by the Langfuse API. 3411 3412 Example: 3413 ```python 3414 from langfuse import Langfuse 3415 3416 langfuse = Langfuse() 3417 3418 # Uploading items to the Langfuse dataset named "capital_cities" 3419 langfuse.create_dataset_item( 3420 dataset_name="capital_cities", 3421 input={"input": {"country": "Italy"}}, 3422 expected_output={"expected_output": "Rome"}, 3423 metadata={"foo": "bar"} 3424 ) 3425 ``` 3426 """ 3427 try: 3428 langfuse_logger.debug(f"Creating dataset item for dataset {dataset_name}") 3429 3430 # Media uploads must reference the (dataset, item) they belong to, and 3431 # the item need not exist yet — so settle on the item id up front and 3432 # reuse it for the create call below. 3433 item_id = id if id is not None else str(uuid.uuid4()) 3434 3435 # Single pass per field: swap each LangfuseMedia for its reference 3436 # string (derived from content, not the upload) and collect the media 3437 # still to upload, deduped by media id and tagged with its field. 3438 pending_media: Dict[str, Tuple[LangfuseMedia, str]] = {} 3439 input = self._process_dataset_item_media( 3440 data=input, 3441 pending_media=pending_media, 3442 field=DatasetItemMediaReferenceField.INPUT.value, 3443 ) 3444 expected_output = self._process_dataset_item_media( 3445 data=expected_output, 3446 pending_media=pending_media, 3447 field=DatasetItemMediaReferenceField.EXPECTED_OUTPUT.value, 3448 ) 3449 metadata = self._process_dataset_item_media( 3450 data=metadata, 3451 pending_media=pending_media, 3452 field=DatasetItemMediaReferenceField.METADATA.value, 3453 ) 3454 3455 # The upload needs the dataset id, but the create API only takes the 3456 # name. Resolve it once, and only when there is actually media to 3457 # upload — a plain item pays no extra datasets.get round-trip. 3458 if pending_media: 3459 assert self._resources is not None 3460 dataset_id = self.api.datasets.get(self._url_encode(dataset_name)).id 3461 for media, field in pending_media.values(): 3462 self._resources._media_manager._upload_media_sync( 3463 media=media, 3464 dataset_id=dataset_id, 3465 dataset_item_id=item_id, 3466 field=field, 3467 ) 3468 3469 result = self.api.dataset_items.create( 3470 dataset_name=dataset_name, 3471 input=input, 3472 expected_output=expected_output, 3473 metadata=metadata, 3474 source_trace_id=source_trace_id, 3475 source_observation_id=source_observation_id, 3476 status=status, 3477 id=item_id, 3478 ) 3479 3480 return cast(DatasetItem, result) 3481 except Error as e: 3482 handle_fern_exception(e) 3483 raise e 3484 3485 def _process_dataset_item_media( 3486 self, 3487 *, 3488 data: Any, 3489 pending_media: Dict[str, Tuple[LangfuseMedia, str]], 3490 field: str, 3491 ) -> Any: 3492 """Swap each ``LangfuseMedia`` for its reference string in ``data``. 3493 3494 Each replaced media is recorded in ``pending_media`` (keyed by media id, 3495 so the same media across fields uploads once) for the caller to upload 3496 after the dataset id has been resolved. 3497 """ 3498 if self._resources is None: 3499 return data 3500 3501 max_levels = 10 3502 3503 def _process_data_recursively( 3504 data: Any, level: int, ancestor_container_ids: set[int] 3505 ) -> Any: 3506 if isinstance(data, LangfuseMedia): 3507 reference_string = data._reference_string 3508 media_id = data._media_id 3509 if reference_string is None or media_id is None: 3510 raise ValueError( 3511 "Cannot create dataset item with invalid LangfuseMedia." 3512 ) 3513 # First field a media appears in wins; later duplicates dedupe. 3514 pending_media.setdefault(media_id, (data, field)) 3515 return reference_string 3516 3517 if isinstance(data, LangfuseMediaReference): 3518 return data.reference_string if data.reference_string else data 3519 3520 # Tuples are intentionally excluded: namedtuple subclasses can't be 3521 # rebuilt from an iterable, so media inside them is left untouched. 3522 if not isinstance(data, (list, set, frozenset, dict)): 3523 return data 3524 3525 # Container ids only protect against recursive cycles. 3526 data_id = id(data) 3527 if data_id in ancestor_container_ids or level > max_levels: 3528 return data 3529 3530 next_ancestor_container_ids = ancestor_container_ids | {data_id} 3531 3532 if isinstance(data, (list, set, frozenset)): 3533 processed = ( 3534 _process_data_recursively( 3535 item, level + 1, next_ancestor_container_ids 3536 ) 3537 for item in data 3538 ) 3539 return type(data)(processed) 3540 3541 return { 3542 key: _process_data_recursively( 3543 value, level + 1, next_ancestor_container_ids 3544 ) 3545 for key, value in data.items() 3546 } 3547 3548 return _process_data_recursively(data, 1, set()) 3549 3550 def _hydrate_dataset_item_media_references(self, item: DatasetItem) -> DatasetItem: 3551 media_references = item.media_references or [] 3552 if not media_references: 3553 return item 3554 3555 # Map the API enum member to the snake_case model attribute so this keeps 3556 # working regardless of the enum's wire value (e.g. "expectedOutput"). 3557 attr_by_field = { 3558 DatasetItemMediaReferenceField.INPUT: "input", 3559 DatasetItemMediaReferenceField.EXPECTED_OUTPUT: "expected_output", 3560 DatasetItemMediaReferenceField.METADATA: "metadata", 3561 } 3562 hydrated_fields = { 3563 "input": item.input, 3564 "expected_output": item.expected_output, 3565 "metadata": item.metadata, 3566 } 3567 3568 for media_reference in media_references: 3569 media = media_reference.media 3570 field = attr_by_field.get(media_reference.field) 3571 if field is None: 3572 continue 3573 3574 replacement = LangfuseMediaReference( 3575 media_id=media.media_id, 3576 content_type=media.content_type, 3577 url=media.url, 3578 url_expiry=media.url_expiry, 3579 content_length=media.content_length, 3580 reference_string=media_reference.reference_string, 3581 ) 3582 hydrated_fields[field] = self._replace_json_path_value( 3583 value=hydrated_fields[field], 3584 path=media_reference.json_path, 3585 replacement=replacement, 3586 ) 3587 3588 return item.model_copy( 3589 update={ 3590 "input": hydrated_fields["input"], 3591 "expected_output": hydrated_fields["expected_output"], 3592 "metadata": hydrated_fields["metadata"], 3593 } 3594 ) 3595 3596 def _replace_json_path_value( 3597 self, *, value: Any, path: str, replacement: LangfuseMediaReference 3598 ) -> Any: 3599 try: 3600 return json_path.set_value_at_path(value, path, replacement) 3601 except Exception as e: 3602 langfuse_logger.warning( 3603 f"Failed to hydrate dataset media reference at JSONPath {path}", 3604 exc_info=e, 3605 ) 3606 3607 return value 3608 3609 def resolve_media_references( 3610 self, 3611 *, 3612 obj: Any, 3613 resolve_with: Literal["base64_data_uri"], 3614 max_depth: int = 10, 3615 content_fetch_timeout_seconds: int = 5, 3616 ) -> Any: 3617 """Replace media reference strings in an object with base64 data URIs. 3618 3619 This method recursively traverses an object (up to max_depth) looking for media reference strings 3620 in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using 3621 the provided Langfuse client and replaces the reference string with a base64 data URI. 3622 3623 If fetching media content fails for a reference string, a warning is logged and the reference 3624 string is left unchanged. 3625 3626 Args: 3627 obj: The object to process. Can be a primitive value, array, or nested object. 3628 If the object has a __dict__ attribute, a dict will be returned instead of the original object type. 3629 resolve_with: The representation of the media content to replace the media reference string with. 3630 Currently only "base64_data_uri" is supported. 3631 max_depth: int: The maximum depth to traverse the object. Default is 10. 3632 content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5. 3633 3634 Returns: 3635 A deep copy of the input object with all media references replaced with base64 data URIs where possible. 3636 If the input object has a __dict__ attribute, a dict will be returned instead of the original object type. 3637 3638 Example: 3639 obj = { 3640 "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@", 3641 "nested": { 3642 "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@" 3643 } 3644 } 3645 3646 result = await LangfuseMedia.resolve_media_references(obj, langfuse_client) 3647 3648 # Result: 3649 # { 3650 # "image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...", 3651 # "nested": { 3652 # "pdf": "data:application/pdf;base64,JVBERi0xLjcK..." 3653 # } 3654 # } 3655 """ 3656 return LangfuseMedia.resolve_media_references( 3657 langfuse_client=self, 3658 obj=obj, 3659 resolve_with=resolve_with, 3660 max_depth=max_depth, 3661 content_fetch_timeout_seconds=content_fetch_timeout_seconds, 3662 ) 3663 3664 @overload 3665 def get_prompt( 3666 self, 3667 name: str, 3668 *, 3669 version: Optional[int] = None, 3670 label: Optional[str] = None, 3671 type: Literal["chat"], 3672 cache_ttl_seconds: Optional[int] = None, 3673 fallback: Optional[List[ChatMessageDict]] = None, 3674 max_retries: Optional[int] = None, 3675 fetch_timeout_seconds: Optional[int] = None, 3676 ) -> ChatPromptClient: ... 3677 3678 @overload 3679 def get_prompt( 3680 self, 3681 name: str, 3682 *, 3683 version: Optional[int] = None, 3684 label: Optional[str] = None, 3685 type: Literal["text"] = "text", 3686 cache_ttl_seconds: Optional[int] = None, 3687 fallback: Optional[str] = None, 3688 max_retries: Optional[int] = None, 3689 fetch_timeout_seconds: Optional[int] = None, 3690 ) -> TextPromptClient: ... 3691 3692 def get_prompt( 3693 self, 3694 name: str, 3695 *, 3696 version: Optional[int] = None, 3697 label: Optional[str] = None, 3698 type: Literal["chat", "text"] = "text", 3699 cache_ttl_seconds: Optional[int] = None, 3700 fallback: Union[Optional[List[ChatMessageDict]], Optional[str]] = None, 3701 max_retries: Optional[int] = None, 3702 fetch_timeout_seconds: Optional[int] = None, 3703 ) -> PromptClient: 3704 """Get a prompt. 3705 3706 This method attempts to fetch the requested prompt from the local cache. If the prompt is not found 3707 in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again 3708 and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will 3709 return the expired prompt as a fallback. 3710 3711 Args: 3712 name (str): The name of the prompt to retrieve. 3713 3714 Keyword Args: 3715 version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both. 3716 label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both. 3717 cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a 3718 keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0. 3719 type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text". 3720 fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None. 3721 max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds. 3722 fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default. 3723 3724 Returns: 3725 The prompt object retrieved from the cache or directly fetched if not cached or expired of type 3726 - TextPromptClient, if type argument is 'text'. 3727 - ChatPromptClient, if type argument is 'chat'. 3728 3729 Raises: 3730 Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an 3731 expired prompt in the cache, in which case it logs a warning and returns the expired prompt. 3732 """ 3733 if self._resources is None: 3734 raise Error( 3735 "SDK is not correctly initialized. Check the init logs for more details." 3736 ) 3737 if version is not None and label is not None: 3738 raise ValueError("Cannot specify both version and label at the same time.") 3739 3740 if not name: 3741 raise ValueError("Prompt name cannot be empty.") 3742 3743 cache_key = PromptCache.generate_cache_key(name, version=version, label=label) 3744 bounded_max_retries = self._get_bounded_max_retries( 3745 max_retries, default_max_retries=2, max_retries_upper_bound=4 3746 ) 3747 3748 langfuse_logger.debug(f"Getting prompt '{cache_key}'") 3749 cached_prompt = self._resources.prompt_cache.get(cache_key) 3750 3751 if cached_prompt is None or cache_ttl_seconds == 0: 3752 langfuse_logger.debug( 3753 f"Prompt '{cache_key}' not found in cache or caching disabled." 3754 ) 3755 try: 3756 return self._fetch_prompt_and_update_cache( 3757 name, 3758 version=version, 3759 label=label, 3760 ttl_seconds=cache_ttl_seconds, 3761 max_retries=bounded_max_retries, 3762 fetch_timeout_seconds=fetch_timeout_seconds, 3763 ) 3764 except Exception as e: 3765 if fallback: 3766 langfuse_logger.warning( 3767 f"Returning fallback prompt for '{cache_key}' due to fetch error: {e}" 3768 ) 3769 3770 fallback_client_args: Dict[str, Any] = { 3771 "name": name, 3772 "prompt": fallback, 3773 "type": type, 3774 "version": version or 0, 3775 "config": {}, 3776 "labels": [label] if label else [], 3777 "tags": [], 3778 } 3779 3780 if type == "text": 3781 return TextPromptClient( 3782 prompt=Prompt_Text(**fallback_client_args), 3783 is_fallback=True, 3784 ) 3785 3786 if type == "chat": 3787 return ChatPromptClient( 3788 prompt=Prompt_Chat(**fallback_client_args), 3789 is_fallback=True, 3790 ) 3791 3792 raise e 3793 3794 if cached_prompt.is_expired(): 3795 langfuse_logger.debug(f"Stale prompt '{cache_key}' found in cache.") 3796 try: 3797 # refresh prompt in background thread, refresh_prompt deduplicates tasks 3798 langfuse_logger.debug(f"Refreshing prompt '{cache_key}' in background.") 3799 3800 def refresh_task() -> None: 3801 self._fetch_prompt_and_update_cache( 3802 name, 3803 version=version, 3804 label=label, 3805 ttl_seconds=cache_ttl_seconds, 3806 max_retries=bounded_max_retries, 3807 fetch_timeout_seconds=fetch_timeout_seconds, 3808 ) 3809 3810 self._resources.prompt_cache.add_refresh_prompt_task_if_current( 3811 cache_key, 3812 cached_prompt, 3813 refresh_task, 3814 ) 3815 langfuse_logger.debug( 3816 f"Returning stale prompt '{cache_key}' from cache." 3817 ) 3818 # return stale prompt 3819 return cached_prompt.value 3820 3821 except Exception as e: 3822 langfuse_logger.warning( 3823 f"Error when refreshing cached prompt '{cache_key}', returning cached version. Error: {e}" 3824 ) 3825 # creation of refresh prompt task failed, return stale prompt 3826 return cached_prompt.value 3827 3828 return cached_prompt.value 3829 3830 def _fetch_prompt_and_update_cache( 3831 self, 3832 name: str, 3833 *, 3834 version: Optional[int] = None, 3835 label: Optional[str] = None, 3836 ttl_seconds: Optional[int] = None, 3837 max_retries: int, 3838 fetch_timeout_seconds: Optional[int], 3839 ) -> PromptClient: 3840 cache_key = PromptCache.generate_cache_key(name, version=version, label=label) 3841 langfuse_logger.debug(f"Fetching prompt '{cache_key}' from server...") 3842 3843 try: 3844 3845 @backoff.on_exception( 3846 backoff.constant, Exception, max_tries=max_retries + 1, logger=None 3847 ) 3848 def fetch_prompts() -> Any: 3849 return self.api.prompts.get( 3850 self._url_encode(name), 3851 version=version, 3852 label=label, 3853 request_options={ 3854 "timeout_in_seconds": fetch_timeout_seconds, 3855 } 3856 if fetch_timeout_seconds is not None 3857 else None, 3858 ) 3859 3860 prompt_response = fetch_prompts() 3861 3862 prompt: PromptClient 3863 if prompt_response.type == "chat": 3864 prompt = ChatPromptClient(prompt_response) 3865 else: 3866 prompt = TextPromptClient(prompt_response) 3867 3868 if self._resources is not None: 3869 self._resources.prompt_cache.set(cache_key, prompt, ttl_seconds) 3870 3871 return prompt 3872 3873 except NotFoundError as not_found_error: 3874 langfuse_logger.warning( 3875 f"Prompt '{cache_key}' not found during refresh, evicting from cache." 3876 ) 3877 if self._resources is not None: 3878 self._resources.prompt_cache.delete(cache_key) 3879 raise not_found_error 3880 3881 except Exception as e: 3882 langfuse_logger.error( 3883 f"Error while fetching prompt '{cache_key}': {str(e)}" 3884 ) 3885 raise e 3886 3887 def _get_bounded_max_retries( 3888 self, 3889 max_retries: Optional[int], 3890 *, 3891 default_max_retries: int = 2, 3892 max_retries_upper_bound: int = 4, 3893 ) -> int: 3894 if max_retries is None: 3895 return default_max_retries 3896 3897 bounded_max_retries = min( 3898 max(max_retries, 0), 3899 max_retries_upper_bound, 3900 ) 3901 3902 return bounded_max_retries 3903 3904 @overload 3905 def create_prompt( 3906 self, 3907 *, 3908 name: str, 3909 prompt: List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]], 3910 labels: List[str] = [], 3911 tags: Optional[List[str]] = None, 3912 type: Optional[Literal["chat"]], 3913 config: Optional[Any] = None, 3914 commit_message: Optional[str] = None, 3915 ) -> ChatPromptClient: ... 3916 3917 @overload 3918 def create_prompt( 3919 self, 3920 *, 3921 name: str, 3922 prompt: str, 3923 labels: List[str] = [], 3924 tags: Optional[List[str]] = None, 3925 type: Optional[Literal["text"]] = "text", 3926 config: Optional[Any] = None, 3927 commit_message: Optional[str] = None, 3928 ) -> TextPromptClient: ... 3929 3930 def create_prompt( 3931 self, 3932 *, 3933 name: str, 3934 prompt: Union[ 3935 str, List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]] 3936 ], 3937 labels: List[str] = [], 3938 tags: Optional[List[str]] = None, 3939 type: Optional[Literal["chat", "text"]] = "text", 3940 config: Optional[Any] = None, 3941 commit_message: Optional[str] = None, 3942 ) -> PromptClient: 3943 """Create a new prompt in Langfuse. 3944 3945 Keyword Args: 3946 name : The name of the prompt to be created. 3947 prompt : The content of the prompt to be created. 3948 is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead. 3949 labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label. 3950 tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt. 3951 config: Additional structured data to be saved with the prompt. Defaults to None. 3952 type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text". 3953 commit_message: Optional string describing the change. 3954 3955 Returns: 3956 TextPromptClient: The prompt if type argument is 'text'. 3957 ChatPromptClient: The prompt if type argument is 'chat'. 3958 """ 3959 try: 3960 langfuse_logger.debug(f"Creating prompt {name=}, {labels=}") 3961 3962 if type == "chat": 3963 if not isinstance(prompt, list): 3964 raise ValueError( 3965 "For 'chat' type, 'prompt' must be a list of chat messages with role and content attributes." 3966 ) 3967 request: Union[CreateChatPromptRequest, CreateTextPromptRequest] = ( 3968 CreateChatPromptRequest( 3969 name=name, 3970 prompt=cast(Any, prompt), 3971 labels=labels, 3972 tags=tags, 3973 config=config or {}, 3974 commit_message=commit_message, 3975 type=CreateChatPromptType.CHAT, 3976 ) 3977 ) 3978 server_prompt = self.api.prompts.create(request=request) 3979 3980 if self._resources is not None: 3981 self._resources.prompt_cache.invalidate(name) 3982 3983 return ChatPromptClient(prompt=cast(Prompt_Chat, server_prompt)) 3984 3985 if not isinstance(prompt, str): 3986 raise ValueError("For 'text' type, 'prompt' must be a string.") 3987 3988 request = CreateTextPromptRequest( 3989 name=name, 3990 prompt=prompt, 3991 labels=labels, 3992 tags=tags, 3993 config=config or {}, 3994 commit_message=commit_message, 3995 ) 3996 3997 server_prompt = self.api.prompts.create(request=request) 3998 3999 if self._resources is not None: 4000 self._resources.prompt_cache.invalidate(name) 4001 4002 return TextPromptClient(prompt=cast(Prompt_Text, server_prompt)) 4003 4004 except Error as e: 4005 handle_fern_exception(e) 4006 raise e 4007 4008 def update_prompt( 4009 self, 4010 *, 4011 name: str, 4012 version: int, 4013 new_labels: List[str] = [], 4014 ) -> Any: 4015 """Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name. 4016 4017 Args: 4018 name (str): The name of the prompt to update. 4019 version (int): The version number of the prompt to update. 4020 new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to []. 4021 4022 Returns: 4023 Prompt: The updated prompt from the Langfuse API. 4024 4025 """ 4026 updated_prompt = self.api.prompt_version.update( 4027 name=self._url_encode(name), 4028 version=version, 4029 new_labels=new_labels, 4030 ) 4031 4032 if self._resources is not None: 4033 self._resources.prompt_cache.invalidate(name) 4034 4035 return updated_prompt 4036 4037 def _url_encode(self, url: str, *, is_url_param: Optional[bool] = False) -> str: 4038 # httpx ≥ 0.28 does its own WHATWG-compliant quoting (eg. encodes bare 4039 # “%”, “?”, “#”, “|”, … in query/path parts). Re-quoting here would 4040 # double-encode, so we skip when the value is about to be sent straight 4041 # to httpx (`is_url_param=True`) and the installed version is ≥ 0.28. 4042 if is_url_param and Version(httpx.__version__) >= Version("0.28.0"): 4043 return url 4044 4045 # urllib.parse.quote does not escape slashes "/" by default; we need to add safe="" to force escaping 4046 # we need add safe="" to force escaping of slashes 4047 # This is necessary for prompts in prompt folders 4048 return urllib.parse.quote(url, safe="") 4049 4050 def clear_prompt_cache(self) -> None: 4051 """Clear the entire prompt cache, removing all cached prompts. 4052 4053 This method is useful when you want to force a complete refresh of all 4054 cached prompts, for example after major updates or when you need to 4055 ensure the latest versions are fetched from the server. 4056 """ 4057 if self._resources is not None: 4058 self._resources.prompt_cache.clear()
Main client for Langfuse tracing and platform features.
This class provides an interface for creating and managing traces, spans, and generations in Langfuse as well as interacting with the Langfuse API.
The client features a thread-safe singleton pattern for each unique public API key, ensuring consistent trace context propagation across your application. It implements efficient batching of spans with configurable flush settings and includes background thread management for media uploads and score ingestion.
Configuration is flexible through either direct parameters or environment variables, with graceful fallbacks and runtime configuration updates.
Attributes:
- api: Synchronous API client for Langfuse backend communication
- async_api: Asynchronous API client for Langfuse backend communication
- _otel_tracer: Internal LangfuseTracer instance managing OpenTelemetry components
Arguments:
- public_key (Optional[str]): Your Langfuse public API key. Can also be set via LANGFUSE_PUBLIC_KEY environment variable.
- secret_key (Optional[str]): Your Langfuse secret API key. Can also be set via LANGFUSE_SECRET_KEY environment variable.
- base_url (Optional[str]): The Langfuse API base URL. Defaults to "https://cloud.langfuse.com". Can also be set via LANGFUSE_BASE_URL environment variable.
- host (Optional[str]): Deprecated. Use base_url instead. The Langfuse API host URL. Defaults to "https://cloud.langfuse.com".
- timeout (Optional[int]): Timeout in seconds for API requests. Defaults to 5 seconds.
- httpx_client (Optional[httpx.Client]): Custom httpx client for making non-tracing HTTP requests. If not provided, a default client will be created.
Fork safety:
httpx.Clientis thread-safe but not process-safe. When usingfork()-based servers (e.g. Gunicorn with--preload), the SDK automatically recreates its internally-managed HTTP client in child processes after fork. A customhttpx_clientis intentionally left as-is (the fork-inherited copy is reused), so you retain the opportunity to handle process-safety yourself — for example by registering your ownos.register_at_fork(after_in_child=...)handler to close and reopen connections on the custom client. - debug (bool): Enable debug logging. Defaults to False. Can also be set via LANGFUSE_DEBUG environment variable.
- tracing_enabled (Optional[bool]): Enable or disable tracing. Defaults to True. Can also be set via LANGFUSE_TRACING_ENABLED environment variable.
- flush_at (Optional[int]): Number of spans to batch before sending to the API. Defaults to 512. Can also be set via LANGFUSE_FLUSH_AT environment variable.
- flush_interval (Optional[float]): Time in seconds between batch flushes. Defaults to 5 seconds. Can also be set via LANGFUSE_FLUSH_INTERVAL environment variable.
- environment (Optional[str]): Environment name for tracing. Default is 'default'. Can also be set via LANGFUSE_TRACING_ENVIRONMENT environment variable. Can be any lowercase alphanumeric string with hyphens and underscores that does not start with 'langfuse'.
- release (Optional[str]): Release version/hash of your application. Used for grouping analytics by release.
- media_upload_thread_count (Optional[int]): Number of background threads for handling media uploads. Defaults to 1. Can also be set via LANGFUSE_MEDIA_UPLOAD_THREAD_COUNT environment variable.
- sample_rate (Optional[float]): Sampling rate for traces (0.0 to 1.0). Defaults to 1.0 (100% of traces are sampled). Can also be set via LANGFUSE_SAMPLE_RATE environment variable.
- mask (Optional[MaskFunction]): Function to mask sensitive data synchronously when Langfuse SDK attributes are created. This applies only to data set through Langfuse SDK APIs such as
start_observation(),update(), andset_trace_io(). mask_otel_spans (Optional[MaskOtelSpansFunction]): Synchronous export-stage hook for masking raw OpenTelemetry span attributes before this Langfuse client sends them to Langfuse. Use this for spans created by third-party OpenTelemetry instrumentations, or when you need to inspect final span attributes after export filtering and Langfuse media handling. It does not modify spans already exported through other OpenTelemetry exporters.
The hook receives one OpenTelemetry export batch. A batch is not guaranteed to contain a complete trace, request, or Langfuse observation tree. The hook usually runs on the OpenTelemetry batch span processor worker thread; during
flush()and shutdown it may run on the caller thread. Keep it synchronous, deterministic, and fast.Return
Noneto leave the batch unchanged. ReturnMaskOtelSpansResultwithOtelSpanPatchvalues to delete or replace attributes on selected spans. If the hook raises or returns an invalid batch result, Langfuse drops the whole export batch. If one returned span patch is invalid, Langfuse drops only that span from the Langfuse export.Example:
from typing import Optional from langfuse import Langfuse from langfuse.types import ( MaskOtelSpansParams, MaskOtelSpansResult, OtelSpanPatch, ) def mask_otel_spans( *, params: MaskOtelSpansParams ) -> Optional[MaskOtelSpansResult]: patches = {} for identifier, span in params.spans.items(): if "gen_ai.prompt.0.content" in span.attributes: patches[identifier] = OtelSpanPatch( delete_attributes=("gen_ai.prompt.0.content",), set_attributes={"masking.applied": True}, ) return MaskOtelSpansResult(span_patches=patches) langfuse = Langfuse(mask_otel_spans=mask_otel_spans)blocked_instrumentation_scopes (Optional[List[str]]): Deprecated. Use
should_export_spaninstead. Equivalent behavior:from langfuse.span_filter import is_default_export_span blocked = {"sqlite", "requests"} should_export_span = lambda span: ( is_default_export_span(span) and ( span.instrumentation_scope is None or span.instrumentation_scope.name not in blocked ) )should_export_span (Optional[Callable[[ReadableSpan], bool]]): Callback to decide whether to export a span. If omitted, Langfuse uses the default filter (Langfuse SDK spans, spans with
gen_ai.*attributes, and known LLM instrumentation scopes).- additional_headers (Optional[Dict[str, str]]): Additional headers to include in all API requests and in the default OTLPSpanExporter requests. These headers will be merged with default headers. Note: If httpx_client is provided, additional_headers must be set directly on your custom httpx_client as well. If
span_exporteris provided, these headers are not wired into that exporter and must be configured on the exporter instance directly. - tracer_provider(Optional[TracerProvider]): OpenTelemetry TracerProvider to use for Langfuse. This can be useful to set to have disconnected tracing between Langfuse and other OpenTelemetry-span emitting libraries. Note: To track active spans, the context is still shared between TracerProviders. This may lead to broken trace trees.
- id_generator (Optional[IdGenerator]): OpenTelemetry ID generator to use when Langfuse creates its own TracerProvider. If omitted, the OpenTelemetry SDK default is used. If
tracer_provideris provided, or an OpenTelemetry TracerProvider is already registered globally, configure the ID generator on that provider instead. - span_exporter (Optional[SpanExporter]): Custom OpenTelemetry span exporter for the Langfuse span processor. If omitted, Langfuse creates an OTLPSpanExporter pointed at the Langfuse OTLP endpoint. If provided, Langfuse does not wire
base_url, exporter headers, exporter auth, or exporter timeout into it. Configure endpoint, headers, and timeout on the exporter instance directly. If you are sending spans to Langfuse v4 or using Langfuse Cloud Fast Preview, includex-langfuse-ingestion-version=4on the exporter to enable real time processing of exported spans.
Example:
from langfuse.otel import Langfuse # Initialize the client (reads from env vars if not provided) langfuse = Langfuse( public_key="your-public-key", secret_key="your-secret-key", host="https://cloud.langfuse.com", # Optional, default shown ) # Create a trace span with langfuse.start_as_current_observation(name="process-query") as span: # Your application code here # Create a nested generation span for an LLM call with span.start_as_current_generation( name="generate-response", model="gpt-4", input={"query": "Tell me about AI"}, model_parameters={"temperature": 0.7, "max_tokens": 500} ) as generation: # Generate response here response = "AI is a field of computer science..." generation.update( output=response, usage_details={"prompt_tokens": 10, "completion_tokens": 50}, cost_details={"total_cost": 0.0023} ) # Score the generation (supports NUMERIC, BOOLEAN, CATEGORICAL) generation.score(name="relevance", value=0.95, data_type="NUMERIC")
290 def __init__( 291 self, 292 *, 293 public_key: Optional[str] = None, 294 secret_key: Optional[str] = None, 295 base_url: Optional[str] = None, 296 host: Optional[str] = None, 297 timeout: Optional[int] = None, 298 httpx_client: Optional[httpx.Client] = None, 299 debug: bool = False, 300 tracing_enabled: Optional[bool] = True, 301 flush_at: Optional[int] = None, 302 flush_interval: Optional[float] = None, 303 environment: Optional[str] = None, 304 release: Optional[str] = None, 305 media_upload_thread_count: Optional[int] = None, 306 sample_rate: Optional[float] = None, 307 mask: Optional[MaskFunction] = None, 308 mask_otel_spans: Optional[MaskOtelSpansFunction] = None, 309 blocked_instrumentation_scopes: Optional[List[str]] = None, 310 should_export_span: Optional[Callable[[ReadableSpan], bool]] = None, 311 additional_headers: Optional[Dict[str, str]] = None, 312 tracer_provider: Optional[TracerProvider] = None, 313 id_generator: Optional[IdGenerator] = None, 314 span_exporter: Optional[SpanExporter] = None, 315 ): 316 self._base_url = ( 317 base_url 318 or os.environ.get(LANGFUSE_BASE_URL) 319 or host 320 or os.environ.get(LANGFUSE_HOST, "https://cloud.langfuse.com") 321 ) 322 self._environment = environment or cast( 323 str, os.environ.get(LANGFUSE_TRACING_ENVIRONMENT) 324 ) 325 self._release = ( 326 release 327 or os.environ.get(LANGFUSE_RELEASE, None) 328 or get_common_release_envs() 329 ) 330 self._project_id: Optional[str] = None 331 sample_rate = sample_rate or float(os.environ.get(LANGFUSE_SAMPLE_RATE, 1.0)) 332 if not 0.0 <= sample_rate <= 1.0: 333 raise ValueError( 334 f"Sample rate must be between 0.0 and 1.0, got {sample_rate}" 335 ) 336 337 timeout = timeout or int(os.environ.get(LANGFUSE_TIMEOUT, 5)) 338 339 self._tracing_enabled = ( 340 tracing_enabled 341 and os.environ.get(LANGFUSE_TRACING_ENABLED, "true").lower() != "false" 342 ) 343 if not self._tracing_enabled: 344 langfuse_logger.info( 345 "Configuration: Langfuse tracing is explicitly disabled. No data will be sent to the Langfuse API." 346 ) 347 348 debug = ( 349 debug if debug else (os.getenv(LANGFUSE_DEBUG, "false").lower() == "true") 350 ) 351 if debug: 352 logging.basicConfig( 353 format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" 354 ) 355 langfuse_logger.setLevel(logging.DEBUG) 356 357 public_key = public_key or os.environ.get(LANGFUSE_PUBLIC_KEY) 358 if public_key is None: 359 langfuse_logger.warning( 360 "Authentication error: Langfuse client initialized without public_key. Client will be disabled. " 361 "Provide a public_key parameter or set LANGFUSE_PUBLIC_KEY environment variable. " 362 ) 363 self._otel_tracer = otel_trace_api.NoOpTracer() 364 return 365 366 secret_key = secret_key or os.environ.get(LANGFUSE_SECRET_KEY) 367 if secret_key is None: 368 langfuse_logger.warning( 369 "Authentication error: Langfuse client initialized without secret_key. Client will be disabled. " 370 "Provide a secret_key parameter or set LANGFUSE_SECRET_KEY environment variable. " 371 ) 372 self._otel_tracer = otel_trace_api.NoOpTracer() 373 return 374 375 if os.environ.get("OTEL_SDK_DISABLED", "false").lower() == "true": 376 langfuse_logger.warning( 377 "OTEL_SDK_DISABLED is set. Langfuse tracing will be disabled and no traces will appear in the UI." 378 ) 379 380 if blocked_instrumentation_scopes is not None: 381 warnings.warn( 382 "`blocked_instrumentation_scopes` is deprecated and will be removed in a future release. " 383 "Use `should_export_span` instead. Example: " 384 "from langfuse.span_filter import is_default_export_span; " 385 'blocked={"scope"}; should_export_span=lambda span: ' 386 "is_default_export_span(span) and (span.instrumentation_scope is None or " 387 "span.instrumentation_scope.name not in blocked).", 388 DeprecationWarning, 389 stacklevel=2, 390 ) 391 392 # Initialize api and tracer if requirements are met 393 self._resources = LangfuseResourceManager( 394 public_key=public_key, 395 secret_key=secret_key, 396 base_url=self._base_url, 397 timeout=timeout, 398 environment=self._environment, 399 release=release, 400 flush_at=flush_at, 401 flush_interval=flush_interval, 402 httpx_client=httpx_client, 403 media_upload_thread_count=media_upload_thread_count, 404 sample_rate=sample_rate, 405 mask=mask, 406 mask_otel_spans=mask_otel_spans, 407 tracing_enabled=self._tracing_enabled, 408 blocked_instrumentation_scopes=blocked_instrumentation_scopes, 409 should_export_span=should_export_span, 410 additional_headers=additional_headers, 411 tracer_provider=tracer_provider, 412 id_generator=id_generator, 413 span_exporter=span_exporter, 414 ) 415 self._mask = self._resources.mask 416 417 self._otel_tracer = ( 418 self._resources.tracer 419 if self._tracing_enabled and self._resources.tracer is not None 420 else otel_trace_api.NoOpTracer() 421 )
598 def start_observation( 599 self, 600 *, 601 trace_context: Optional[TraceContext] = None, 602 name: str, 603 as_type: ObservationTypeLiteralNoEvent = "span", 604 input: Optional[Any] = None, 605 output: Optional[Any] = None, 606 metadata: Optional[Any] = None, 607 version: Optional[str] = None, 608 level: Optional[SpanLevel] = None, 609 status_message: Optional[str] = None, 610 completion_start_time: Optional[datetime] = None, 611 model: Optional[str] = None, 612 model_parameters: Optional[Dict[str, MapValue]] = None, 613 usage_details: Optional[Dict[str, int]] = None, 614 cost_details: Optional[Dict[str, float]] = None, 615 prompt: Optional[PromptClient] = None, 616 ) -> Union[ 617 LangfuseSpan, 618 LangfuseGeneration, 619 LangfuseAgent, 620 LangfuseTool, 621 LangfuseChain, 622 LangfuseRetriever, 623 LangfuseEvaluator, 624 LangfuseEmbedding, 625 LangfuseGuardrail, 626 ]: 627 """Create a new observation of the specified type. 628 629 This method creates a new observation but does not set it as the current span in the 630 context. To create and use an observation within a context, use start_as_current_observation(). 631 632 Args: 633 trace_context: Optional context for connecting to an existing trace 634 name: Name of the observation 635 as_type: Type of observation to create (defaults to "span") 636 input: Input data for the operation 637 output: Output data from the operation 638 metadata: Additional metadata to associate with the observation 639 version: Version identifier for the code or component 640 level: Importance level of the observation 641 status_message: Optional status message for the observation 642 completion_start_time: When the model started generating (for generation types) 643 model: Name/identifier of the AI model used (for generation types) 644 model_parameters: Parameters used for the model (for generation types) 645 usage_details: Token usage information (for generation types) 646 cost_details: Cost information (for generation types) 647 prompt: Associated prompt template (for generation types) 648 649 Returns: 650 An observation object of the appropriate type that must be ended with .end() 651 """ 652 if trace_context: 653 trace_id = trace_context.get("trace_id", None) 654 parent_span_id = trace_context.get("parent_span_id", None) 655 656 if trace_id: 657 remote_parent_span = self._create_remote_parent_span( 658 trace_id=trace_id, parent_span_id=parent_span_id 659 ) 660 661 with otel_trace_api.use_span( 662 cast(otel_trace_api.Span, remote_parent_span) 663 ): 664 otel_span = self._otel_tracer.start_span(name=name) 665 otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True) 666 667 return self._create_observation_from_otel_span( 668 otel_span=otel_span, 669 as_type=as_type, 670 input=input, 671 output=output, 672 metadata=metadata, 673 version=version, 674 level=level, 675 status_message=status_message, 676 completion_start_time=completion_start_time, 677 model=model, 678 model_parameters=model_parameters, 679 usage_details=usage_details, 680 cost_details=cost_details, 681 prompt=prompt, 682 ) 683 684 otel_span = self._otel_tracer.start_span(name=name) 685 686 return self._create_observation_from_otel_span( 687 otel_span=otel_span, 688 as_type=as_type, 689 input=input, 690 output=output, 691 metadata=metadata, 692 version=version, 693 level=level, 694 status_message=status_message, 695 completion_start_time=completion_start_time, 696 model=model, 697 model_parameters=model_parameters, 698 usage_details=usage_details, 699 cost_details=cost_details, 700 prompt=prompt, 701 )
Create a new observation of the specified type.
This method creates a new observation but does not set it as the current span in the context. To create and use an observation within a context, use start_as_current_observation().
Arguments:
- trace_context: Optional context for connecting to an existing trace
- name: Name of the observation
- as_type: Type of observation to create (defaults to "span")
- input: Input data for the operation
- output: Output data from the operation
- metadata: Additional metadata to associate with the observation
- version: Version identifier for the code or component
- level: Importance level of the observation
- status_message: Optional status message for the observation
- completion_start_time: When the model started generating (for generation types)
- model: Name/identifier of the AI model used (for generation types)
- model_parameters: Parameters used for the model (for generation types)
- usage_details: Token usage information (for generation types)
- cost_details: Cost information (for generation types)
- prompt: Associated prompt template (for generation types)
Returns:
An observation object of the appropriate type that must be ended with .end()
931 def start_as_current_observation( 932 self, 933 *, 934 trace_context: Optional[TraceContext] = None, 935 name: str, 936 as_type: ObservationTypeLiteralNoEvent = "span", 937 input: Optional[Any] = None, 938 output: Optional[Any] = None, 939 metadata: Optional[Any] = None, 940 version: Optional[str] = None, 941 level: Optional[SpanLevel] = None, 942 status_message: Optional[str] = None, 943 completion_start_time: Optional[datetime] = None, 944 model: Optional[str] = None, 945 model_parameters: Optional[Dict[str, MapValue]] = None, 946 usage_details: Optional[Dict[str, int]] = None, 947 cost_details: Optional[Dict[str, float]] = None, 948 prompt: Optional[PromptClient] = None, 949 end_on_exit: Optional[bool] = None, 950 ) -> Union[ 951 _AgnosticContextManager[LangfuseGeneration], 952 _AgnosticContextManager[LangfuseSpan], 953 _AgnosticContextManager[LangfuseAgent], 954 _AgnosticContextManager[LangfuseTool], 955 _AgnosticContextManager[LangfuseChain], 956 _AgnosticContextManager[LangfuseRetriever], 957 _AgnosticContextManager[LangfuseEvaluator], 958 _AgnosticContextManager[LangfuseEmbedding], 959 _AgnosticContextManager[LangfuseGuardrail], 960 ]: 961 """Create a new observation and set it as the current span in a context manager. 962 963 This method creates a new observation of the specified type and sets it as the 964 current span within a context manager. Use this method with a 'with' statement to 965 automatically handle the observation lifecycle within a code block. 966 967 The created observation will be the child of the current span in the context. 968 969 Args: 970 trace_context: Optional context for connecting to an existing trace 971 name: Name of the observation (e.g., function or operation name) 972 as_type: Type of observation to create (defaults to "span") 973 input: Input data for the operation (can be any JSON-serializable object) 974 output: Output data from the operation (can be any JSON-serializable object) 975 metadata: Additional metadata to associate with the observation 976 version: Version identifier for the code or component 977 level: Importance level of the observation (info, warning, error) 978 status_message: Optional status message for the observation 979 end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks. 980 981 The following parameters are available when as_type is: "generation" or "embedding". 982 completion_start_time: When the model started generating the response 983 model: Name/identifier of the AI model used (e.g., "gpt-4") 984 model_parameters: Parameters used for the model (e.g., temperature, max_tokens) 985 usage_details: Token usage information (e.g., prompt_tokens, completion_tokens) 986 cost_details: Cost information for the model call 987 prompt: Associated prompt template from Langfuse prompt management 988 989 Returns: 990 A context manager that yields the appropriate observation type based on as_type 991 992 Example: 993 ```python 994 # Create a span 995 with langfuse.start_as_current_observation(name="process-query", as_type="span") as span: 996 # Do work 997 result = process_data() 998 span.update(output=result) 999 1000 # Create a child span automatically 1001 with span.start_as_current_observation(name="sub-operation") as child_span: 1002 # Do sub-operation work 1003 child_span.update(output="sub-result") 1004 1005 # Create a tool observation 1006 with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool: 1007 # Do tool work 1008 results = search_web(query) 1009 tool.update(output=results) 1010 1011 # Create a generation observation 1012 with langfuse.start_as_current_observation( 1013 name="answer-generation", 1014 as_type="generation", 1015 model="gpt-4" 1016 ) as generation: 1017 # Generate answer 1018 response = llm.generate(...) 1019 generation.update(output=response) 1020 ``` 1021 """ 1022 if as_type in get_observation_types_list(ObservationTypeGenerationLike): 1023 if trace_context: 1024 trace_id = trace_context.get("trace_id", None) 1025 parent_span_id = trace_context.get("parent_span_id", None) 1026 1027 if trace_id: 1028 remote_parent_span = self._create_remote_parent_span( 1029 trace_id=trace_id, parent_span_id=parent_span_id 1030 ) 1031 1032 return cast( 1033 Union[ 1034 _AgnosticContextManager[LangfuseGeneration], 1035 _AgnosticContextManager[LangfuseEmbedding], 1036 ], 1037 self._create_span_with_parent_context( 1038 as_type=as_type, 1039 name=name, 1040 remote_parent_span=remote_parent_span, 1041 parent=None, 1042 end_on_exit=end_on_exit, 1043 input=input, 1044 output=output, 1045 metadata=metadata, 1046 version=version, 1047 level=level, 1048 status_message=status_message, 1049 completion_start_time=completion_start_time, 1050 model=model, 1051 model_parameters=model_parameters, 1052 usage_details=usage_details, 1053 cost_details=cost_details, 1054 prompt=prompt, 1055 ), 1056 ) 1057 1058 return cast( 1059 Union[ 1060 _AgnosticContextManager[LangfuseGeneration], 1061 _AgnosticContextManager[LangfuseEmbedding], 1062 ], 1063 self._start_as_current_otel_span_with_processed_media( 1064 as_type=as_type, 1065 name=name, 1066 end_on_exit=end_on_exit, 1067 input=input, 1068 output=output, 1069 metadata=metadata, 1070 version=version, 1071 level=level, 1072 status_message=status_message, 1073 completion_start_time=completion_start_time, 1074 model=model, 1075 model_parameters=model_parameters, 1076 usage_details=usage_details, 1077 cost_details=cost_details, 1078 prompt=prompt, 1079 ), 1080 ) 1081 1082 if as_type in get_observation_types_list(ObservationTypeSpanLike): 1083 if trace_context: 1084 trace_id = trace_context.get("trace_id", None) 1085 parent_span_id = trace_context.get("parent_span_id", None) 1086 1087 if trace_id: 1088 remote_parent_span = self._create_remote_parent_span( 1089 trace_id=trace_id, parent_span_id=parent_span_id 1090 ) 1091 1092 return cast( 1093 Union[ 1094 _AgnosticContextManager[LangfuseSpan], 1095 _AgnosticContextManager[LangfuseAgent], 1096 _AgnosticContextManager[LangfuseTool], 1097 _AgnosticContextManager[LangfuseChain], 1098 _AgnosticContextManager[LangfuseRetriever], 1099 _AgnosticContextManager[LangfuseEvaluator], 1100 _AgnosticContextManager[LangfuseGuardrail], 1101 ], 1102 self._create_span_with_parent_context( 1103 as_type=as_type, 1104 name=name, 1105 remote_parent_span=remote_parent_span, 1106 parent=None, 1107 end_on_exit=end_on_exit, 1108 input=input, 1109 output=output, 1110 metadata=metadata, 1111 version=version, 1112 level=level, 1113 status_message=status_message, 1114 ), 1115 ) 1116 1117 return cast( 1118 Union[ 1119 _AgnosticContextManager[LangfuseSpan], 1120 _AgnosticContextManager[LangfuseAgent], 1121 _AgnosticContextManager[LangfuseTool], 1122 _AgnosticContextManager[LangfuseChain], 1123 _AgnosticContextManager[LangfuseRetriever], 1124 _AgnosticContextManager[LangfuseEvaluator], 1125 _AgnosticContextManager[LangfuseGuardrail], 1126 ], 1127 self._start_as_current_otel_span_with_processed_media( 1128 as_type=as_type, 1129 name=name, 1130 end_on_exit=end_on_exit, 1131 input=input, 1132 output=output, 1133 metadata=metadata, 1134 version=version, 1135 level=level, 1136 status_message=status_message, 1137 ), 1138 ) 1139 1140 # This should never be reached since all valid types are handled above 1141 langfuse_logger.warning( 1142 f"Unknown observation type: {as_type}, falling back to span" 1143 ) 1144 return self._start_as_current_otel_span_with_processed_media( 1145 as_type="span", 1146 name=name, 1147 end_on_exit=end_on_exit, 1148 input=input, 1149 output=output, 1150 metadata=metadata, 1151 version=version, 1152 level=level, 1153 status_message=status_message, 1154 )
Create a new observation and set it as the current span in a context manager.
This method creates a new observation of the specified type and sets it as the current span within a context manager. Use this method with a 'with' statement to automatically handle the observation lifecycle within a code block.
The created observation will be the child of the current span in the context.
Arguments:
- trace_context: Optional context for connecting to an existing trace
- name: Name of the observation (e.g., function or operation name)
- as_type: Type of observation to create (defaults to "span")
- input: Input data for the operation (can be any JSON-serializable object)
- output: Output data from the operation (can be any JSON-serializable object)
- metadata: Additional metadata to associate with the observation
- version: Version identifier for the code or component
- level: Importance level of the observation (info, warning, error)
- status_message: Optional status message for the observation
- end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
- The following parameters are available when as_type is: "generation" or "embedding".
- completion_start_time: When the model started generating the response
- model: Name/identifier of the AI model used (e.g., "gpt-4")
- model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
- usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
- cost_details: Cost information for the model call
- prompt: Associated prompt template from Langfuse prompt management
Returns:
A context manager that yields the appropriate observation type based on as_type
Example:
# Create a span with langfuse.start_as_current_observation(name="process-query", as_type="span") as span: # Do work result = process_data() span.update(output=result) # Create a child span automatically with span.start_as_current_observation(name="sub-operation") as child_span: # Do sub-operation work child_span.update(output="sub-result") # Create a tool observation with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool: # Do tool work results = search_web(query) tool.update(output=results) # Create a generation observation with langfuse.start_as_current_observation( name="answer-generation", as_type="generation", model="gpt-4" ) as generation: # Generate answer response = llm.generate(...) generation.update(output=response)
1346 def update_current_generation( 1347 self, 1348 *, 1349 name: Optional[str] = None, 1350 input: Optional[Any] = None, 1351 output: Optional[Any] = None, 1352 metadata: Optional[Any] = None, 1353 version: Optional[str] = None, 1354 level: Optional[SpanLevel] = None, 1355 status_message: Optional[str] = None, 1356 completion_start_time: Optional[datetime] = None, 1357 model: Optional[str] = None, 1358 model_parameters: Optional[Dict[str, MapValue]] = None, 1359 usage_details: Optional[Dict[str, int]] = None, 1360 cost_details: Optional[Dict[str, float]] = None, 1361 prompt: Optional[PromptClient] = None, 1362 ) -> None: 1363 """Update the current active generation span with new information. 1364 1365 This method updates the current generation span in the active context with 1366 additional information. It's useful for adding output, usage stats, or other 1367 details that become available during or after model generation. 1368 1369 Args: 1370 name: The generation name 1371 input: Updated input data for the model 1372 output: Output from the model (e.g., completions) 1373 metadata: Additional metadata to associate with the generation 1374 version: Version identifier for the model or component 1375 level: Importance level of the generation (info, warning, error) 1376 status_message: Optional status message for the generation 1377 completion_start_time: When the model started generating the response 1378 model: Name/identifier of the AI model used (e.g., "gpt-4") 1379 model_parameters: Parameters used for the model (e.g., temperature, max_tokens) 1380 usage_details: Token usage information (e.g., prompt_tokens, completion_tokens) 1381 cost_details: Cost information for the model call 1382 prompt: Associated prompt template from Langfuse prompt management 1383 1384 Example: 1385 ```python 1386 with langfuse.start_as_current_generation(name="answer-query") as generation: 1387 # Initial setup and API call 1388 response = llm.generate(...) 1389 1390 # Update with results that weren't available at creation time 1391 langfuse.update_current_generation( 1392 output=response.text, 1393 usage_details={ 1394 "prompt_tokens": response.usage.prompt_tokens, 1395 "completion_tokens": response.usage.completion_tokens 1396 } 1397 ) 1398 ``` 1399 """ 1400 if not self._tracing_enabled: 1401 langfuse_logger.debug( 1402 "Operation skipped: update_current_generation - Tracing is disabled or client is in no-op mode." 1403 ) 1404 return 1405 1406 current_otel_span = self._get_current_otel_span() 1407 1408 if current_otel_span is not None: 1409 generation = LangfuseGeneration( 1410 otel_span=current_otel_span, langfuse_client=self 1411 ) 1412 1413 if name: 1414 current_otel_span.update_name(name) 1415 1416 generation.update( 1417 input=input, 1418 output=output, 1419 metadata=metadata, 1420 version=version, 1421 level=level, 1422 status_message=status_message, 1423 completion_start_time=completion_start_time, 1424 model=model, 1425 model_parameters=model_parameters, 1426 usage_details=usage_details, 1427 cost_details=cost_details, 1428 prompt=prompt, 1429 )
Update the current active generation span with new information.
This method updates the current generation span in the active context with additional information. It's useful for adding output, usage stats, or other details that become available during or after model generation.
Arguments:
- name: The generation name
- input: Updated input data for the model
- output: Output from the model (e.g., completions)
- metadata: Additional metadata to associate with the generation
- version: Version identifier for the model or component
- level: Importance level of the generation (info, warning, error)
- status_message: Optional status message for the generation
- completion_start_time: When the model started generating the response
- model: Name/identifier of the AI model used (e.g., "gpt-4")
- model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
- usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
- cost_details: Cost information for the model call
- prompt: Associated prompt template from Langfuse prompt management
Example:
with langfuse.start_as_current_generation(name="answer-query") as generation: # Initial setup and API call response = llm.generate(...) # Update with results that weren't available at creation time langfuse.update_current_generation( output=response.text, usage_details={ "prompt_tokens": response.usage.prompt_tokens, "completion_tokens": response.usage.completion_tokens } )
1431 def update_current_span( 1432 self, 1433 *, 1434 name: Optional[str] = None, 1435 input: Optional[Any] = None, 1436 output: Optional[Any] = None, 1437 metadata: Optional[Any] = None, 1438 version: Optional[str] = None, 1439 level: Optional[SpanLevel] = None, 1440 status_message: Optional[str] = None, 1441 ) -> None: 1442 """Update the current active span with new information. 1443 1444 This method updates the current span in the active context with 1445 additional information. It's useful for adding outputs or metadata 1446 that become available during execution. 1447 1448 Args: 1449 name: The span name 1450 input: Updated input data for the operation 1451 output: Output data from the operation 1452 metadata: Additional metadata to associate with the span 1453 version: Version identifier for the code or component 1454 level: Importance level of the span (info, warning, error) 1455 status_message: Optional status message for the span 1456 1457 Example: 1458 ```python 1459 with langfuse.start_as_current_observation(name="process-data") as span: 1460 # Initial processing 1461 result = process_first_part() 1462 1463 # Update with intermediate results 1464 langfuse.update_current_span(metadata={"intermediate_result": result}) 1465 1466 # Continue processing 1467 final_result = process_second_part(result) 1468 1469 # Final update 1470 langfuse.update_current_span(output=final_result) 1471 ``` 1472 """ 1473 if not self._tracing_enabled: 1474 langfuse_logger.debug( 1475 "Operation skipped: update_current_span - Tracing is disabled or client is in no-op mode." 1476 ) 1477 return 1478 1479 current_otel_span = self._get_current_otel_span() 1480 1481 if current_otel_span is not None: 1482 span_class = self._get_span_class( 1483 self._get_observation_type_from_otel_span(current_otel_span) 1484 ) 1485 span = span_class( 1486 otel_span=current_otel_span, 1487 langfuse_client=self, 1488 environment=self._environment, 1489 release=self._release, 1490 ) 1491 1492 if name: 1493 current_otel_span.update_name(name) 1494 1495 span.update( 1496 input=input, 1497 output=output, 1498 metadata=metadata, 1499 version=version, 1500 level=level, 1501 status_message=status_message, 1502 )
Update the current active span with new information.
This method updates the current span in the active context with additional information. It's useful for adding outputs or metadata that become available during execution.
Arguments:
- name: The span name
- input: Updated input data for the operation
- output: Output data from the operation
- metadata: Additional metadata to associate with the span
- version: Version identifier for the code or component
- level: Importance level of the span (info, warning, error)
- status_message: Optional status message for the span
Example:
with langfuse.start_as_current_observation(name="process-data") as span: # Initial processing result = process_first_part() # Update with intermediate results langfuse.update_current_span(metadata={"intermediate_result": result}) # Continue processing final_result = process_second_part(result) # Final update langfuse.update_current_span(output=final_result)
1504 @deprecated( 1505 "Trace-level input/output is deprecated. " 1506 "For trace attributes (user_id, session_id, tags, etc.), use propagate_attributes() instead. " 1507 "This method will be removed in a future major version." 1508 ) 1509 def set_current_trace_io( 1510 self, 1511 *, 1512 input: Optional[Any] = None, 1513 output: Optional[Any] = None, 1514 ) -> None: 1515 """Set trace-level input and output for the current span's trace. 1516 1517 .. deprecated:: 1518 This is a legacy method for backward compatibility with Langfuse platform 1519 features that still rely on trace-level input/output (e.g., legacy LLM-as-a-judge 1520 evaluators). It will be removed in a future major version. 1521 1522 For setting other trace attributes (user_id, session_id, metadata, tags, version), 1523 use :meth:`propagate_attributes` instead. 1524 1525 Args: 1526 input: Input data to associate with the trace. 1527 output: Output data to associate with the trace. 1528 """ 1529 if not self._tracing_enabled: 1530 langfuse_logger.debug( 1531 "Operation skipped: set_current_trace_io - Tracing is disabled or client is in no-op mode." 1532 ) 1533 return 1534 1535 current_otel_span = self._get_current_otel_span() 1536 1537 if current_otel_span is not None and current_otel_span.is_recording(): 1538 span_class = self._get_span_class( 1539 self._get_observation_type_from_otel_span(current_otel_span) 1540 ) 1541 span = span_class( 1542 otel_span=current_otel_span, 1543 langfuse_client=self, 1544 environment=self._environment, 1545 release=self._release, 1546 ) 1547 1548 span.set_trace_io( 1549 input=input, 1550 output=output, 1551 )
Set trace-level input and output for the current span's trace.
Deprecated since version : This is a legacy method for backward compatibility with Langfuse platform features that still rely on trace-level input/output (e.g., legacy LLM-as-a-judge evaluators). It will be removed in a future major version.
For setting other trace attributes (user_id, session_id, metadata, tags, version),
use propagate_attributes() instead.
Arguments:
- input: Input data to associate with the trace.
- output: Output data to associate with the trace.
1553 def set_current_trace_as_public(self) -> None: 1554 """Make the current trace publicly accessible via its URL. 1555 1556 When a trace is published, anyone with the trace link can view the full trace 1557 without needing to be logged in to Langfuse. This action cannot be undone 1558 programmatically - once published, the entire trace becomes public. 1559 1560 This is a convenience method that publishes the trace from the currently 1561 active span context. Use this when you want to make a trace public from 1562 within a traced function without needing direct access to the span object. 1563 """ 1564 if not self._tracing_enabled: 1565 langfuse_logger.debug( 1566 "Operation skipped: set_current_trace_as_public - Tracing is disabled or client is in no-op mode." 1567 ) 1568 return 1569 1570 current_otel_span = self._get_current_otel_span() 1571 1572 if current_otel_span is not None and current_otel_span.is_recording(): 1573 span_class = self._get_span_class( 1574 self._get_observation_type_from_otel_span(current_otel_span) 1575 ) 1576 span = span_class( 1577 otel_span=current_otel_span, 1578 langfuse_client=self, 1579 environment=self._environment, 1580 ) 1581 1582 span.set_trace_as_public()
Make the current trace publicly accessible via its URL.
When a trace is published, anyone with the trace link can view the full trace without needing to be logged in to Langfuse. This action cannot be undone programmatically - once published, the entire trace becomes public.
This is a convenience method that publishes the trace from the currently active span context. Use this when you want to make a trace public from within a traced function without needing direct access to the span object.
1584 def create_event( 1585 self, 1586 *, 1587 trace_context: Optional[TraceContext] = None, 1588 name: str, 1589 input: Optional[Any] = None, 1590 output: Optional[Any] = None, 1591 metadata: Optional[Any] = None, 1592 version: Optional[str] = None, 1593 level: Optional[SpanLevel] = None, 1594 status_message: Optional[str] = None, 1595 ) -> LangfuseEvent: 1596 """Create a new Langfuse observation of type 'EVENT'. 1597 1598 The created Langfuse Event observation will be the child of the current span in the context. 1599 1600 Args: 1601 trace_context: Optional context for connecting to an existing trace 1602 name: Name of the span (e.g., function or operation name) 1603 input: Input data for the operation (can be any JSON-serializable object) 1604 output: Output data from the operation (can be any JSON-serializable object) 1605 metadata: Additional metadata to associate with the span 1606 version: Version identifier for the code or component 1607 level: Importance level of the span (info, warning, error) 1608 status_message: Optional status message for the span 1609 1610 Returns: 1611 The Langfuse Event object 1612 1613 Example: 1614 ```python 1615 event = langfuse.create_event(name="process-event") 1616 ``` 1617 """ 1618 timestamp = time_ns() 1619 1620 if trace_context: 1621 trace_id = trace_context.get("trace_id", None) 1622 parent_span_id = trace_context.get("parent_span_id", None) 1623 1624 if trace_id: 1625 remote_parent_span = self._create_remote_parent_span( 1626 trace_id=trace_id, parent_span_id=parent_span_id 1627 ) 1628 1629 with otel_trace_api.use_span( 1630 cast(otel_trace_api.Span, remote_parent_span) 1631 ): 1632 otel_span = self._otel_tracer.start_span( 1633 name=name, start_time=timestamp 1634 ) 1635 otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True) 1636 1637 return cast( 1638 LangfuseEvent, 1639 LangfuseEvent( 1640 otel_span=otel_span, 1641 langfuse_client=self, 1642 environment=self._environment, 1643 release=self._release, 1644 input=input, 1645 output=output, 1646 metadata=metadata, 1647 version=version, 1648 level=level, 1649 status_message=status_message, 1650 ).end(end_time=timestamp), 1651 ) 1652 1653 otel_span = self._otel_tracer.start_span(name=name, start_time=timestamp) 1654 1655 return cast( 1656 LangfuseEvent, 1657 LangfuseEvent( 1658 otel_span=otel_span, 1659 langfuse_client=self, 1660 environment=self._environment, 1661 release=self._release, 1662 input=input, 1663 output=output, 1664 metadata=metadata, 1665 version=version, 1666 level=level, 1667 status_message=status_message, 1668 ).end(end_time=timestamp), 1669 )
Create a new Langfuse observation of type 'EVENT'.
The created Langfuse Event observation will be the child of the current span in the context.
Arguments:
- trace_context: Optional context for connecting to an existing trace
- name: Name of the span (e.g., function or operation name)
- input: Input data for the operation (can be any JSON-serializable object)
- output: Output data from the operation (can be any JSON-serializable object)
- metadata: Additional metadata to associate with the span
- version: Version identifier for the code or component
- level: Importance level of the span (info, warning, error)
- status_message: Optional status message for the span
Returns:
The Langfuse Event object
Example:
event = langfuse.create_event(name="process-event")
1758 @staticmethod 1759 def create_trace_id(*, seed: Optional[str] = None) -> str: 1760 """Create a unique trace ID for use with Langfuse. 1761 1762 This method generates a unique trace ID for use with various Langfuse APIs. 1763 It can either generate a random ID or create a deterministic ID based on 1764 a seed string. 1765 1766 Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes. 1767 This method ensures the generated ID meets this requirement. If you need to 1768 correlate an external ID with a Langfuse trace ID, use the external ID as the 1769 seed to get a valid, deterministic Langfuse trace ID. 1770 1771 Args: 1772 seed: Optional string to use as a seed for deterministic ID generation. 1773 If provided, the same seed will always produce the same ID. 1774 If not provided, a random ID will be generated. 1775 1776 Returns: 1777 A 32-character lowercase hexadecimal string representing the Langfuse trace ID. 1778 1779 Example: 1780 ```python 1781 # Generate a random trace ID 1782 trace_id = langfuse.create_trace_id() 1783 1784 # Generate a deterministic ID based on a seed 1785 session_trace_id = langfuse.create_trace_id(seed="session-456") 1786 1787 # Correlate an external ID with a Langfuse trace ID 1788 external_id = "external-system-123456" 1789 correlated_trace_id = langfuse.create_trace_id(seed=external_id) 1790 1791 # Use the ID with trace context 1792 with langfuse.start_as_current_observation( 1793 name="process-request", 1794 trace_context={"trace_id": trace_id} 1795 ) as span: 1796 # Operation will be part of the specific trace 1797 pass 1798 ``` 1799 """ 1800 if not seed: 1801 trace_id_int = RandomIdGenerator().generate_trace_id() 1802 1803 return Langfuse._format_otel_trace_id(trace_id_int) 1804 1805 return sha256(seed.encode("utf-8")).digest()[:16].hex()
Create a unique trace ID for use with Langfuse.
This method generates a unique trace ID for use with various Langfuse APIs. It can either generate a random ID or create a deterministic ID based on a seed string.
Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes. This method ensures the generated ID meets this requirement. If you need to correlate an external ID with a Langfuse trace ID, use the external ID as the seed to get a valid, deterministic Langfuse trace ID.
Arguments:
- seed: Optional string to use as a seed for deterministic ID generation. If provided, the same seed will always produce the same ID. If not provided, a random ID will be generated.
Returns:
A 32-character lowercase hexadecimal string representing the Langfuse trace ID.
Example:
# Generate a random trace ID trace_id = langfuse.create_trace_id() # Generate a deterministic ID based on a seed session_trace_id = langfuse.create_trace_id(seed="session-456") # Correlate an external ID with a Langfuse trace ID external_id = "external-system-123456" correlated_trace_id = langfuse.create_trace_id(seed=external_id) # Use the ID with trace context with langfuse.start_as_current_observation( name="process-request", trace_context={"trace_id": trace_id} ) as span: # Operation will be part of the specific trace pass
1887 def create_score( 1888 self, 1889 *, 1890 name: str, 1891 value: Union[float, str], 1892 session_id: Optional[str] = None, 1893 dataset_run_id: Optional[str] = None, 1894 trace_id: Optional[str] = None, 1895 observation_id: Optional[str] = None, 1896 score_id: Optional[str] = None, 1897 data_type: Optional[ScoreDataType] = None, 1898 comment: Optional[str] = None, 1899 config_id: Optional[str] = None, 1900 metadata: Optional[Any] = None, 1901 timestamp: Optional[datetime] = None, 1902 environment: Optional[str] = None, 1903 ) -> None: 1904 """Create a score for a specific trace or observation. 1905 1906 This method creates a score for evaluating a Langfuse trace or observation. Scores can be 1907 used to track quality metrics, user feedback, or automated evaluations. 1908 1909 Args: 1910 name: Name of the score (e.g., "relevance", "accuracy") 1911 value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION) 1912 session_id: ID of the Langfuse session to associate the score with 1913 dataset_run_id: ID of the Langfuse dataset run to associate the score with 1914 trace_id: ID of the Langfuse trace to associate the score with 1915 observation_id: Optional ID of the specific observation to score. Trace ID must be provided too. 1916 score_id: Optional custom ID for the score (auto-generated if not provided) 1917 data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION) 1918 comment: Optional comment or explanation for the score 1919 config_id: Optional ID of a score config defined in Langfuse 1920 metadata: Optional metadata to be attached to the score 1921 timestamp: Optional timestamp for the score (defaults to current UTC time) 1922 environment: Optional environment override for this score. If omitted, 1923 the score uses the client-level environment from 1924 `Langfuse(environment=...)` or `LANGFUSE_TRACING_ENVIRONMENT`. 1925 Langfuse observation wrapper methods pass their resolved span 1926 environment here so scores created via `span.score()` or 1927 `span.score_trace()` stay grouped with the scored observation or 1928 trace, including request-scoped environments propagated with 1929 `propagate_attributes(environment=...)`. 1930 1931 Example: 1932 ```python 1933 # Create a numeric score for accuracy 1934 langfuse.create_score( 1935 name="accuracy", 1936 value=0.92, 1937 trace_id="abcdef1234567890abcdef1234567890", 1938 data_type="NUMERIC", 1939 comment="High accuracy with minor irrelevant details" 1940 ) 1941 1942 # Create a categorical score for sentiment 1943 langfuse.create_score( 1944 name="sentiment", 1945 value="positive", 1946 trace_id="abcdef1234567890abcdef1234567890", 1947 observation_id="abcdef1234567890", 1948 data_type="CATEGORICAL" 1949 ) 1950 ``` 1951 """ 1952 if not self._tracing_enabled: 1953 return 1954 1955 score_id = score_id or self._create_observation_id() 1956 1957 try: 1958 new_body = ScoreBody( 1959 id=score_id, 1960 sessionId=session_id, 1961 datasetRunId=dataset_run_id, 1962 traceId=trace_id, 1963 observationId=observation_id, 1964 name=name, 1965 value=value, 1966 dataType=data_type, # type: ignore 1967 comment=comment, 1968 configId=config_id, 1969 environment=environment or self._environment, 1970 metadata=metadata, 1971 ) 1972 1973 event = { 1974 "id": self.create_trace_id(), 1975 "type": "score-create", 1976 "timestamp": timestamp or _get_timestamp(), 1977 "body": new_body, 1978 } 1979 1980 if self._resources is not None: 1981 # Force the score to be in sample if it was for a legacy trace ID, i.e. non-32 hexchar 1982 force_sample = ( 1983 not self._is_valid_trace_id(trace_id) if trace_id else True 1984 ) 1985 1986 self._resources.add_score_task( 1987 event, 1988 force_sample=force_sample, 1989 ) 1990 1991 except Exception as e: 1992 langfuse_logger.exception( 1993 f"Error creating score: Failed to process score event for trace_id={trace_id}, name={name}. Error: {e}" 1994 )
Create a score for a specific trace or observation.
This method creates a score for evaluating a Langfuse trace or observation. Scores can be used to track quality metrics, user feedback, or automated evaluations.
Arguments:
- name: Name of the score (e.g., "relevance", "accuracy")
- value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION)
- session_id: ID of the Langfuse session to associate the score with
- dataset_run_id: ID of the Langfuse dataset run to associate the score with
- trace_id: ID of the Langfuse trace to associate the score with
- observation_id: Optional ID of the specific observation to score. Trace ID must be provided too.
- score_id: Optional custom ID for the score (auto-generated if not provided)
- data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION)
- comment: Optional comment or explanation for the score
- config_id: Optional ID of a score config defined in Langfuse
- metadata: Optional metadata to be attached to the score
- timestamp: Optional timestamp for the score (defaults to current UTC time)
- environment: Optional environment override for this score. If omitted,
the score uses the client-level environment from
Langfuse(environment=...)orLANGFUSE_TRACING_ENVIRONMENT. Langfuse observation wrapper methods pass their resolved span environment here so scores created viaspan.score()orspan.score_trace()stay grouped with the scored observation or trace, including request-scoped environments propagated withpropagate_attributes(environment=...).
Example:
# Create a numeric score for accuracy langfuse.create_score( name="accuracy", value=0.92, trace_id="abcdef1234567890abcdef1234567890", data_type="NUMERIC", comment="High accuracy with minor irrelevant details" ) # Create a categorical score for sentiment langfuse.create_score( name="sentiment", value="positive", trace_id="abcdef1234567890abcdef1234567890", observation_id="abcdef1234567890", data_type="CATEGORICAL" )
2057 def score_current_span( 2058 self, 2059 *, 2060 name: str, 2061 value: Union[float, str], 2062 score_id: Optional[str] = None, 2063 data_type: Optional[ScoreDataType] = None, 2064 comment: Optional[str] = None, 2065 config_id: Optional[str] = None, 2066 metadata: Optional[Any] = None, 2067 ) -> None: 2068 """Create a score for the current active span. 2069 2070 This method scores the currently active span in the context. It's a convenient 2071 way to score the current operation without needing to know its trace and span IDs. 2072 If the active span has a `langfuse.environment` attribute, including one 2073 set by `propagate_attributes(environment=...)`, the score uses that 2074 environment. Otherwise it uses the client-level environment. 2075 2076 Args: 2077 name: Name of the score (e.g., "relevance", "accuracy") 2078 value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION) 2079 score_id: Optional custom ID for the score (auto-generated if not provided) 2080 data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION) 2081 comment: Optional comment or explanation for the score 2082 config_id: Optional ID of a score config defined in Langfuse 2083 metadata: Optional metadata to be attached to the score 2084 2085 Example: 2086 ```python 2087 with langfuse.start_as_current_generation(name="answer-query") as generation: 2088 # Generate answer 2089 response = generate_answer(...) 2090 generation.update(output=response) 2091 2092 # Score the generation 2093 langfuse.score_current_span( 2094 name="relevance", 2095 value=0.85, 2096 data_type="NUMERIC", 2097 comment="Mostly relevant but contains some tangential information", 2098 metadata={"model": "gpt-4", "prompt_version": "v2"} 2099 ) 2100 ``` 2101 """ 2102 current_span = self._get_current_otel_span() 2103 2104 if current_span is not None: 2105 trace_id = self._get_otel_trace_id(current_span) 2106 observation_id = self._get_otel_span_id(current_span) 2107 2108 langfuse_logger.info( 2109 f"Score: Creating score name='{name}' value={value} for current span ({observation_id}) in trace {trace_id}" 2110 ) 2111 2112 self.create_score( 2113 trace_id=trace_id, 2114 observation_id=observation_id, 2115 name=name, 2116 value=cast(str, value), 2117 score_id=score_id, 2118 data_type=cast(Literal["CATEGORICAL", "TEXT", "CORRECTION"], data_type), 2119 comment=comment, 2120 config_id=config_id, 2121 metadata=metadata, 2122 environment=get_string_span_attribute( 2123 current_span, LangfuseOtelSpanAttributes.ENVIRONMENT 2124 ), 2125 )
Create a score for the current active span.
This method scores the currently active span in the context. It's a convenient
way to score the current operation without needing to know its trace and span IDs.
If the active span has a langfuse.environment attribute, including one
set by propagate_attributes(environment=...), the score uses that
environment. Otherwise it uses the client-level environment.
Arguments:
- name: Name of the score (e.g., "relevance", "accuracy")
- value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION)
- score_id: Optional custom ID for the score (auto-generated if not provided)
- data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION)
- comment: Optional comment or explanation for the score
- config_id: Optional ID of a score config defined in Langfuse
- metadata: Optional metadata to be attached to the score
Example:
with langfuse.start_as_current_generation(name="answer-query") as generation: # Generate answer response = generate_answer(...) generation.update(output=response) # Score the generation langfuse.score_current_span( name="relevance", value=0.85, data_type="NUMERIC", comment="Mostly relevant but contains some tangential information", metadata={"model": "gpt-4", "prompt_version": "v2"} )
2155 def score_current_trace( 2156 self, 2157 *, 2158 name: str, 2159 value: Union[float, str], 2160 score_id: Optional[str] = None, 2161 data_type: Optional[ScoreDataType] = None, 2162 comment: Optional[str] = None, 2163 config_id: Optional[str] = None, 2164 metadata: Optional[Any] = None, 2165 ) -> None: 2166 """Create a score for the current trace. 2167 2168 This method scores the trace of the currently active span. Unlike score_current_span, 2169 this method associates the score with the entire trace rather than a specific span. 2170 It's useful for scoring overall performance or quality of the entire operation. 2171 If the active span has a `langfuse.environment` attribute, including one 2172 set by `propagate_attributes(environment=...)`, the score uses that 2173 environment. Otherwise it uses the client-level environment. 2174 2175 Args: 2176 name: Name of the score (e.g., "user_satisfaction", "overall_quality") 2177 value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION) 2178 score_id: Optional custom ID for the score (auto-generated if not provided) 2179 data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION) 2180 comment: Optional comment or explanation for the score 2181 config_id: Optional ID of a score config defined in Langfuse 2182 metadata: Optional metadata to be attached to the score 2183 2184 Example: 2185 ```python 2186 with langfuse.start_as_current_observation(name="process-user-request") as span: 2187 # Process request 2188 result = process_complete_request() 2189 span.update(output=result) 2190 2191 # Score the overall trace 2192 langfuse.score_current_trace( 2193 name="overall_quality", 2194 value=0.95, 2195 data_type="NUMERIC", 2196 comment="High quality end-to-end response", 2197 metadata={"evaluator": "gpt-4", "criteria": "comprehensive"} 2198 ) 2199 ``` 2200 """ 2201 current_span = self._get_current_otel_span() 2202 2203 if current_span is not None: 2204 trace_id = self._get_otel_trace_id(current_span) 2205 2206 langfuse_logger.info( 2207 f"Score: Creating score name='{name}' value={value} for entire trace {trace_id}" 2208 ) 2209 2210 self.create_score( 2211 trace_id=trace_id, 2212 name=name, 2213 value=cast(str, value), 2214 score_id=score_id, 2215 data_type=cast(Literal["CATEGORICAL", "TEXT", "CORRECTION"], data_type), 2216 comment=comment, 2217 config_id=config_id, 2218 metadata=metadata, 2219 environment=get_string_span_attribute( 2220 current_span, LangfuseOtelSpanAttributes.ENVIRONMENT 2221 ), 2222 )
Create a score for the current trace.
This method scores the trace of the currently active span. Unlike score_current_span,
this method associates the score with the entire trace rather than a specific span.
It's useful for scoring overall performance or quality of the entire operation.
If the active span has a langfuse.environment attribute, including one
set by propagate_attributes(environment=...), the score uses that
environment. Otherwise it uses the client-level environment.
Arguments:
- name: Name of the score (e.g., "user_satisfaction", "overall_quality")
- value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION)
- score_id: Optional custom ID for the score (auto-generated if not provided)
- data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION)
- comment: Optional comment or explanation for the score
- config_id: Optional ID of a score config defined in Langfuse
- metadata: Optional metadata to be attached to the score
Example:
with langfuse.start_as_current_observation(name="process-user-request") as span: # Process request result = process_complete_request() span.update(output=result) # Score the overall trace langfuse.score_current_trace( name="overall_quality", value=0.95, data_type="NUMERIC", comment="High quality end-to-end response", metadata={"evaluator": "gpt-4", "criteria": "comprehensive"} )
2224 def flush(self) -> None: 2225 """Force flush all pending spans and events to the Langfuse API. 2226 2227 This method manually flushes any pending spans, scores, and other events to the 2228 Langfuse API. It's useful in scenarios where you want to ensure all data is sent 2229 before proceeding, without waiting for the automatic flush interval. 2230 2231 Example: 2232 ```python 2233 # Record some spans and scores 2234 with langfuse.start_as_current_observation(name="operation") as span: 2235 # Do work... 2236 pass 2237 2238 # Ensure all data is sent to Langfuse before proceeding 2239 langfuse.flush() 2240 2241 # Continue with other work 2242 ``` 2243 """ 2244 if self._resources is not None: 2245 self._resources.flush()
Force flush all pending spans and events to the Langfuse API.
This method manually flushes any pending spans, scores, and other events to the Langfuse API. It's useful in scenarios where you want to ensure all data is sent before proceeding, without waiting for the automatic flush interval.
Example:
# Record some spans and scores with langfuse.start_as_current_observation(name="operation") as span: # Do work... pass # Ensure all data is sent to Langfuse before proceeding langfuse.flush() # Continue with other work
2247 def shutdown(self) -> None: 2248 """Shut down the Langfuse client and flush all pending data. 2249 2250 This method cleanly shuts down the Langfuse client, ensuring all pending data 2251 is flushed to the API and all background threads are properly terminated. 2252 2253 It's important to call this method when your application is shutting down to 2254 prevent data loss and resource leaks. For most applications, using the client 2255 as a context manager or relying on the automatic shutdown via atexit is sufficient. 2256 2257 Example: 2258 ```python 2259 # Initialize Langfuse 2260 langfuse = Langfuse(public_key="...", secret_key="...") 2261 2262 # Use Langfuse throughout your application 2263 # ... 2264 2265 # When application is shutting down 2266 langfuse.shutdown() 2267 ``` 2268 """ 2269 if self._resources is not None: 2270 self._resources.shutdown()
Shut down the Langfuse client and flush all pending data.
This method cleanly shuts down the Langfuse client, ensuring all pending data is flushed to the API and all background threads are properly terminated.
It's important to call this method when your application is shutting down to prevent data loss and resource leaks. For most applications, using the client as a context manager or relying on the automatic shutdown via atexit is sufficient.
Example:
# Initialize Langfuse langfuse = Langfuse(public_key="...", secret_key="...") # Use Langfuse throughout your application # ... # When application is shutting down langfuse.shutdown()
2272 def get_current_trace_id(self) -> Optional[str]: 2273 """Get the trace ID of the current active span. 2274 2275 This method retrieves the trace ID from the currently active span in the context. 2276 It can be used to get the trace ID for referencing in logs, external systems, 2277 or for creating related operations. 2278 2279 Returns: 2280 The current trace ID as a 32-character lowercase hexadecimal string, 2281 or None if there is no active span. 2282 2283 Example: 2284 ```python 2285 with langfuse.start_as_current_observation(name="process-request") as span: 2286 # Get the current trace ID for reference 2287 trace_id = langfuse.get_current_trace_id() 2288 2289 # Use it for external correlation 2290 log.info(f"Processing request with trace_id: {trace_id}") 2291 2292 # Or pass to another system 2293 external_system.process(data, trace_id=trace_id) 2294 ``` 2295 """ 2296 if not self._tracing_enabled: 2297 langfuse_logger.debug( 2298 "Operation skipped: get_current_trace_id - Tracing is disabled or client is in no-op mode." 2299 ) 2300 return None 2301 2302 current_otel_span = self._get_current_otel_span() 2303 2304 return self._get_otel_trace_id(current_otel_span) if current_otel_span else None
Get the trace ID of the current active span.
This method retrieves the trace ID from the currently active span in the context. It can be used to get the trace ID for referencing in logs, external systems, or for creating related operations.
Returns:
The current trace ID as a 32-character lowercase hexadecimal string, or None if there is no active span.
Example:
with langfuse.start_as_current_observation(name="process-request") as span: # Get the current trace ID for reference trace_id = langfuse.get_current_trace_id() # Use it for external correlation log.info(f"Processing request with trace_id: {trace_id}") # Or pass to another system external_system.process(data, trace_id=trace_id)
2306 def get_current_observation_id(self) -> Optional[str]: 2307 """Get the observation ID (span ID) of the current active span. 2308 2309 This method retrieves the observation ID from the currently active span in the context. 2310 It can be used to get the observation ID for referencing in logs, external systems, 2311 or for creating scores or other related operations. 2312 2313 Returns: 2314 The current observation ID as a 16-character lowercase hexadecimal string, 2315 or None if there is no active span. 2316 2317 Example: 2318 ```python 2319 with langfuse.start_as_current_observation(name="process-user-query") as span: 2320 # Get the current observation ID 2321 observation_id = langfuse.get_current_observation_id() 2322 2323 # Store it for later reference 2324 cache.set(f"query_{query_id}_observation", observation_id) 2325 2326 # Process the query... 2327 ``` 2328 """ 2329 if not self._tracing_enabled: 2330 langfuse_logger.debug( 2331 "Operation skipped: get_current_observation_id - Tracing is disabled or client is in no-op mode." 2332 ) 2333 return None 2334 2335 current_otel_span = self._get_current_otel_span() 2336 2337 return self._get_otel_span_id(current_otel_span) if current_otel_span else None
Get the observation ID (span ID) of the current active span.
This method retrieves the observation ID from the currently active span in the context. It can be used to get the observation ID for referencing in logs, external systems, or for creating scores or other related operations.
Returns:
The current observation ID as a 16-character lowercase hexadecimal string, or None if there is no active span.
Example:
with langfuse.start_as_current_observation(name="process-user-query") as span: # Get the current observation ID observation_id = langfuse.get_current_observation_id() # Store it for later reference cache.set(f"query_{query_id}_observation", observation_id) # Process the query...
2350 def get_trace_url(self, *, trace_id: Optional[str] = None) -> Optional[str]: 2351 """Get the URL to view a trace in the Langfuse UI. 2352 2353 This method generates a URL that links directly to a trace in the Langfuse UI. 2354 It's useful for providing links in logs, notifications, or debugging tools. 2355 2356 Args: 2357 trace_id: Optional trace ID to generate a URL for. If not provided, 2358 the trace ID of the current active span will be used. 2359 2360 Returns: 2361 A URL string pointing to the trace in the Langfuse UI, 2362 or None if the project ID couldn't be retrieved or no trace ID is available. 2363 2364 Example: 2365 ```python 2366 # Get URL for the current trace 2367 with langfuse.start_as_current_observation(name="process-request") as span: 2368 trace_url = langfuse.get_trace_url() 2369 log.info(f"Processing trace: {trace_url}") 2370 2371 # Get URL for a specific trace 2372 specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef") 2373 send_notification(f"Review needed for trace: {specific_trace_url}") 2374 ``` 2375 """ 2376 final_trace_id = trace_id or self.get_current_trace_id() 2377 if not final_trace_id: 2378 return None 2379 2380 project_id = self._get_project_id() 2381 2382 return ( 2383 f"{self._base_url}/project/{project_id}/traces/{final_trace_id}" 2384 if project_id and final_trace_id 2385 else None 2386 )
Get the URL to view a trace in the Langfuse UI.
This method generates a URL that links directly to a trace in the Langfuse UI. It's useful for providing links in logs, notifications, or debugging tools.
Arguments:
- trace_id: Optional trace ID to generate a URL for. If not provided, the trace ID of the current active span will be used.
Returns:
A URL string pointing to the trace in the Langfuse UI, or None if the project ID couldn't be retrieved or no trace ID is available.
Example:
# Get URL for the current trace with langfuse.start_as_current_observation(name="process-request") as span: trace_url = langfuse.get_trace_url() log.info(f"Processing trace: {trace_url}") # Get URL for a specific trace specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef") send_notification(f"Review needed for trace: {specific_trace_url}")
2388 def get_dataset( 2389 self, 2390 name: str, 2391 *, 2392 fetch_items_page_size: Optional[int] = 50, 2393 version: Optional[datetime] = None, 2394 ) -> "DatasetClient": 2395 """Fetch a dataset by its name. 2396 2397 Args: 2398 name: The name of the dataset to fetch. 2399 fetch_items_page_size: All items of the dataset will be fetched in chunks of this size. Defaults to 50. 2400 version: Retrieve dataset items as they existed at this specific point in time (UTC). 2401 If provided, returns the state of items at the specified UTC timestamp. 2402 If not provided, returns the latest version. Must be a timezone-aware datetime object in UTC. 2403 2404 Returns: 2405 DatasetClient: The dataset with the given name. 2406 """ 2407 try: 2408 langfuse_logger.debug(f"Getting datasets {name}") 2409 dataset = self.api.datasets.get(dataset_name=self._url_encode(name)) 2410 2411 dataset_items: List[DatasetItem] = [] 2412 page = 1 2413 2414 while True: 2415 new_items = self.api.dataset_items.list( 2416 dataset_name=self._url_encode(name, is_url_param=True), 2417 page=page, 2418 limit=fetch_items_page_size, 2419 version=version, 2420 ) 2421 dataset_items.extend( 2422 self._hydrate_dataset_item_media_references(item) 2423 for item in new_items.data 2424 ) 2425 2426 if new_items.meta.total_pages <= page: 2427 break 2428 2429 page += 1 2430 2431 return DatasetClient( 2432 dataset=dataset, 2433 items=dataset_items, 2434 version=version, 2435 langfuse_client=self, 2436 ) 2437 2438 except Error as e: 2439 handle_fern_exception(e) 2440 raise e
Fetch a dataset by its name.
Arguments:
- name: The name of the dataset to fetch.
- fetch_items_page_size: All items of the dataset will be fetched in chunks of this size. Defaults to 50.
- version: Retrieve dataset items as they existed at this specific point in time (UTC). If provided, returns the state of items at the specified UTC timestamp. If not provided, returns the latest version. Must be a timezone-aware datetime object in UTC.
Returns:
DatasetClient: The dataset with the given name.
2442 def get_dataset_run( 2443 self, *, dataset_name: str, run_name: str 2444 ) -> DatasetRunWithItems: 2445 """Fetch a dataset run by dataset name and run name. 2446 2447 Args: 2448 dataset_name (str): The name of the dataset. 2449 run_name (str): The name of the run. 2450 2451 Returns: 2452 DatasetRunWithItems: The dataset run with its items. 2453 """ 2454 try: 2455 return cast( 2456 DatasetRunWithItems, 2457 self.api.datasets.get_run( 2458 dataset_name=self._url_encode(dataset_name), 2459 run_name=self._url_encode(run_name), 2460 request_options=None, 2461 ), 2462 ) 2463 except Error as e: 2464 handle_fern_exception(e) 2465 raise e
Fetch a dataset run by dataset name and run name.
Arguments:
- dataset_name (str): The name of the dataset.
- run_name (str): The name of the run.
Returns:
DatasetRunWithItems: The dataset run with its items.
2467 def get_dataset_runs( 2468 self, 2469 *, 2470 dataset_name: str, 2471 page: Optional[int] = None, 2472 limit: Optional[int] = None, 2473 ) -> PaginatedDatasetRuns: 2474 """Fetch all runs for a dataset. 2475 2476 Args: 2477 dataset_name (str): The name of the dataset. 2478 page (Optional[int]): Page number, starts at 1. 2479 limit (Optional[int]): Limit of items per page. 2480 2481 Returns: 2482 PaginatedDatasetRuns: Paginated list of dataset runs. 2483 """ 2484 try: 2485 return cast( 2486 PaginatedDatasetRuns, 2487 self.api.datasets.get_runs( 2488 dataset_name=self._url_encode(dataset_name), 2489 page=page, 2490 limit=limit, 2491 request_options=None, 2492 ), 2493 ) 2494 except Error as e: 2495 handle_fern_exception(e) 2496 raise e
Fetch all runs for a dataset.
Arguments:
- dataset_name (str): The name of the dataset.
- page (Optional[int]): Page number, starts at 1.
- limit (Optional[int]): Limit of items per page.
Returns:
PaginatedDatasetRuns: Paginated list of dataset runs.
2498 def delete_dataset_run( 2499 self, *, dataset_name: str, run_name: str 2500 ) -> DeleteDatasetRunResponse: 2501 """Delete a dataset run and all its run items. This action is irreversible. 2502 2503 Args: 2504 dataset_name (str): The name of the dataset. 2505 run_name (str): The name of the run. 2506 2507 Returns: 2508 DeleteDatasetRunResponse: Confirmation of deletion. 2509 """ 2510 try: 2511 return cast( 2512 DeleteDatasetRunResponse, 2513 self.api.datasets.delete_run( 2514 dataset_name=self._url_encode(dataset_name), 2515 run_name=self._url_encode(run_name), 2516 request_options=None, 2517 ), 2518 ) 2519 except Error as e: 2520 handle_fern_exception(e) 2521 raise e
Delete a dataset run and all its run items. This action is irreversible.
Arguments:
- dataset_name (str): The name of the dataset.
- run_name (str): The name of the run.
Returns:
DeleteDatasetRunResponse: Confirmation of deletion.
2523 def run_experiment( 2524 self, 2525 *, 2526 name: str, 2527 run_name: Optional[str] = None, 2528 description: Optional[str] = None, 2529 data: ExperimentData, 2530 task: TaskFunction, 2531 evaluators: List[EvaluatorFunction] = [], 2532 composite_evaluator: Optional[CompositeEvaluatorFunction] = None, 2533 run_evaluators: List[RunEvaluatorFunction] = [], 2534 max_concurrency: int = 50, 2535 metadata: Optional[Dict[str, str]] = None, 2536 _dataset_version: Optional[datetime] = None, 2537 ) -> ExperimentResult: 2538 """Run an experiment on a dataset with automatic tracing and evaluation. 2539 2540 This method executes a task function on each item in the provided dataset, 2541 automatically traces all executions with Langfuse for observability, runs 2542 item-level and run-level evaluators on the outputs, and returns comprehensive 2543 results with evaluation metrics. 2544 2545 The experiment system provides: 2546 - Automatic tracing of all task executions 2547 - Concurrent processing with configurable limits 2548 - Comprehensive error handling that isolates failures 2549 - Integration with Langfuse datasets for experiment tracking 2550 - Flexible evaluation framework supporting both sync and async evaluators 2551 2552 Args: 2553 name: Human-readable name for the experiment. Used for identification 2554 in the Langfuse UI. 2555 run_name: Optional exact name for the experiment run. If provided, this will be 2556 used as the exact dataset run name if the `data` contains Langfuse dataset items. 2557 If not provided, this will default to the experiment name appended with an ISO timestamp. 2558 description: Optional description explaining the experiment's purpose, 2559 methodology, or expected outcomes. 2560 data: Array of data items to process. Can be either: 2561 - List of dict-like items with 'input', 'expected_output', 'metadata' keys 2562 - List of Langfuse DatasetItem objects from dataset.items 2563 task: Function that processes each data item and returns output. 2564 Must accept 'item' as keyword argument and can return sync or async results. 2565 The task function signature should be: task(*, item, **kwargs) -> Any 2566 evaluators: List of functions to evaluate each item's output individually. 2567 Each evaluator receives input, output, expected_output, and metadata. 2568 Can return single Evaluation dict or list of Evaluation dicts. 2569 composite_evaluator: Optional function that creates composite scores from item-level evaluations. 2570 Receives the same inputs as item-level evaluators (input, output, expected_output, metadata) 2571 plus the list of evaluations from item-level evaluators. Useful for weighted averages, 2572 pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics. 2573 run_evaluators: List of functions to evaluate the entire experiment run. 2574 Each run evaluator receives all item_results and can compute aggregate metrics. 2575 Useful for calculating averages, distributions, or cross-item comparisons. 2576 max_concurrency: Maximum number of concurrent task executions (default: 50). 2577 Controls the number of items processed simultaneously. Adjust based on 2578 API rate limits and system resources. 2579 metadata: Optional metadata dictionary to attach to all experiment traces. 2580 This metadata will be included in every trace created during the experiment. 2581 If `data` are Langfuse dataset items, the metadata will be attached to the dataset run, too. 2582 2583 Returns: 2584 ExperimentResult containing: 2585 - run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset. 2586 - item_results: List of results for each processed item with outputs and evaluations 2587 - run_evaluations: List of aggregate evaluation results for the entire run 2588 - experiment_id: Stable identifier for the experiment run across all items 2589 - dataset_run_id: ID of the dataset run (if using Langfuse datasets) 2590 - dataset_run_url: Direct URL to view results in Langfuse UI (if applicable) 2591 2592 Raises: 2593 ValueError: If required parameters are missing or invalid 2594 Exception: If experiment setup fails (individual item failures are handled gracefully) 2595 2596 Examples: 2597 Basic experiment with local data: 2598 ```python 2599 def summarize_text(*, item, **kwargs): 2600 return f"Summary: {item['input'][:50]}..." 2601 2602 def length_evaluator(*, input, output, expected_output=None, **kwargs): 2603 return { 2604 "name": "output_length", 2605 "value": len(output), 2606 "comment": f"Output contains {len(output)} characters" 2607 } 2608 2609 result = langfuse.run_experiment( 2610 name="Text Summarization Test", 2611 description="Evaluate summarization quality and length", 2612 data=[ 2613 {"input": "Long article text...", "expected_output": "Expected summary"}, 2614 {"input": "Another article...", "expected_output": "Another summary"} 2615 ], 2616 task=summarize_text, 2617 evaluators=[length_evaluator] 2618 ) 2619 2620 print(f"Processed {len(result.item_results)} items") 2621 for item_result in result.item_results: 2622 print(f"Input: {item_result.item['input']}") 2623 print(f"Output: {item_result.output}") 2624 print(f"Evaluations: {item_result.evaluations}") 2625 ``` 2626 2627 Advanced experiment with async task and multiple evaluators: 2628 ```python 2629 async def llm_task(*, item, **kwargs): 2630 # Simulate async LLM call 2631 response = await openai_client.chat.completions.create( 2632 model="gpt-4", 2633 messages=[{"role": "user", "content": item["input"]}] 2634 ) 2635 return response.choices[0].message.content 2636 2637 def accuracy_evaluator(*, input, output, expected_output=None, **kwargs): 2638 if expected_output and expected_output.lower() in output.lower(): 2639 return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"} 2640 return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"} 2641 2642 def toxicity_evaluator(*, input, output, expected_output=None, **kwargs): 2643 # Simulate toxicity check 2644 toxicity_score = check_toxicity(output) # Your toxicity checker 2645 return { 2646 "name": "toxicity", 2647 "value": toxicity_score, 2648 "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}" 2649 } 2650 2651 def average_accuracy(*, item_results, **kwargs): 2652 accuracies = [ 2653 eval.value for result in item_results 2654 for eval in result.evaluations 2655 if eval.name == "accuracy" 2656 ] 2657 return { 2658 "name": "average_accuracy", 2659 "value": sum(accuracies) / len(accuracies) if accuracies else 0, 2660 "comment": f"Average accuracy across {len(accuracies)} items" 2661 } 2662 2663 result = langfuse.run_experiment( 2664 name="LLM Safety and Accuracy Test", 2665 description="Evaluate model accuracy and safety across diverse prompts", 2666 data=test_dataset, # Your dataset items 2667 task=llm_task, 2668 evaluators=[accuracy_evaluator, toxicity_evaluator], 2669 run_evaluators=[average_accuracy], 2670 max_concurrency=5, # Limit concurrent API calls 2671 metadata={"model": "gpt-4", "temperature": 0.7} 2672 ) 2673 ``` 2674 2675 Using with Langfuse datasets: 2676 ```python 2677 # Get dataset from Langfuse 2678 dataset = langfuse.get_dataset("my-eval-dataset") 2679 2680 result = dataset.run_experiment( 2681 name="Production Model Evaluation", 2682 description="Monthly evaluation of production model performance", 2683 task=my_production_task, 2684 evaluators=[accuracy_evaluator, latency_evaluator] 2685 ) 2686 2687 # Results automatically linked to dataset in Langfuse UI 2688 print(f"View results: {result['dataset_run_url']}") 2689 ``` 2690 2691 Note: 2692 - Task and evaluator functions can be either synchronous or asynchronous 2693 - Individual item failures are logged but don't stop the experiment 2694 - All executions are automatically traced and visible in Langfuse UI 2695 - When using Langfuse datasets, results are automatically linked for easy comparison 2696 - This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.) 2697 - Async execution is handled automatically with smart event loop detection 2698 """ 2699 return cast( 2700 ExperimentResult, 2701 run_async_safely( 2702 self._run_experiment_async( 2703 name=name, 2704 run_name=self._create_experiment_run_name( 2705 name=name, run_name=run_name 2706 ), 2707 description=description, 2708 data=data, 2709 task=task, 2710 evaluators=evaluators or [], 2711 composite_evaluator=composite_evaluator, 2712 run_evaluators=run_evaluators or [], 2713 max_concurrency=max_concurrency, 2714 metadata=metadata, 2715 dataset_version=_dataset_version, 2716 ), 2717 ), 2718 )
Run an experiment on a dataset with automatic tracing and evaluation.
This method executes a task function on each item in the provided dataset, automatically traces all executions with Langfuse for observability, runs item-level and run-level evaluators on the outputs, and returns comprehensive results with evaluation metrics.
The experiment system provides:
- Automatic tracing of all task executions
- Concurrent processing with configurable limits
- Comprehensive error handling that isolates failures
- Integration with Langfuse datasets for experiment tracking
- Flexible evaluation framework supporting both sync and async evaluators
Arguments:
- name: Human-readable name for the experiment. Used for identification in the Langfuse UI.
- run_name: Optional exact name for the experiment run. If provided, this will be
used as the exact dataset run name if the
datacontains Langfuse dataset items. If not provided, this will default to the experiment name appended with an ISO timestamp. - description: Optional description explaining the experiment's purpose, methodology, or expected outcomes.
- data: Array of data items to process. Can be either:
- List of dict-like items with 'input', 'expected_output', 'metadata' keys
- List of Langfuse DatasetItem objects from dataset.items
- task: Function that processes each data item and returns output. Must accept 'item' as keyword argument and can return sync or async results. The task function signature should be: task(*, item, **kwargs) -> Any
- evaluators: List of functions to evaluate each item's output individually. Each evaluator receives input, output, expected_output, and metadata. Can return single Evaluation dict or list of Evaluation dicts.
- composite_evaluator: Optional function that creates composite scores from item-level evaluations. Receives the same inputs as item-level evaluators (input, output, expected_output, metadata) plus the list of evaluations from item-level evaluators. Useful for weighted averages, pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics.
- run_evaluators: List of functions to evaluate the entire experiment run. Each run evaluator receives all item_results and can compute aggregate metrics. Useful for calculating averages, distributions, or cross-item comparisons.
- max_concurrency: Maximum number of concurrent task executions (default: 50). Controls the number of items processed simultaneously. Adjust based on API rate limits and system resources.
- metadata: Optional metadata dictionary to attach to all experiment traces.
This metadata will be included in every trace created during the experiment.
If
dataare Langfuse dataset items, the metadata will be attached to the dataset run, too.
Returns:
ExperimentResult containing:
- run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset.
- item_results: List of results for each processed item with outputs and evaluations
- run_evaluations: List of aggregate evaluation results for the entire run
- experiment_id: Stable identifier for the experiment run across all items
- dataset_run_id: ID of the dataset run (if using Langfuse datasets)
- dataset_run_url: Direct URL to view results in Langfuse UI (if applicable)
Raises:
- ValueError: If required parameters are missing or invalid
- Exception: If experiment setup fails (individual item failures are handled gracefully)
Examples:
Basic experiment with local data:
def summarize_text(*, item, **kwargs): return f"Summary: {item['input'][:50]}..." def length_evaluator(*, input, output, expected_output=None, **kwargs): return { "name": "output_length", "value": len(output), "comment": f"Output contains {len(output)} characters" } result = langfuse.run_experiment( name="Text Summarization Test", description="Evaluate summarization quality and length", data=[ {"input": "Long article text...", "expected_output": "Expected summary"}, {"input": "Another article...", "expected_output": "Another summary"} ], task=summarize_text, evaluators=[length_evaluator] ) print(f"Processed {len(result.item_results)} items") for item_result in result.item_results: print(f"Input: {item_result.item['input']}") print(f"Output: {item_result.output}") print(f"Evaluations: {item_result.evaluations}")Advanced experiment with async task and multiple evaluators:
async def llm_task(*, item, **kwargs): # Simulate async LLM call response = await openai_client.chat.completions.create( model="gpt-4", messages=[{"role": "user", "content": item["input"]}] ) return response.choices[0].message.content def accuracy_evaluator(*, input, output, expected_output=None, **kwargs): if expected_output and expected_output.lower() in output.lower(): return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"} return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"} def toxicity_evaluator(*, input, output, expected_output=None, **kwargs): # Simulate toxicity check toxicity_score = check_toxicity(output) # Your toxicity checker return { "name": "toxicity", "value": toxicity_score, "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}" } def average_accuracy(*, item_results, **kwargs): accuracies = [ eval.value for result in item_results for eval in result.evaluations if eval.name == "accuracy" ] return { "name": "average_accuracy", "value": sum(accuracies) / len(accuracies) if accuracies else 0, "comment": f"Average accuracy across {len(accuracies)} items" } result = langfuse.run_experiment( name="LLM Safety and Accuracy Test", description="Evaluate model accuracy and safety across diverse prompts", data=test_dataset, # Your dataset items task=llm_task, evaluators=[accuracy_evaluator, toxicity_evaluator], run_evaluators=[average_accuracy], max_concurrency=5, # Limit concurrent API calls metadata={"model": "gpt-4", "temperature": 0.7} )Using with Langfuse datasets:
# Get dataset from Langfuse dataset = langfuse.get_dataset("my-eval-dataset") result = dataset.run_experiment( name="Production Model Evaluation", description="Monthly evaluation of production model performance", task=my_production_task, evaluators=[accuracy_evaluator, latency_evaluator] ) # Results automatically linked to dataset in Langfuse UI print(f"View results: {result['dataset_run_url']}")
Note:
- Task and evaluator functions can be either synchronous or asynchronous
- Individual item failures are logged but don't stop the experiment
- All executions are automatically traced and visible in Langfuse UI
- When using Langfuse datasets, results are automatically linked for easy comparison
- This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.)
- Async execution is handled automatically with smart event loop detection
3080 def run_batched_evaluation( 3081 self, 3082 *, 3083 scope: Literal["traces", "observations"], 3084 mapper: MapperFunction, 3085 filter: Optional[str] = None, 3086 fetch_batch_size: int = 50, 3087 fetch_trace_fields: Optional[str] = None, 3088 max_items: Optional[int] = None, 3089 max_retries: int = 3, 3090 evaluators: List[EvaluatorFunction], 3091 composite_evaluator: Optional[CompositeEvaluatorFunction] = None, 3092 max_concurrency: int = 5, 3093 metadata: Optional[Dict[str, Any]] = None, 3094 _add_observation_scores_to_trace: bool = False, 3095 _additional_trace_tags: Optional[List[str]] = None, 3096 resume_from: Optional[BatchEvaluationResumeToken] = None, 3097 verbose: bool = False, 3098 ) -> BatchEvaluationResult: 3099 """Fetch traces or observations and run evaluations on each item. 3100 3101 This method provides a powerful way to evaluate existing data in Langfuse at scale. 3102 It fetches items based on filters, transforms them using a mapper function, runs 3103 evaluators on each item, and creates scores that are linked back to the original 3104 entities. This is ideal for: 3105 3106 - Running evaluations on production traces after deployment 3107 - Backtesting new evaluation metrics on historical data 3108 - Batch scoring of observations for quality monitoring 3109 - Periodic evaluation runs on recent data 3110 3111 The method uses a streaming/pipeline approach to process items in batches, making 3112 it memory-efficient for large datasets. It includes comprehensive error handling, 3113 retry logic, and resume capability for long-running evaluations. 3114 3115 Args: 3116 scope: The type of items to evaluate. Must be one of: 3117 - "traces": Evaluate complete traces with all their observations 3118 - "observations": Evaluate individual observations (spans, generations, events) 3119 mapper: Function that transforms API response objects into evaluator inputs. 3120 Receives a trace/observation object and returns an EvaluatorInputs 3121 instance with input, output, expected_output, and metadata fields. 3122 Can be sync or async. 3123 evaluators: List of evaluation functions to run on each item. Each evaluator 3124 receives the mapped inputs and returns Evaluation object(s). Evaluator 3125 failures are logged but don't stop the batch evaluation. 3126 filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples: 3127 - '{"tags": ["production"]}' 3128 - '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}' 3129 Default: None (fetches all items). 3130 fetch_batch_size: Number of items to fetch per API call and hold in memory. 3131 Larger values may be faster but use more memory. Default: 50. 3132 fetch_trace_fields: Comma-separated list of fields to include when fetching traces. Available field groups: 'core' (always included), 'io' (input, output, metadata), 'scores', 'observations', 'metrics'. If not specified, all fields are returned. Example: 'core,scores,metrics'. Note: Excluded 'observations' or 'scores' fields return empty arrays; excluded 'metrics' returns -1 for 'totalCost' and 'latency'. Only relevant if scope is 'traces'. 3133 max_items: Maximum total number of items to process. If None, processes all 3134 items matching the filter. Useful for testing or limiting evaluation runs. 3135 Default: None (process all). 3136 max_concurrency: Maximum number of items to evaluate concurrently. Controls 3137 parallelism and resource usage. Default: 5. 3138 composite_evaluator: Optional function that creates a composite score from 3139 item-level evaluations. Receives the original item and its evaluations, 3140 returns a single Evaluation. Useful for weighted averages or combined metrics. 3141 Default: None. 3142 metadata: Optional metadata dict to add to all created scores. Useful for 3143 tracking evaluation runs, versions, or other context. Default: None. 3144 max_retries: Maximum number of retry attempts for failed batch fetches. 3145 Uses exponential backoff (1s, 2s, 4s). Default: 3. 3146 verbose: If True, logs progress information to console. Useful for monitoring 3147 long-running evaluations. Default: False. 3148 resume_from: Optional resume token from a previous incomplete run. Allows 3149 continuing evaluation after interruption or failure. Default: None. 3150 3151 3152 Returns: 3153 BatchEvaluationResult containing: 3154 - total_items_fetched: Number of items fetched from API 3155 - total_items_processed: Number of items successfully evaluated 3156 - total_items_failed: Number of items that failed evaluation 3157 - total_scores_created: Scores created by item-level evaluators 3158 - total_composite_scores_created: Scores created by composite evaluator 3159 - total_evaluations_failed: Individual evaluator failures 3160 - evaluator_stats: Per-evaluator statistics (success rate, scores created) 3161 - resume_token: Token for resuming if incomplete (None if completed) 3162 - completed: True if all items processed 3163 - duration_seconds: Total execution time 3164 - failed_item_ids: IDs of items that failed 3165 - error_summary: Error types and counts 3166 - has_more_items: True if max_items reached but more exist 3167 3168 Raises: 3169 ValueError: If invalid scope is provided. 3170 3171 Examples: 3172 Basic trace evaluation: 3173 ```python 3174 from langfuse import Langfuse, EvaluatorInputs, Evaluation 3175 3176 client = Langfuse() 3177 3178 # Define mapper to extract fields from traces 3179 def trace_mapper(trace): 3180 return EvaluatorInputs( 3181 input=trace.input, 3182 output=trace.output, 3183 expected_output=None, 3184 metadata={"trace_id": trace.id} 3185 ) 3186 3187 # Define evaluator 3188 def length_evaluator(*, input, output, expected_output, metadata): 3189 return Evaluation( 3190 name="output_length", 3191 value=len(output) if output else 0 3192 ) 3193 3194 # Run batch evaluation 3195 result = client.run_batched_evaluation( 3196 scope="traces", 3197 mapper=trace_mapper, 3198 evaluators=[length_evaluator], 3199 filter='{"tags": ["production"]}', 3200 max_items=1000, 3201 verbose=True 3202 ) 3203 3204 print(f"Processed {result.total_items_processed} traces") 3205 print(f"Created {result.total_scores_created} scores") 3206 ``` 3207 3208 Evaluation with composite scorer: 3209 ```python 3210 def accuracy_evaluator(*, input, output, expected_output, metadata): 3211 # ... evaluation logic 3212 return Evaluation(name="accuracy", value=0.85) 3213 3214 def relevance_evaluator(*, input, output, expected_output, metadata): 3215 # ... evaluation logic 3216 return Evaluation(name="relevance", value=0.92) 3217 3218 def composite_evaluator(*, item, evaluations): 3219 # Weighted average of evaluations 3220 weights = {"accuracy": 0.6, "relevance": 0.4} 3221 total = sum( 3222 e.value * weights.get(e.name, 0) 3223 for e in evaluations 3224 if isinstance(e.value, (int, float)) 3225 ) 3226 return Evaluation( 3227 name="composite_score", 3228 value=total, 3229 comment=f"Weighted average of {len(evaluations)} metrics" 3230 ) 3231 3232 result = client.run_batched_evaluation( 3233 scope="traces", 3234 mapper=trace_mapper, 3235 evaluators=[accuracy_evaluator, relevance_evaluator], 3236 composite_evaluator=composite_evaluator, 3237 filter='{"user_id": "important_user"}', 3238 verbose=True 3239 ) 3240 ``` 3241 3242 Handling incomplete runs with resume: 3243 ```python 3244 # Initial run that may fail or timeout 3245 result = client.run_batched_evaluation( 3246 scope="observations", 3247 mapper=obs_mapper, 3248 evaluators=[my_evaluator], 3249 max_items=10000, 3250 verbose=True 3251 ) 3252 3253 # Check if incomplete 3254 if not result.completed and result.resume_token: 3255 print(f"Processed {result.resume_token.items_processed} items before interruption") 3256 3257 # Resume from where it left off 3258 result = client.run_batched_evaluation( 3259 scope="observations", 3260 mapper=obs_mapper, 3261 evaluators=[my_evaluator], 3262 resume_from=result.resume_token, 3263 verbose=True 3264 ) 3265 3266 print(f"Total items processed: {result.total_items_processed}") 3267 ``` 3268 3269 Monitoring evaluator performance: 3270 ```python 3271 result = client.run_batched_evaluation(...) 3272 3273 for stats in result.evaluator_stats: 3274 success_rate = stats.successful_runs / stats.total_runs 3275 print(f"{stats.name}:") 3276 print(f" Success rate: {success_rate:.1%}") 3277 print(f" Scores created: {stats.total_scores_created}") 3278 3279 if stats.failed_runs > 0: 3280 print(f" ⚠️ Failed {stats.failed_runs} times") 3281 ``` 3282 3283 Note: 3284 - Evaluator failures are logged but don't stop the batch evaluation 3285 - Individual item failures are tracked but don't stop processing 3286 - Fetch failures are retried with exponential backoff 3287 - All scores are automatically flushed to Langfuse at the end 3288 - The resume mechanism uses timestamp-based filtering to avoid duplicates 3289 """ 3290 runner = BatchEvaluationRunner(self) 3291 3292 return cast( 3293 BatchEvaluationResult, 3294 run_async_safely( 3295 runner.run_async( 3296 scope=scope, 3297 mapper=mapper, 3298 evaluators=evaluators, 3299 filter=filter, 3300 fetch_batch_size=fetch_batch_size, 3301 fetch_trace_fields=fetch_trace_fields, 3302 max_items=max_items, 3303 max_concurrency=max_concurrency, 3304 composite_evaluator=composite_evaluator, 3305 metadata=metadata, 3306 _add_observation_scores_to_trace=_add_observation_scores_to_trace, 3307 _additional_trace_tags=_additional_trace_tags, 3308 max_retries=max_retries, 3309 verbose=verbose, 3310 resume_from=resume_from, 3311 ) 3312 ), 3313 )
Fetch traces or observations and run evaluations on each item.
This method provides a powerful way to evaluate existing data in Langfuse at scale. It fetches items based on filters, transforms them using a mapper function, runs evaluators on each item, and creates scores that are linked back to the original entities. This is ideal for:
- Running evaluations on production traces after deployment
- Backtesting new evaluation metrics on historical data
- Batch scoring of observations for quality monitoring
- Periodic evaluation runs on recent data
The method uses a streaming/pipeline approach to process items in batches, making it memory-efficient for large datasets. It includes comprehensive error handling, retry logic, and resume capability for long-running evaluations.
Arguments:
- scope: The type of items to evaluate. Must be one of:
- "traces": Evaluate complete traces with all their observations
- "observations": Evaluate individual observations (spans, generations, events)
- mapper: Function that transforms API response objects into evaluator inputs. Receives a trace/observation object and returns an EvaluatorInputs instance with input, output, expected_output, and metadata fields. Can be sync or async.
- evaluators: List of evaluation functions to run on each item. Each evaluator receives the mapped inputs and returns Evaluation object(s). Evaluator failures are logged but don't stop the batch evaluation.
- filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples:
- '{"tags": ["production"]}'
- '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}' Default: None (fetches all items).
- fetch_batch_size: Number of items to fetch per API call and hold in memory. Larger values may be faster but use more memory. Default: 50.
- fetch_trace_fields: Comma-separated list of fields to include when fetching traces. Available field groups: 'core' (always included), 'io' (input, output, metadata), 'scores', 'observations', 'metrics'. If not specified, all fields are returned. Example: 'core,scores,metrics'. Note: Excluded 'observations' or 'scores' fields return empty arrays; excluded 'metrics' returns -1 for 'totalCost' and 'latency'. Only relevant if scope is 'traces'.
- max_items: Maximum total number of items to process. If None, processes all items matching the filter. Useful for testing or limiting evaluation runs. Default: None (process all).
- max_concurrency: Maximum number of items to evaluate concurrently. Controls parallelism and resource usage. Default: 5.
- composite_evaluator: Optional function that creates a composite score from item-level evaluations. Receives the original item and its evaluations, returns a single Evaluation. Useful for weighted averages or combined metrics. Default: None.
- metadata: Optional metadata dict to add to all created scores. Useful for tracking evaluation runs, versions, or other context. Default: None.
- max_retries: Maximum number of retry attempts for failed batch fetches. Uses exponential backoff (1s, 2s, 4s). Default: 3.
- verbose: If True, logs progress information to console. Useful for monitoring long-running evaluations. Default: False.
- resume_from: Optional resume token from a previous incomplete run. Allows continuing evaluation after interruption or failure. Default: None.
Returns:
BatchEvaluationResult containing: - total_items_fetched: Number of items fetched from API - total_items_processed: Number of items successfully evaluated - total_items_failed: Number of items that failed evaluation - total_scores_created: Scores created by item-level evaluators - total_composite_scores_created: Scores created by composite evaluator - total_evaluations_failed: Individual evaluator failures - evaluator_stats: Per-evaluator statistics (success rate, scores created) - resume_token: Token for resuming if incomplete (None if completed) - completed: True if all items processed - duration_seconds: Total execution time - failed_item_ids: IDs of items that failed - error_summary: Error types and counts - has_more_items: True if max_items reached but more exist
Raises:
- ValueError: If invalid scope is provided.
Examples:
Basic trace evaluation:
from langfuse import Langfuse, EvaluatorInputs, Evaluation client = Langfuse() # Define mapper to extract fields from traces def trace_mapper(trace): return EvaluatorInputs( input=trace.input, output=trace.output, expected_output=None, metadata={"trace_id": trace.id} ) # Define evaluator def length_evaluator(*, input, output, expected_output, metadata): return Evaluation( name="output_length", value=len(output) if output else 0 ) # Run batch evaluation result = client.run_batched_evaluation( scope="traces", mapper=trace_mapper, evaluators=[length_evaluator], filter='{"tags": ["production"]}', max_items=1000, verbose=True ) print(f"Processed {result.total_items_processed} traces") print(f"Created {result.total_scores_created} scores")Evaluation with composite scorer:
def accuracy_evaluator(*, input, output, expected_output, metadata): # ... evaluation logic return Evaluation(name="accuracy", value=0.85) def relevance_evaluator(*, input, output, expected_output, metadata): # ... evaluation logic return Evaluation(name="relevance", value=0.92) def composite_evaluator(*, item, evaluations): # Weighted average of evaluations weights = {"accuracy": 0.6, "relevance": 0.4} total = sum( e.value * weights.get(e.name, 0) for e in evaluations if isinstance(e.value, (int, float)) ) return Evaluation( name="composite_score", value=total, comment=f"Weighted average of {len(evaluations)} metrics" ) result = client.run_batched_evaluation( scope="traces", mapper=trace_mapper, evaluators=[accuracy_evaluator, relevance_evaluator], composite_evaluator=composite_evaluator, filter='{"user_id": "important_user"}', verbose=True )Handling incomplete runs with resume:
# Initial run that may fail or timeout result = client.run_batched_evaluation( scope="observations", mapper=obs_mapper, evaluators=[my_evaluator], max_items=10000, verbose=True ) # Check if incomplete if not result.completed and result.resume_token: print(f"Processed {result.resume_token.items_processed} items before interruption") # Resume from where it left off result = client.run_batched_evaluation( scope="observations", mapper=obs_mapper, evaluators=[my_evaluator], resume_from=result.resume_token, verbose=True ) print(f"Total items processed: {result.total_items_processed}")Monitoring evaluator performance:
result = client.run_batched_evaluation(...) for stats in result.evaluator_stats: success_rate = stats.successful_runs / stats.total_runs print(f"{stats.name}:") print(f" Success rate: {success_rate:.1%}") print(f" Scores created: {stats.total_scores_created}") if stats.failed_runs > 0: print(f" ⚠️ Failed {stats.failed_runs} times")
Note:
- Evaluator failures are logged but don't stop the batch evaluation
- Individual item failures are tracked but don't stop processing
- Fetch failures are retried with exponential backoff
- All scores are automatically flushed to Langfuse at the end
- The resume mechanism uses timestamp-based filtering to avoid duplicates
3315 def auth_check(self) -> bool: 3316 """Check if the provided credentials (public and secret key) are valid. 3317 3318 Raises: 3319 Exception: If no projects were found for the provided credentials. 3320 3321 Note: 3322 This method is blocking. It is discouraged to use it in production code. 3323 """ 3324 try: 3325 projects = self.api.projects.get() 3326 langfuse_logger.debug( 3327 f"Auth check successful, found {len(projects.data)} projects" 3328 ) 3329 if len(projects.data) == 0: 3330 raise Exception( 3331 "Auth check failed, no project found for the keys provided." 3332 ) 3333 return True 3334 3335 except AttributeError as e: 3336 langfuse_logger.warning( 3337 f"Auth check failed: Client not properly initialized. Error: {e}" 3338 ) 3339 return False 3340 3341 except Error as e: 3342 handle_fern_exception(e) 3343 raise e
Check if the provided credentials (public and secret key) are valid.
Raises:
- Exception: If no projects were found for the provided credentials.
Note:
This method is blocking. It is discouraged to use it in production code.
3345 def create_dataset( 3346 self, 3347 *, 3348 name: str, 3349 description: Optional[str] = None, 3350 metadata: Optional[Any] = None, 3351 input_schema: Optional[Any] = None, 3352 expected_output_schema: Optional[Any] = None, 3353 ) -> Dataset: 3354 """Create a dataset with the given name on Langfuse. 3355 3356 Args: 3357 name: Name of the dataset to create. 3358 description: Description of the dataset. Defaults to None. 3359 metadata: Additional metadata. Defaults to None. 3360 input_schema: JSON Schema for validating dataset item inputs. When set, all new items will be validated against this schema. 3361 expected_output_schema: JSON Schema for validating dataset item expected outputs. When set, all new items will be validated against this schema. 3362 3363 Returns: 3364 Dataset: The created dataset as returned by the Langfuse API. 3365 """ 3366 try: 3367 langfuse_logger.debug(f"Creating datasets {name}") 3368 3369 result = self.api.datasets.create( 3370 name=name, 3371 description=description, 3372 metadata=metadata, 3373 input_schema=input_schema, 3374 expected_output_schema=expected_output_schema, 3375 ) 3376 3377 return cast(Dataset, result) 3378 3379 except Error as e: 3380 handle_fern_exception(e) 3381 raise e
Create a dataset with the given name on Langfuse.
Arguments:
- name: Name of the dataset to create.
- description: Description of the dataset. Defaults to None.
- metadata: Additional metadata. Defaults to None.
- input_schema: JSON Schema for validating dataset item inputs. When set, all new items will be validated against this schema.
- expected_output_schema: JSON Schema for validating dataset item expected outputs. When set, all new items will be validated against this schema.
Returns:
Dataset: The created dataset as returned by the Langfuse API.
3383 def create_dataset_item( 3384 self, 3385 *, 3386 dataset_name: str, 3387 input: Optional[Any] = None, 3388 expected_output: Optional[Any] = None, 3389 metadata: Optional[Any] = None, 3390 source_trace_id: Optional[str] = None, 3391 source_observation_id: Optional[str] = None, 3392 status: Optional[DatasetStatus] = None, 3393 id: Optional[str] = None, 3394 ) -> DatasetItem: 3395 """Create a dataset item. 3396 3397 Upserts if an item with id already exists. 3398 3399 Args: 3400 dataset_name: Name of the dataset in which the dataset item should be created. 3401 input: Input data. Defaults to None. Can contain any dict, list or scalar. 3402 expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar. 3403 metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar. 3404 source_trace_id: Id of the source trace. Defaults to None. 3405 source_observation_id: Id of the source observation. Defaults to None. 3406 status: Status of the dataset item. Defaults to ACTIVE for newly created items. 3407 id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets. 3408 3409 Returns: 3410 DatasetItem: The created dataset item as returned by the Langfuse API. 3411 3412 Example: 3413 ```python 3414 from langfuse import Langfuse 3415 3416 langfuse = Langfuse() 3417 3418 # Uploading items to the Langfuse dataset named "capital_cities" 3419 langfuse.create_dataset_item( 3420 dataset_name="capital_cities", 3421 input={"input": {"country": "Italy"}}, 3422 expected_output={"expected_output": "Rome"}, 3423 metadata={"foo": "bar"} 3424 ) 3425 ``` 3426 """ 3427 try: 3428 langfuse_logger.debug(f"Creating dataset item for dataset {dataset_name}") 3429 3430 # Media uploads must reference the (dataset, item) they belong to, and 3431 # the item need not exist yet — so settle on the item id up front and 3432 # reuse it for the create call below. 3433 item_id = id if id is not None else str(uuid.uuid4()) 3434 3435 # Single pass per field: swap each LangfuseMedia for its reference 3436 # string (derived from content, not the upload) and collect the media 3437 # still to upload, deduped by media id and tagged with its field. 3438 pending_media: Dict[str, Tuple[LangfuseMedia, str]] = {} 3439 input = self._process_dataset_item_media( 3440 data=input, 3441 pending_media=pending_media, 3442 field=DatasetItemMediaReferenceField.INPUT.value, 3443 ) 3444 expected_output = self._process_dataset_item_media( 3445 data=expected_output, 3446 pending_media=pending_media, 3447 field=DatasetItemMediaReferenceField.EXPECTED_OUTPUT.value, 3448 ) 3449 metadata = self._process_dataset_item_media( 3450 data=metadata, 3451 pending_media=pending_media, 3452 field=DatasetItemMediaReferenceField.METADATA.value, 3453 ) 3454 3455 # The upload needs the dataset id, but the create API only takes the 3456 # name. Resolve it once, and only when there is actually media to 3457 # upload — a plain item pays no extra datasets.get round-trip. 3458 if pending_media: 3459 assert self._resources is not None 3460 dataset_id = self.api.datasets.get(self._url_encode(dataset_name)).id 3461 for media, field in pending_media.values(): 3462 self._resources._media_manager._upload_media_sync( 3463 media=media, 3464 dataset_id=dataset_id, 3465 dataset_item_id=item_id, 3466 field=field, 3467 ) 3468 3469 result = self.api.dataset_items.create( 3470 dataset_name=dataset_name, 3471 input=input, 3472 expected_output=expected_output, 3473 metadata=metadata, 3474 source_trace_id=source_trace_id, 3475 source_observation_id=source_observation_id, 3476 status=status, 3477 id=item_id, 3478 ) 3479 3480 return cast(DatasetItem, result) 3481 except Error as e: 3482 handle_fern_exception(e) 3483 raise e
Create a dataset item.
Upserts if an item with id already exists.
Arguments:
- dataset_name: Name of the dataset in which the dataset item should be created.
- input: Input data. Defaults to None. Can contain any dict, list or scalar.
- expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar.
- metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar.
- source_trace_id: Id of the source trace. Defaults to None.
- source_observation_id: Id of the source observation. Defaults to None.
- status: Status of the dataset item. Defaults to ACTIVE for newly created items.
- id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets.
Returns:
DatasetItem: The created dataset item as returned by the Langfuse API.
Example:
from langfuse import Langfuse langfuse = Langfuse() # Uploading items to the Langfuse dataset named "capital_cities" langfuse.create_dataset_item( dataset_name="capital_cities", input={"input": {"country": "Italy"}}, expected_output={"expected_output": "Rome"}, metadata={"foo": "bar"} )
3609 def resolve_media_references( 3610 self, 3611 *, 3612 obj: Any, 3613 resolve_with: Literal["base64_data_uri"], 3614 max_depth: int = 10, 3615 content_fetch_timeout_seconds: int = 5, 3616 ) -> Any: 3617 """Replace media reference strings in an object with base64 data URIs. 3618 3619 This method recursively traverses an object (up to max_depth) looking for media reference strings 3620 in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using 3621 the provided Langfuse client and replaces the reference string with a base64 data URI. 3622 3623 If fetching media content fails for a reference string, a warning is logged and the reference 3624 string is left unchanged. 3625 3626 Args: 3627 obj: The object to process. Can be a primitive value, array, or nested object. 3628 If the object has a __dict__ attribute, a dict will be returned instead of the original object type. 3629 resolve_with: The representation of the media content to replace the media reference string with. 3630 Currently only "base64_data_uri" is supported. 3631 max_depth: int: The maximum depth to traverse the object. Default is 10. 3632 content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5. 3633 3634 Returns: 3635 A deep copy of the input object with all media references replaced with base64 data URIs where possible. 3636 If the input object has a __dict__ attribute, a dict will be returned instead of the original object type. 3637 3638 Example: 3639 obj = { 3640 "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@", 3641 "nested": { 3642 "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@" 3643 } 3644 } 3645 3646 result = await LangfuseMedia.resolve_media_references(obj, langfuse_client) 3647 3648 # Result: 3649 # { 3650 # "image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...", 3651 # "nested": { 3652 # "pdf": "data:application/pdf;base64,JVBERi0xLjcK..." 3653 # } 3654 # } 3655 """ 3656 return LangfuseMedia.resolve_media_references( 3657 langfuse_client=self, 3658 obj=obj, 3659 resolve_with=resolve_with, 3660 max_depth=max_depth, 3661 content_fetch_timeout_seconds=content_fetch_timeout_seconds, 3662 )
Replace media reference strings in an object with base64 data URIs.
This method recursively traverses an object (up to max_depth) looking for media reference strings in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using the provided Langfuse client and replaces the reference string with a base64 data URI.
If fetching media content fails for a reference string, a warning is logged and the reference string is left unchanged.
Arguments:
- obj: The object to process. Can be a primitive value, array, or nested object. If the object has a __dict__ attribute, a dict will be returned instead of the original object type.
- resolve_with: The representation of the media content to replace the media reference string with. Currently only "base64_data_uri" is supported.
- max_depth: int: The maximum depth to traverse the object. Default is 10.
- content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5.
Returns:
A deep copy of the input object with all media references replaced with base64 data URIs where possible. If the input object has a __dict__ attribute, a dict will be returned instead of the original object type.
Example:
obj = { "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@", "nested": { "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@" } }
result = await LangfuseMedia.resolve_media_references(obj, langfuse_client)
Result:
{
"image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...",
"nested": {
"pdf": "data:application/pdf;base64,JVBERi0xLjcK..."
}
}
3692 def get_prompt( 3693 self, 3694 name: str, 3695 *, 3696 version: Optional[int] = None, 3697 label: Optional[str] = None, 3698 type: Literal["chat", "text"] = "text", 3699 cache_ttl_seconds: Optional[int] = None, 3700 fallback: Union[Optional[List[ChatMessageDict]], Optional[str]] = None, 3701 max_retries: Optional[int] = None, 3702 fetch_timeout_seconds: Optional[int] = None, 3703 ) -> PromptClient: 3704 """Get a prompt. 3705 3706 This method attempts to fetch the requested prompt from the local cache. If the prompt is not found 3707 in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again 3708 and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will 3709 return the expired prompt as a fallback. 3710 3711 Args: 3712 name (str): The name of the prompt to retrieve. 3713 3714 Keyword Args: 3715 version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both. 3716 label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both. 3717 cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a 3718 keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0. 3719 type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text". 3720 fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None. 3721 max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds. 3722 fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default. 3723 3724 Returns: 3725 The prompt object retrieved from the cache or directly fetched if not cached or expired of type 3726 - TextPromptClient, if type argument is 'text'. 3727 - ChatPromptClient, if type argument is 'chat'. 3728 3729 Raises: 3730 Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an 3731 expired prompt in the cache, in which case it logs a warning and returns the expired prompt. 3732 """ 3733 if self._resources is None: 3734 raise Error( 3735 "SDK is not correctly initialized. Check the init logs for more details." 3736 ) 3737 if version is not None and label is not None: 3738 raise ValueError("Cannot specify both version and label at the same time.") 3739 3740 if not name: 3741 raise ValueError("Prompt name cannot be empty.") 3742 3743 cache_key = PromptCache.generate_cache_key(name, version=version, label=label) 3744 bounded_max_retries = self._get_bounded_max_retries( 3745 max_retries, default_max_retries=2, max_retries_upper_bound=4 3746 ) 3747 3748 langfuse_logger.debug(f"Getting prompt '{cache_key}'") 3749 cached_prompt = self._resources.prompt_cache.get(cache_key) 3750 3751 if cached_prompt is None or cache_ttl_seconds == 0: 3752 langfuse_logger.debug( 3753 f"Prompt '{cache_key}' not found in cache or caching disabled." 3754 ) 3755 try: 3756 return self._fetch_prompt_and_update_cache( 3757 name, 3758 version=version, 3759 label=label, 3760 ttl_seconds=cache_ttl_seconds, 3761 max_retries=bounded_max_retries, 3762 fetch_timeout_seconds=fetch_timeout_seconds, 3763 ) 3764 except Exception as e: 3765 if fallback: 3766 langfuse_logger.warning( 3767 f"Returning fallback prompt for '{cache_key}' due to fetch error: {e}" 3768 ) 3769 3770 fallback_client_args: Dict[str, Any] = { 3771 "name": name, 3772 "prompt": fallback, 3773 "type": type, 3774 "version": version or 0, 3775 "config": {}, 3776 "labels": [label] if label else [], 3777 "tags": [], 3778 } 3779 3780 if type == "text": 3781 return TextPromptClient( 3782 prompt=Prompt_Text(**fallback_client_args), 3783 is_fallback=True, 3784 ) 3785 3786 if type == "chat": 3787 return ChatPromptClient( 3788 prompt=Prompt_Chat(**fallback_client_args), 3789 is_fallback=True, 3790 ) 3791 3792 raise e 3793 3794 if cached_prompt.is_expired(): 3795 langfuse_logger.debug(f"Stale prompt '{cache_key}' found in cache.") 3796 try: 3797 # refresh prompt in background thread, refresh_prompt deduplicates tasks 3798 langfuse_logger.debug(f"Refreshing prompt '{cache_key}' in background.") 3799 3800 def refresh_task() -> None: 3801 self._fetch_prompt_and_update_cache( 3802 name, 3803 version=version, 3804 label=label, 3805 ttl_seconds=cache_ttl_seconds, 3806 max_retries=bounded_max_retries, 3807 fetch_timeout_seconds=fetch_timeout_seconds, 3808 ) 3809 3810 self._resources.prompt_cache.add_refresh_prompt_task_if_current( 3811 cache_key, 3812 cached_prompt, 3813 refresh_task, 3814 ) 3815 langfuse_logger.debug( 3816 f"Returning stale prompt '{cache_key}' from cache." 3817 ) 3818 # return stale prompt 3819 return cached_prompt.value 3820 3821 except Exception as e: 3822 langfuse_logger.warning( 3823 f"Error when refreshing cached prompt '{cache_key}', returning cached version. Error: {e}" 3824 ) 3825 # creation of refresh prompt task failed, return stale prompt 3826 return cached_prompt.value 3827 3828 return cached_prompt.value
Get a prompt.
This method attempts to fetch the requested prompt from the local cache. If the prompt is not found in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will return the expired prompt as a fallback.
Arguments:
- name (str): The name of the prompt to retrieve.
Keyword Args:
- version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the
productionlabel is returned. Specify either version or label, not both. - label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the
productionlabel is returned. Specify either version or label, not both. - cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a
- keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0.
- type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text".
- fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None.
- max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds.
- fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default.
Returns:
The prompt object retrieved from the cache or directly fetched if not cached or expired of type
- TextPromptClient, if type argument is 'text'.
- ChatPromptClient, if type argument is 'chat'.
Raises:
- Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an
- expired prompt in the cache, in which case it logs a warning and returns the expired prompt.
3930 def create_prompt( 3931 self, 3932 *, 3933 name: str, 3934 prompt: Union[ 3935 str, List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]] 3936 ], 3937 labels: List[str] = [], 3938 tags: Optional[List[str]] = None, 3939 type: Optional[Literal["chat", "text"]] = "text", 3940 config: Optional[Any] = None, 3941 commit_message: Optional[str] = None, 3942 ) -> PromptClient: 3943 """Create a new prompt in Langfuse. 3944 3945 Keyword Args: 3946 name : The name of the prompt to be created. 3947 prompt : The content of the prompt to be created. 3948 is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead. 3949 labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label. 3950 tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt. 3951 config: Additional structured data to be saved with the prompt. Defaults to None. 3952 type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text". 3953 commit_message: Optional string describing the change. 3954 3955 Returns: 3956 TextPromptClient: The prompt if type argument is 'text'. 3957 ChatPromptClient: The prompt if type argument is 'chat'. 3958 """ 3959 try: 3960 langfuse_logger.debug(f"Creating prompt {name=}, {labels=}") 3961 3962 if type == "chat": 3963 if not isinstance(prompt, list): 3964 raise ValueError( 3965 "For 'chat' type, 'prompt' must be a list of chat messages with role and content attributes." 3966 ) 3967 request: Union[CreateChatPromptRequest, CreateTextPromptRequest] = ( 3968 CreateChatPromptRequest( 3969 name=name, 3970 prompt=cast(Any, prompt), 3971 labels=labels, 3972 tags=tags, 3973 config=config or {}, 3974 commit_message=commit_message, 3975 type=CreateChatPromptType.CHAT, 3976 ) 3977 ) 3978 server_prompt = self.api.prompts.create(request=request) 3979 3980 if self._resources is not None: 3981 self._resources.prompt_cache.invalidate(name) 3982 3983 return ChatPromptClient(prompt=cast(Prompt_Chat, server_prompt)) 3984 3985 if not isinstance(prompt, str): 3986 raise ValueError("For 'text' type, 'prompt' must be a string.") 3987 3988 request = CreateTextPromptRequest( 3989 name=name, 3990 prompt=prompt, 3991 labels=labels, 3992 tags=tags, 3993 config=config or {}, 3994 commit_message=commit_message, 3995 ) 3996 3997 server_prompt = self.api.prompts.create(request=request) 3998 3999 if self._resources is not None: 4000 self._resources.prompt_cache.invalidate(name) 4001 4002 return TextPromptClient(prompt=cast(Prompt_Text, server_prompt)) 4003 4004 except Error as e: 4005 handle_fern_exception(e) 4006 raise e
Create a new prompt in Langfuse.
Keyword Args:
- name : The name of the prompt to be created.
- prompt : The content of the prompt to be created.
- is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead.
- labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label.
- tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt.
- config: Additional structured data to be saved with the prompt. Defaults to None.
- type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text".
- commit_message: Optional string describing the change.
Returns:
TextPromptClient: The prompt if type argument is 'text'. ChatPromptClient: The prompt if type argument is 'chat'.
4008 def update_prompt( 4009 self, 4010 *, 4011 name: str, 4012 version: int, 4013 new_labels: List[str] = [], 4014 ) -> Any: 4015 """Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name. 4016 4017 Args: 4018 name (str): The name of the prompt to update. 4019 version (int): The version number of the prompt to update. 4020 new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to []. 4021 4022 Returns: 4023 Prompt: The updated prompt from the Langfuse API. 4024 4025 """ 4026 updated_prompt = self.api.prompt_version.update( 4027 name=self._url_encode(name), 4028 version=version, 4029 new_labels=new_labels, 4030 ) 4031 4032 if self._resources is not None: 4033 self._resources.prompt_cache.invalidate(name) 4034 4035 return updated_prompt
Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name.
Arguments:
- name (str): The name of the prompt to update.
- version (int): The version number of the prompt to update.
- new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to [].
Returns:
Prompt: The updated prompt from the Langfuse API.
4050 def clear_prompt_cache(self) -> None: 4051 """Clear the entire prompt cache, removing all cached prompts. 4052 4053 This method is useful when you want to force a complete refresh of all 4054 cached prompts, for example after major updates or when you need to 4055 ensure the latest versions are fetched from the server. 4056 """ 4057 if self._resources is not None: 4058 self._resources.prompt_cache.clear()
Clear the entire prompt cache, removing all cached prompts.
This method is useful when you want to force a complete refresh of all cached prompts, for example after major updates or when you need to ensure the latest versions are fetched from the server.
99class LangfuseMedia: 100 """A class for wrapping media objects for upload to Langfuse. 101 102 This class handles the preparation and formatting of media content for Langfuse, 103 supporting both base64 data URIs and raw content bytes. 104 105 Args: 106 obj (Optional[object]): The source object to be wrapped. Can be accessed via the `obj` attribute. 107 base64_data_uri (Optional[str]): A base64-encoded data URI containing the media content 108 and content type (e.g., "data:image/jpeg;base64,/9j/4AAQ..."). 109 content_type (Optional[str]): The MIME type of the media content when providing raw bytes. 110 content_bytes (Optional[bytes]): Raw bytes of the media content. 111 file_path (Optional[str]): The path to the file containing the media content. For relative paths, 112 the current working directory is used. 113 114 Raises: 115 ValueError: If neither base64_data_uri or the combination of content_bytes 116 and content_type is provided. 117 """ 118 119 obj: object 120 121 _content_bytes: Optional[bytes] 122 _content_type: Optional[MediaContentType] 123 _source: Optional[str] 124 _media_id: Optional[str] 125 126 def __init__( 127 self, 128 *, 129 obj: Optional[object] = None, 130 base64_data_uri: Optional[str] = None, 131 content_type: Optional[MediaContentType] = None, 132 content_bytes: Optional[bytes] = None, 133 file_path: Optional[str] = None, 134 ): 135 """Initialize a LangfuseMedia object. 136 137 Args: 138 obj: The object to wrap. 139 140 base64_data_uri: A base64-encoded data URI containing the media content 141 and content type (e.g., "data:image/jpeg;base64,/9j/4AAQ..."). 142 content_type: The MIME type of the media content when providing raw bytes or reading from a file. 143 content_bytes: Raw bytes of the media content. 144 file_path: The path to the file containing the media content. For relative paths, 145 the current working directory is used. 146 """ 147 self.obj = obj 148 149 if base64_data_uri is not None: 150 parsed_data = self._parse_base64_data_uri(base64_data_uri) 151 self._content_bytes, self._content_type = parsed_data 152 self._source = "base64_data_uri" 153 154 elif content_bytes is not None and content_type is not None: 155 self._content_type = content_type 156 self._content_bytes = content_bytes 157 self._source = "bytes" 158 elif ( 159 file_path is not None 160 and content_type is not None 161 and os.path.exists(file_path) 162 ): 163 self._content_bytes = self._read_file(file_path) 164 self._content_type = content_type if self._content_bytes else None 165 self._source = "file" if self._content_bytes else None 166 else: 167 logger.error( 168 "base64_data_uri, or content_bytes and content_type, or file_path must be provided to LangfuseMedia" 169 ) 170 171 self._content_bytes = None 172 self._content_type = None 173 self._source = None 174 175 self._media_id = self._get_media_id() 176 177 def _read_file(self, file_path: str) -> Optional[bytes]: 178 try: 179 with open(file_path, "rb") as file: 180 return file.read() 181 except Exception as e: 182 logger.error(f"Error reading file at path {file_path}", exc_info=e) 183 184 return None 185 186 def _get_media_id(self) -> Optional[str]: 187 content_hash = self._content_sha256_hash 188 189 if content_hash is None: 190 return None 191 192 # Convert hash to base64Url 193 url_safe_content_hash = content_hash.replace("+", "-").replace("/", "_") 194 195 return url_safe_content_hash[:22] 196 197 @property 198 def _content_length(self) -> Optional[int]: 199 return len(self._content_bytes) if self._content_bytes else None 200 201 @property 202 def _content_sha256_hash(self) -> Optional[str]: 203 if self._content_bytes is None: 204 return None 205 206 sha256_hash_bytes = hashlib.sha256(self._content_bytes).digest() 207 208 return base64.b64encode(sha256_hash_bytes).decode("utf-8") 209 210 @property 211 def _reference_string(self) -> Optional[str]: 212 if self._content_type is None or self._source is None or self._media_id is None: 213 return None 214 215 return f"@@@langfuseMedia:type={self._content_type}|id={self._media_id}|source={self._source}@@@" 216 217 @staticmethod 218 def parse_reference_string(reference_string: str) -> ParsedMediaReference: 219 """Parse a media reference string into a ParsedMediaReference. 220 221 Example reference string: 222 "@@@langfuseMedia:type=image/jpeg|id=some-uuid|source=base64_data_uri@@@" 223 224 Args: 225 reference_string: The reference string to parse. 226 227 Returns: 228 A TypedDict with the media_id, source, and content_type. 229 230 Raises: 231 ValueError: If the reference string is empty or not a string. 232 ValueError: If the reference string does not start with "@@@langfuseMedia:type=". 233 ValueError: If the reference string does not end with "@@@". 234 ValueError: If the reference string is missing required fields. 235 """ 236 if not reference_string: 237 raise ValueError("Reference string is empty") 238 239 if not isinstance(reference_string, str): 240 raise ValueError("Reference string is not a string") 241 242 if not reference_string.startswith("@@@langfuseMedia:type="): 243 raise ValueError( 244 "Reference string does not start with '@@@langfuseMedia:type='" 245 ) 246 247 if not reference_string.endswith("@@@"): 248 raise ValueError("Reference string does not end with '@@@'") 249 250 content = reference_string[len("@@@langfuseMedia:") :].rstrip("@@@") 251 252 # Split into key-value pairs 253 pairs = content.split("|") 254 parsed_data = {} 255 256 for pair in pairs: 257 key, value = pair.split("=", 1) 258 parsed_data[key] = value 259 260 # Verify all required fields are present 261 if not all(key in parsed_data for key in ["type", "id", "source"]): 262 raise ValueError("Missing required fields in reference string") 263 264 return ParsedMediaReference( 265 media_id=parsed_data["id"], 266 source=parsed_data["source"], 267 content_type=cast(MediaContentType, parsed_data["type"]), 268 ) 269 270 def _parse_base64_data_uri( 271 self, data: str 272 ) -> Tuple[Optional[bytes], Optional[MediaContentType]]: 273 # Example data URI: data:image/jpeg;base64,/9j/4AAQ... 274 try: 275 if not data or not isinstance(data, str): 276 raise ValueError("Data URI is not a string") 277 278 if not data.startswith("data:"): 279 raise ValueError("Data URI does not start with 'data:'") 280 281 header, actual_data = data[5:].split(",", 1) 282 if not header or not actual_data: 283 raise ValueError("Invalid URI") 284 285 # Split header into parts and check for base64 286 header_parts = header.split(";") 287 if "base64" not in header_parts: 288 raise ValueError("Data is not base64 encoded") 289 290 # Content type is the first part 291 content_type = header_parts[0] 292 if not content_type: 293 raise ValueError("Content type is empty") 294 295 return base64.b64decode(actual_data), cast(MediaContentType, content_type) 296 297 except Exception as e: 298 logger.error("Error parsing base64 data URI", exc_info=e) 299 300 return None, None 301 302 @staticmethod 303 def resolve_media_references( 304 *, 305 obj: T, 306 langfuse_client: "Langfuse", 307 resolve_with: Literal["base64_data_uri"], 308 max_depth: int = 10, 309 content_fetch_timeout_seconds: int = 10, 310 ) -> T: 311 """Replace media reference strings in an object with base64 data URIs. 312 313 This method recursively traverses an object (up to max_depth) looking for media reference strings 314 in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using 315 the provided Langfuse client and replaces the reference string with a base64 data URI. 316 317 If fetching media content fails for a reference string, a warning is logged and the reference 318 string is left unchanged. 319 320 Args: 321 obj: The object to process. Can be a primitive value, array, or nested object. 322 If the object has a __dict__ attribute, a dict will be returned instead of the original object type. 323 langfuse_client: Langfuse client instance used to fetch media content. 324 resolve_with: The representation of the media content to replace the media reference string with. 325 Currently only "base64_data_uri" is supported. 326 max_depth: Optional. Default is 10. The maximum depth to traverse the object. 327 328 Returns: 329 A deep copy of the input object with all media references replaced with base64 data URIs where possible. 330 If the input object has a __dict__ attribute, a dict will be returned instead of the original object type. 331 332 Example: 333 obj = { 334 "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@", 335 "nested": { 336 "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@" 337 } 338 } 339 340 result = await LangfuseMedia.resolve_media_references(obj, langfuse_client) 341 342 # Result: 343 # { 344 # "image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...", 345 # "nested": { 346 # "pdf": "data:application/pdf;base64,JVBERi0xLjcK..." 347 # } 348 # } 349 """ 350 351 def traverse(obj: Any, depth: int) -> Any: 352 if depth > max_depth: 353 return obj 354 355 # Handle string 356 if isinstance(obj, str): 357 regex = r"@@@langfuseMedia:.+?@@@" 358 reference_string_matches = re.findall(regex, obj) 359 if len(reference_string_matches) == 0: 360 return obj 361 362 result = obj 363 reference_string_to_media_content = {} 364 httpx_client = ( 365 langfuse_client._resources.httpx_client 366 if langfuse_client._resources is not None 367 else None 368 ) 369 370 for reference_string in reference_string_matches: 371 try: 372 parsed_media_reference = LangfuseMedia.parse_reference_string( 373 reference_string 374 ) 375 media_data = langfuse_client.api.media.get( 376 parsed_media_reference["media_id"] 377 ) 378 media_content = ( 379 httpx_client.get( 380 media_data.url, 381 timeout=content_fetch_timeout_seconds, 382 ) 383 if httpx_client is not None 384 else httpx.get( 385 media_data.url, timeout=content_fetch_timeout_seconds 386 ) 387 ) 388 media_content.raise_for_status() 389 390 base64_media_content = base64.b64encode( 391 media_content.content 392 ).decode() 393 base64_data_uri = f"data:{media_data.content_type};base64,{base64_media_content}" 394 395 reference_string_to_media_content[reference_string] = ( 396 base64_data_uri 397 ) 398 except Exception as e: 399 logger.warning( 400 f"Error fetching media content for reference string {reference_string}: {e}" 401 ) 402 # Do not replace the reference string if there's an error 403 continue 404 405 for ( 406 ref_str, 407 media_content_str, 408 ) in reference_string_to_media_content.items(): 409 result = result.replace(ref_str, media_content_str) 410 411 return result 412 413 # Handle arrays 414 if isinstance(obj, list): 415 return [traverse(item, depth + 1) for item in obj] 416 417 # Handle dictionaries 418 if isinstance(obj, dict): 419 return {key: traverse(value, depth + 1) for key, value in obj.items()} 420 421 # Handle objects: 422 if hasattr(obj, "__dict__"): 423 return { 424 key: traverse(value, depth + 1) 425 for key, value in obj.__dict__.items() 426 } 427 428 return obj 429 430 return cast(T, traverse(obj, 0))
A class for wrapping media objects for upload to Langfuse.
This class handles the preparation and formatting of media content for Langfuse, supporting both base64 data URIs and raw content bytes.
Arguments:
- obj (Optional[object]): The source object to be wrapped. Can be accessed via the
objattribute. - base64_data_uri (Optional[str]): A base64-encoded data URI containing the media content and content type (e.g., "data:image/jpeg;base64,/9j/4AAQ...").
- content_type (Optional[str]): The MIME type of the media content when providing raw bytes.
- content_bytes (Optional[bytes]): Raw bytes of the media content.
- file_path (Optional[str]): The path to the file containing the media content. For relative paths, the current working directory is used.
Raises:
- ValueError: If neither base64_data_uri or the combination of content_bytes and content_type is provided.
126 def __init__( 127 self, 128 *, 129 obj: Optional[object] = None, 130 base64_data_uri: Optional[str] = None, 131 content_type: Optional[MediaContentType] = None, 132 content_bytes: Optional[bytes] = None, 133 file_path: Optional[str] = None, 134 ): 135 """Initialize a LangfuseMedia object. 136 137 Args: 138 obj: The object to wrap. 139 140 base64_data_uri: A base64-encoded data URI containing the media content 141 and content type (e.g., "data:image/jpeg;base64,/9j/4AAQ..."). 142 content_type: The MIME type of the media content when providing raw bytes or reading from a file. 143 content_bytes: Raw bytes of the media content. 144 file_path: The path to the file containing the media content. For relative paths, 145 the current working directory is used. 146 """ 147 self.obj = obj 148 149 if base64_data_uri is not None: 150 parsed_data = self._parse_base64_data_uri(base64_data_uri) 151 self._content_bytes, self._content_type = parsed_data 152 self._source = "base64_data_uri" 153 154 elif content_bytes is not None and content_type is not None: 155 self._content_type = content_type 156 self._content_bytes = content_bytes 157 self._source = "bytes" 158 elif ( 159 file_path is not None 160 and content_type is not None 161 and os.path.exists(file_path) 162 ): 163 self._content_bytes = self._read_file(file_path) 164 self._content_type = content_type if self._content_bytes else None 165 self._source = "file" if self._content_bytes else None 166 else: 167 logger.error( 168 "base64_data_uri, or content_bytes and content_type, or file_path must be provided to LangfuseMedia" 169 ) 170 171 self._content_bytes = None 172 self._content_type = None 173 self._source = None 174 175 self._media_id = self._get_media_id()
Initialize a LangfuseMedia object.
Arguments:
- obj: The object to wrap.
- base64_data_uri: A base64-encoded data URI containing the media content and content type (e.g., "data:image/jpeg;base64,/9j/4AAQ...").
- content_type: The MIME type of the media content when providing raw bytes or reading from a file.
- content_bytes: Raw bytes of the media content.
- file_path: The path to the file containing the media content. For relative paths, the current working directory is used.
217 @staticmethod 218 def parse_reference_string(reference_string: str) -> ParsedMediaReference: 219 """Parse a media reference string into a ParsedMediaReference. 220 221 Example reference string: 222 "@@@langfuseMedia:type=image/jpeg|id=some-uuid|source=base64_data_uri@@@" 223 224 Args: 225 reference_string: The reference string to parse. 226 227 Returns: 228 A TypedDict with the media_id, source, and content_type. 229 230 Raises: 231 ValueError: If the reference string is empty or not a string. 232 ValueError: If the reference string does not start with "@@@langfuseMedia:type=". 233 ValueError: If the reference string does not end with "@@@". 234 ValueError: If the reference string is missing required fields. 235 """ 236 if not reference_string: 237 raise ValueError("Reference string is empty") 238 239 if not isinstance(reference_string, str): 240 raise ValueError("Reference string is not a string") 241 242 if not reference_string.startswith("@@@langfuseMedia:type="): 243 raise ValueError( 244 "Reference string does not start with '@@@langfuseMedia:type='" 245 ) 246 247 if not reference_string.endswith("@@@"): 248 raise ValueError("Reference string does not end with '@@@'") 249 250 content = reference_string[len("@@@langfuseMedia:") :].rstrip("@@@") 251 252 # Split into key-value pairs 253 pairs = content.split("|") 254 parsed_data = {} 255 256 for pair in pairs: 257 key, value = pair.split("=", 1) 258 parsed_data[key] = value 259 260 # Verify all required fields are present 261 if not all(key in parsed_data for key in ["type", "id", "source"]): 262 raise ValueError("Missing required fields in reference string") 263 264 return ParsedMediaReference( 265 media_id=parsed_data["id"], 266 source=parsed_data["source"], 267 content_type=cast(MediaContentType, parsed_data["type"]), 268 )
Parse a media reference string into a ParsedMediaReference.
Example reference string:
"@@@langfuseMedia:type=image/jpeg|id=some-uuid|source=base64_data_uri@@@"
Arguments:
- reference_string: The reference string to parse.
Returns:
A TypedDict with the media_id, source, and content_type.
Raises:
- ValueError: If the reference string is empty or not a string.
- ValueError: If the reference string does not start with "@@@langfuseMedia:type=".
- ValueError: If the reference string does not end with "@@@".
- ValueError: If the reference string is missing required fields.
302 @staticmethod 303 def resolve_media_references( 304 *, 305 obj: T, 306 langfuse_client: "Langfuse", 307 resolve_with: Literal["base64_data_uri"], 308 max_depth: int = 10, 309 content_fetch_timeout_seconds: int = 10, 310 ) -> T: 311 """Replace media reference strings in an object with base64 data URIs. 312 313 This method recursively traverses an object (up to max_depth) looking for media reference strings 314 in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using 315 the provided Langfuse client and replaces the reference string with a base64 data URI. 316 317 If fetching media content fails for a reference string, a warning is logged and the reference 318 string is left unchanged. 319 320 Args: 321 obj: The object to process. Can be a primitive value, array, or nested object. 322 If the object has a __dict__ attribute, a dict will be returned instead of the original object type. 323 langfuse_client: Langfuse client instance used to fetch media content. 324 resolve_with: The representation of the media content to replace the media reference string with. 325 Currently only "base64_data_uri" is supported. 326 max_depth: Optional. Default is 10. The maximum depth to traverse the object. 327 328 Returns: 329 A deep copy of the input object with all media references replaced with base64 data URIs where possible. 330 If the input object has a __dict__ attribute, a dict will be returned instead of the original object type. 331 332 Example: 333 obj = { 334 "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@", 335 "nested": { 336 "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@" 337 } 338 } 339 340 result = await LangfuseMedia.resolve_media_references(obj, langfuse_client) 341 342 # Result: 343 # { 344 # "image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...", 345 # "nested": { 346 # "pdf": "data:application/pdf;base64,JVBERi0xLjcK..." 347 # } 348 # } 349 """ 350 351 def traverse(obj: Any, depth: int) -> Any: 352 if depth > max_depth: 353 return obj 354 355 # Handle string 356 if isinstance(obj, str): 357 regex = r"@@@langfuseMedia:.+?@@@" 358 reference_string_matches = re.findall(regex, obj) 359 if len(reference_string_matches) == 0: 360 return obj 361 362 result = obj 363 reference_string_to_media_content = {} 364 httpx_client = ( 365 langfuse_client._resources.httpx_client 366 if langfuse_client._resources is not None 367 else None 368 ) 369 370 for reference_string in reference_string_matches: 371 try: 372 parsed_media_reference = LangfuseMedia.parse_reference_string( 373 reference_string 374 ) 375 media_data = langfuse_client.api.media.get( 376 parsed_media_reference["media_id"] 377 ) 378 media_content = ( 379 httpx_client.get( 380 media_data.url, 381 timeout=content_fetch_timeout_seconds, 382 ) 383 if httpx_client is not None 384 else httpx.get( 385 media_data.url, timeout=content_fetch_timeout_seconds 386 ) 387 ) 388 media_content.raise_for_status() 389 390 base64_media_content = base64.b64encode( 391 media_content.content 392 ).decode() 393 base64_data_uri = f"data:{media_data.content_type};base64,{base64_media_content}" 394 395 reference_string_to_media_content[reference_string] = ( 396 base64_data_uri 397 ) 398 except Exception as e: 399 logger.warning( 400 f"Error fetching media content for reference string {reference_string}: {e}" 401 ) 402 # Do not replace the reference string if there's an error 403 continue 404 405 for ( 406 ref_str, 407 media_content_str, 408 ) in reference_string_to_media_content.items(): 409 result = result.replace(ref_str, media_content_str) 410 411 return result 412 413 # Handle arrays 414 if isinstance(obj, list): 415 return [traverse(item, depth + 1) for item in obj] 416 417 # Handle dictionaries 418 if isinstance(obj, dict): 419 return {key: traverse(value, depth + 1) for key, value in obj.items()} 420 421 # Handle objects: 422 if hasattr(obj, "__dict__"): 423 return { 424 key: traverse(value, depth + 1) 425 for key, value in obj.__dict__.items() 426 } 427 428 return obj 429 430 return cast(T, traverse(obj, 0))
Replace media reference strings in an object with base64 data URIs.
This method recursively traverses an object (up to max_depth) looking for media reference strings in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using the provided Langfuse client and replaces the reference string with a base64 data URI.
If fetching media content fails for a reference string, a warning is logged and the reference string is left unchanged.
Arguments:
- obj: The object to process. Can be a primitive value, array, or nested object. If the object has a __dict__ attribute, a dict will be returned instead of the original object type.
- langfuse_client: Langfuse client instance used to fetch media content.
- resolve_with: The representation of the media content to replace the media reference string with. Currently only "base64_data_uri" is supported.
- max_depth: Optional. Default is 10. The maximum depth to traverse the object.
Returns:
A deep copy of the input object with all media references replaced with base64 data URIs where possible. If the input object has a __dict__ attribute, a dict will be returned instead of the original object type.
Example:
obj = { "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@", "nested": { "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@" } }
result = await LangfuseMedia.resolve_media_references(obj, langfuse_client)
Result:
{
"image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...",
"nested": {
"pdf": "data:application/pdf;base64,JVBERi0xLjcK..."
}
}
24@dataclass(frozen=True) 25class LangfuseMediaReference: 26 """Resolved reference to media stored in Langfuse.""" 27 28 media_id: str 29 content_type: str 30 url: str 31 url_expiry: Optional[str] = None 32 content_length: Optional[int] = None 33 reference_string: Optional[str] = None 34 35 def is_url_expired(self) -> bool: 36 """Return whether the signed URL is already expired.""" 37 if self.url_expiry is None: 38 return False 39 40 expiry = self.url_expiry.replace("Z", "+00:00") 41 42 try: 43 expiry_datetime = datetime.fromisoformat(expiry) 44 except ValueError: 45 return False 46 47 if expiry_datetime.tzinfo is None: 48 expiry_datetime = expiry_datetime.replace(tzinfo=timezone.utc) 49 50 return expiry_datetime <= datetime.now(timezone.utc) 51 52 def fetch_bytes( 53 self, *, timeout: float = 30.0, client: Optional[httpx.Client] = None 54 ) -> bytes: 55 """Fetch the media content from the signed URL. 56 57 Args: 58 timeout: Request timeout in seconds. 59 client: Optional httpx client to use for the request. Pass this to 60 honor custom transport settings (proxy, CA bundle, mTLS) — in 61 particular when multiple Langfuse clients are configured, since 62 the SDK cannot otherwise tell which client produced this 63 reference. When omitted, the single configured client is used, 64 falling back to a default httpx client. 65 """ 66 from langfuse._client.resource_manager import LangfuseResourceManager 67 68 httpx_client = client or LangfuseResourceManager.get_singleton_httpx_client() 69 response = ( 70 httpx_client.get(self.url, timeout=timeout) 71 if httpx_client is not None 72 else httpx.get(self.url, timeout=timeout) 73 ) 74 response.raise_for_status() 75 76 return response.content 77 78 def fetch_base64( 79 self, *, timeout: float = 30.0, client: Optional[httpx.Client] = None 80 ) -> str: 81 """Fetch media and return raw base64 without a data URI prefix. 82 83 See :meth:`fetch_bytes` for the ``client`` argument. 84 """ 85 return base64.b64encode( 86 self.fetch_bytes(timeout=timeout, client=client) 87 ).decode() 88 89 def fetch_data_uri( 90 self, *, timeout: float = 30.0, client: Optional[httpx.Client] = None 91 ) -> str: 92 """Fetch media and return it as a data URI. 93 94 See :meth:`fetch_bytes` for the ``client`` argument. 95 """ 96 return f"data:{self.content_type};base64,{self.fetch_base64(timeout=timeout, client=client)}"
Resolved reference to media stored in Langfuse.
35 def is_url_expired(self) -> bool: 36 """Return whether the signed URL is already expired.""" 37 if self.url_expiry is None: 38 return False 39 40 expiry = self.url_expiry.replace("Z", "+00:00") 41 42 try: 43 expiry_datetime = datetime.fromisoformat(expiry) 44 except ValueError: 45 return False 46 47 if expiry_datetime.tzinfo is None: 48 expiry_datetime = expiry_datetime.replace(tzinfo=timezone.utc) 49 50 return expiry_datetime <= datetime.now(timezone.utc)
Return whether the signed URL is already expired.
52 def fetch_bytes( 53 self, *, timeout: float = 30.0, client: Optional[httpx.Client] = None 54 ) -> bytes: 55 """Fetch the media content from the signed URL. 56 57 Args: 58 timeout: Request timeout in seconds. 59 client: Optional httpx client to use for the request. Pass this to 60 honor custom transport settings (proxy, CA bundle, mTLS) — in 61 particular when multiple Langfuse clients are configured, since 62 the SDK cannot otherwise tell which client produced this 63 reference. When omitted, the single configured client is used, 64 falling back to a default httpx client. 65 """ 66 from langfuse._client.resource_manager import LangfuseResourceManager 67 68 httpx_client = client or LangfuseResourceManager.get_singleton_httpx_client() 69 response = ( 70 httpx_client.get(self.url, timeout=timeout) 71 if httpx_client is not None 72 else httpx.get(self.url, timeout=timeout) 73 ) 74 response.raise_for_status() 75 76 return response.content
Fetch the media content from the signed URL.
Arguments:
- timeout: Request timeout in seconds.
- client: Optional httpx client to use for the request. Pass this to honor custom transport settings (proxy, CA bundle, mTLS) — in particular when multiple Langfuse clients are configured, since the SDK cannot otherwise tell which client produced this reference. When omitted, the single configured client is used, falling back to a default httpx client.
78 def fetch_base64( 79 self, *, timeout: float = 30.0, client: Optional[httpx.Client] = None 80 ) -> str: 81 """Fetch media and return raw base64 without a data URI prefix. 82 83 See :meth:`fetch_bytes` for the ``client`` argument. 84 """ 85 return base64.b64encode( 86 self.fetch_bytes(timeout=timeout, client=client) 87 ).decode()
Fetch media and return raw base64 without a data URI prefix.
See fetch_bytes() for the client argument.
89 def fetch_data_uri( 90 self, *, timeout: float = 30.0, client: Optional[httpx.Client] = None 91 ) -> str: 92 """Fetch media and return it as a data URI. 93 94 See :meth:`fetch_bytes` for the ``client`` argument. 95 """ 96 return f"data:{self.content_type};base64,{self.fetch_base64(timeout=timeout, client=client)}"
Fetch media and return it as a data URI.
See fetch_bytes() for the client argument.
65def get_client(*, public_key: Optional[str] = None) -> Langfuse: 66 """Get or create a Langfuse client instance. 67 68 Returns an existing Langfuse client or creates a new one if none exists. In multi-project setups, 69 providing a public_key is required. Multi-project support is experimental - see Langfuse docs. 70 71 Behavior: 72 - Single project: Returns existing client or creates new one 73 - Multi-project: Requires public_key to return specific client 74 - No public_key in multi-project: Returns disabled client to prevent data leakage 75 76 The function uses a singleton pattern per public_key to conserve resources and maintain state. 77 78 Args: 79 public_key (Optional[str]): Project identifier 80 - With key: Returns client for that project 81 - Without key: Returns single client or disabled client if multiple exist 82 83 Returns: 84 Langfuse: Client instance in one of three states: 85 1. Client for specified public_key 86 2. Default client for single-project setup 87 3. Disabled client when multiple projects exist without key 88 89 Security: 90 Disables tracing when multiple projects exist without explicit key to prevent 91 cross-project data leakage. Multi-project setups are experimental. 92 93 Example: 94 ```python 95 # Single project 96 client = get_client() # Default client 97 98 # In multi-project usage: 99 client_a = get_client(public_key="project_a_key") # Returns project A's client 100 client_b = get_client(public_key="project_b_key") # Returns project B's client 101 102 # Without specific key in multi-project setup: 103 client = get_client() # Returns disabled client for safety 104 ``` 105 """ 106 with LangfuseResourceManager._lock: 107 active_instances = LangfuseResourceManager._instances 108 109 # If no explicit public_key provided, check execution context 110 if not public_key: 111 public_key = _current_public_key.get(None) 112 113 if not public_key: 114 if len(active_instances) == 0: 115 # No clients initialized yet, create default instance 116 return Langfuse() 117 118 if len(active_instances) == 1: 119 # Only one client exists, safe to use without specifying key 120 instance = list(active_instances.values())[0] 121 122 # Initialize with the credentials bound to the instance 123 # This is important if the original instance was instantiated 124 # via constructor arguments 125 return _create_client_from_instance(instance) 126 127 else: 128 # Multiple clients exist but no key specified - disable tracing 129 # to prevent cross-project data leakage 130 langfuse_logger.warning( 131 "No 'langfuse_public_key' passed to decorated function, but multiple langfuse clients are instantiated in current process. Skipping tracing for this function to avoid cross-project leakage." 132 ) 133 return Langfuse( 134 tracing_enabled=False, public_key="fake", secret_key="fake" 135 ) 136 137 else: 138 # Specific key provided, look up existing instance 139 target_instance: Optional[LangfuseResourceManager] = active_instances.get( 140 public_key, None 141 ) 142 143 if target_instance is None: 144 # No instance found with this key - client not initialized properly 145 langfuse_logger.warning( 146 f"No Langfuse client with public key {public_key} has been initialized. Skipping tracing for decorated function." 147 ) 148 return Langfuse( 149 tracing_enabled=False, public_key="fake", secret_key="fake" 150 ) 151 152 # target_instance is guaranteed to be not None at this point 153 return _create_client_from_instance(target_instance, public_key)
Get or create a Langfuse client instance.
Returns an existing Langfuse client or creates a new one if none exists. In multi-project setups, providing a public_key is required. Multi-project support is experimental - see Langfuse docs.
Behavior:
- Single project: Returns existing client or creates new one
- Multi-project: Requires public_key to return specific client
- No public_key in multi-project: Returns disabled client to prevent data leakage
The function uses a singleton pattern per public_key to conserve resources and maintain state.
Arguments:
- public_key (Optional[str]): Project identifier
- With key: Returns client for that project
- Without key: Returns single client or disabled client if multiple exist
Returns:
Langfuse: Client instance in one of three states: 1. Client for specified public_key 2. Default client for single-project setup 3. Disabled client when multiple projects exist without key
Security:
Disables tracing when multiple projects exist without explicit key to prevent cross-project data leakage. Multi-project setups are experimental.
Example:
# Single project client = get_client() # Default client # In multi-project usage: client_a = get_client(public_key="project_a_key") # Returns project A's client client_b = get_client(public_key="project_b_key") # Returns project B's client # Without specific key in multi-project setup: client = get_client() # Returns disabled client for safety
88 def observe( 89 self, 90 func: Optional[F] = None, 91 *, 92 name: Optional[str] = None, 93 as_type: Optional[ObservationTypeLiteralNoEvent] = None, 94 capture_input: Optional[bool] = None, 95 capture_output: Optional[bool] = None, 96 transform_to_string: Optional[Callable[[Iterable], str]] = None, 97 ) -> Union[F, Callable[[F], F]]: 98 """Wrap a function to create and manage Langfuse tracing around its execution, supporting both synchronous and asynchronous functions. 99 100 This decorator provides seamless integration of Langfuse observability into your codebase. It automatically creates 101 spans or generations around function execution, capturing timing, inputs/outputs, and error states. The decorator 102 intelligently handles both synchronous and asynchronous functions, preserving function signatures and type hints. 103 104 Using OpenTelemetry's distributed tracing system, it maintains proper trace context propagation throughout your application, 105 enabling you to see hierarchical traces of function calls with detailed performance metrics and function-specific details. 106 107 Args: 108 func (Optional[Callable]): The function to decorate. When used with parentheses @observe(), this will be None. 109 name (Optional[str]): Custom name for the created trace or span. If not provided, the function name is used. 110 as_type (Optional[Literal]): Set the observation type. Supported values: 111 "generation", "span", "agent", "tool", "chain", "retriever", "embedding", "evaluator", "guardrail". 112 Observation types are highlighted in the Langfuse UI for filtering and visualization. 113 The types "generation" and "embedding" create a span on which additional attributes such as model metrics 114 can be set. 115 116 Returns: 117 Callable: A wrapped version of the original function that automatically creates and manages Langfuse spans. 118 119 Example: 120 For general function tracing with automatic naming: 121 ```python 122 @observe() 123 def process_user_request(user_id, query): 124 # Function is automatically traced with name "process_user_request" 125 return get_response(query) 126 ``` 127 128 For language model generation tracking: 129 ```python 130 @observe(name="answer-generation", as_type="generation") 131 async def generate_answer(query): 132 # Creates a generation-type span with extended LLM metrics 133 response = await openai.chat.completions.create( 134 model="gpt-4", 135 messages=[{"role": "user", "content": query}] 136 ) 137 return response.choices[0].message.content 138 ``` 139 140 For trace context propagation between functions: 141 ```python 142 @observe() 143 def main_process(): 144 # Parent span is created 145 return sub_process() # Child span automatically connected to parent 146 147 @observe() 148 def sub_process(): 149 # Automatically becomes a child span of main_process 150 return "result" 151 ``` 152 153 Raises: 154 Exception: Propagates any exceptions from the wrapped function after logging them in the trace. 155 156 Notes: 157 - The decorator preserves the original function's signature, docstring, and return type. 158 - Proper parent-child relationships between spans are automatically maintained. 159 - Special keyword arguments can be passed to control tracing: 160 - langfuse_trace_id: Explicitly set the trace ID for this function call 161 - langfuse_parent_observation_id: Explicitly set the parent span ID 162 - langfuse_public_key: Use a specific Langfuse project (when multiple clients exist) 163 - For async functions, the decorator returns an async function wrapper. 164 - For sync functions, the decorator returns a synchronous wrapper. 165 """ 166 valid_types = set(get_observation_types_list(ObservationTypeLiteralNoEvent)) 167 if as_type is not None and as_type not in valid_types: 168 logger.warning( 169 f"Invalid as_type '{as_type}'. Valid types are: {', '.join(sorted(valid_types))}. Defaulting to 'span'." 170 ) 171 as_type = "span" 172 173 function_io_capture_enabled = os.environ.get( 174 LANGFUSE_OBSERVE_DECORATOR_IO_CAPTURE_ENABLED, "True" 175 ).lower() not in ("false", "0") 176 177 should_capture_input = ( 178 capture_input if capture_input is not None else function_io_capture_enabled 179 ) 180 181 should_capture_output = ( 182 capture_output 183 if capture_output is not None 184 else function_io_capture_enabled 185 ) 186 187 def decorator(func: F) -> F: 188 return ( 189 self._async_observe( 190 func, 191 name=name, 192 as_type=as_type, 193 capture_input=should_capture_input, 194 capture_output=should_capture_output, 195 transform_to_string=transform_to_string, 196 ) 197 if asyncio.iscoroutinefunction(func) 198 else self._sync_observe( 199 func, 200 name=name, 201 as_type=as_type, 202 capture_input=should_capture_input, 203 capture_output=should_capture_output, 204 transform_to_string=transform_to_string, 205 ) 206 ) 207 208 """Handle decorator with or without parentheses. 209 210 This logic enables the decorator to work both with and without parentheses: 211 - @observe - Python passes the function directly to the decorator 212 - @observe() - Python calls the decorator first, which must return a function decorator 213 214 When called without arguments (@observe), the func parameter contains the function to decorate, 215 so we directly apply the decorator to it. When called with parentheses (@observe()), 216 func is None, so we return the decorator function itself for Python to apply in the next step. 217 """ 218 if func is None: 219 return decorator 220 else: 221 return decorator(func)
Wrap a function to create and manage Langfuse tracing around its execution, supporting both synchronous and asynchronous functions.
This decorator provides seamless integration of Langfuse observability into your codebase. It automatically creates spans or generations around function execution, capturing timing, inputs/outputs, and error states. The decorator intelligently handles both synchronous and asynchronous functions, preserving function signatures and type hints.
Using OpenTelemetry's distributed tracing system, it maintains proper trace context propagation throughout your application, enabling you to see hierarchical traces of function calls with detailed performance metrics and function-specific details.
Arguments:
- func (Optional[Callable]): The function to decorate. When used with parentheses @observe(), this will be None.
- name (Optional[str]): Custom name for the created trace or span. If not provided, the function name is used.
- as_type (Optional[Literal]): Set the observation type. Supported values: "generation", "span", "agent", "tool", "chain", "retriever", "embedding", "evaluator", "guardrail". Observation types are highlighted in the Langfuse UI for filtering and visualization. The types "generation" and "embedding" create a span on which additional attributes such as model metrics can be set.
Returns:
Callable: A wrapped version of the original function that automatically creates and manages Langfuse spans.
Example:
For general function tracing with automatic naming:
@observe() def process_user_request(user_id, query): # Function is automatically traced with name "process_user_request" return get_response(query)For language model generation tracking:
@observe(name="answer-generation", as_type="generation") async def generate_answer(query): # Creates a generation-type span with extended LLM metrics response = await openai.chat.completions.create( model="gpt-4", messages=[{"role": "user", "content": query}] ) return response.choices[0].message.contentFor trace context propagation between functions:
@observe() def main_process(): # Parent span is created return sub_process() # Child span automatically connected to parent @observe() def sub_process(): # Automatically becomes a child span of main_process return "result"
Raises:
- Exception: Propagates any exceptions from the wrapped function after logging them in the trace.
Notes:
- The decorator preserves the original function's signature, docstring, and return type.
- Proper parent-child relationships between spans are automatically maintained.
- Special keyword arguments can be passed to control tracing:
- langfuse_trace_id: Explicitly set the trace ID for this function call
- langfuse_parent_observation_id: Explicitly set the parent span ID
- langfuse_public_key: Use a specific Langfuse project (when multiple clients exist)
- For async functions, the decorator returns an async function wrapper.
- For sync functions, the decorator returns a synchronous wrapper.
98def propagate_attributes( 99 *, 100 user_id: Optional[str] = None, 101 session_id: Optional[str] = None, 102 metadata: Optional[Dict[str, Any]] = None, 103 version: Optional[str] = None, 104 tags: Optional[List[str]] = None, 105 trace_name: Optional[str] = None, 106 environment: Optional[str] = None, 107 as_baggage: bool = False, 108) -> _AgnosticContextManager[Any]: 109 """Propagate trace-level attributes to all spans created within this context. 110 111 This context manager sets attributes on the currently active span AND automatically 112 propagates them to all new child spans created within the context. This is the 113 recommended way to set trace-level attributes like user_id, session_id, 114 environment, and metadata dimensions that should be consistently applied across 115 all observations in a trace. 116 117 **IMPORTANT**: Call this as early as possible within your trace/workflow. Only the 118 currently active span and spans created after entering this context will have these 119 attributes. Pre-existing spans will NOT be retroactively updated. 120 121 **Why this matters**: Langfuse aggregation queries (e.g., total cost by user_id, 122 filtering by session_id) only include observations that have the attribute set. 123 If you call `propagate_attributes` late in your workflow, earlier spans won't be 124 included in aggregations for that attribute. 125 126 Args: 127 user_id: User identifier to associate with all spans in this context. 128 Must be US-ASCII string, ≤200 characters. Use this to track which user 129 generated each trace and enable e.g. per-user cost/performance analysis. 130 session_id: Session identifier to associate with all spans in this context. 131 Must be US-ASCII string, ≤200 characters. Use this to group related traces 132 within a user session (e.g., a conversation thread, multi-turn interaction). 133 metadata: Additional key-value metadata to propagate to all spans. 134 - Keys must be US-ASCII strings 135 - Values are coerced to strings 136 - Coerced values must be ≤200 characters 137 - Use for dimensions like internal correlating identifiers 138 - AVOID: large payloads or sensitive data 139 version: Version identfier for parts of your application that are independently versioned, e.g. agents 140 tags: List of tags to categorize the group of observations 141 trace_name: Name to assign to the trace. Must be US-ASCII string, ≤200 characters. 142 Use this to set a consistent trace name for all spans created within this context. 143 environment: Langfuse environment to assign to spans created in this context. 144 Must be a lowercase alphanumeric string with optional hyphens or underscores, 145 must be ≤40 characters, and must not start with "langfuse". This maps to 146 the first-class `langfuse.environment` attribute, not to trace metadata. 147 Use it for request-scoped environments, for example when one shared proxy 148 handles calls from dev, staging, qa, and prod. A propagated environment 149 takes precedence over the local client default configured via 150 `Langfuse(environment=...)` or `LANGFUSE_TRACING_ENVIRONMENT` for spans 151 created while this propagation context is active. 152 as_baggage: If True, propagates attributes using OpenTelemetry baggage for 153 cross-process/service propagation. **Security warning**: When enabled, 154 attribute values are added to HTTP headers on ALL outbound requests. 155 This includes `environment` as the `langfuse_environment` baggage entry. 156 Only enable if values are safe to transmit via HTTP headers and you need 157 cross-service tracing. Default: False. 158 159 Returns: 160 Context manager that propagates attributes to all child spans. 161 162 Example: 163 Basic usage with user and session tracking: 164 165 ```python 166 from langfuse import Langfuse 167 168 langfuse = Langfuse() 169 170 # Set attributes early in the trace 171 with langfuse.start_as_current_observation(name="user_workflow") as span: 172 with langfuse.propagate_attributes( 173 user_id="user_123", 174 session_id="session_abc", 175 environment="production", 176 metadata={"experiment": "variant_a"} 177 ): 178 # All spans created here will have user_id, session_id, environment, and metadata 179 with langfuse.start_observation(name="llm_call") as llm_span: 180 # This span inherits user_id, session_id, environment, and experiment metadata 181 ... 182 183 with langfuse.start_generation(name="completion") as gen: 184 # This span also inherits all attributes 185 ... 186 ``` 187 188 Late propagation (anti-pattern): 189 190 ```python 191 with langfuse.start_as_current_observation(name="workflow") as span: 192 # These spans WON'T have user_id 193 early_span = langfuse.start_observation(name="early_work") 194 early_span.end() 195 196 # Set attributes in the middle 197 with langfuse.propagate_attributes(user_id="user_123"): 198 # Only spans created AFTER this point will have user_id 199 late_span = langfuse.start_observation(name="late_work") 200 late_span.end() 201 202 # Result: Aggregations by user_id will miss "early_work" span 203 ``` 204 205 Cross-service propagation with baggage (advanced): 206 207 ```python 208 # Service A - originating service 209 with langfuse.start_as_current_observation(name="api_request"): 210 with langfuse.propagate_attributes( 211 user_id="user_123", 212 session_id="session_abc", 213 environment="staging", 214 as_baggage=True # Propagate via HTTP headers 215 ): 216 # Make HTTP request to Service B 217 response = requests.get("https://service-b.example.com/api") 218 # user_id, session_id, and environment are now in HTTP headers 219 220 # Service B - downstream service 221 # OpenTelemetry will automatically extract baggage from HTTP headers 222 # and propagate attributes to spans in Service B. If Service B has a local 223 # Langfuse environment configured, the propagated environment wins for 224 # spans created within this context. 225 ``` 226 227 Note: 228 - **Validation**: Attribute values (user_id, session_id, version, tags, 229 trace_name) must be strings ≤200 characters. Environment must also match 230 Langfuse's environment format: lowercase alphanumeric with optional 231 hyphens or underscores, must be ≤40 characters, and it must not start with "langfuse". Metadata 232 values are coerced to strings before the 200 character limit is applied. 233 Invalid values will be dropped with a warning logged. 234 - **OpenTelemetry**: This uses OpenTelemetry context propagation under the hood, 235 making it compatible with other OTel-instrumented libraries. 236 237 Raises: 238 No exceptions are raised. Invalid values are logged as warnings and dropped. 239 """ 240 return _propagate_attributes( 241 user_id=user_id, 242 session_id=session_id, 243 metadata=metadata, 244 version=version, 245 tags=tags, 246 trace_name=trace_name, 247 environment=environment, 248 as_baggage=as_baggage, 249 )
Propagate trace-level attributes to all spans created within this context.
This context manager sets attributes on the currently active span AND automatically propagates them to all new child spans created within the context. This is the recommended way to set trace-level attributes like user_id, session_id, environment, and metadata dimensions that should be consistently applied across all observations in a trace.
IMPORTANT: Call this as early as possible within your trace/workflow. Only the currently active span and spans created after entering this context will have these attributes. Pre-existing spans will NOT be retroactively updated.
Why this matters: Langfuse aggregation queries (e.g., total cost by user_id,
filtering by session_id) only include observations that have the attribute set.
If you call propagate_attributes late in your workflow, earlier spans won't be
included in aggregations for that attribute.
Arguments:
- user_id: User identifier to associate with all spans in this context. Must be US-ASCII string, ≤200 characters. Use this to track which user generated each trace and enable e.g. per-user cost/performance analysis.
- session_id: Session identifier to associate with all spans in this context. Must be US-ASCII string, ≤200 characters. Use this to group related traces within a user session (e.g., a conversation thread, multi-turn interaction).
- metadata: Additional key-value metadata to propagate to all spans.
- Keys must be US-ASCII strings
- Values are coerced to strings
- Coerced values must be ≤200 characters
- Use for dimensions like internal correlating identifiers
- AVOID: large payloads or sensitive data
- version: Version identfier for parts of your application that are independently versioned, e.g. agents
- tags: List of tags to categorize the group of observations
- trace_name: Name to assign to the trace. Must be US-ASCII string, ≤200 characters. Use this to set a consistent trace name for all spans created within this context.
- environment: Langfuse environment to assign to spans created in this context.
Must be a lowercase alphanumeric string with optional hyphens or underscores,
must be ≤40 characters, and must not start with "langfuse". This maps to
the first-class
langfuse.environmentattribute, not to trace metadata. Use it for request-scoped environments, for example when one shared proxy handles calls from dev, staging, qa, and prod. A propagated environment takes precedence over the local client default configured viaLangfuse(environment=...)orLANGFUSE_TRACING_ENVIRONMENTfor spans created while this propagation context is active. - as_baggage: If True, propagates attributes using OpenTelemetry baggage for
cross-process/service propagation. Security warning: When enabled,
attribute values are added to HTTP headers on ALL outbound requests.
This includes
environmentas thelangfuse_environmentbaggage entry. Only enable if values are safe to transmit via HTTP headers and you need cross-service tracing. Default: False.
Returns:
Context manager that propagates attributes to all child spans.
Example:
Basic usage with user and session tracking:
from langfuse import Langfuse langfuse = Langfuse() # Set attributes early in the trace with langfuse.start_as_current_observation(name="user_workflow") as span: with langfuse.propagate_attributes( user_id="user_123", session_id="session_abc", environment="production", metadata={"experiment": "variant_a"} ): # All spans created here will have user_id, session_id, environment, and metadata with langfuse.start_observation(name="llm_call") as llm_span: # This span inherits user_id, session_id, environment, and experiment metadata ... with langfuse.start_generation(name="completion") as gen: # This span also inherits all attributes ...Late propagation (anti-pattern):
with langfuse.start_as_current_observation(name="workflow") as span: # These spans WON'T have user_id early_span = langfuse.start_observation(name="early_work") early_span.end() # Set attributes in the middle with langfuse.propagate_attributes(user_id="user_123"): # Only spans created AFTER this point will have user_id late_span = langfuse.start_observation(name="late_work") late_span.end() # Result: Aggregations by user_id will miss "early_work" spanCross-service propagation with baggage (advanced):
# Service A - originating service with langfuse.start_as_current_observation(name="api_request"): with langfuse.propagate_attributes( user_id="user_123", session_id="session_abc", environment="staging", as_baggage=True # Propagate via HTTP headers ): # Make HTTP request to Service B response = requests.get("https://service-b.example.com/api") # user_id, session_id, and environment are now in HTTP headers # Service B - downstream service # OpenTelemetry will automatically extract baggage from HTTP headers # and propagate attributes to spans in Service B. If Service B has a local # Langfuse environment configured, the propagated environment wins for # spans created within this context.
Note:
- Validation: Attribute values (user_id, session_id, version, tags, trace_name) must be strings ≤200 characters. Environment must also match Langfuse's environment format: lowercase alphanumeric with optional hyphens or underscores, must be ≤40 characters, and it must not start with "langfuse". Metadata values are coerced to strings before the 200 character limit is applied. Invalid values will be dropped with a warning logged.
- OpenTelemetry: This uses OpenTelemetry context propagation under the hood, making it compatible with other OTel-instrumented libraries.
Raises:
- No exceptions are raised. Invalid values are logged as warnings and dropped.
1267class LangfuseSpan(LangfuseObservationWrapper): 1268 """Standard span implementation for general operations in Langfuse. 1269 1270 This class represents a general-purpose span that can be used to trace 1271 any operation in your application. It extends the base LangfuseObservationWrapper 1272 with specific methods for creating child spans, generations, and updating 1273 span-specific attributes. If possible, use a more specific type for 1274 better observability and insights. 1275 """ 1276 1277 def __init__( 1278 self, 1279 *, 1280 otel_span: otel_trace_api.Span, 1281 langfuse_client: "Langfuse", 1282 input: Optional[Any] = None, 1283 output: Optional[Any] = None, 1284 metadata: Optional[Any] = None, 1285 environment: Optional[str] = None, 1286 release: Optional[str] = None, 1287 version: Optional[str] = None, 1288 level: Optional[SpanLevel] = None, 1289 status_message: Optional[str] = None, 1290 ): 1291 """Initialize a new LangfuseSpan. 1292 1293 Args: 1294 otel_span: The OpenTelemetry span to wrap 1295 langfuse_client: Reference to the parent Langfuse client 1296 input: Input data for the span (any JSON-serializable object) 1297 output: Output data from the span (any JSON-serializable object) 1298 metadata: Additional metadata to associate with the span 1299 environment: The tracing environment 1300 release: Release identifier for the application 1301 version: Version identifier for the code or component 1302 level: Importance level of the span (info, warning, error) 1303 status_message: Optional status message for the span 1304 """ 1305 super().__init__( 1306 otel_span=otel_span, 1307 as_type="span", 1308 langfuse_client=langfuse_client, 1309 input=input, 1310 output=output, 1311 metadata=metadata, 1312 environment=environment, 1313 release=release, 1314 version=version, 1315 level=level, 1316 status_message=status_message, 1317 )
Standard span implementation for general operations in Langfuse.
This class represents a general-purpose span that can be used to trace any operation in your application. It extends the base LangfuseObservationWrapper with specific methods for creating child spans, generations, and updating span-specific attributes. If possible, use a more specific type for better observability and insights.
1277 def __init__( 1278 self, 1279 *, 1280 otel_span: otel_trace_api.Span, 1281 langfuse_client: "Langfuse", 1282 input: Optional[Any] = None, 1283 output: Optional[Any] = None, 1284 metadata: Optional[Any] = None, 1285 environment: Optional[str] = None, 1286 release: Optional[str] = None, 1287 version: Optional[str] = None, 1288 level: Optional[SpanLevel] = None, 1289 status_message: Optional[str] = None, 1290 ): 1291 """Initialize a new LangfuseSpan. 1292 1293 Args: 1294 otel_span: The OpenTelemetry span to wrap 1295 langfuse_client: Reference to the parent Langfuse client 1296 input: Input data for the span (any JSON-serializable object) 1297 output: Output data from the span (any JSON-serializable object) 1298 metadata: Additional metadata to associate with the span 1299 environment: The tracing environment 1300 release: Release identifier for the application 1301 version: Version identifier for the code or component 1302 level: Importance level of the span (info, warning, error) 1303 status_message: Optional status message for the span 1304 """ 1305 super().__init__( 1306 otel_span=otel_span, 1307 as_type="span", 1308 langfuse_client=langfuse_client, 1309 input=input, 1310 output=output, 1311 metadata=metadata, 1312 environment=environment, 1313 release=release, 1314 version=version, 1315 level=level, 1316 status_message=status_message, 1317 )
Initialize a new LangfuseSpan.
Arguments:
- otel_span: The OpenTelemetry span to wrap
- langfuse_client: Reference to the parent Langfuse client
- input: Input data for the span (any JSON-serializable object)
- output: Output data from the span (any JSON-serializable object)
- metadata: Additional metadata to associate with the span
- environment: The tracing environment
- release: Release identifier for the application
- version: Version identifier for the code or component
- level: Importance level of the span (info, warning, error)
- status_message: Optional status message for the span
1320class LangfuseGeneration(LangfuseObservationWrapper): 1321 """Specialized span implementation for AI model generations in Langfuse. 1322 1323 This class represents a generation span specifically designed for tracking 1324 AI/LLM operations. It extends the base LangfuseObservationWrapper with specialized 1325 attributes for model details, token usage, and costs. 1326 """ 1327 1328 def __init__( 1329 self, 1330 *, 1331 otel_span: otel_trace_api.Span, 1332 langfuse_client: "Langfuse", 1333 input: Optional[Any] = None, 1334 output: Optional[Any] = None, 1335 metadata: Optional[Any] = None, 1336 environment: Optional[str] = None, 1337 release: Optional[str] = None, 1338 version: Optional[str] = None, 1339 level: Optional[SpanLevel] = None, 1340 status_message: Optional[str] = None, 1341 completion_start_time: Optional[datetime] = None, 1342 model: Optional[str] = None, 1343 model_parameters: Optional[Dict[str, MapValue]] = None, 1344 usage_details: Optional[Dict[str, int]] = None, 1345 cost_details: Optional[Dict[str, float]] = None, 1346 prompt: Optional[PromptClient] = None, 1347 ): 1348 """Initialize a new LangfuseGeneration span. 1349 1350 Args: 1351 otel_span: The OpenTelemetry span to wrap 1352 langfuse_client: Reference to the parent Langfuse client 1353 input: Input data for the generation (e.g., prompts) 1354 output: Output from the generation (e.g., completions) 1355 metadata: Additional metadata to associate with the generation 1356 environment: The tracing environment 1357 release: Release identifier for the application 1358 version: Version identifier for the model or component 1359 level: Importance level of the generation (info, warning, error) 1360 status_message: Optional status message for the generation 1361 completion_start_time: When the model started generating the response 1362 model: Name/identifier of the AI model used (e.g., "gpt-4") 1363 model_parameters: Parameters used for the model (e.g., temperature, max_tokens) 1364 usage_details: Token usage information (e.g., prompt_tokens, completion_tokens) 1365 cost_details: Cost information for the model call 1366 prompt: Associated prompt template from Langfuse prompt management 1367 """ 1368 super().__init__( 1369 as_type="generation", 1370 otel_span=otel_span, 1371 langfuse_client=langfuse_client, 1372 input=input, 1373 output=output, 1374 metadata=metadata, 1375 environment=environment, 1376 release=release, 1377 version=version, 1378 level=level, 1379 status_message=status_message, 1380 completion_start_time=completion_start_time, 1381 model=model, 1382 model_parameters=model_parameters, 1383 usage_details=usage_details, 1384 cost_details=cost_details, 1385 prompt=prompt, 1386 )
Specialized span implementation for AI model generations in Langfuse.
This class represents a generation span specifically designed for tracking AI/LLM operations. It extends the base LangfuseObservationWrapper with specialized attributes for model details, token usage, and costs.
1328 def __init__( 1329 self, 1330 *, 1331 otel_span: otel_trace_api.Span, 1332 langfuse_client: "Langfuse", 1333 input: Optional[Any] = None, 1334 output: Optional[Any] = None, 1335 metadata: Optional[Any] = None, 1336 environment: Optional[str] = None, 1337 release: Optional[str] = None, 1338 version: Optional[str] = None, 1339 level: Optional[SpanLevel] = None, 1340 status_message: Optional[str] = None, 1341 completion_start_time: Optional[datetime] = None, 1342 model: Optional[str] = None, 1343 model_parameters: Optional[Dict[str, MapValue]] = None, 1344 usage_details: Optional[Dict[str, int]] = None, 1345 cost_details: Optional[Dict[str, float]] = None, 1346 prompt: Optional[PromptClient] = None, 1347 ): 1348 """Initialize a new LangfuseGeneration span. 1349 1350 Args: 1351 otel_span: The OpenTelemetry span to wrap 1352 langfuse_client: Reference to the parent Langfuse client 1353 input: Input data for the generation (e.g., prompts) 1354 output: Output from the generation (e.g., completions) 1355 metadata: Additional metadata to associate with the generation 1356 environment: The tracing environment 1357 release: Release identifier for the application 1358 version: Version identifier for the model or component 1359 level: Importance level of the generation (info, warning, error) 1360 status_message: Optional status message for the generation 1361 completion_start_time: When the model started generating the response 1362 model: Name/identifier of the AI model used (e.g., "gpt-4") 1363 model_parameters: Parameters used for the model (e.g., temperature, max_tokens) 1364 usage_details: Token usage information (e.g., prompt_tokens, completion_tokens) 1365 cost_details: Cost information for the model call 1366 prompt: Associated prompt template from Langfuse prompt management 1367 """ 1368 super().__init__( 1369 as_type="generation", 1370 otel_span=otel_span, 1371 langfuse_client=langfuse_client, 1372 input=input, 1373 output=output, 1374 metadata=metadata, 1375 environment=environment, 1376 release=release, 1377 version=version, 1378 level=level, 1379 status_message=status_message, 1380 completion_start_time=completion_start_time, 1381 model=model, 1382 model_parameters=model_parameters, 1383 usage_details=usage_details, 1384 cost_details=cost_details, 1385 prompt=prompt, 1386 )
Initialize a new LangfuseGeneration span.
Arguments:
- otel_span: The OpenTelemetry span to wrap
- langfuse_client: Reference to the parent Langfuse client
- input: Input data for the generation (e.g., prompts)
- output: Output from the generation (e.g., completions)
- metadata: Additional metadata to associate with the generation
- environment: The tracing environment
- release: Release identifier for the application
- version: Version identifier for the model or component
- level: Importance level of the generation (info, warning, error)
- status_message: Optional status message for the generation
- completion_start_time: When the model started generating the response
- model: Name/identifier of the AI model used (e.g., "gpt-4")
- model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
- usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
- cost_details: Cost information for the model call
- prompt: Associated prompt template from Langfuse prompt management
1389class LangfuseEvent(LangfuseObservationWrapper): 1390 """Specialized span implementation for Langfuse Events.""" 1391 1392 def __init__( 1393 self, 1394 *, 1395 otel_span: otel_trace_api.Span, 1396 langfuse_client: "Langfuse", 1397 input: Optional[Any] = None, 1398 output: Optional[Any] = None, 1399 metadata: Optional[Any] = None, 1400 environment: Optional[str] = None, 1401 release: Optional[str] = None, 1402 version: Optional[str] = None, 1403 level: Optional[SpanLevel] = None, 1404 status_message: Optional[str] = None, 1405 ): 1406 """Initialize a new LangfuseEvent span. 1407 1408 Args: 1409 otel_span: The OpenTelemetry span to wrap 1410 langfuse_client: Reference to the parent Langfuse client 1411 input: Input data for the event 1412 output: Output from the event 1413 metadata: Additional metadata to associate with the generation 1414 environment: The tracing environment 1415 release: Release identifier for the application 1416 version: Version identifier for the model or component 1417 level: Importance level of the generation (info, warning, error) 1418 status_message: Optional status message for the generation 1419 """ 1420 super().__init__( 1421 otel_span=otel_span, 1422 as_type="event", 1423 langfuse_client=langfuse_client, 1424 input=input, 1425 output=output, 1426 metadata=metadata, 1427 environment=environment, 1428 release=release, 1429 version=version, 1430 level=level, 1431 status_message=status_message, 1432 ) 1433 1434 def update( 1435 self, 1436 *, 1437 name: Optional[str] = None, 1438 input: Optional[Any] = None, 1439 output: Optional[Any] = None, 1440 metadata: Optional[Any] = None, 1441 version: Optional[str] = None, 1442 level: Optional[SpanLevel] = None, 1443 status_message: Optional[str] = None, 1444 completion_start_time: Optional[datetime] = None, 1445 model: Optional[str] = None, 1446 model_parameters: Optional[Dict[str, MapValue]] = None, 1447 usage_details: Optional[Dict[str, int]] = None, 1448 cost_details: Optional[Dict[str, float]] = None, 1449 prompt: Optional[PromptClient] = None, 1450 **kwargs: Any, 1451 ) -> "LangfuseEvent": 1452 """Update is not allowed for LangfuseEvent because events cannot be updated. 1453 1454 This method logs a warning and returns self without making changes. 1455 1456 Returns: 1457 self: Returns the unchanged LangfuseEvent instance 1458 """ 1459 langfuse_logger.warning( 1460 "Attempted to update LangfuseEvent observation. Events cannot be updated after creation." 1461 ) 1462 return self
Specialized span implementation for Langfuse Events.
1392 def __init__( 1393 self, 1394 *, 1395 otel_span: otel_trace_api.Span, 1396 langfuse_client: "Langfuse", 1397 input: Optional[Any] = None, 1398 output: Optional[Any] = None, 1399 metadata: Optional[Any] = None, 1400 environment: Optional[str] = None, 1401 release: Optional[str] = None, 1402 version: Optional[str] = None, 1403 level: Optional[SpanLevel] = None, 1404 status_message: Optional[str] = None, 1405 ): 1406 """Initialize a new LangfuseEvent span. 1407 1408 Args: 1409 otel_span: The OpenTelemetry span to wrap 1410 langfuse_client: Reference to the parent Langfuse client 1411 input: Input data for the event 1412 output: Output from the event 1413 metadata: Additional metadata to associate with the generation 1414 environment: The tracing environment 1415 release: Release identifier for the application 1416 version: Version identifier for the model or component 1417 level: Importance level of the generation (info, warning, error) 1418 status_message: Optional status message for the generation 1419 """ 1420 super().__init__( 1421 otel_span=otel_span, 1422 as_type="event", 1423 langfuse_client=langfuse_client, 1424 input=input, 1425 output=output, 1426 metadata=metadata, 1427 environment=environment, 1428 release=release, 1429 version=version, 1430 level=level, 1431 status_message=status_message, 1432 )
Initialize a new LangfuseEvent span.
Arguments:
- otel_span: The OpenTelemetry span to wrap
- langfuse_client: Reference to the parent Langfuse client
- input: Input data for the event
- output: Output from the event
- metadata: Additional metadata to associate with the generation
- environment: The tracing environment
- release: Release identifier for the application
- version: Version identifier for the model or component
- level: Importance level of the generation (info, warning, error)
- status_message: Optional status message for the generation
1434 def update( 1435 self, 1436 *, 1437 name: Optional[str] = None, 1438 input: Optional[Any] = None, 1439 output: Optional[Any] = None, 1440 metadata: Optional[Any] = None, 1441 version: Optional[str] = None, 1442 level: Optional[SpanLevel] = None, 1443 status_message: Optional[str] = None, 1444 completion_start_time: Optional[datetime] = None, 1445 model: Optional[str] = None, 1446 model_parameters: Optional[Dict[str, MapValue]] = None, 1447 usage_details: Optional[Dict[str, int]] = None, 1448 cost_details: Optional[Dict[str, float]] = None, 1449 prompt: Optional[PromptClient] = None, 1450 **kwargs: Any, 1451 ) -> "LangfuseEvent": 1452 """Update is not allowed for LangfuseEvent because events cannot be updated. 1453 1454 This method logs a warning and returns self without making changes. 1455 1456 Returns: 1457 self: Returns the unchanged LangfuseEvent instance 1458 """ 1459 langfuse_logger.warning( 1460 "Attempted to update LangfuseEvent observation. Events cannot be updated after creation." 1461 ) 1462 return self
Update is not allowed for LangfuseEvent because events cannot be updated.
This method logs a warning and returns self without making changes.
Returns:
self: Returns the unchanged LangfuseEvent instance
28class LangfuseOtelSpanAttributes: 29 # Langfuse-Trace attributes 30 TRACE_NAME = "langfuse.trace.name" 31 TRACE_USER_ID = "user.id" 32 TRACE_SESSION_ID = "session.id" 33 TRACE_TAGS = "langfuse.trace.tags" 34 TRACE_PUBLIC = "langfuse.trace.public" 35 TRACE_METADATA = "langfuse.trace.metadata" 36 TRACE_INPUT = "langfuse.trace.input" 37 TRACE_OUTPUT = "langfuse.trace.output" 38 39 # Langfuse-observation attributes 40 OBSERVATION_TYPE = "langfuse.observation.type" 41 OBSERVATION_METADATA = "langfuse.observation.metadata" 42 OBSERVATION_LEVEL = "langfuse.observation.level" 43 OBSERVATION_STATUS_MESSAGE = "langfuse.observation.status_message" 44 OBSERVATION_INPUT = "langfuse.observation.input" 45 OBSERVATION_OUTPUT = "langfuse.observation.output" 46 47 # Langfuse-observation of type Generation attributes 48 OBSERVATION_COMPLETION_START_TIME = "langfuse.observation.completion_start_time" 49 OBSERVATION_MODEL = "langfuse.observation.model.name" 50 OBSERVATION_MODEL_PARAMETERS = "langfuse.observation.model.parameters" 51 OBSERVATION_USAGE_DETAILS = "langfuse.observation.usage_details" 52 OBSERVATION_COST_DETAILS = "langfuse.observation.cost_details" 53 OBSERVATION_PROMPT_NAME = "langfuse.observation.prompt.name" 54 OBSERVATION_PROMPT_VERSION = "langfuse.observation.prompt.version" 55 56 # General 57 ENVIRONMENT = "langfuse.environment" 58 RELEASE = "langfuse.release" 59 VERSION = "langfuse.version" 60 61 # Internal 62 AS_ROOT = "langfuse.internal.as_root" 63 IS_APP_ROOT = "langfuse.internal.is_app_root" 64 65 # Experiments 66 EXPERIMENT_ID = "langfuse.experiment.id" 67 EXPERIMENT_NAME = "langfuse.experiment.name" 68 EXPERIMENT_DESCRIPTION = "langfuse.experiment.description" 69 EXPERIMENT_METADATA = "langfuse.experiment.metadata" 70 EXPERIMENT_DATASET_ID = "langfuse.experiment.dataset.id" 71 EXPERIMENT_ITEM_ID = "langfuse.experiment.item.id" 72 EXPERIMENT_ITEM_EXPECTED_OUTPUT = "langfuse.experiment.item.expected_output" 73 EXPERIMENT_ITEM_METADATA = "langfuse.experiment.item.metadata" 74 EXPERIMENT_ITEM_ROOT_OBSERVATION_ID = "langfuse.experiment.item.root_observation_id"
1465class LangfuseAgent(LangfuseObservationWrapper): 1466 """Agent observation for reasoning blocks that act on tools using LLM guidance.""" 1467 1468 def __init__(self, **kwargs: Any) -> None: 1469 """Initialize a new LangfuseAgent span.""" 1470 kwargs["as_type"] = "agent" 1471 super().__init__(**kwargs)
Agent observation for reasoning blocks that act on tools using LLM guidance.
1474class LangfuseTool(LangfuseObservationWrapper): 1475 """Tool observation representing external tool calls, e.g., calling a weather API.""" 1476 1477 def __init__(self, **kwargs: Any) -> None: 1478 """Initialize a new LangfuseTool span.""" 1479 kwargs["as_type"] = "tool" 1480 super().__init__(**kwargs)
Tool observation representing external tool calls, e.g., calling a weather API.
1483class LangfuseChain(LangfuseObservationWrapper): 1484 """Chain observation for connecting LLM application steps, e.g. passing context from retriever to LLM.""" 1485 1486 def __init__(self, **kwargs: Any) -> None: 1487 """Initialize a new LangfuseChain span.""" 1488 kwargs["as_type"] = "chain" 1489 super().__init__(**kwargs)
Chain observation for connecting LLM application steps, e.g. passing context from retriever to LLM.
1501class LangfuseEmbedding(LangfuseObservationWrapper): 1502 """Embedding observation for LLM embedding calls, typically used before retrieval.""" 1503 1504 def __init__(self, **kwargs: Any) -> None: 1505 """Initialize a new LangfuseEmbedding span.""" 1506 kwargs["as_type"] = "embedding" 1507 super().__init__(**kwargs)
Embedding observation for LLM embedding calls, typically used before retrieval.
1510class LangfuseEvaluator(LangfuseObservationWrapper): 1511 """Evaluator observation for assessing relevance, correctness, or helpfulness of LLM outputs.""" 1512 1513 def __init__(self, **kwargs: Any) -> None: 1514 """Initialize a new LangfuseEvaluator span.""" 1515 kwargs["as_type"] = "evaluator" 1516 super().__init__(**kwargs)
Evaluator observation for assessing relevance, correctness, or helpfulness of LLM outputs.
1492class LangfuseRetriever(LangfuseObservationWrapper): 1493 """Retriever observation for data retrieval steps, e.g. vector store or database queries.""" 1494 1495 def __init__(self, **kwargs: Any) -> None: 1496 """Initialize a new LangfuseRetriever span.""" 1497 kwargs["as_type"] = "retriever" 1498 super().__init__(**kwargs)
Retriever observation for data retrieval steps, e.g. vector store or database queries.
1519class LangfuseGuardrail(LangfuseObservationWrapper): 1520 """Guardrail observation for protection e.g. against jailbreaks or offensive content.""" 1521 1522 def __init__(self, **kwargs: Any) -> None: 1523 """Initialize a new LangfuseGuardrail span.""" 1524 kwargs["as_type"] = "guardrail" 1525 super().__init__(**kwargs)
Guardrail observation for protection e.g. against jailbreaks or offensive content.
101class Evaluation: 102 """Represents an evaluation result for an experiment item or an entire experiment run. 103 104 This class provides a strongly-typed way to create evaluation results in evaluator functions. 105 Users must use keyword arguments when instantiating this class. 106 107 Attributes: 108 name: Unique identifier for the evaluation metric. Should be descriptive 109 and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity"). 110 Used for aggregation and comparison across experiment runs. 111 value: The evaluation score or result. Can be: 112 - Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42) 113 - String: For categorical results like "positive", "negative", "neutral" 114 - Boolean: For binary assessments like "passes_safety_check" 115 comment: Optional human-readable explanation of the evaluation result. 116 Useful for providing context, explaining scoring rationale, or noting 117 special conditions. Displayed in Langfuse UI for interpretability. 118 metadata: Optional structured metadata about the evaluation process. 119 Can include confidence scores, intermediate calculations, model versions, 120 or any other relevant technical details. 121 data_type: Optional score data type. Required if value is not NUMERIC. 122 One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC. 123 config_id: Optional Langfuse score config ID. 124 125 Examples: 126 Basic accuracy evaluation: 127 ```python 128 from langfuse import Evaluation 129 130 def accuracy_evaluator(*, input, output, expected_output=None, **kwargs): 131 if not expected_output: 132 return Evaluation(name="accuracy", value=0, comment="No expected output") 133 134 is_correct = output.strip().lower() == expected_output.strip().lower() 135 return Evaluation( 136 name="accuracy", 137 value=1.0 if is_correct else 0.0, 138 comment="Correct answer" if is_correct else "Incorrect answer" 139 ) 140 ``` 141 142 Multi-metric evaluator: 143 ```python 144 def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs): 145 return [ 146 Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"), 147 Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"), 148 Evaluation( 149 name="quality", 150 value=0.85, 151 comment="High quality response", 152 metadata={"confidence": 0.92, "model": "gpt-4"} 153 ) 154 ] 155 ``` 156 157 Categorical evaluation: 158 ```python 159 def sentiment_evaluator(*, input, output, **kwargs): 160 sentiment = analyze_sentiment(output) # Returns "positive", "negative", or "neutral" 161 return Evaluation( 162 name="sentiment", 163 value=sentiment, 164 comment=f"Response expresses {sentiment} sentiment", 165 data_type="CATEGORICAL" 166 ) 167 ``` 168 169 Failed evaluation with error handling: 170 ```python 171 def external_api_evaluator(*, input, output, **kwargs): 172 try: 173 score = external_api.evaluate(output) 174 return Evaluation(name="external_score", value=score) 175 except Exception as e: 176 return Evaluation( 177 name="external_score", 178 value=0, 179 comment=f"API unavailable: {e}", 180 metadata={"error": str(e), "retry_count": 3} 181 ) 182 ``` 183 184 Note: 185 All arguments must be passed as keywords. Positional arguments are not allowed 186 to ensure code clarity and prevent errors from argument reordering. 187 """ 188 189 def __init__( 190 self, 191 *, 192 name: str, 193 value: Union[int, float, str, bool], 194 comment: Optional[str] = None, 195 metadata: Optional[Dict[str, Any]] = None, 196 data_type: Optional[ExperimentScoreType] = None, 197 config_id: Optional[str] = None, 198 ): 199 """Initialize an Evaluation with the provided data. 200 201 Args: 202 name: Unique identifier for the evaluation metric. 203 value: The evaluation score or result. 204 comment: Optional human-readable explanation of the result. 205 metadata: Optional structured metadata about the evaluation process. 206 data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN). 207 config_id: Optional Langfuse score config ID. 208 209 Note: 210 All arguments must be provided as keywords. Positional arguments will raise a TypeError. 211 """ 212 self.name = name 213 self.value = value 214 self.comment = comment 215 self.metadata = metadata 216 self.data_type = data_type 217 self.config_id = config_id
Represents an evaluation result for an experiment item or an entire experiment run.
This class provides a strongly-typed way to create evaluation results in evaluator functions. Users must use keyword arguments when instantiating this class.
Attributes:
- name: Unique identifier for the evaluation metric. Should be descriptive and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity"). Used for aggregation and comparison across experiment runs.
- value: The evaluation score or result. Can be:
- Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42)
- String: For categorical results like "positive", "negative", "neutral"
- Boolean: For binary assessments like "passes_safety_check"
- comment: Optional human-readable explanation of the evaluation result. Useful for providing context, explaining scoring rationale, or noting special conditions. Displayed in Langfuse UI for interpretability.
- metadata: Optional structured metadata about the evaluation process. Can include confidence scores, intermediate calculations, model versions, or any other relevant technical details.
- data_type: Optional score data type. Required if value is not NUMERIC. One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC.
- config_id: Optional Langfuse score config ID.
Examples:
Basic accuracy evaluation:
from langfuse import Evaluation def accuracy_evaluator(*, input, output, expected_output=None, **kwargs): if not expected_output: return Evaluation(name="accuracy", value=0, comment="No expected output") is_correct = output.strip().lower() == expected_output.strip().lower() return Evaluation( name="accuracy", value=1.0 if is_correct else 0.0, comment="Correct answer" if is_correct else "Incorrect answer" )Multi-metric evaluator:
def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs): return [ Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"), Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"), Evaluation( name="quality", value=0.85, comment="High quality response", metadata={"confidence": 0.92, "model": "gpt-4"} ) ]Categorical evaluation:
def sentiment_evaluator(*, input, output, **kwargs): sentiment = analyze_sentiment(output) # Returns "positive", "negative", or "neutral" return Evaluation( name="sentiment", value=sentiment, comment=f"Response expresses {sentiment} sentiment", data_type="CATEGORICAL" )Failed evaluation with error handling:
def external_api_evaluator(*, input, output, **kwargs): try: score = external_api.evaluate(output) return Evaluation(name="external_score", value=score) except Exception as e: return Evaluation( name="external_score", value=0, comment=f"API unavailable: {e}", metadata={"error": str(e), "retry_count": 3} )
Note:
All arguments must be passed as keywords. Positional arguments are not allowed to ensure code clarity and prevent errors from argument reordering.
189 def __init__( 190 self, 191 *, 192 name: str, 193 value: Union[int, float, str, bool], 194 comment: Optional[str] = None, 195 metadata: Optional[Dict[str, Any]] = None, 196 data_type: Optional[ExperimentScoreType] = None, 197 config_id: Optional[str] = None, 198 ): 199 """Initialize an Evaluation with the provided data. 200 201 Args: 202 name: Unique identifier for the evaluation metric. 203 value: The evaluation score or result. 204 comment: Optional human-readable explanation of the result. 205 metadata: Optional structured metadata about the evaluation process. 206 data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN). 207 config_id: Optional Langfuse score config ID. 208 209 Note: 210 All arguments must be provided as keywords. Positional arguments will raise a TypeError. 211 """ 212 self.name = name 213 self.value = value 214 self.comment = comment 215 self.metadata = metadata 216 self.data_type = data_type 217 self.config_id = config_id
Initialize an Evaluation with the provided data.
Arguments:
- name: Unique identifier for the evaluation metric.
- value: The evaluation score or result.
- comment: Optional human-readable explanation of the result.
- metadata: Optional structured metadata about the evaluation process.
- data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
- config_id: Optional Langfuse score config ID.
Note:
All arguments must be provided as keywords. Positional arguments will raise a TypeError.
38class EvaluatorInputs: 39 """Input data structure for evaluators, returned by mapper functions. 40 41 This class provides a strongly-typed container for transforming API response 42 objects (traces, observations) into the standardized format expected 43 by evaluator functions. It ensures consistent access to input, output, expected 44 output, and metadata regardless of the source entity type. 45 46 Attributes: 47 input: The input data that was provided to generate the output being evaluated. 48 For traces, this might be the initial prompt or request. For observations, 49 this could be the span's input. The exact meaning depends on your use case. 50 output: The actual output that was produced and needs to be evaluated. 51 For traces, this is typically the final response. For observations, 52 this might be the generation output or span result. 53 expected_output: Optional ground truth or expected result for comparison. 54 Used by evaluators to assess correctness. May be None if no ground truth 55 is available for the entity being evaluated. 56 metadata: Optional structured metadata providing additional context for evaluation. 57 Can include information about the entity, execution context, user attributes, 58 or any other relevant data that evaluators might use. 59 60 Examples: 61 Simple mapper for traces: 62 ```python 63 from langfuse import EvaluatorInputs 64 65 def trace_mapper(trace): 66 return EvaluatorInputs( 67 input=trace.input, 68 output=trace.output, 69 expected_output=None, # No ground truth available 70 metadata={"user_id": trace.user_id, "tags": trace.tags} 71 ) 72 ``` 73 74 Mapper for observations extracting specific fields: 75 ```python 76 def observation_mapper(observation): 77 # Extract input/output from observation's data 78 input_data = observation.input if hasattr(observation, 'input') else None 79 output_data = observation.output if hasattr(observation, 'output') else None 80 81 return EvaluatorInputs( 82 input=input_data, 83 output=output_data, 84 expected_output=None, 85 metadata={ 86 "observation_type": observation.type, 87 "model": observation.model, 88 "latency_ms": observation.end_time - observation.start_time 89 } 90 ) 91 ``` 92 ``` 93 94 Note: 95 All arguments must be passed as keywords when instantiating this class. 96 """ 97 98 def __init__( 99 self, 100 *, 101 input: Any, 102 output: Any, 103 expected_output: Any = None, 104 metadata: Optional[Dict[str, Any]] = None, 105 ): 106 """Initialize EvaluatorInputs with the provided data. 107 108 Args: 109 input: The input data for evaluation. 110 output: The output data to be evaluated. 111 expected_output: Optional ground truth for comparison. 112 metadata: Optional additional context for evaluation. 113 114 Note: 115 All arguments must be provided as keywords. 116 """ 117 self.input = input 118 self.output = output 119 self.expected_output = expected_output 120 self.metadata = metadata
Input data structure for evaluators, returned by mapper functions.
This class provides a strongly-typed container for transforming API response objects (traces, observations) into the standardized format expected by evaluator functions. It ensures consistent access to input, output, expected output, and metadata regardless of the source entity type.
Attributes:
- input: The input data that was provided to generate the output being evaluated. For traces, this might be the initial prompt or request. For observations, this could be the span's input. The exact meaning depends on your use case.
- output: The actual output that was produced and needs to be evaluated. For traces, this is typically the final response. For observations, this might be the generation output or span result.
- expected_output: Optional ground truth or expected result for comparison. Used by evaluators to assess correctness. May be None if no ground truth is available for the entity being evaluated.
- metadata: Optional structured metadata providing additional context for evaluation. Can include information about the entity, execution context, user attributes, or any other relevant data that evaluators might use.
Examples:
Simple mapper for traces:
from langfuse import EvaluatorInputs def trace_mapper(trace): return EvaluatorInputs( input=trace.input, output=trace.output, expected_output=None, # No ground truth available metadata={"user_id": trace.user_id, "tags": trace.tags} )Mapper for observations extracting specific fields:
def observation_mapper(observation): # Extract input/output from observation's data input_data = observation.input if hasattr(observation, 'input') else None output_data = observation.output if hasattr(observation, 'output') else None return EvaluatorInputs( input=input_data, output=output_data, expected_output=None, metadata={ "observation_type": observation.type, "model": observation.model, "latency_ms": observation.end_time - observation.start_time } )```
Note:
All arguments must be passed as keywords when instantiating this class.
98 def __init__( 99 self, 100 *, 101 input: Any, 102 output: Any, 103 expected_output: Any = None, 104 metadata: Optional[Dict[str, Any]] = None, 105 ): 106 """Initialize EvaluatorInputs with the provided data. 107 108 Args: 109 input: The input data for evaluation. 110 output: The output data to be evaluated. 111 expected_output: Optional ground truth for comparison. 112 metadata: Optional additional context for evaluation. 113 114 Note: 115 All arguments must be provided as keywords. 116 """ 117 self.input = input 118 self.output = output 119 self.expected_output = expected_output 120 self.metadata = metadata
Initialize EvaluatorInputs with the provided data.
Arguments:
- input: The input data for evaluation.
- output: The output data to be evaluated.
- expected_output: Optional ground truth for comparison.
- metadata: Optional additional context for evaluation.
Note:
All arguments must be provided as keywords.
123class MapperFunction(Protocol): 124 """Protocol defining the interface for mapper functions in batch evaluation. 125 126 Mapper functions transform API response objects (traces or observations) 127 into the standardized EvaluatorInputs format that evaluators expect. This abstraction 128 allows you to define how to extract and structure evaluation data from different 129 entity types. 130 131 Mapper functions must: 132 - Accept a single item parameter (trace, observation) 133 - Return an EvaluatorInputs instance with input, output, expected_output, metadata 134 - Can be either synchronous or asynchronous 135 - Should handle missing or malformed data gracefully 136 """ 137 138 def __call__( 139 self, 140 *, 141 item: Union["TraceWithFullDetails", "ObservationsView"], 142 **kwargs: Dict[str, Any], 143 ) -> Union[EvaluatorInputs, Awaitable[EvaluatorInputs]]: 144 """Transform an API response object into evaluator inputs. 145 146 This method defines how to extract evaluation-relevant data from the raw 147 API response object. The implementation should map entity-specific fields 148 to the standardized input/output/expected_output/metadata structure. 149 150 Args: 151 item: The API response object to transform. The type depends on the scope: 152 - TraceWithFullDetails: When evaluating traces 153 - ObservationsView: When evaluating observations 154 155 Returns: 156 EvaluatorInputs: A structured container with: 157 - input: The input data that generated the output 158 - output: The output to be evaluated 159 - expected_output: Optional ground truth for comparison 160 - metadata: Optional additional context 161 162 Can return either a direct EvaluatorInputs instance or an awaitable 163 (for async mappers that need to fetch additional data). 164 165 Examples: 166 Basic trace mapper: 167 ```python 168 def map_trace(trace): 169 return EvaluatorInputs( 170 input=trace.input, 171 output=trace.output, 172 expected_output=None, 173 metadata={"trace_id": trace.id, "user": trace.user_id} 174 ) 175 ``` 176 177 Observation mapper with conditional logic: 178 ```python 179 def map_observation(observation): 180 # Extract fields based on observation type 181 if observation.type == "GENERATION": 182 input_data = observation.input 183 output_data = observation.output 184 else: 185 # For other types, use different fields 186 input_data = observation.metadata.get("input") 187 output_data = observation.metadata.get("output") 188 189 return EvaluatorInputs( 190 input=input_data, 191 output=output_data, 192 expected_output=None, 193 metadata={"obs_id": observation.id, "type": observation.type} 194 ) 195 ``` 196 197 Async mapper (if additional processing needed): 198 ```python 199 async def map_trace_async(trace): 200 # Could do async processing here if needed 201 processed_output = await some_async_transformation(trace.output) 202 203 return EvaluatorInputs( 204 input=trace.input, 205 output=processed_output, 206 expected_output=None, 207 metadata={"trace_id": trace.id} 208 ) 209 ``` 210 """ 211 ...
Protocol defining the interface for mapper functions in batch evaluation.
Mapper functions transform API response objects (traces or observations) into the standardized EvaluatorInputs format that evaluators expect. This abstraction allows you to define how to extract and structure evaluation data from different entity types.
Mapper functions must:
- Accept a single item parameter (trace, observation)
- Return an EvaluatorInputs instance with input, output, expected_output, metadata
- Can be either synchronous or asynchronous
- Should handle missing or malformed data gracefully
1927def _no_init_or_replace_init(self, *args, **kwargs): 1928 cls = type(self) 1929 1930 if cls._is_protocol: 1931 raise TypeError('Protocols cannot be instantiated') 1932 1933 # Already using a custom `__init__`. No need to calculate correct 1934 # `__init__` to call. This can lead to RecursionError. See bpo-45121. 1935 if cls.__init__ is not _no_init_or_replace_init: 1936 return 1937 1938 # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`. 1939 # The first instantiation of the subclass will call `_no_init_or_replace_init` which 1940 # searches for a proper new `__init__` in the MRO. The new `__init__` 1941 # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent 1942 # instantiation of the protocol subclass will thus use the new 1943 # `__init__` and no longer call `_no_init_or_replace_init`. 1944 for base in cls.__mro__: 1945 init = base.__dict__.get('__init__', _no_init_or_replace_init) 1946 if init is not _no_init_or_replace_init: 1947 cls.__init__ = init 1948 break 1949 else: 1950 # should not happen 1951 cls.__init__ = object.__init__ 1952 1953 cls.__init__(self, *args, **kwargs)
214class CompositeEvaluatorFunction(Protocol): 215 """Protocol defining the interface for composite evaluator functions. 216 217 Composite evaluators create aggregate scores from multiple item-level evaluations. 218 This is commonly used to compute weighted averages, combined metrics, or other 219 composite assessments based on individual evaluation results. 220 221 Composite evaluators: 222 - Accept the same inputs as item-level evaluators (input, output, expected_output, metadata) 223 plus the list of evaluations 224 - Return either a single Evaluation, a list of Evaluations, or a dict 225 - Can be either synchronous or asynchronous 226 - Have access to both raw item data and evaluation results 227 """ 228 229 def __call__( 230 self, 231 *, 232 input: Optional[Any] = None, 233 output: Optional[Any] = None, 234 expected_output: Optional[Any] = None, 235 metadata: Optional[Dict[str, Any]] = None, 236 evaluations: List[Evaluation], 237 **kwargs: Dict[str, Any], 238 ) -> Union[ 239 Evaluation, 240 List[Evaluation], 241 Dict[str, Any], 242 Awaitable[Evaluation], 243 Awaitable[List[Evaluation]], 244 Awaitable[Dict[str, Any]], 245 ]: 246 r"""Create a composite evaluation from item-level evaluation results. 247 248 This method combines multiple evaluation scores into a single composite metric. 249 Common use cases include weighted averages, pass/fail decisions based on multiple 250 criteria, or custom scoring logic that considers multiple dimensions. 251 252 Args: 253 input: The input data that was provided to the system being evaluated. 254 output: The output generated by the system being evaluated. 255 expected_output: The expected/reference output for comparison (if available). 256 metadata: Additional metadata about the evaluation context. 257 evaluations: List of evaluation results from item-level evaluators. 258 Each evaluation contains name, value, comment, and metadata. 259 260 Returns: 261 Can return any of: 262 - Evaluation: A single composite evaluation result 263 - List[Evaluation]: Multiple composite evaluations 264 - Dict: A dict that will be converted to an Evaluation 265 - name: Identifier for the composite metric (e.g., "composite_score") 266 - value: The computed composite value 267 - comment: Optional explanation of how the score was computed 268 - metadata: Optional details about the composition logic 269 270 Can return either a direct Evaluation instance or an awaitable 271 (for async composite evaluators). 272 273 Examples: 274 Simple weighted average: 275 ```python 276 def weighted_composite(*, input, output, expected_output, metadata, evaluations): 277 weights = { 278 "accuracy": 0.5, 279 "relevance": 0.3, 280 "safety": 0.2 281 } 282 283 total_score = 0.0 284 total_weight = 0.0 285 286 for eval in evaluations: 287 if eval.name in weights and isinstance(eval.value, (int, float)): 288 total_score += eval.value * weights[eval.name] 289 total_weight += weights[eval.name] 290 291 final_score = total_score / total_weight if total_weight > 0 else 0.0 292 293 return Evaluation( 294 name="composite_score", 295 value=final_score, 296 comment=f"Weighted average of {len(evaluations)} metrics" 297 ) 298 ``` 299 300 Pass/fail composite based on thresholds: 301 ```python 302 def pass_fail_composite(*, input, output, expected_output, metadata, evaluations): 303 # Must pass all criteria 304 thresholds = { 305 "accuracy": 0.7, 306 "safety": 0.9, 307 "relevance": 0.6 308 } 309 310 passes = True 311 failing_metrics = [] 312 313 for metric, threshold in thresholds.items(): 314 eval_result = next((e for e in evaluations if e.name == metric), None) 315 if eval_result and isinstance(eval_result.value, (int, float)): 316 if eval_result.value < threshold: 317 passes = False 318 failing_metrics.append(metric) 319 320 return Evaluation( 321 name="passes_all_checks", 322 value=passes, 323 comment=f"Failed: {', '.join(failing_metrics)}" if failing_metrics else "All checks passed", 324 data_type="BOOLEAN" 325 ) 326 ``` 327 328 Async composite with external scoring: 329 ```python 330 async def llm_composite(*, input, output, expected_output, metadata, evaluations): 331 # Use LLM to synthesize multiple evaluation results 332 eval_summary = "\n".join( 333 f"- {e.name}: {e.value}" for e in evaluations 334 ) 335 336 prompt = f"Given these evaluation scores:\n{eval_summary}\n" 337 prompt += f"For the output: {output}\n" 338 prompt += "Provide an overall quality score from 0-1." 339 340 response = await openai.chat.completions.create( 341 model="gpt-4", 342 messages=[{"role": "user", "content": prompt}] 343 ) 344 345 score = float(response.choices[0].message.content.strip()) 346 347 return Evaluation( 348 name="llm_composite_score", 349 value=score, 350 comment="LLM-synthesized composite score" 351 ) 352 ``` 353 354 Context-aware composite: 355 ```python 356 def context_composite(*, input, output, expected_output, metadata, evaluations): 357 # Adjust weighting based on metadata 358 base_weights = {"accuracy": 0.5, "speed": 0.3, "cost": 0.2} 359 360 # If metadata indicates high importance, prioritize accuracy 361 if metadata and metadata.get('importance') == 'high': 362 weights = {"accuracy": 0.7, "speed": 0.2, "cost": 0.1} 363 else: 364 weights = base_weights 365 366 total = sum( 367 e.value * weights.get(e.name, 0) 368 for e in evaluations 369 if isinstance(e.value, (int, float)) 370 ) 371 372 return Evaluation( 373 name="weighted_composite", 374 value=total, 375 comment="Context-aware weighted composite" 376 ) 377 ``` 378 """ 379 ...
Protocol defining the interface for composite evaluator functions.
Composite evaluators create aggregate scores from multiple item-level evaluations. This is commonly used to compute weighted averages, combined metrics, or other composite assessments based on individual evaluation results.
Composite evaluators:
- Accept the same inputs as item-level evaluators (input, output, expected_output, metadata) plus the list of evaluations
- Return either a single Evaluation, a list of Evaluations, or a dict
- Can be either synchronous or asynchronous
- Have access to both raw item data and evaluation results
1927def _no_init_or_replace_init(self, *args, **kwargs): 1928 cls = type(self) 1929 1930 if cls._is_protocol: 1931 raise TypeError('Protocols cannot be instantiated') 1932 1933 # Already using a custom `__init__`. No need to calculate correct 1934 # `__init__` to call. This can lead to RecursionError. See bpo-45121. 1935 if cls.__init__ is not _no_init_or_replace_init: 1936 return 1937 1938 # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`. 1939 # The first instantiation of the subclass will call `_no_init_or_replace_init` which 1940 # searches for a proper new `__init__` in the MRO. The new `__init__` 1941 # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent 1942 # instantiation of the protocol subclass will thus use the new 1943 # `__init__` and no longer call `_no_init_or_replace_init`. 1944 for base in cls.__mro__: 1945 init = base.__dict__.get('__init__', _no_init_or_replace_init) 1946 if init is not _no_init_or_replace_init: 1947 cls.__init__ = init 1948 break 1949 else: 1950 # should not happen 1951 cls.__init__ = object.__init__ 1952 1953 cls.__init__(self, *args, **kwargs)
382class EvaluatorStats: 383 """Statistics for a single evaluator's performance during batch evaluation. 384 385 This class tracks detailed metrics about how a specific evaluator performed 386 across all items in a batch evaluation run. It helps identify evaluator issues, 387 understand reliability, and optimize evaluation pipelines. 388 389 Attributes: 390 name: The name of the evaluator function (extracted from __name__). 391 total_runs: Total number of times the evaluator was invoked. 392 successful_runs: Number of times the evaluator completed successfully. 393 failed_runs: Number of times the evaluator raised an exception or failed. 394 total_scores_created: Total number of evaluation scores created by this evaluator. 395 Can be higher than successful_runs if the evaluator returns multiple scores. 396 397 Examples: 398 Accessing evaluator stats from batch evaluation result: 399 ```python 400 result = client.run_batched_evaluation(...) 401 402 for stats in result.evaluator_stats: 403 print(f"Evaluator: {stats.name}") 404 print(f" Success rate: {stats.successful_runs / stats.total_runs:.1%}") 405 print(f" Scores created: {stats.total_scores_created}") 406 407 if stats.failed_runs > 0: 408 print(f" ⚠️ Failed {stats.failed_runs} times") 409 ``` 410 411 Identifying problematic evaluators: 412 ```python 413 result = client.run_batched_evaluation(...) 414 415 # Find evaluators with high failure rates 416 for stats in result.evaluator_stats: 417 failure_rate = stats.failed_runs / stats.total_runs 418 if failure_rate > 0.1: # More than 10% failures 419 print(f"⚠️ {stats.name} has {failure_rate:.1%} failure rate") 420 print(f" Consider debugging or removing this evaluator") 421 ``` 422 423 Note: 424 All arguments must be passed as keywords when instantiating this class. 425 """ 426 427 def __init__( 428 self, 429 *, 430 name: str, 431 total_runs: int = 0, 432 successful_runs: int = 0, 433 failed_runs: int = 0, 434 total_scores_created: int = 0, 435 ): 436 """Initialize EvaluatorStats with the provided metrics. 437 438 Args: 439 name: The evaluator function name. 440 total_runs: Total number of evaluator invocations. 441 successful_runs: Number of successful completions. 442 failed_runs: Number of failures. 443 total_scores_created: Total scores created by this evaluator. 444 445 Note: 446 All arguments must be provided as keywords. 447 """ 448 self.name = name 449 self.total_runs = total_runs 450 self.successful_runs = successful_runs 451 self.failed_runs = failed_runs 452 self.total_scores_created = total_scores_created
Statistics for a single evaluator's performance during batch evaluation.
This class tracks detailed metrics about how a specific evaluator performed across all items in a batch evaluation run. It helps identify evaluator issues, understand reliability, and optimize evaluation pipelines.
Attributes:
- name: The name of the evaluator function (extracted from __name__).
- total_runs: Total number of times the evaluator was invoked.
- successful_runs: Number of times the evaluator completed successfully.
- failed_runs: Number of times the evaluator raised an exception or failed.
- total_scores_created: Total number of evaluation scores created by this evaluator. Can be higher than successful_runs if the evaluator returns multiple scores.
Examples:
Accessing evaluator stats from batch evaluation result:
result = client.run_batched_evaluation(...) for stats in result.evaluator_stats: print(f"Evaluator: {stats.name}") print(f" Success rate: {stats.successful_runs / stats.total_runs:.1%}") print(f" Scores created: {stats.total_scores_created}") if stats.failed_runs > 0: print(f" ⚠️ Failed {stats.failed_runs} times")Identifying problematic evaluators:
result = client.run_batched_evaluation(...) # Find evaluators with high failure rates for stats in result.evaluator_stats: failure_rate = stats.failed_runs / stats.total_runs if failure_rate > 0.1: # More than 10% failures print(f"⚠️ {stats.name} has {failure_rate:.1%} failure rate") print(f" Consider debugging or removing this evaluator")
Note:
All arguments must be passed as keywords when instantiating this class.
427 def __init__( 428 self, 429 *, 430 name: str, 431 total_runs: int = 0, 432 successful_runs: int = 0, 433 failed_runs: int = 0, 434 total_scores_created: int = 0, 435 ): 436 """Initialize EvaluatorStats with the provided metrics. 437 438 Args: 439 name: The evaluator function name. 440 total_runs: Total number of evaluator invocations. 441 successful_runs: Number of successful completions. 442 failed_runs: Number of failures. 443 total_scores_created: Total scores created by this evaluator. 444 445 Note: 446 All arguments must be provided as keywords. 447 """ 448 self.name = name 449 self.total_runs = total_runs 450 self.successful_runs = successful_runs 451 self.failed_runs = failed_runs 452 self.total_scores_created = total_scores_created
Initialize EvaluatorStats with the provided metrics.
Arguments:
- name: The evaluator function name.
- total_runs: Total number of evaluator invocations.
- successful_runs: Number of successful completions.
- failed_runs: Number of failures.
- total_scores_created: Total scores created by this evaluator.
Note:
All arguments must be provided as keywords.
455class BatchEvaluationResumeToken: 456 """Token for resuming a failed batch evaluation run. 457 458 This class encapsulates all the information needed to resume a batch evaluation 459 that was interrupted or failed partway through. It uses timestamp-based filtering 460 to avoid re-processing items that were already evaluated, even if the underlying 461 dataset changed between runs. 462 463 Attributes: 464 scope: The type of items being evaluated ("traces", "observations"). 465 filter: The original JSON filter string used to query items. 466 last_processed_timestamp: ISO 8601 timestamp of the last successfully processed item. 467 Used to construct a filter that only fetches items after this timestamp. 468 last_processed_id: The ID of the last successfully processed item, for reference. 469 items_processed: Count of items successfully processed before interruption. 470 471 Examples: 472 Resuming a failed batch evaluation: 473 ```python 474 # Initial run that fails partway through 475 try: 476 result = client.run_batched_evaluation( 477 scope="traces", 478 mapper=my_mapper, 479 evaluators=[evaluator1, evaluator2], 480 filter='{"tags": ["production"]}', 481 max_items=10000 482 ) 483 except Exception as e: 484 print(f"Evaluation failed: {e}") 485 486 # Save the resume token 487 if result.resume_token: 488 # Store resume token for later (e.g., in a file or database) 489 import json 490 with open("resume_token.json", "w") as f: 491 json.dump({ 492 "scope": result.resume_token.scope, 493 "filter": result.resume_token.filter, 494 "last_timestamp": result.resume_token.last_processed_timestamp, 495 "last_id": result.resume_token.last_processed_id, 496 "items_done": result.resume_token.items_processed 497 }, f) 498 499 # Later, resume from where it left off 500 with open("resume_token.json") as f: 501 token_data = json.load(f) 502 503 resume_token = BatchEvaluationResumeToken( 504 scope=token_data["scope"], 505 filter=token_data["filter"], 506 last_processed_timestamp=token_data["last_timestamp"], 507 last_processed_id=token_data["last_id"], 508 items_processed=token_data["items_done"] 509 ) 510 511 # Resume the evaluation 512 result = client.run_batched_evaluation( 513 scope="traces", 514 mapper=my_mapper, 515 evaluators=[evaluator1, evaluator2], 516 resume_from=resume_token 517 ) 518 519 print(f"Processed {result.total_items_processed} additional items") 520 ``` 521 522 Handling partial completion: 523 ```python 524 result = client.run_batched_evaluation(...) 525 526 if not result.completed: 527 print(f"Evaluation incomplete. Processed {result.resume_token.items_processed} items") 528 print(f"Last item: {result.resume_token.last_processed_id}") 529 print(f"Resume from: {result.resume_token.last_processed_timestamp}") 530 531 # Optionally retry automatically 532 if result.resume_token: 533 print("Retrying...") 534 result = client.run_batched_evaluation( 535 scope=result.resume_token.scope, 536 mapper=my_mapper, 537 evaluators=my_evaluators, 538 resume_from=result.resume_token 539 ) 540 ``` 541 542 Note: 543 All arguments must be passed as keywords when instantiating this class. 544 The timestamp-based approach means that items created after the initial run 545 but before the timestamp will be skipped. This is intentional to avoid 546 duplicates and ensure consistent evaluation. 547 """ 548 549 def __init__( 550 self, 551 *, 552 scope: str, 553 filter: Optional[str], 554 last_processed_timestamp: str, 555 last_processed_id: str, 556 items_processed: int, 557 ): 558 """Initialize BatchEvaluationResumeToken with the provided state. 559 560 Args: 561 scope: The scope type ("traces", "observations"). 562 filter: The original JSON filter string. 563 last_processed_timestamp: ISO 8601 timestamp of last processed item. 564 last_processed_id: ID of last processed item. 565 items_processed: Count of items processed before interruption. 566 567 Note: 568 All arguments must be provided as keywords. 569 """ 570 self.scope = scope 571 self.filter = filter 572 self.last_processed_timestamp = last_processed_timestamp 573 self.last_processed_id = last_processed_id 574 self.items_processed = items_processed
Token for resuming a failed batch evaluation run.
This class encapsulates all the information needed to resume a batch evaluation that was interrupted or failed partway through. It uses timestamp-based filtering to avoid re-processing items that were already evaluated, even if the underlying dataset changed between runs.
Attributes:
- scope: The type of items being evaluated ("traces", "observations").
- filter: The original JSON filter string used to query items.
- last_processed_timestamp: ISO 8601 timestamp of the last successfully processed item. Used to construct a filter that only fetches items after this timestamp.
- last_processed_id: The ID of the last successfully processed item, for reference.
- items_processed: Count of items successfully processed before interruption.
Examples:
Resuming a failed batch evaluation:
# Initial run that fails partway through try: result = client.run_batched_evaluation( scope="traces", mapper=my_mapper, evaluators=[evaluator1, evaluator2], filter='{"tags": ["production"]}', max_items=10000 ) except Exception as e: print(f"Evaluation failed: {e}") # Save the resume token if result.resume_token: # Store resume token for later (e.g., in a file or database) import json with open("resume_token.json", "w") as f: json.dump({ "scope": result.resume_token.scope, "filter": result.resume_token.filter, "last_timestamp": result.resume_token.last_processed_timestamp, "last_id": result.resume_token.last_processed_id, "items_done": result.resume_token.items_processed }, f) # Later, resume from where it left off with open("resume_token.json") as f: token_data = json.load(f) resume_token = BatchEvaluationResumeToken( scope=token_data["scope"], filter=token_data["filter"], last_processed_timestamp=token_data["last_timestamp"], last_processed_id=token_data["last_id"], items_processed=token_data["items_done"] ) # Resume the evaluation result = client.run_batched_evaluation( scope="traces", mapper=my_mapper, evaluators=[evaluator1, evaluator2], resume_from=resume_token ) print(f"Processed {result.total_items_processed} additional items")Handling partial completion:
result = client.run_batched_evaluation(...) if not result.completed: print(f"Evaluation incomplete. Processed {result.resume_token.items_processed} items") print(f"Last item: {result.resume_token.last_processed_id}") print(f"Resume from: {result.resume_token.last_processed_timestamp}") # Optionally retry automatically if result.resume_token: print("Retrying...") result = client.run_batched_evaluation( scope=result.resume_token.scope, mapper=my_mapper, evaluators=my_evaluators, resume_from=result.resume_token )
Note:
All arguments must be passed as keywords when instantiating this class. The timestamp-based approach means that items created after the initial run but before the timestamp will be skipped. This is intentional to avoid duplicates and ensure consistent evaluation.
549 def __init__( 550 self, 551 *, 552 scope: str, 553 filter: Optional[str], 554 last_processed_timestamp: str, 555 last_processed_id: str, 556 items_processed: int, 557 ): 558 """Initialize BatchEvaluationResumeToken with the provided state. 559 560 Args: 561 scope: The scope type ("traces", "observations"). 562 filter: The original JSON filter string. 563 last_processed_timestamp: ISO 8601 timestamp of last processed item. 564 last_processed_id: ID of last processed item. 565 items_processed: Count of items processed before interruption. 566 567 Note: 568 All arguments must be provided as keywords. 569 """ 570 self.scope = scope 571 self.filter = filter 572 self.last_processed_timestamp = last_processed_timestamp 573 self.last_processed_id = last_processed_id 574 self.items_processed = items_processed
Initialize BatchEvaluationResumeToken with the provided state.
Arguments:
- scope: The scope type ("traces", "observations").
- filter: The original JSON filter string.
- last_processed_timestamp: ISO 8601 timestamp of last processed item.
- last_processed_id: ID of last processed item.
- items_processed: Count of items processed before interruption.
Note:
All arguments must be provided as keywords.
577class BatchEvaluationResult: 578 r"""Complete result structure for batch evaluation execution. 579 580 This class encapsulates comprehensive statistics and metadata about a batch 581 evaluation run, including counts, evaluator-specific metrics, timing information, 582 error details, and resume capability. 583 584 Attributes: 585 total_items_fetched: Total number of items fetched from the API. 586 total_items_processed: Number of items successfully evaluated. 587 total_items_failed: Number of items that failed during evaluation. 588 total_scores_created: Total scores created by all item-level evaluators. 589 total_composite_scores_created: Scores created by the composite evaluator. 590 total_evaluations_failed: Number of individual evaluator failures across all items. 591 evaluator_stats: List of per-evaluator statistics (success/failure rates, scores created). 592 resume_token: Token for resuming if evaluation was interrupted (None if completed). 593 completed: True if all items were processed, False if stopped early or failed. 594 duration_seconds: Total time taken to execute the batch evaluation. 595 failed_item_ids: List of IDs for items that failed evaluation. 596 error_summary: Dictionary mapping error types to occurrence counts. 597 has_more_items: True if max_items limit was reached but more items exist. 598 item_evaluations: Dictionary mapping item IDs to their evaluation results (both regular and composite). 599 600 Examples: 601 Basic result inspection: 602 ```python 603 result = client.run_batched_evaluation(...) 604 605 print(f"Processed: {result.total_items_processed}/{result.total_items_fetched}") 606 print(f"Scores created: {result.total_scores_created}") 607 print(f"Duration: {result.duration_seconds:.2f}s") 608 print(f"Success rate: {result.total_items_processed / result.total_items_fetched:.1%}") 609 ``` 610 611 Detailed analysis with evaluator stats: 612 ```python 613 result = client.run_batched_evaluation(...) 614 615 print(f"\n📊 Batch Evaluation Results") 616 print(f"{'='*50}") 617 print(f"Items processed: {result.total_items_processed}") 618 print(f"Items failed: {result.total_items_failed}") 619 print(f"Scores created: {result.total_scores_created}") 620 621 if result.total_composite_scores_created > 0: 622 print(f"Composite scores: {result.total_composite_scores_created}") 623 624 print(f"\n📈 Evaluator Performance:") 625 for stats in result.evaluator_stats: 626 success_rate = stats.successful_runs / stats.total_runs if stats.total_runs > 0 else 0 627 print(f"\n {stats.name}:") 628 print(f" Success rate: {success_rate:.1%}") 629 print(f" Scores created: {stats.total_scores_created}") 630 if stats.failed_runs > 0: 631 print(f" ⚠️ Failures: {stats.failed_runs}") 632 633 if result.error_summary: 634 print(f"\n⚠️ Errors encountered:") 635 for error_type, count in result.error_summary.items(): 636 print(f" {error_type}: {count}") 637 ``` 638 639 Handling incomplete runs: 640 ```python 641 result = client.run_batched_evaluation(...) 642 643 if not result.completed: 644 print("⚠️ Evaluation incomplete!") 645 646 if result.resume_token: 647 print(f"Processed {result.resume_token.items_processed} items before failure") 648 print(f"Use resume_from parameter to continue from:") 649 print(f" Timestamp: {result.resume_token.last_processed_timestamp}") 650 print(f" Last ID: {result.resume_token.last_processed_id}") 651 652 if result.has_more_items: 653 print(f"ℹ️ More items available beyond max_items limit") 654 ``` 655 656 Performance monitoring: 657 ```python 658 result = client.run_batched_evaluation(...) 659 660 items_per_second = result.total_items_processed / result.duration_seconds 661 avg_scores_per_item = result.total_scores_created / result.total_items_processed 662 663 print(f"Performance metrics:") 664 print(f" Throughput: {items_per_second:.2f} items/second") 665 print(f" Avg scores/item: {avg_scores_per_item:.2f}") 666 print(f" Total duration: {result.duration_seconds:.2f}s") 667 668 if result.total_evaluations_failed > 0: 669 failure_rate = result.total_evaluations_failed / ( 670 result.total_items_processed * len(result.evaluator_stats) 671 ) 672 print(f" Evaluation failure rate: {failure_rate:.1%}") 673 ``` 674 675 Note: 676 All arguments must be passed as keywords when instantiating this class. 677 """ 678 679 def __init__( 680 self, 681 *, 682 total_items_fetched: int, 683 total_items_processed: int, 684 total_items_failed: int, 685 total_scores_created: int, 686 total_composite_scores_created: int, 687 total_evaluations_failed: int, 688 evaluator_stats: List[EvaluatorStats], 689 resume_token: Optional[BatchEvaluationResumeToken], 690 completed: bool, 691 duration_seconds: float, 692 failed_item_ids: List[str], 693 error_summary: Dict[str, int], 694 has_more_items: bool, 695 item_evaluations: Dict[str, List["Evaluation"]], 696 ): 697 """Initialize BatchEvaluationResult with comprehensive statistics. 698 699 Args: 700 total_items_fetched: Total items fetched from API. 701 total_items_processed: Items successfully evaluated. 702 total_items_failed: Items that failed evaluation. 703 total_scores_created: Scores from item-level evaluators. 704 total_composite_scores_created: Scores from composite evaluator. 705 total_evaluations_failed: Individual evaluator failures. 706 evaluator_stats: Per-evaluator statistics. 707 resume_token: Token for resuming (None if completed). 708 completed: Whether all items were processed. 709 duration_seconds: Total execution time. 710 failed_item_ids: IDs of failed items. 711 error_summary: Error types and counts. 712 has_more_items: Whether more items exist beyond max_items. 713 item_evaluations: Dictionary mapping item IDs to their evaluation results. 714 715 Note: 716 All arguments must be provided as keywords. 717 """ 718 self.total_items_fetched = total_items_fetched 719 self.total_items_processed = total_items_processed 720 self.total_items_failed = total_items_failed 721 self.total_scores_created = total_scores_created 722 self.total_composite_scores_created = total_composite_scores_created 723 self.total_evaluations_failed = total_evaluations_failed 724 self.evaluator_stats = evaluator_stats 725 self.resume_token = resume_token 726 self.completed = completed 727 self.duration_seconds = duration_seconds 728 self.failed_item_ids = failed_item_ids 729 self.error_summary = error_summary 730 self.has_more_items = has_more_items 731 self.item_evaluations = item_evaluations 732 733 def __str__(self) -> str: 734 """Return a formatted string representation of the batch evaluation results. 735 736 Returns: 737 A multi-line string with a summary of the evaluation results. 738 """ 739 lines = [] 740 lines.append("=" * 60) 741 lines.append("Batch Evaluation Results") 742 lines.append("=" * 60) 743 744 # Summary statistics 745 lines.append(f"\nStatus: {'Completed' if self.completed else 'Incomplete'}") 746 lines.append(f"Duration: {self.duration_seconds:.2f}s") 747 lines.append(f"\nItems fetched: {self.total_items_fetched}") 748 lines.append(f"Items processed: {self.total_items_processed}") 749 750 if self.total_items_failed > 0: 751 lines.append(f"Items failed: {self.total_items_failed}") 752 753 # Success rate 754 if self.total_items_fetched > 0: 755 success_rate = self.total_items_processed / self.total_items_fetched * 100 756 lines.append(f"Success rate: {success_rate:.1f}%") 757 758 # Scores created 759 lines.append(f"\nScores created: {self.total_scores_created}") 760 if self.total_composite_scores_created > 0: 761 lines.append(f"Composite scores: {self.total_composite_scores_created}") 762 763 total_scores = self.total_scores_created + self.total_composite_scores_created 764 lines.append(f"Total scores: {total_scores}") 765 766 # Evaluator statistics 767 if self.evaluator_stats: 768 lines.append("\nEvaluator Performance:") 769 for stats in self.evaluator_stats: 770 lines.append(f" {stats.name}:") 771 if stats.total_runs > 0: 772 success_rate = ( 773 stats.successful_runs / stats.total_runs * 100 774 if stats.total_runs > 0 775 else 0 776 ) 777 lines.append( 778 f" Runs: {stats.successful_runs}/{stats.total_runs} " 779 f"({success_rate:.1f}% success)" 780 ) 781 lines.append(f" Scores created: {stats.total_scores_created}") 782 if stats.failed_runs > 0: 783 lines.append(f" Failed runs: {stats.failed_runs}") 784 785 # Performance metrics 786 if self.total_items_processed > 0 and self.duration_seconds > 0: 787 items_per_sec = self.total_items_processed / self.duration_seconds 788 lines.append("\nPerformance:") 789 lines.append(f" Throughput: {items_per_sec:.2f} items/second") 790 if self.total_scores_created > 0: 791 avg_scores = self.total_scores_created / self.total_items_processed 792 lines.append(f" Avg scores per item: {avg_scores:.2f}") 793 794 # Errors and warnings 795 if self.error_summary: 796 lines.append("\nErrors encountered:") 797 for error_type, count in self.error_summary.items(): 798 lines.append(f" {error_type}: {count}") 799 800 # Incomplete run information 801 if not self.completed: 802 lines.append("\nWarning: Evaluation incomplete") 803 if self.resume_token: 804 lines.append( 805 f" Last processed: {self.resume_token.last_processed_timestamp}" 806 ) 807 lines.append(f" Items processed: {self.resume_token.items_processed}") 808 lines.append(" Use resume_from parameter to continue") 809 810 if self.has_more_items: 811 lines.append("\nNote: More items available beyond max_items limit") 812 813 lines.append("=" * 60) 814 return "\n".join(lines)
Complete result structure for batch evaluation execution.
This class encapsulates comprehensive statistics and metadata about a batch evaluation run, including counts, evaluator-specific metrics, timing information, error details, and resume capability.
Attributes:
- total_items_fetched: Total number of items fetched from the API.
- total_items_processed: Number of items successfully evaluated.
- total_items_failed: Number of items that failed during evaluation.
- total_scores_created: Total scores created by all item-level evaluators.
- total_composite_scores_created: Scores created by the composite evaluator.
- total_evaluations_failed: Number of individual evaluator failures across all items.
- evaluator_stats: List of per-evaluator statistics (success/failure rates, scores created).
- resume_token: Token for resuming if evaluation was interrupted (None if completed).
- completed: True if all items were processed, False if stopped early or failed.
- duration_seconds: Total time taken to execute the batch evaluation.
- failed_item_ids: List of IDs for items that failed evaluation.
- error_summary: Dictionary mapping error types to occurrence counts.
- has_more_items: True if max_items limit was reached but more items exist.
- item_evaluations: Dictionary mapping item IDs to their evaluation results (both regular and composite).
Examples:
Basic result inspection:
result = client.run_batched_evaluation(...) print(f"Processed: {result.total_items_processed}/{result.total_items_fetched}") print(f"Scores created: {result.total_scores_created}") print(f"Duration: {result.duration_seconds:.2f}s") print(f"Success rate: {result.total_items_processed / result.total_items_fetched:.1%}")Detailed analysis with evaluator stats:
result = client.run_batched_evaluation(...) print(f"\n📊 Batch Evaluation Results") print(f"{'='*50}") print(f"Items processed: {result.total_items_processed}") print(f"Items failed: {result.total_items_failed}") print(f"Scores created: {result.total_scores_created}") if result.total_composite_scores_created > 0: print(f"Composite scores: {result.total_composite_scores_created}") print(f"\n📈 Evaluator Performance:") for stats in result.evaluator_stats: success_rate = stats.successful_runs / stats.total_runs if stats.total_runs > 0 else 0 print(f"\n {stats.name}:") print(f" Success rate: {success_rate:.1%}") print(f" Scores created: {stats.total_scores_created}") if stats.failed_runs > 0: print(f" ⚠️ Failures: {stats.failed_runs}") if result.error_summary: print(f"\n⚠️ Errors encountered:") for error_type, count in result.error_summary.items(): print(f" {error_type}: {count}")Handling incomplete runs:
result = client.run_batched_evaluation(...) if not result.completed: print("⚠️ Evaluation incomplete!") if result.resume_token: print(f"Processed {result.resume_token.items_processed} items before failure") print(f"Use resume_from parameter to continue from:") print(f" Timestamp: {result.resume_token.last_processed_timestamp}") print(f" Last ID: {result.resume_token.last_processed_id}") if result.has_more_items: print(f"ℹ️ More items available beyond max_items limit")Performance monitoring:
result = client.run_batched_evaluation(...) items_per_second = result.total_items_processed / result.duration_seconds avg_scores_per_item = result.total_scores_created / result.total_items_processed print(f"Performance metrics:") print(f" Throughput: {items_per_second:.2f} items/second") print(f" Avg scores/item: {avg_scores_per_item:.2f}") print(f" Total duration: {result.duration_seconds:.2f}s") if result.total_evaluations_failed > 0: failure_rate = result.total_evaluations_failed / ( result.total_items_processed * len(result.evaluator_stats) ) print(f" Evaluation failure rate: {failure_rate:.1%}")
Note:
All arguments must be passed as keywords when instantiating this class.
679 def __init__( 680 self, 681 *, 682 total_items_fetched: int, 683 total_items_processed: int, 684 total_items_failed: int, 685 total_scores_created: int, 686 total_composite_scores_created: int, 687 total_evaluations_failed: int, 688 evaluator_stats: List[EvaluatorStats], 689 resume_token: Optional[BatchEvaluationResumeToken], 690 completed: bool, 691 duration_seconds: float, 692 failed_item_ids: List[str], 693 error_summary: Dict[str, int], 694 has_more_items: bool, 695 item_evaluations: Dict[str, List["Evaluation"]], 696 ): 697 """Initialize BatchEvaluationResult with comprehensive statistics. 698 699 Args: 700 total_items_fetched: Total items fetched from API. 701 total_items_processed: Items successfully evaluated. 702 total_items_failed: Items that failed evaluation. 703 total_scores_created: Scores from item-level evaluators. 704 total_composite_scores_created: Scores from composite evaluator. 705 total_evaluations_failed: Individual evaluator failures. 706 evaluator_stats: Per-evaluator statistics. 707 resume_token: Token for resuming (None if completed). 708 completed: Whether all items were processed. 709 duration_seconds: Total execution time. 710 failed_item_ids: IDs of failed items. 711 error_summary: Error types and counts. 712 has_more_items: Whether more items exist beyond max_items. 713 item_evaluations: Dictionary mapping item IDs to their evaluation results. 714 715 Note: 716 All arguments must be provided as keywords. 717 """ 718 self.total_items_fetched = total_items_fetched 719 self.total_items_processed = total_items_processed 720 self.total_items_failed = total_items_failed 721 self.total_scores_created = total_scores_created 722 self.total_composite_scores_created = total_composite_scores_created 723 self.total_evaluations_failed = total_evaluations_failed 724 self.evaluator_stats = evaluator_stats 725 self.resume_token = resume_token 726 self.completed = completed 727 self.duration_seconds = duration_seconds 728 self.failed_item_ids = failed_item_ids 729 self.error_summary = error_summary 730 self.has_more_items = has_more_items 731 self.item_evaluations = item_evaluations
Initialize BatchEvaluationResult with comprehensive statistics.
Arguments:
- total_items_fetched: Total items fetched from API.
- total_items_processed: Items successfully evaluated.
- total_items_failed: Items that failed evaluation.
- total_scores_created: Scores from item-level evaluators.
- total_composite_scores_created: Scores from composite evaluator.
- total_evaluations_failed: Individual evaluator failures.
- evaluator_stats: Per-evaluator statistics.
- resume_token: Token for resuming (None if completed).
- completed: Whether all items were processed.
- duration_seconds: Total execution time.
- failed_item_ids: IDs of failed items.
- error_summary: Error types and counts.
- has_more_items: Whether more items exist beyond max_items.
- item_evaluations: Dictionary mapping item IDs to their evaluation results.
Note:
All arguments must be provided as keywords.
1062class RunnerContext: 1063 """Wraps :meth:`Langfuse.run_experiment` with CI-injected defaults. 1064 1065 Intended for use with the ``langfuse/experiment-action`` GitHub Action 1066 (https://github.com/langfuse/experiment-action). The action builds a 1067 ``RunnerContext`` before invoking the user's ``experiment(context)`` 1068 function. Defaults set here (dataset, metadata tags) are applied when 1069 the user omits them on the :meth:`run_experiment` call; users can 1070 override any default by passing the corresponding argument explicitly. 1071 """ 1072 1073 def __init__( 1074 self, 1075 *, 1076 client: "Langfuse", 1077 data: Optional[ExperimentData] = None, 1078 dataset_version: Optional[datetime] = None, 1079 metadata: Optional[Dict[str, str]] = None, 1080 ): 1081 """Build a ``RunnerContext`` populated with defaults for ``run_experiment``. 1082 1083 Typically called by the ``langfuse/experiment-action`` GitHub Action, 1084 not by end users directly. Every field except ``client`` is optional: 1085 fields left as ``None`` simply mean the corresponding argument must be 1086 supplied on the :meth:`run_experiment` call. 1087 1088 Args: 1089 client: Initialized Langfuse SDK client used to execute the 1090 experiment. The action creates this from the 1091 ``langfuse_public_key`` / ``langfuse_secret_key`` / 1092 ``langfuse_base_url`` inputs. 1093 data: Default dataset items to run the experiment on. Accepts 1094 either ``List[LocalExperimentItem]`` or ``List[DatasetItem]``. 1095 Injected by the action when ``dataset_name`` is configured. 1096 If ``None``, the user must pass ``data=`` to 1097 :meth:`run_experiment`. 1098 dataset_version: Optional pinned dataset version. Injected by the 1099 action when ``dataset_version`` is configured. 1100 metadata: Default metadata attached to every experiment trace and 1101 the dataset run. The action injects GitHub-sourced tags (SHA, 1102 PR link, workflow run link, branch, GH user, etc.). Merged 1103 with any ``metadata`` passed to :meth:`run_experiment`, with 1104 user-supplied keys winning on collision. 1105 """ 1106 self.client = client 1107 self.data = data 1108 self.dataset_version = dataset_version 1109 self.metadata = metadata 1110 1111 def run_experiment( 1112 self, 1113 *, 1114 name: str, 1115 run_name: Optional[str] = None, 1116 description: Optional[str] = None, 1117 data: Optional[ExperimentData] = None, 1118 task: TaskFunction, 1119 evaluators: List[EvaluatorFunction] = [], 1120 composite_evaluator: Optional["CompositeEvaluatorFunction"] = None, 1121 run_evaluators: List[RunEvaluatorFunction] = [], 1122 max_concurrency: int = 50, 1123 metadata: Optional[Dict[str, str]] = None, 1124 _dataset_version: Optional[datetime] = None, 1125 ) -> ExperimentResult: 1126 resolved_data = data if data is not None else self.data 1127 if resolved_data is None: 1128 raise ValueError( 1129 "`data` must be provided either on the RunnerContext or the run_experiment call" 1130 ) 1131 1132 resolved_dataset_version = ( 1133 _dataset_version if _dataset_version is not None else self.dataset_version 1134 ) 1135 1136 merged_metadata: Optional[Dict[str, str]] 1137 if self.metadata is None and metadata is None: 1138 merged_metadata = None 1139 else: 1140 merged_metadata = {**(self.metadata or {}), **(metadata or {})} 1141 1142 return self.client.run_experiment( 1143 name=name, 1144 run_name=run_name, 1145 description=description, 1146 data=resolved_data, 1147 task=task, 1148 evaluators=evaluators, 1149 composite_evaluator=composite_evaluator, 1150 run_evaluators=run_evaluators, 1151 max_concurrency=max_concurrency, 1152 metadata=merged_metadata, 1153 _dataset_version=resolved_dataset_version, 1154 )
Wraps Langfuse.run_experiment() with CI-injected defaults.
Intended for use with the langfuse/experiment-action GitHub Action
(https://github.com/langfuse/experiment-action). The action builds a
RunnerContext before invoking the user's experiment(context)
function. Defaults set here (dataset, metadata tags) are applied when
the user omits them on the run_experiment() call; users can
override any default by passing the corresponding argument explicitly.
1073 def __init__( 1074 self, 1075 *, 1076 client: "Langfuse", 1077 data: Optional[ExperimentData] = None, 1078 dataset_version: Optional[datetime] = None, 1079 metadata: Optional[Dict[str, str]] = None, 1080 ): 1081 """Build a ``RunnerContext`` populated with defaults for ``run_experiment``. 1082 1083 Typically called by the ``langfuse/experiment-action`` GitHub Action, 1084 not by end users directly. Every field except ``client`` is optional: 1085 fields left as ``None`` simply mean the corresponding argument must be 1086 supplied on the :meth:`run_experiment` call. 1087 1088 Args: 1089 client: Initialized Langfuse SDK client used to execute the 1090 experiment. The action creates this from the 1091 ``langfuse_public_key`` / ``langfuse_secret_key`` / 1092 ``langfuse_base_url`` inputs. 1093 data: Default dataset items to run the experiment on. Accepts 1094 either ``List[LocalExperimentItem]`` or ``List[DatasetItem]``. 1095 Injected by the action when ``dataset_name`` is configured. 1096 If ``None``, the user must pass ``data=`` to 1097 :meth:`run_experiment`. 1098 dataset_version: Optional pinned dataset version. Injected by the 1099 action when ``dataset_version`` is configured. 1100 metadata: Default metadata attached to every experiment trace and 1101 the dataset run. The action injects GitHub-sourced tags (SHA, 1102 PR link, workflow run link, branch, GH user, etc.). Merged 1103 with any ``metadata`` passed to :meth:`run_experiment`, with 1104 user-supplied keys winning on collision. 1105 """ 1106 self.client = client 1107 self.data = data 1108 self.dataset_version = dataset_version 1109 self.metadata = metadata
Build a RunnerContext populated with defaults for run_experiment.
Typically called by the langfuse/experiment-action GitHub Action,
not by end users directly. Every field except client is optional:
fields left as None simply mean the corresponding argument must be
supplied on the run_experiment() call.
Arguments:
- client: Initialized Langfuse SDK client used to execute the
experiment. The action creates this from the
langfuse_public_key/langfuse_secret_key/langfuse_base_urlinputs. - data: Default dataset items to run the experiment on. Accepts
either
List[LocalExperimentItem]orList[DatasetItem]. Injected by the action whendataset_nameis configured. IfNone, the user must passdata=torun_experiment(). - dataset_version: Optional pinned dataset version. Injected by the
action when
dataset_versionis configured. - metadata: Default metadata attached to every experiment trace and
the dataset run. The action injects GitHub-sourced tags (SHA,
PR link, workflow run link, branch, GH user, etc.). Merged
with any
metadatapassed torun_experiment(), with user-supplied keys winning on collision.
1111 def run_experiment( 1112 self, 1113 *, 1114 name: str, 1115 run_name: Optional[str] = None, 1116 description: Optional[str] = None, 1117 data: Optional[ExperimentData] = None, 1118 task: TaskFunction, 1119 evaluators: List[EvaluatorFunction] = [], 1120 composite_evaluator: Optional["CompositeEvaluatorFunction"] = None, 1121 run_evaluators: List[RunEvaluatorFunction] = [], 1122 max_concurrency: int = 50, 1123 metadata: Optional[Dict[str, str]] = None, 1124 _dataset_version: Optional[datetime] = None, 1125 ) -> ExperimentResult: 1126 resolved_data = data if data is not None else self.data 1127 if resolved_data is None: 1128 raise ValueError( 1129 "`data` must be provided either on the RunnerContext or the run_experiment call" 1130 ) 1131 1132 resolved_dataset_version = ( 1133 _dataset_version if _dataset_version is not None else self.dataset_version 1134 ) 1135 1136 merged_metadata: Optional[Dict[str, str]] 1137 if self.metadata is None and metadata is None: 1138 merged_metadata = None 1139 else: 1140 merged_metadata = {**(self.metadata or {}), **(metadata or {})} 1141 1142 return self.client.run_experiment( 1143 name=name, 1144 run_name=run_name, 1145 description=description, 1146 data=resolved_data, 1147 task=task, 1148 evaluators=evaluators, 1149 composite_evaluator=composite_evaluator, 1150 run_evaluators=run_evaluators, 1151 max_concurrency=max_concurrency, 1152 metadata=merged_metadata, 1153 _dataset_version=resolved_dataset_version, 1154 )
1157class RegressionError(Exception): 1158 """Raised by a user's ``experiment`` function to signal a CI gate failure. 1159 1160 Intended for use with the ``langfuse/experiment-action`` GitHub Action 1161 (https://github.com/langfuse/experiment-action). The action catches this 1162 exception and, when ``should_fail_on_error`` is enabled, fails the 1163 workflow run and renders a callout in the PR comment using 1164 ``metric``/``value``/``threshold`` if supplied, otherwise ``str(exc)``. 1165 1166 Callers choose one of three forms: 1167 1168 - ``RegressionError(result=r)`` — minimal, generic message. 1169 - ``RegressionError(result=r, message="...")`` — free-form message. 1170 - ``RegressionError(result=r, metric="acc", value=0.7, threshold=0.9)`` — 1171 structured; ``metric`` and ``value`` must be provided together so the 1172 action can render a targeted callout without ``None`` placeholders. 1173 """ 1174 1175 @overload 1176 def __init__(self, *, result: ExperimentResult) -> None: ... 1177 @overload 1178 def __init__(self, *, result: ExperimentResult, message: str) -> None: ... 1179 @overload 1180 def __init__( 1181 self, 1182 *, 1183 result: ExperimentResult, 1184 metric: str, 1185 value: float, 1186 threshold: Optional[float] = None, 1187 message: Optional[str] = None, 1188 ) -> None: ... 1189 def __init__( 1190 self, 1191 *, 1192 result: ExperimentResult, 1193 metric: Optional[str] = None, 1194 value: Optional[float] = None, 1195 threshold: Optional[float] = None, 1196 message: Optional[str] = None, 1197 ): 1198 self.result = result 1199 self.metric = metric 1200 self.value = value 1201 self.threshold = threshold 1202 if message is not None: 1203 formatted = message 1204 elif metric is not None and value is not None: 1205 formatted = f"Regression on `{metric}`: {value} (threshold {threshold})" 1206 else: 1207 formatted = "Experiment regression detected" 1208 super().__init__(formatted)
Raised by a user's experiment function to signal a CI gate failure.
Intended for use with the langfuse/experiment-action GitHub Action
(https://github.com/langfuse/experiment-action). The action catches this
exception and, when should_fail_on_error is enabled, fails the
workflow run and renders a callout in the PR comment using
metric/value/threshold if supplied, otherwise str(exc).
Callers choose one of three forms:
RegressionError(result=r)— minimal, generic message.RegressionError(result=r, message="...")— free-form message.RegressionError(result=r, metric="acc", value=0.7, threshold=0.9)— structured;metricandvaluemust be provided together so the action can render a targeted callout withoutNoneplaceholders.
1189 def __init__( 1190 self, 1191 *, 1192 result: ExperimentResult, 1193 metric: Optional[str] = None, 1194 value: Optional[float] = None, 1195 threshold: Optional[float] = None, 1196 message: Optional[str] = None, 1197 ): 1198 self.result = result 1199 self.metric = metric 1200 self.value = value 1201 self.threshold = threshold 1202 if message is not None: 1203 formatted = message 1204 elif metric is not None and value is not None: 1205 formatted = f"Regression on `{metric}`: {value} (threshold {threshold})" 1206 else: 1207 formatted = "Experiment regression detected" 1208 super().__init__(formatted)
98def is_default_export_span(span: ReadableSpan) -> bool: 99 """Return whether a span should be exported by default.""" 100 return ( 101 is_langfuse_span(span) or is_genai_span(span) or is_known_llm_instrumentor(span) 102 )
Return whether a span should be exported by default.
61def is_langfuse_span(span: ReadableSpan) -> bool: 62 """Return whether the span was created by the Langfuse SDK tracer.""" 63 return ( 64 span.instrumentation_scope is not None 65 and span.instrumentation_scope.name == LANGFUSE_TRACER_NAME 66 )
Return whether the span was created by the Langfuse SDK tracer.
69def is_genai_span(span: ReadableSpan) -> bool: 70 """Return whether the span has any ``gen_ai.*`` semantic convention attribute.""" 71 if span.attributes is None: 72 return False 73 74 return any( 75 isinstance(key, str) and key.startswith("gen_ai") 76 for key in span.attributes.keys() 77 )
Return whether the span has any gen_ai.* semantic convention attribute.
85def is_known_llm_instrumentor(span: ReadableSpan) -> bool: 86 """Return whether the span comes from a known LLM instrumentation scope.""" 87 if span.instrumentation_scope is None: 88 return False 89 90 scope_name = span.instrumentation_scope.name 91 92 return any( 93 _matches_scope_prefix(scope_name, prefix) 94 for prefix in KNOWN_LLM_INSTRUMENTATION_SCOPE_PREFIXES 95 )
Return whether the span comes from a known LLM instrumentation scope.
224class MaskOtelSpansFunction(Protocol): 225 """Function protocol for export-stage OpenTelemetry span masking. 226 227 `mask_otel_spans` runs after Langfuse decides which spans this client should 228 export and after export-stage media handling has converted supported media 229 payloads into Langfuse media references. It affects only the spans exported 230 by this Langfuse client. If the same OpenTelemetry spans are sent to another 231 exporter, that exporter receives its own unmodified copy. 232 233 The function is synchronous. It usually runs on the OpenTelemetry batch span 234 processor worker thread; during `flush()` and shutdown it may run on the 235 caller thread. Keep it deterministic and fast, and avoid relying on request 236 locals, the current active span, or async I/O. 237 238 Return `None` to leave the whole batch unchanged, or return 239 `MaskOtelSpansResult` with sparse patches for the spans that should change. 240 241 Example: 242 ```python 243 from typing import Optional 244 245 from langfuse import Langfuse 246 from langfuse.types import ( 247 MaskOtelSpansParams, 248 MaskOtelSpansResult, 249 OtelSpanPatch, 250 ) 251 252 def mask_otel_spans( 253 *, params: MaskOtelSpansParams 254 ) -> Optional[MaskOtelSpansResult]: 255 patches = {} 256 257 for identifier, span in params.spans.items(): 258 if span.instrumentation_scope_name == "openai": 259 patches[identifier] = OtelSpanPatch( 260 delete_attributes=( 261 "gen_ai.prompt.0.content", 262 "gen_ai.completion.0.content", 263 ), 264 set_attributes={"masking.applied": True}, 265 ) 266 267 return MaskOtelSpansResult(span_patches=patches) 268 269 langfuse = Langfuse(mask_otel_spans=mask_otel_spans) 270 ``` 271 """ 272 273 def __call__( 274 self, *, params: MaskOtelSpansParams 275 ) -> Optional[MaskOtelSpansResult]: ...
Function protocol for export-stage OpenTelemetry span masking.
mask_otel_spans runs after Langfuse decides which spans this client should
export and after export-stage media handling has converted supported media
payloads into Langfuse media references. It affects only the spans exported
by this Langfuse client. If the same OpenTelemetry spans are sent to another
exporter, that exporter receives its own unmodified copy.
The function is synchronous. It usually runs on the OpenTelemetry batch span
processor worker thread; during flush() and shutdown it may run on the
caller thread. Keep it deterministic and fast, and avoid relying on request
locals, the current active span, or async I/O.
Return None to leave the whole batch unchanged, or return
MaskOtelSpansResult with sparse patches for the spans that should change.
Example:
from typing import Optional from langfuse import Langfuse from langfuse.types import ( MaskOtelSpansParams, MaskOtelSpansResult, OtelSpanPatch, ) def mask_otel_spans( *, params: MaskOtelSpansParams ) -> Optional[MaskOtelSpansResult]: patches = {} for identifier, span in params.spans.items(): if span.instrumentation_scope_name == "openai": patches[identifier] = OtelSpanPatch( delete_attributes=( "gen_ai.prompt.0.content", "gen_ai.completion.0.content", ), set_attributes={"masking.applied": True}, ) return MaskOtelSpansResult(span_patches=patches) langfuse = Langfuse(mask_otel_spans=mask_otel_spans)
1927def _no_init_or_replace_init(self, *args, **kwargs): 1928 cls = type(self) 1929 1930 if cls._is_protocol: 1931 raise TypeError('Protocols cannot be instantiated') 1932 1933 # Already using a custom `__init__`. No need to calculate correct 1934 # `__init__` to call. This can lead to RecursionError. See bpo-45121. 1935 if cls.__init__ is not _no_init_or_replace_init: 1936 return 1937 1938 # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`. 1939 # The first instantiation of the subclass will call `_no_init_or_replace_init` which 1940 # searches for a proper new `__init__` in the MRO. The new `__init__` 1941 # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent 1942 # instantiation of the protocol subclass will thus use the new 1943 # `__init__` and no longer call `_no_init_or_replace_init`. 1944 for base in cls.__mro__: 1945 init = base.__dict__.get('__init__', _no_init_or_replace_init) 1946 if init is not _no_init_or_replace_init: 1947 cls.__init__ = init 1948 break 1949 else: 1950 # should not happen 1951 cls.__init__ = object.__init__ 1952 1953 cls.__init__(self, *args, **kwargs)
123@dataclass(frozen=True) 124class MaskOtelSpansParams: 125 """Input passed to an export-stage OpenTelemetry span masking function. 126 127 A single call receives one OpenTelemetry export batch, not necessarily a 128 complete trace, request, or Langfuse observation tree. Batch contents depend 129 on OpenTelemetry span processor settings such as `flush_at`, 130 `flush_interval`, explicit `flush()`, and shutdown. 131 132 Example: 133 ```python 134 from typing import Optional 135 136 from langfuse.types import ( 137 MaskOtelSpansParams, 138 MaskOtelSpansResult, 139 OtelSpanPatch, 140 ) 141 142 def mask_otel_spans( 143 *, params: MaskOtelSpansParams 144 ) -> Optional[MaskOtelSpansResult]: 145 patches = {} 146 147 for identifier, span in params.spans.items(): 148 if "http.request.header.authorization" in span.attributes: 149 patches[identifier] = OtelSpanPatch( 150 delete_attributes=("http.request.header.authorization",), 151 set_attributes={"security.redacted": True}, 152 ) 153 154 return MaskOtelSpansResult(span_patches=patches) 155 ``` 156 157 Attributes: 158 spans: Read-only mapping from stable span identifiers to span snapshots. 159 Return patches using keys from this mapping. 160 """ 161 162 spans: Mapping[OtelSpanIdentifier, OtelSpanData]
Input passed to an export-stage OpenTelemetry span masking function.
A single call receives one OpenTelemetry export batch, not necessarily a
complete trace, request, or Langfuse observation tree. Batch contents depend
on OpenTelemetry span processor settings such as flush_at,
flush_interval, explicit flush(), and shutdown.
Example:
from typing import Optional from langfuse.types import ( MaskOtelSpansParams, MaskOtelSpansResult, OtelSpanPatch, ) def mask_otel_spans( *, params: MaskOtelSpansParams ) -> Optional[MaskOtelSpansResult]: patches = {} for identifier, span in params.spans.items(): if "http.request.header.authorization" in span.attributes: patches[identifier] = OtelSpanPatch( delete_attributes=("http.request.header.authorization",), set_attributes={"security.redacted": True}, ) return MaskOtelSpansResult(span_patches=patches)
Attributes:
- spans: Read-only mapping from stable span identifiers to span snapshots. Return patches using keys from this mapping.
200@dataclass(frozen=True) 201class MaskOtelSpansResult: 202 """Patches returned by a `mask_otel_spans` function. 203 204 Omit spans that do not need changes. A mapping value of `None` also leaves 205 that span unchanged. Returning an invalid patch to drop a span is not a 206 supported API; use `should_export_span` when you need span-level export 207 filtering. 208 209 If `mask_otel_spans` raises or returns an object that is not a 210 `MaskOtelSpansResult`, Langfuse drops the whole export batch. If one 211 individual `OtelSpanPatch` is invalid, Langfuse drops only that span from 212 the export batch. 213 214 Attributes: 215 span_patches: Mapping from identifiers in `MaskOtelSpansParams.spans` to 216 sparse attribute patches. 217 """ 218 219 span_patches: Mapping[OtelSpanIdentifier, Optional[OtelSpanPatch]] = field( 220 default_factory=lambda: MappingProxyType({}) 221 )
Patches returned by a mask_otel_spans function.
Omit spans that do not need changes. A mapping value of None also leaves
that span unchanged. Returning an invalid patch to drop a span is not a
supported API; use should_export_span when you need span-level export
filtering.
If mask_otel_spans raises or returns an object that is not a
MaskOtelSpansResult, Langfuse drops the whole export batch. If one
individual OtelSpanPatch is invalid, Langfuse drops only that span from
the export batch.
Attributes:
- span_patches: Mapping from identifiers in
MaskOtelSpansParams.spansto sparse attribute patches.
82@dataclass(frozen=True) 83class OtelSpanData: 84 """Read-only OpenTelemetry span snapshot passed to `mask_otel_spans`. 85 86 The snapshot contains the span data that Langfuse is about to export after 87 the SDK has applied `should_export_span` filtering and export-stage media 88 processing. The mappings are immutable views and mutating them is not 89 supported; return an `OtelSpanPatch` to change exported attributes. 90 91 `mask_otel_spans` can only change span attributes. It cannot change the 92 span name, IDs, parent relationship, resource attributes, events, links, or 93 instrumentation scope. 94 95 Attributes: 96 trace_id: Lowercase 32-character hexadecimal OpenTelemetry trace ID. 97 span_id: Lowercase 16-character hexadecimal OpenTelemetry span ID. 98 parent_span_id: Lowercase hexadecimal parent span ID, or `None` for a 99 root span or when the parent is not available. 100 name: OpenTelemetry span name. 101 instrumentation_scope_name: Name of the instrumentation scope that 102 emitted the span, for example `openai` or `langfuse`. 103 instrumentation_scope_version: Version of the instrumentation scope, if 104 the instrumentation library provided one. 105 attributes: Read-only attributes that will be exported unless patched. 106 Values use OpenTelemetry `AttributeValue` types: strings, booleans, 107 numbers, or homogeneous sequences of those scalar values. 108 resource_attributes: Read-only resource attributes from the span's 109 OpenTelemetry resource. These are available for decisions only and 110 cannot be patched through `mask_otel_spans`. 111 """ 112 113 trace_id: str 114 span_id: str 115 parent_span_id: Optional[str] 116 name: str 117 instrumentation_scope_name: Optional[str] 118 instrumentation_scope_version: Optional[str] 119 attributes: Mapping[str, AttributeValue] 120 resource_attributes: Mapping[str, AttributeValue]
Read-only OpenTelemetry span snapshot passed to mask_otel_spans.
The snapshot contains the span data that Langfuse is about to export after
the SDK has applied should_export_span filtering and export-stage media
processing. The mappings are immutable views and mutating them is not
supported; return an OtelSpanPatch to change exported attributes.
mask_otel_spans can only change span attributes. It cannot change the
span name, IDs, parent relationship, resource attributes, events, links, or
instrumentation scope.
Attributes:
- trace_id: Lowercase 32-character hexadecimal OpenTelemetry trace ID.
- span_id: Lowercase 16-character hexadecimal OpenTelemetry span ID.
- parent_span_id: Lowercase hexadecimal parent span ID, or
Nonefor a root span or when the parent is not available. - name: OpenTelemetry span name.
- instrumentation_scope_name: Name of the instrumentation scope that
emitted the span, for example
openaiorlangfuse. - instrumentation_scope_version: Version of the instrumentation scope, if the instrumentation library provided one.
- attributes: Read-only attributes that will be exported unless patched.
Values use OpenTelemetry
AttributeValuetypes: strings, booleans, numbers, or homogeneous sequences of those scalar values. - resource_attributes: Read-only resource attributes from the span's
OpenTelemetry resource. These are available for decisions only and
cannot be patched through
mask_otel_spans.
65@dataclass(frozen=True) 66class OtelSpanIdentifier: 67 """Stable key for one OpenTelemetry span in a masking batch. 68 69 Use this object as the key when returning a patch for a span. It is a 70 frozen, hashable dataclass, so the safest pattern is to reuse the exact 71 identifier object from `MaskOtelSpansParams.spans` instead of rebuilding it. 72 73 Attributes: 74 trace_id: Lowercase 32-character hexadecimal OpenTelemetry trace ID. 75 span_id: Lowercase 16-character hexadecimal OpenTelemetry span ID. 76 """ 77 78 trace_id: str 79 span_id: str
Stable key for one OpenTelemetry span in a masking batch.
Use this object as the key when returning a patch for a span. It is a
frozen, hashable dataclass, so the safest pattern is to reuse the exact
identifier object from MaskOtelSpansParams.spans instead of rebuilding it.
Attributes:
- trace_id: Lowercase 32-character hexadecimal OpenTelemetry trace ID.
- span_id: Lowercase 16-character hexadecimal OpenTelemetry span ID.
165@dataclass(frozen=True) 166class OtelSpanPatch: 167 """Attribute changes to apply to one OpenTelemetry span before export. 168 169 Patches are sparse: include only the attributes that should change. Langfuse 170 deletes `delete_attributes` first and then applies `set_attributes`, so a key 171 present in both fields is exported with the value from `set_attributes`. 172 173 Attribute values must be valid OpenTelemetry attributes: strings, booleans, 174 integers, floats, or homogeneous sequences of those scalar types. If one 175 value is not valid for OpenTelemetry, Langfuse removes that attribute from 176 the export rather than sending an invalid span. 177 178 Example: 179 ```python 180 OtelSpanPatch( 181 delete_attributes=("gen_ai.prompt.0.content",), 182 set_attributes={ 183 "gen_ai.prompt.redacted": True, 184 "app.masking.rule": "drop_prompt_text", 185 }, 186 ) 187 ``` 188 189 Attributes: 190 set_attributes: Attribute values to add or replace on the exported span. 191 delete_attributes: Attribute keys to remove from the exported span. 192 """ 193 194 set_attributes: Mapping[str, AttributeValue] = field( 195 default_factory=lambda: MappingProxyType({}) 196 ) 197 delete_attributes: Sequence[str] = field(default_factory=tuple)
Attribute changes to apply to one OpenTelemetry span before export.
Patches are sparse: include only the attributes that should change. Langfuse
deletes delete_attributes first and then applies set_attributes, so a key
present in both fields is exported with the value from set_attributes.
Attribute values must be valid OpenTelemetry attributes: strings, booleans, integers, floats, or homogeneous sequences of those scalar types. If one value is not valid for OpenTelemetry, Langfuse removes that attribute from the export rather than sending an invalid span.
Example:
OtelSpanPatch( delete_attributes=("gen_ai.prompt.0.content",), set_attributes={ "gen_ai.prompt.redacted": True, "app.masking.rule": "drop_prompt_text", }, )
Attributes:
- set_attributes: Attribute values to add or replace on the exported span.
- delete_attributes: Attribute keys to remove from the exported span.