langfuse
1""".. include:: ../README.md""" 2 3from langfuse.batch_evaluation import ( 4 BatchEvaluationResult, 5 BatchEvaluationResumeToken, 6 CompositeEvaluatorFunction, 7 EvaluatorInputs, 8 EvaluatorStats, 9 MapperFunction, 10) 11from langfuse.experiment import Evaluation, RegressionError, RunnerContext 12 13from ._client import client as _client_module 14from ._client.attributes import LangfuseOtelSpanAttributes 15from ._client.constants import ObservationTypeLiteral 16from ._client.get_client import get_client 17from ._client.observe import observe 18from ._client.propagation import propagate_attributes 19from ._client.span import ( 20 LangfuseAgent, 21 LangfuseChain, 22 LangfuseEmbedding, 23 LangfuseEvaluator, 24 LangfuseEvent, 25 LangfuseGeneration, 26 LangfuseGuardrail, 27 LangfuseRetriever, 28 LangfuseSpan, 29 LangfuseTool, 30) 31from ._version import __version__ 32from .media import LangfuseMedia, LangfuseMediaReference 33from .span_filter import ( 34 KNOWN_LLM_INSTRUMENTATION_SCOPE_PREFIXES, 35 is_default_export_span, 36 is_genai_span, 37 is_known_llm_instrumentor, 38 is_langfuse_span, 39) 40from .types import ( 41 MaskOtelSpansFunction, 42 MaskOtelSpansParams, 43 MaskOtelSpansResult, 44 OtelSpanData, 45 OtelSpanIdentifier, 46 OtelSpanPatch, 47) 48 49Langfuse = _client_module.Langfuse 50 51__all__ = [ 52 "Langfuse", 53 "LangfuseMedia", 54 "LangfuseMediaReference", 55 "get_client", 56 "observe", 57 "propagate_attributes", 58 "ObservationTypeLiteral", 59 "LangfuseSpan", 60 "LangfuseGeneration", 61 "LangfuseEvent", 62 "LangfuseOtelSpanAttributes", 63 "LangfuseAgent", 64 "LangfuseTool", 65 "LangfuseChain", 66 "LangfuseEmbedding", 67 "LangfuseEvaluator", 68 "LangfuseRetriever", 69 "LangfuseGuardrail", 70 "Evaluation", 71 "EvaluatorInputs", 72 "MapperFunction", 73 "CompositeEvaluatorFunction", 74 "EvaluatorStats", 75 "BatchEvaluationResumeToken", 76 "BatchEvaluationResult", 77 "RunnerContext", 78 "RegressionError", 79 "__version__", 80 "is_default_export_span", 81 "is_langfuse_span", 82 "is_genai_span", 83 "is_known_llm_instrumentor", 84 "KNOWN_LLM_INSTRUMENTATION_SCOPE_PREFIXES", 85 "MaskOtelSpansFunction", 86 "MaskOtelSpansParams", 87 "MaskOtelSpansResult", 88 "OtelSpanData", 89 "OtelSpanIdentifier", 90 "OtelSpanPatch", 91 "experiment", 92 "api", 93]
154class Langfuse: 155 """Main client for Langfuse tracing and platform features. 156 157 This class provides an interface for creating and managing traces, spans, 158 and generations in Langfuse as well as interacting with the Langfuse API. 159 160 The client features a thread-safe singleton pattern for each unique public API key, 161 ensuring consistent trace context propagation across your application. It implements 162 efficient batching of spans with configurable flush settings and includes background 163 thread management for media uploads and score ingestion. 164 165 Configuration is flexible through either direct parameters or environment variables, 166 with graceful fallbacks and runtime configuration updates. 167 168 Attributes: 169 api: Synchronous API client for Langfuse backend communication 170 async_api: Asynchronous API client for Langfuse backend communication 171 _otel_tracer: Internal LangfuseTracer instance managing OpenTelemetry components 172 173 Parameters: 174 public_key (Optional[str]): Your Langfuse public API key. Can also be set via LANGFUSE_PUBLIC_KEY environment variable. 175 secret_key (Optional[str]): Your Langfuse secret API key. Can also be set via LANGFUSE_SECRET_KEY environment variable. 176 base_url (Optional[str]): The Langfuse API base URL. Defaults to "https://cloud.langfuse.com". Can also be set via LANGFUSE_BASE_URL environment variable. 177 host (Optional[str]): Deprecated. Use base_url instead. The Langfuse API host URL. Defaults to "https://cloud.langfuse.com". 178 timeout (Optional[int]): Timeout in seconds for API requests. Defaults to 5 seconds. 179 httpx_client (Optional[httpx.Client]): Custom httpx client for making non-tracing HTTP requests. If not provided, a default client will be created. 180 debug (bool): Enable debug logging. Defaults to False. Can also be set via LANGFUSE_DEBUG environment variable. 181 tracing_enabled (Optional[bool]): Enable or disable tracing. Defaults to True. Can also be set via LANGFUSE_TRACING_ENABLED environment variable. 182 flush_at (Optional[int]): Number of spans to batch before sending to the API. Defaults to 512. Can also be set via LANGFUSE_FLUSH_AT environment variable. 183 flush_interval (Optional[float]): Time in seconds between batch flushes. Defaults to 5 seconds. Can also be set via LANGFUSE_FLUSH_INTERVAL environment variable. 184 environment (Optional[str]): Environment name for tracing. Default is 'default'. Can also be set via LANGFUSE_TRACING_ENVIRONMENT environment variable. Can be any lowercase alphanumeric string with hyphens and underscores that does not start with 'langfuse'. 185 release (Optional[str]): Release version/hash of your application. Used for grouping analytics by release. 186 media_upload_thread_count (Optional[int]): Number of background threads for handling media uploads. Defaults to 1. Can also be set via LANGFUSE_MEDIA_UPLOAD_THREAD_COUNT environment variable. 187 sample_rate (Optional[float]): Sampling rate for traces (0.0 to 1.0). Defaults to 1.0 (100% of traces are sampled). Can also be set via LANGFUSE_SAMPLE_RATE environment variable. 188 mask (Optional[MaskFunction]): Function to mask sensitive data synchronously when Langfuse SDK attributes are created. This applies only to data set through Langfuse SDK APIs such as `start_observation()`, `update()`, and `set_trace_io()`. 189 mask_otel_spans (Optional[MaskOtelSpansFunction]): Synchronous export-stage hook for masking raw OpenTelemetry span attributes before this Langfuse client sends them to Langfuse. Use this for spans created by third-party OpenTelemetry instrumentations, or when you need to inspect final span attributes after export filtering and Langfuse media handling. It does not modify spans already exported through other OpenTelemetry exporters. 190 191 The hook receives one OpenTelemetry export batch. A batch is not guaranteed to contain a complete trace, request, or Langfuse observation tree. The hook usually runs on the OpenTelemetry batch span processor worker thread; during `flush()` and shutdown it may run on the caller thread. Keep it synchronous, deterministic, and fast. 192 193 Return `None` to leave the batch unchanged. Return `MaskOtelSpansResult` with `OtelSpanPatch` values to delete or replace attributes on selected spans. If the hook raises or returns an invalid batch result, Langfuse drops the whole export batch. If one returned span patch is invalid, Langfuse drops only that span from the Langfuse export. 194 195 Example: 196 ```python 197 from typing import Optional 198 199 from langfuse import Langfuse 200 from langfuse.types import ( 201 MaskOtelSpansParams, 202 MaskOtelSpansResult, 203 OtelSpanPatch, 204 ) 205 206 def mask_otel_spans( 207 *, params: MaskOtelSpansParams 208 ) -> Optional[MaskOtelSpansResult]: 209 patches = {} 210 211 for identifier, span in params.spans.items(): 212 if "gen_ai.prompt.0.content" in span.attributes: 213 patches[identifier] = OtelSpanPatch( 214 delete_attributes=("gen_ai.prompt.0.content",), 215 set_attributes={"masking.applied": True}, 216 ) 217 218 return MaskOtelSpansResult(span_patches=patches) 219 220 langfuse = Langfuse(mask_otel_spans=mask_otel_spans) 221 ``` 222 blocked_instrumentation_scopes (Optional[List[str]]): Deprecated. Use `should_export_span` instead. Equivalent behavior: 223 ```python 224 from langfuse.span_filter import is_default_export_span 225 blocked = {"sqlite", "requests"} 226 227 should_export_span = lambda span: ( 228 is_default_export_span(span) 229 and ( 230 span.instrumentation_scope is None 231 or span.instrumentation_scope.name not in blocked 232 ) 233 ) 234 ``` 235 should_export_span (Optional[Callable[[ReadableSpan], bool]]): Callback to decide whether to export a span. If omitted, Langfuse uses the default filter (Langfuse SDK spans, spans with `gen_ai.*` attributes, and known LLM instrumentation scopes). 236 additional_headers (Optional[Dict[str, str]]): Additional headers to include in all API requests and in the default OTLPSpanExporter requests. These headers will be merged with default headers. Note: If httpx_client is provided, additional_headers must be set directly on your custom httpx_client as well. If `span_exporter` is provided, these headers are not wired into that exporter and must be configured on the exporter instance directly. 237 tracer_provider(Optional[TracerProvider]): OpenTelemetry TracerProvider to use for Langfuse. This can be useful to set to have disconnected tracing between Langfuse and other OpenTelemetry-span emitting libraries. Note: To track active spans, the context is still shared between TracerProviders. This may lead to broken trace trees. 238 id_generator (Optional[IdGenerator]): OpenTelemetry ID generator to use when Langfuse creates its own TracerProvider. If omitted, the OpenTelemetry SDK default is used. If `tracer_provider` is provided, or an OpenTelemetry TracerProvider is already registered globally, configure the ID generator on that provider instead. 239 span_exporter (Optional[SpanExporter]): Custom OpenTelemetry span exporter for the Langfuse span processor. If omitted, Langfuse creates an OTLPSpanExporter pointed at the Langfuse OTLP endpoint. If provided, Langfuse does not wire `base_url`, exporter headers, exporter auth, or exporter timeout into it. Configure endpoint, headers, and timeout on the exporter instance directly. If you are sending spans to Langfuse v4 or using Langfuse Cloud Fast Preview, include `x-langfuse-ingestion-version=4` on the exporter to enable real time processing of exported spans. 240 241 Example: 242 ```python 243 from langfuse.otel import Langfuse 244 245 # Initialize the client (reads from env vars if not provided) 246 langfuse = Langfuse( 247 public_key="your-public-key", 248 secret_key="your-secret-key", 249 host="https://cloud.langfuse.com", # Optional, default shown 250 ) 251 252 # Create a trace span 253 with langfuse.start_as_current_observation(name="process-query") as span: 254 # Your application code here 255 256 # Create a nested generation span for an LLM call 257 with span.start_as_current_generation( 258 name="generate-response", 259 model="gpt-4", 260 input={"query": "Tell me about AI"}, 261 model_parameters={"temperature": 0.7, "max_tokens": 500} 262 ) as generation: 263 # Generate response here 264 response = "AI is a field of computer science..." 265 266 generation.update( 267 output=response, 268 usage_details={"prompt_tokens": 10, "completion_tokens": 50}, 269 cost_details={"total_cost": 0.0023} 270 ) 271 272 # Score the generation (supports NUMERIC, BOOLEAN, CATEGORICAL) 273 generation.score(name="relevance", value=0.95, data_type="NUMERIC") 274 ``` 275 """ 276 277 _resources: Optional[LangfuseResourceManager] = None 278 _mask: Optional[MaskFunction] = None 279 _otel_tracer: otel_trace_api.Tracer 280 281 def __init__( 282 self, 283 *, 284 public_key: Optional[str] = None, 285 secret_key: Optional[str] = None, 286 base_url: Optional[str] = None, 287 host: Optional[str] = None, 288 timeout: Optional[int] = None, 289 httpx_client: Optional[httpx.Client] = None, 290 debug: bool = False, 291 tracing_enabled: Optional[bool] = True, 292 flush_at: Optional[int] = None, 293 flush_interval: Optional[float] = None, 294 environment: Optional[str] = None, 295 release: Optional[str] = None, 296 media_upload_thread_count: Optional[int] = None, 297 sample_rate: Optional[float] = None, 298 mask: Optional[MaskFunction] = None, 299 mask_otel_spans: Optional[MaskOtelSpansFunction] = None, 300 blocked_instrumentation_scopes: Optional[List[str]] = None, 301 should_export_span: Optional[Callable[[ReadableSpan], bool]] = None, 302 additional_headers: Optional[Dict[str, str]] = None, 303 tracer_provider: Optional[TracerProvider] = None, 304 id_generator: Optional[IdGenerator] = None, 305 span_exporter: Optional[SpanExporter] = None, 306 ): 307 self._base_url = ( 308 base_url 309 or os.environ.get(LANGFUSE_BASE_URL) 310 or host 311 or os.environ.get(LANGFUSE_HOST, "https://cloud.langfuse.com") 312 ) 313 self._environment = environment or cast( 314 str, os.environ.get(LANGFUSE_TRACING_ENVIRONMENT) 315 ) 316 self._release = ( 317 release 318 or os.environ.get(LANGFUSE_RELEASE, None) 319 or get_common_release_envs() 320 ) 321 self._project_id: Optional[str] = None 322 sample_rate = sample_rate or float(os.environ.get(LANGFUSE_SAMPLE_RATE, 1.0)) 323 if not 0.0 <= sample_rate <= 1.0: 324 raise ValueError( 325 f"Sample rate must be between 0.0 and 1.0, got {sample_rate}" 326 ) 327 328 timeout = timeout or int(os.environ.get(LANGFUSE_TIMEOUT, 5)) 329 330 self._tracing_enabled = ( 331 tracing_enabled 332 and os.environ.get(LANGFUSE_TRACING_ENABLED, "true").lower() != "false" 333 ) 334 if not self._tracing_enabled: 335 langfuse_logger.info( 336 "Configuration: Langfuse tracing is explicitly disabled. No data will be sent to the Langfuse API." 337 ) 338 339 debug = ( 340 debug if debug else (os.getenv(LANGFUSE_DEBUG, "false").lower() == "true") 341 ) 342 if debug: 343 logging.basicConfig( 344 format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" 345 ) 346 langfuse_logger.setLevel(logging.DEBUG) 347 348 public_key = public_key or os.environ.get(LANGFUSE_PUBLIC_KEY) 349 if public_key is None: 350 langfuse_logger.warning( 351 "Authentication error: Langfuse client initialized without public_key. Client will be disabled. " 352 "Provide a public_key parameter or set LANGFUSE_PUBLIC_KEY environment variable. " 353 ) 354 self._otel_tracer = otel_trace_api.NoOpTracer() 355 return 356 357 secret_key = secret_key or os.environ.get(LANGFUSE_SECRET_KEY) 358 if secret_key is None: 359 langfuse_logger.warning( 360 "Authentication error: Langfuse client initialized without secret_key. Client will be disabled. " 361 "Provide a secret_key parameter or set LANGFUSE_SECRET_KEY environment variable. " 362 ) 363 self._otel_tracer = otel_trace_api.NoOpTracer() 364 return 365 366 if os.environ.get("OTEL_SDK_DISABLED", "false").lower() == "true": 367 langfuse_logger.warning( 368 "OTEL_SDK_DISABLED is set. Langfuse tracing will be disabled and no traces will appear in the UI." 369 ) 370 371 if blocked_instrumentation_scopes is not None: 372 warnings.warn( 373 "`blocked_instrumentation_scopes` is deprecated and will be removed in a future release. " 374 "Use `should_export_span` instead. Example: " 375 "from langfuse.span_filter import is_default_export_span; " 376 'blocked={"scope"}; should_export_span=lambda span: ' 377 "is_default_export_span(span) and (span.instrumentation_scope is None or " 378 "span.instrumentation_scope.name not in blocked).", 379 DeprecationWarning, 380 stacklevel=2, 381 ) 382 383 # Initialize api and tracer if requirements are met 384 self._resources = LangfuseResourceManager( 385 public_key=public_key, 386 secret_key=secret_key, 387 base_url=self._base_url, 388 timeout=timeout, 389 environment=self._environment, 390 release=release, 391 flush_at=flush_at, 392 flush_interval=flush_interval, 393 httpx_client=httpx_client, 394 media_upload_thread_count=media_upload_thread_count, 395 sample_rate=sample_rate, 396 mask=mask, 397 mask_otel_spans=mask_otel_spans, 398 tracing_enabled=self._tracing_enabled, 399 blocked_instrumentation_scopes=blocked_instrumentation_scopes, 400 should_export_span=should_export_span, 401 additional_headers=additional_headers, 402 tracer_provider=tracer_provider, 403 id_generator=id_generator, 404 span_exporter=span_exporter, 405 ) 406 self._mask = self._resources.mask 407 408 self._otel_tracer = ( 409 self._resources.tracer 410 if self._tracing_enabled and self._resources.tracer is not None 411 else otel_trace_api.NoOpTracer() 412 ) 413 self.api = self._resources.api 414 self.async_api = self._resources.async_api 415 416 @overload 417 def start_observation( 418 self, 419 *, 420 trace_context: Optional[TraceContext] = None, 421 name: str, 422 as_type: Literal["generation"], 423 input: Optional[Any] = None, 424 output: Optional[Any] = None, 425 metadata: Optional[Any] = None, 426 version: Optional[str] = None, 427 level: Optional[SpanLevel] = None, 428 status_message: Optional[str] = None, 429 completion_start_time: Optional[datetime] = None, 430 model: Optional[str] = None, 431 model_parameters: Optional[Dict[str, MapValue]] = None, 432 usage_details: Optional[Dict[str, int]] = None, 433 cost_details: Optional[Dict[str, float]] = None, 434 prompt: Optional[PromptClient] = None, 435 ) -> LangfuseGeneration: ... 436 437 @overload 438 def start_observation( 439 self, 440 *, 441 trace_context: Optional[TraceContext] = None, 442 name: str, 443 as_type: Literal["span"] = "span", 444 input: Optional[Any] = None, 445 output: Optional[Any] = None, 446 metadata: Optional[Any] = None, 447 version: Optional[str] = None, 448 level: Optional[SpanLevel] = None, 449 status_message: Optional[str] = None, 450 ) -> LangfuseSpan: ... 451 452 @overload 453 def start_observation( 454 self, 455 *, 456 trace_context: Optional[TraceContext] = None, 457 name: str, 458 as_type: Literal["agent"], 459 input: Optional[Any] = None, 460 output: Optional[Any] = None, 461 metadata: Optional[Any] = None, 462 version: Optional[str] = None, 463 level: Optional[SpanLevel] = None, 464 status_message: Optional[str] = None, 465 ) -> LangfuseAgent: ... 466 467 @overload 468 def start_observation( 469 self, 470 *, 471 trace_context: Optional[TraceContext] = None, 472 name: str, 473 as_type: Literal["tool"], 474 input: Optional[Any] = None, 475 output: Optional[Any] = None, 476 metadata: Optional[Any] = None, 477 version: Optional[str] = None, 478 level: Optional[SpanLevel] = None, 479 status_message: Optional[str] = None, 480 ) -> LangfuseTool: ... 481 482 @overload 483 def start_observation( 484 self, 485 *, 486 trace_context: Optional[TraceContext] = None, 487 name: str, 488 as_type: Literal["chain"], 489 input: Optional[Any] = None, 490 output: Optional[Any] = None, 491 metadata: Optional[Any] = None, 492 version: Optional[str] = None, 493 level: Optional[SpanLevel] = None, 494 status_message: Optional[str] = None, 495 ) -> LangfuseChain: ... 496 497 @overload 498 def start_observation( 499 self, 500 *, 501 trace_context: Optional[TraceContext] = None, 502 name: str, 503 as_type: Literal["retriever"], 504 input: Optional[Any] = None, 505 output: Optional[Any] = None, 506 metadata: Optional[Any] = None, 507 version: Optional[str] = None, 508 level: Optional[SpanLevel] = None, 509 status_message: Optional[str] = None, 510 ) -> LangfuseRetriever: ... 511 512 @overload 513 def start_observation( 514 self, 515 *, 516 trace_context: Optional[TraceContext] = None, 517 name: str, 518 as_type: Literal["evaluator"], 519 input: Optional[Any] = None, 520 output: Optional[Any] = None, 521 metadata: Optional[Any] = None, 522 version: Optional[str] = None, 523 level: Optional[SpanLevel] = None, 524 status_message: Optional[str] = None, 525 ) -> LangfuseEvaluator: ... 526 527 @overload 528 def start_observation( 529 self, 530 *, 531 trace_context: Optional[TraceContext] = None, 532 name: str, 533 as_type: Literal["embedding"], 534 input: Optional[Any] = None, 535 output: Optional[Any] = None, 536 metadata: Optional[Any] = None, 537 version: Optional[str] = None, 538 level: Optional[SpanLevel] = None, 539 status_message: Optional[str] = None, 540 completion_start_time: Optional[datetime] = None, 541 model: Optional[str] = None, 542 model_parameters: Optional[Dict[str, MapValue]] = None, 543 usage_details: Optional[Dict[str, int]] = None, 544 cost_details: Optional[Dict[str, float]] = None, 545 prompt: Optional[PromptClient] = None, 546 ) -> LangfuseEmbedding: ... 547 548 @overload 549 def start_observation( 550 self, 551 *, 552 trace_context: Optional[TraceContext] = None, 553 name: str, 554 as_type: Literal["guardrail"], 555 input: Optional[Any] = None, 556 output: Optional[Any] = None, 557 metadata: Optional[Any] = None, 558 version: Optional[str] = None, 559 level: Optional[SpanLevel] = None, 560 status_message: Optional[str] = None, 561 ) -> LangfuseGuardrail: ... 562 563 def start_observation( 564 self, 565 *, 566 trace_context: Optional[TraceContext] = None, 567 name: str, 568 as_type: ObservationTypeLiteralNoEvent = "span", 569 input: Optional[Any] = None, 570 output: Optional[Any] = None, 571 metadata: Optional[Any] = None, 572 version: Optional[str] = None, 573 level: Optional[SpanLevel] = None, 574 status_message: Optional[str] = None, 575 completion_start_time: Optional[datetime] = None, 576 model: Optional[str] = None, 577 model_parameters: Optional[Dict[str, MapValue]] = None, 578 usage_details: Optional[Dict[str, int]] = None, 579 cost_details: Optional[Dict[str, float]] = None, 580 prompt: Optional[PromptClient] = None, 581 ) -> Union[ 582 LangfuseSpan, 583 LangfuseGeneration, 584 LangfuseAgent, 585 LangfuseTool, 586 LangfuseChain, 587 LangfuseRetriever, 588 LangfuseEvaluator, 589 LangfuseEmbedding, 590 LangfuseGuardrail, 591 ]: 592 """Create a new observation of the specified type. 593 594 This method creates a new observation but does not set it as the current span in the 595 context. To create and use an observation within a context, use start_as_current_observation(). 596 597 Args: 598 trace_context: Optional context for connecting to an existing trace 599 name: Name of the observation 600 as_type: Type of observation to create (defaults to "span") 601 input: Input data for the operation 602 output: Output data from the operation 603 metadata: Additional metadata to associate with the observation 604 version: Version identifier for the code or component 605 level: Importance level of the observation 606 status_message: Optional status message for the observation 607 completion_start_time: When the model started generating (for generation types) 608 model: Name/identifier of the AI model used (for generation types) 609 model_parameters: Parameters used for the model (for generation types) 610 usage_details: Token usage information (for generation types) 611 cost_details: Cost information (for generation types) 612 prompt: Associated prompt template (for generation types) 613 614 Returns: 615 An observation object of the appropriate type that must be ended with .end() 616 """ 617 if trace_context: 618 trace_id = trace_context.get("trace_id", None) 619 parent_span_id = trace_context.get("parent_span_id", None) 620 621 if trace_id: 622 remote_parent_span = self._create_remote_parent_span( 623 trace_id=trace_id, parent_span_id=parent_span_id 624 ) 625 626 with otel_trace_api.use_span( 627 cast(otel_trace_api.Span, remote_parent_span) 628 ): 629 otel_span = self._otel_tracer.start_span(name=name) 630 otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True) 631 632 return self._create_observation_from_otel_span( 633 otel_span=otel_span, 634 as_type=as_type, 635 input=input, 636 output=output, 637 metadata=metadata, 638 version=version, 639 level=level, 640 status_message=status_message, 641 completion_start_time=completion_start_time, 642 model=model, 643 model_parameters=model_parameters, 644 usage_details=usage_details, 645 cost_details=cost_details, 646 prompt=prompt, 647 ) 648 649 otel_span = self._otel_tracer.start_span(name=name) 650 651 return self._create_observation_from_otel_span( 652 otel_span=otel_span, 653 as_type=as_type, 654 input=input, 655 output=output, 656 metadata=metadata, 657 version=version, 658 level=level, 659 status_message=status_message, 660 completion_start_time=completion_start_time, 661 model=model, 662 model_parameters=model_parameters, 663 usage_details=usage_details, 664 cost_details=cost_details, 665 prompt=prompt, 666 ) 667 668 def _create_observation_from_otel_span( 669 self, 670 *, 671 otel_span: otel_trace_api.Span, 672 as_type: ObservationTypeLiteralNoEvent, 673 input: Optional[Any] = None, 674 output: Optional[Any] = None, 675 metadata: Optional[Any] = None, 676 version: Optional[str] = None, 677 level: Optional[SpanLevel] = None, 678 status_message: Optional[str] = None, 679 completion_start_time: Optional[datetime] = None, 680 model: Optional[str] = None, 681 model_parameters: Optional[Dict[str, MapValue]] = None, 682 usage_details: Optional[Dict[str, int]] = None, 683 cost_details: Optional[Dict[str, float]] = None, 684 prompt: Optional[PromptClient] = None, 685 ) -> Union[ 686 LangfuseSpan, 687 LangfuseGeneration, 688 LangfuseAgent, 689 LangfuseTool, 690 LangfuseChain, 691 LangfuseRetriever, 692 LangfuseEvaluator, 693 LangfuseEmbedding, 694 LangfuseGuardrail, 695 ]: 696 """Create the appropriate observation type from an OTEL span.""" 697 if as_type in get_observation_types_list(ObservationTypeGenerationLike): 698 observation_class = self._get_span_class(as_type) 699 # Type ignore to prevent overloads of internal _get_span_class function, 700 # issue is that LangfuseEvent could be returned and that classes have diff. args 701 return observation_class( # type: ignore[return-value,call-arg] 702 otel_span=otel_span, 703 langfuse_client=self, 704 environment=self._environment, 705 release=self._release, 706 input=input, 707 output=output, 708 metadata=metadata, 709 version=version, 710 level=level, 711 status_message=status_message, 712 completion_start_time=completion_start_time, 713 model=model, 714 model_parameters=model_parameters, 715 usage_details=usage_details, 716 cost_details=cost_details, 717 prompt=prompt, 718 ) 719 else: 720 # For other types (e.g. span, guardrail), create appropriate class without generation properties 721 observation_class = self._get_span_class(as_type) 722 # Type ignore to prevent overloads of internal _get_span_class function, 723 # issue is that LangfuseEvent could be returned and that classes have diff. args 724 return observation_class( # type: ignore[return-value,call-arg] 725 otel_span=otel_span, 726 langfuse_client=self, 727 environment=self._environment, 728 release=self._release, 729 input=input, 730 output=output, 731 metadata=metadata, 732 version=version, 733 level=level, 734 status_message=status_message, 735 ) 736 # span._observation_type = as_type 737 # span._otel_span.set_attribute("langfuse.observation.type", as_type) 738 # return span 739 740 @overload 741 def start_as_current_observation( 742 self, 743 *, 744 trace_context: Optional[TraceContext] = None, 745 name: str, 746 as_type: Literal["generation"], 747 input: Optional[Any] = None, 748 output: Optional[Any] = None, 749 metadata: Optional[Any] = None, 750 version: Optional[str] = None, 751 level: Optional[SpanLevel] = None, 752 status_message: Optional[str] = None, 753 completion_start_time: Optional[datetime] = None, 754 model: Optional[str] = None, 755 model_parameters: Optional[Dict[str, MapValue]] = None, 756 usage_details: Optional[Dict[str, int]] = None, 757 cost_details: Optional[Dict[str, float]] = None, 758 prompt: Optional[PromptClient] = None, 759 end_on_exit: Optional[bool] = None, 760 ) -> _AgnosticContextManager[LangfuseGeneration]: ... 761 762 @overload 763 def start_as_current_observation( 764 self, 765 *, 766 trace_context: Optional[TraceContext] = None, 767 name: str, 768 as_type: Literal["span"] = "span", 769 input: Optional[Any] = None, 770 output: Optional[Any] = None, 771 metadata: Optional[Any] = None, 772 version: Optional[str] = None, 773 level: Optional[SpanLevel] = None, 774 status_message: Optional[str] = None, 775 end_on_exit: Optional[bool] = None, 776 ) -> _AgnosticContextManager[LangfuseSpan]: ... 777 778 @overload 779 def start_as_current_observation( 780 self, 781 *, 782 trace_context: Optional[TraceContext] = None, 783 name: str, 784 as_type: Literal["agent"], 785 input: Optional[Any] = None, 786 output: Optional[Any] = None, 787 metadata: Optional[Any] = None, 788 version: Optional[str] = None, 789 level: Optional[SpanLevel] = None, 790 status_message: Optional[str] = None, 791 end_on_exit: Optional[bool] = None, 792 ) -> _AgnosticContextManager[LangfuseAgent]: ... 793 794 @overload 795 def start_as_current_observation( 796 self, 797 *, 798 trace_context: Optional[TraceContext] = None, 799 name: str, 800 as_type: Literal["tool"], 801 input: Optional[Any] = None, 802 output: Optional[Any] = None, 803 metadata: Optional[Any] = None, 804 version: Optional[str] = None, 805 level: Optional[SpanLevel] = None, 806 status_message: Optional[str] = None, 807 end_on_exit: Optional[bool] = None, 808 ) -> _AgnosticContextManager[LangfuseTool]: ... 809 810 @overload 811 def start_as_current_observation( 812 self, 813 *, 814 trace_context: Optional[TraceContext] = None, 815 name: str, 816 as_type: Literal["chain"], 817 input: Optional[Any] = None, 818 output: Optional[Any] = None, 819 metadata: Optional[Any] = None, 820 version: Optional[str] = None, 821 level: Optional[SpanLevel] = None, 822 status_message: Optional[str] = None, 823 end_on_exit: Optional[bool] = None, 824 ) -> _AgnosticContextManager[LangfuseChain]: ... 825 826 @overload 827 def start_as_current_observation( 828 self, 829 *, 830 trace_context: Optional[TraceContext] = None, 831 name: str, 832 as_type: Literal["retriever"], 833 input: Optional[Any] = None, 834 output: Optional[Any] = None, 835 metadata: Optional[Any] = None, 836 version: Optional[str] = None, 837 level: Optional[SpanLevel] = None, 838 status_message: Optional[str] = None, 839 end_on_exit: Optional[bool] = None, 840 ) -> _AgnosticContextManager[LangfuseRetriever]: ... 841 842 @overload 843 def start_as_current_observation( 844 self, 845 *, 846 trace_context: Optional[TraceContext] = None, 847 name: str, 848 as_type: Literal["evaluator"], 849 input: Optional[Any] = None, 850 output: Optional[Any] = None, 851 metadata: Optional[Any] = None, 852 version: Optional[str] = None, 853 level: Optional[SpanLevel] = None, 854 status_message: Optional[str] = None, 855 end_on_exit: Optional[bool] = None, 856 ) -> _AgnosticContextManager[LangfuseEvaluator]: ... 857 858 @overload 859 def start_as_current_observation( 860 self, 861 *, 862 trace_context: Optional[TraceContext] = None, 863 name: str, 864 as_type: Literal["embedding"], 865 input: Optional[Any] = None, 866 output: Optional[Any] = None, 867 metadata: Optional[Any] = None, 868 version: Optional[str] = None, 869 level: Optional[SpanLevel] = None, 870 status_message: Optional[str] = None, 871 completion_start_time: Optional[datetime] = None, 872 model: Optional[str] = None, 873 model_parameters: Optional[Dict[str, MapValue]] = None, 874 usage_details: Optional[Dict[str, int]] = None, 875 cost_details: Optional[Dict[str, float]] = None, 876 prompt: Optional[PromptClient] = None, 877 end_on_exit: Optional[bool] = None, 878 ) -> _AgnosticContextManager[LangfuseEmbedding]: ... 879 880 @overload 881 def start_as_current_observation( 882 self, 883 *, 884 trace_context: Optional[TraceContext] = None, 885 name: str, 886 as_type: Literal["guardrail"], 887 input: Optional[Any] = None, 888 output: Optional[Any] = None, 889 metadata: Optional[Any] = None, 890 version: Optional[str] = None, 891 level: Optional[SpanLevel] = None, 892 status_message: Optional[str] = None, 893 end_on_exit: Optional[bool] = None, 894 ) -> _AgnosticContextManager[LangfuseGuardrail]: ... 895 896 def start_as_current_observation( 897 self, 898 *, 899 trace_context: Optional[TraceContext] = None, 900 name: str, 901 as_type: ObservationTypeLiteralNoEvent = "span", 902 input: Optional[Any] = None, 903 output: Optional[Any] = None, 904 metadata: Optional[Any] = None, 905 version: Optional[str] = None, 906 level: Optional[SpanLevel] = None, 907 status_message: Optional[str] = None, 908 completion_start_time: Optional[datetime] = None, 909 model: Optional[str] = None, 910 model_parameters: Optional[Dict[str, MapValue]] = None, 911 usage_details: Optional[Dict[str, int]] = None, 912 cost_details: Optional[Dict[str, float]] = None, 913 prompt: Optional[PromptClient] = None, 914 end_on_exit: Optional[bool] = None, 915 ) -> Union[ 916 _AgnosticContextManager[LangfuseGeneration], 917 _AgnosticContextManager[LangfuseSpan], 918 _AgnosticContextManager[LangfuseAgent], 919 _AgnosticContextManager[LangfuseTool], 920 _AgnosticContextManager[LangfuseChain], 921 _AgnosticContextManager[LangfuseRetriever], 922 _AgnosticContextManager[LangfuseEvaluator], 923 _AgnosticContextManager[LangfuseEmbedding], 924 _AgnosticContextManager[LangfuseGuardrail], 925 ]: 926 """Create a new observation and set it as the current span in a context manager. 927 928 This method creates a new observation of the specified type and sets it as the 929 current span within a context manager. Use this method with a 'with' statement to 930 automatically handle the observation lifecycle within a code block. 931 932 The created observation will be the child of the current span in the context. 933 934 Args: 935 trace_context: Optional context for connecting to an existing trace 936 name: Name of the observation (e.g., function or operation name) 937 as_type: Type of observation to create (defaults to "span") 938 input: Input data for the operation (can be any JSON-serializable object) 939 output: Output data from the operation (can be any JSON-serializable object) 940 metadata: Additional metadata to associate with the observation 941 version: Version identifier for the code or component 942 level: Importance level of the observation (info, warning, error) 943 status_message: Optional status message for the observation 944 end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks. 945 946 The following parameters are available when as_type is: "generation" or "embedding". 947 completion_start_time: When the model started generating the response 948 model: Name/identifier of the AI model used (e.g., "gpt-4") 949 model_parameters: Parameters used for the model (e.g., temperature, max_tokens) 950 usage_details: Token usage information (e.g., prompt_tokens, completion_tokens) 951 cost_details: Cost information for the model call 952 prompt: Associated prompt template from Langfuse prompt management 953 954 Returns: 955 A context manager that yields the appropriate observation type based on as_type 956 957 Example: 958 ```python 959 # Create a span 960 with langfuse.start_as_current_observation(name="process-query", as_type="span") as span: 961 # Do work 962 result = process_data() 963 span.update(output=result) 964 965 # Create a child span automatically 966 with span.start_as_current_observation(name="sub-operation") as child_span: 967 # Do sub-operation work 968 child_span.update(output="sub-result") 969 970 # Create a tool observation 971 with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool: 972 # Do tool work 973 results = search_web(query) 974 tool.update(output=results) 975 976 # Create a generation observation 977 with langfuse.start_as_current_observation( 978 name="answer-generation", 979 as_type="generation", 980 model="gpt-4" 981 ) as generation: 982 # Generate answer 983 response = llm.generate(...) 984 generation.update(output=response) 985 ``` 986 """ 987 if as_type in get_observation_types_list(ObservationTypeGenerationLike): 988 if trace_context: 989 trace_id = trace_context.get("trace_id", None) 990 parent_span_id = trace_context.get("parent_span_id", None) 991 992 if trace_id: 993 remote_parent_span = self._create_remote_parent_span( 994 trace_id=trace_id, parent_span_id=parent_span_id 995 ) 996 997 return cast( 998 Union[ 999 _AgnosticContextManager[LangfuseGeneration], 1000 _AgnosticContextManager[LangfuseEmbedding], 1001 ], 1002 self._create_span_with_parent_context( 1003 as_type=as_type, 1004 name=name, 1005 remote_parent_span=remote_parent_span, 1006 parent=None, 1007 end_on_exit=end_on_exit, 1008 input=input, 1009 output=output, 1010 metadata=metadata, 1011 version=version, 1012 level=level, 1013 status_message=status_message, 1014 completion_start_time=completion_start_time, 1015 model=model, 1016 model_parameters=model_parameters, 1017 usage_details=usage_details, 1018 cost_details=cost_details, 1019 prompt=prompt, 1020 ), 1021 ) 1022 1023 return cast( 1024 Union[ 1025 _AgnosticContextManager[LangfuseGeneration], 1026 _AgnosticContextManager[LangfuseEmbedding], 1027 ], 1028 self._start_as_current_otel_span_with_processed_media( 1029 as_type=as_type, 1030 name=name, 1031 end_on_exit=end_on_exit, 1032 input=input, 1033 output=output, 1034 metadata=metadata, 1035 version=version, 1036 level=level, 1037 status_message=status_message, 1038 completion_start_time=completion_start_time, 1039 model=model, 1040 model_parameters=model_parameters, 1041 usage_details=usage_details, 1042 cost_details=cost_details, 1043 prompt=prompt, 1044 ), 1045 ) 1046 1047 if as_type in get_observation_types_list(ObservationTypeSpanLike): 1048 if trace_context: 1049 trace_id = trace_context.get("trace_id", None) 1050 parent_span_id = trace_context.get("parent_span_id", None) 1051 1052 if trace_id: 1053 remote_parent_span = self._create_remote_parent_span( 1054 trace_id=trace_id, parent_span_id=parent_span_id 1055 ) 1056 1057 return cast( 1058 Union[ 1059 _AgnosticContextManager[LangfuseSpan], 1060 _AgnosticContextManager[LangfuseAgent], 1061 _AgnosticContextManager[LangfuseTool], 1062 _AgnosticContextManager[LangfuseChain], 1063 _AgnosticContextManager[LangfuseRetriever], 1064 _AgnosticContextManager[LangfuseEvaluator], 1065 _AgnosticContextManager[LangfuseGuardrail], 1066 ], 1067 self._create_span_with_parent_context( 1068 as_type=as_type, 1069 name=name, 1070 remote_parent_span=remote_parent_span, 1071 parent=None, 1072 end_on_exit=end_on_exit, 1073 input=input, 1074 output=output, 1075 metadata=metadata, 1076 version=version, 1077 level=level, 1078 status_message=status_message, 1079 ), 1080 ) 1081 1082 return cast( 1083 Union[ 1084 _AgnosticContextManager[LangfuseSpan], 1085 _AgnosticContextManager[LangfuseAgent], 1086 _AgnosticContextManager[LangfuseTool], 1087 _AgnosticContextManager[LangfuseChain], 1088 _AgnosticContextManager[LangfuseRetriever], 1089 _AgnosticContextManager[LangfuseEvaluator], 1090 _AgnosticContextManager[LangfuseGuardrail], 1091 ], 1092 self._start_as_current_otel_span_with_processed_media( 1093 as_type=as_type, 1094 name=name, 1095 end_on_exit=end_on_exit, 1096 input=input, 1097 output=output, 1098 metadata=metadata, 1099 version=version, 1100 level=level, 1101 status_message=status_message, 1102 ), 1103 ) 1104 1105 # This should never be reached since all valid types are handled above 1106 langfuse_logger.warning( 1107 f"Unknown observation type: {as_type}, falling back to span" 1108 ) 1109 return self._start_as_current_otel_span_with_processed_media( 1110 as_type="span", 1111 name=name, 1112 end_on_exit=end_on_exit, 1113 input=input, 1114 output=output, 1115 metadata=metadata, 1116 version=version, 1117 level=level, 1118 status_message=status_message, 1119 ) 1120 1121 def _get_span_class( 1122 self, 1123 as_type: str, 1124 ) -> Union[ 1125 Type[LangfuseAgent], 1126 Type[LangfuseTool], 1127 Type[LangfuseChain], 1128 Type[LangfuseRetriever], 1129 Type[LangfuseEvaluator], 1130 Type[LangfuseEmbedding], 1131 Type[LangfuseGuardrail], 1132 Type[LangfuseGeneration], 1133 Type[LangfuseEvent], 1134 Type[LangfuseSpan], 1135 ]: 1136 """Get the appropriate span class based on as_type.""" 1137 normalized_type = as_type.lower() 1138 1139 if normalized_type == "agent": 1140 return LangfuseAgent 1141 elif normalized_type == "tool": 1142 return LangfuseTool 1143 elif normalized_type == "chain": 1144 return LangfuseChain 1145 elif normalized_type == "retriever": 1146 return LangfuseRetriever 1147 elif normalized_type == "evaluator": 1148 return LangfuseEvaluator 1149 elif normalized_type == "embedding": 1150 return LangfuseEmbedding 1151 elif normalized_type == "guardrail": 1152 return LangfuseGuardrail 1153 elif normalized_type == "generation": 1154 return LangfuseGeneration 1155 elif normalized_type == "event": 1156 return LangfuseEvent 1157 elif normalized_type == "span": 1158 return LangfuseSpan 1159 else: 1160 return LangfuseSpan 1161 1162 @staticmethod 1163 def _get_observation_type_from_otel_span(otel_span: otel_trace_api.Span) -> str: 1164 if not otel_span.is_recording(): 1165 return "span" 1166 1167 attributes = getattr(otel_span, "attributes", None) 1168 if attributes is None or not hasattr(attributes, "get"): 1169 return "span" 1170 1171 observation_type = attributes.get( 1172 LangfuseOtelSpanAttributes.OBSERVATION_TYPE, "span" 1173 ) 1174 1175 return observation_type if isinstance(observation_type, str) else "span" 1176 1177 @_agnosticcontextmanager 1178 def _create_span_with_parent_context( 1179 self, 1180 *, 1181 name: str, 1182 parent: Optional[otel_trace_api.Span] = None, 1183 remote_parent_span: Optional[otel_trace_api.Span] = None, 1184 as_type: ObservationTypeLiteralNoEvent, 1185 end_on_exit: Optional[bool] = None, 1186 input: Optional[Any] = None, 1187 output: Optional[Any] = None, 1188 metadata: Optional[Any] = None, 1189 version: Optional[str] = None, 1190 level: Optional[SpanLevel] = None, 1191 status_message: Optional[str] = None, 1192 completion_start_time: Optional[datetime] = None, 1193 model: Optional[str] = None, 1194 model_parameters: Optional[Dict[str, MapValue]] = None, 1195 usage_details: Optional[Dict[str, int]] = None, 1196 cost_details: Optional[Dict[str, float]] = None, 1197 prompt: Optional[PromptClient] = None, 1198 ) -> Any: 1199 parent_span = parent or cast(otel_trace_api.Span, remote_parent_span) 1200 1201 with otel_trace_api.use_span(parent_span): 1202 with self._start_as_current_otel_span_with_processed_media( 1203 name=name, 1204 as_type=as_type, 1205 end_on_exit=end_on_exit, 1206 input=input, 1207 output=output, 1208 metadata=metadata, 1209 version=version, 1210 level=level, 1211 status_message=status_message, 1212 completion_start_time=completion_start_time, 1213 model=model, 1214 model_parameters=model_parameters, 1215 usage_details=usage_details, 1216 cost_details=cost_details, 1217 prompt=prompt, 1218 ) as langfuse_span: 1219 if remote_parent_span is not None: 1220 langfuse_span._otel_span.set_attribute( 1221 LangfuseOtelSpanAttributes.AS_ROOT, True 1222 ) 1223 1224 yield langfuse_span 1225 1226 @_agnosticcontextmanager 1227 def _start_as_current_otel_span_with_processed_media( 1228 self, 1229 *, 1230 name: str, 1231 as_type: Optional[ObservationTypeLiteralNoEvent] = None, 1232 end_on_exit: Optional[bool] = None, 1233 input: Optional[Any] = None, 1234 output: Optional[Any] = None, 1235 metadata: Optional[Any] = None, 1236 version: Optional[str] = None, 1237 level: Optional[SpanLevel] = None, 1238 status_message: Optional[str] = None, 1239 completion_start_time: Optional[datetime] = None, 1240 model: Optional[str] = None, 1241 model_parameters: Optional[Dict[str, MapValue]] = None, 1242 usage_details: Optional[Dict[str, int]] = None, 1243 cost_details: Optional[Dict[str, float]] = None, 1244 prompt: Optional[PromptClient] = None, 1245 ) -> Any: 1246 with self._otel_tracer.start_as_current_span( 1247 name=name, 1248 end_on_exit=end_on_exit if end_on_exit is not None else True, 1249 ) as otel_span: 1250 baggage_token = None 1251 1252 if otel_span.is_recording(): 1253 context_with_app_root_claim = _set_langfuse_trace_id_in_baggage( 1254 trace_id=self._get_otel_trace_id(otel_span), 1255 context=otel_context_api.get_current(), 1256 ) 1257 baggage_token = otel_context_api.attach(context_with_app_root_claim) 1258 1259 span_class = self._get_span_class( 1260 as_type or "generation" 1261 ) # default was "generation" 1262 1263 try: 1264 common_args = { 1265 "otel_span": otel_span, 1266 "langfuse_client": self, 1267 "environment": self._environment, 1268 "release": self._release, 1269 "input": input, 1270 "output": output, 1271 "metadata": metadata, 1272 "version": version, 1273 "level": level, 1274 "status_message": status_message, 1275 } 1276 1277 if span_class in [ 1278 LangfuseGeneration, 1279 LangfuseEmbedding, 1280 ]: 1281 common_args.update( 1282 { 1283 "completion_start_time": completion_start_time, 1284 "model": model, 1285 "model_parameters": model_parameters, 1286 "usage_details": usage_details, 1287 "cost_details": cost_details, 1288 "prompt": prompt, 1289 } 1290 ) 1291 # For span-like types (span, agent, tool, chain, retriever, evaluator, guardrail), no generation properties needed 1292 1293 yield span_class(**common_args) # type: ignore[arg-type] 1294 1295 finally: 1296 if baggage_token is not None: 1297 _detach_context_token_safely(baggage_token) 1298 1299 def _get_current_otel_span(self) -> Optional[otel_trace_api.Span]: 1300 current_span = otel_trace_api.get_current_span() 1301 1302 if current_span is otel_trace_api.INVALID_SPAN: 1303 langfuse_logger.warning( 1304 "Context error: No active span in current context. Operations that depend on an active span will be skipped. " 1305 "Ensure spans are created with start_as_current_observation() or that you're operating within an active span context." 1306 ) 1307 return None 1308 1309 return current_span 1310 1311 def update_current_generation( 1312 self, 1313 *, 1314 name: Optional[str] = None, 1315 input: Optional[Any] = None, 1316 output: Optional[Any] = None, 1317 metadata: Optional[Any] = None, 1318 version: Optional[str] = None, 1319 level: Optional[SpanLevel] = None, 1320 status_message: Optional[str] = None, 1321 completion_start_time: Optional[datetime] = None, 1322 model: Optional[str] = None, 1323 model_parameters: Optional[Dict[str, MapValue]] = None, 1324 usage_details: Optional[Dict[str, int]] = None, 1325 cost_details: Optional[Dict[str, float]] = None, 1326 prompt: Optional[PromptClient] = None, 1327 ) -> None: 1328 """Update the current active generation span with new information. 1329 1330 This method updates the current generation span in the active context with 1331 additional information. It's useful for adding output, usage stats, or other 1332 details that become available during or after model generation. 1333 1334 Args: 1335 name: The generation name 1336 input: Updated input data for the model 1337 output: Output from the model (e.g., completions) 1338 metadata: Additional metadata to associate with the generation 1339 version: Version identifier for the model or component 1340 level: Importance level of the generation (info, warning, error) 1341 status_message: Optional status message for the generation 1342 completion_start_time: When the model started generating the response 1343 model: Name/identifier of the AI model used (e.g., "gpt-4") 1344 model_parameters: Parameters used for the model (e.g., temperature, max_tokens) 1345 usage_details: Token usage information (e.g., prompt_tokens, completion_tokens) 1346 cost_details: Cost information for the model call 1347 prompt: Associated prompt template from Langfuse prompt management 1348 1349 Example: 1350 ```python 1351 with langfuse.start_as_current_generation(name="answer-query") as generation: 1352 # Initial setup and API call 1353 response = llm.generate(...) 1354 1355 # Update with results that weren't available at creation time 1356 langfuse.update_current_generation( 1357 output=response.text, 1358 usage_details={ 1359 "prompt_tokens": response.usage.prompt_tokens, 1360 "completion_tokens": response.usage.completion_tokens 1361 } 1362 ) 1363 ``` 1364 """ 1365 if not self._tracing_enabled: 1366 langfuse_logger.debug( 1367 "Operation skipped: update_current_generation - Tracing is disabled or client is in no-op mode." 1368 ) 1369 return 1370 1371 current_otel_span = self._get_current_otel_span() 1372 1373 if current_otel_span is not None: 1374 generation = LangfuseGeneration( 1375 otel_span=current_otel_span, langfuse_client=self 1376 ) 1377 1378 if name: 1379 current_otel_span.update_name(name) 1380 1381 generation.update( 1382 input=input, 1383 output=output, 1384 metadata=metadata, 1385 version=version, 1386 level=level, 1387 status_message=status_message, 1388 completion_start_time=completion_start_time, 1389 model=model, 1390 model_parameters=model_parameters, 1391 usage_details=usage_details, 1392 cost_details=cost_details, 1393 prompt=prompt, 1394 ) 1395 1396 def update_current_span( 1397 self, 1398 *, 1399 name: Optional[str] = None, 1400 input: Optional[Any] = None, 1401 output: Optional[Any] = None, 1402 metadata: Optional[Any] = None, 1403 version: Optional[str] = None, 1404 level: Optional[SpanLevel] = None, 1405 status_message: Optional[str] = None, 1406 ) -> None: 1407 """Update the current active span with new information. 1408 1409 This method updates the current span in the active context with 1410 additional information. It's useful for adding outputs or metadata 1411 that become available during execution. 1412 1413 Args: 1414 name: The span name 1415 input: Updated input data for the operation 1416 output: Output data from the operation 1417 metadata: Additional metadata to associate with the span 1418 version: Version identifier for the code or component 1419 level: Importance level of the span (info, warning, error) 1420 status_message: Optional status message for the span 1421 1422 Example: 1423 ```python 1424 with langfuse.start_as_current_observation(name="process-data") as span: 1425 # Initial processing 1426 result = process_first_part() 1427 1428 # Update with intermediate results 1429 langfuse.update_current_span(metadata={"intermediate_result": result}) 1430 1431 # Continue processing 1432 final_result = process_second_part(result) 1433 1434 # Final update 1435 langfuse.update_current_span(output=final_result) 1436 ``` 1437 """ 1438 if not self._tracing_enabled: 1439 langfuse_logger.debug( 1440 "Operation skipped: update_current_span - Tracing is disabled or client is in no-op mode." 1441 ) 1442 return 1443 1444 current_otel_span = self._get_current_otel_span() 1445 1446 if current_otel_span is not None: 1447 span_class = self._get_span_class( 1448 self._get_observation_type_from_otel_span(current_otel_span) 1449 ) 1450 span = span_class( 1451 otel_span=current_otel_span, 1452 langfuse_client=self, 1453 environment=self._environment, 1454 release=self._release, 1455 ) 1456 1457 if name: 1458 current_otel_span.update_name(name) 1459 1460 span.update( 1461 input=input, 1462 output=output, 1463 metadata=metadata, 1464 version=version, 1465 level=level, 1466 status_message=status_message, 1467 ) 1468 1469 @deprecated( 1470 "Trace-level input/output is deprecated. " 1471 "For trace attributes (user_id, session_id, tags, etc.), use propagate_attributes() instead. " 1472 "This method will be removed in a future major version." 1473 ) 1474 def set_current_trace_io( 1475 self, 1476 *, 1477 input: Optional[Any] = None, 1478 output: Optional[Any] = None, 1479 ) -> None: 1480 """Set trace-level input and output for the current span's trace. 1481 1482 .. deprecated:: 1483 This is a legacy method for backward compatibility with Langfuse platform 1484 features that still rely on trace-level input/output (e.g., legacy LLM-as-a-judge 1485 evaluators). It will be removed in a future major version. 1486 1487 For setting other trace attributes (user_id, session_id, metadata, tags, version), 1488 use :meth:`propagate_attributes` instead. 1489 1490 Args: 1491 input: Input data to associate with the trace. 1492 output: Output data to associate with the trace. 1493 """ 1494 if not self._tracing_enabled: 1495 langfuse_logger.debug( 1496 "Operation skipped: set_current_trace_io - Tracing is disabled or client is in no-op mode." 1497 ) 1498 return 1499 1500 current_otel_span = self._get_current_otel_span() 1501 1502 if current_otel_span is not None and current_otel_span.is_recording(): 1503 span_class = self._get_span_class( 1504 self._get_observation_type_from_otel_span(current_otel_span) 1505 ) 1506 span = span_class( 1507 otel_span=current_otel_span, 1508 langfuse_client=self, 1509 environment=self._environment, 1510 release=self._release, 1511 ) 1512 1513 span.set_trace_io( 1514 input=input, 1515 output=output, 1516 ) 1517 1518 def set_current_trace_as_public(self) -> None: 1519 """Make the current trace publicly accessible via its URL. 1520 1521 When a trace is published, anyone with the trace link can view the full trace 1522 without needing to be logged in to Langfuse. This action cannot be undone 1523 programmatically - once published, the entire trace becomes public. 1524 1525 This is a convenience method that publishes the trace from the currently 1526 active span context. Use this when you want to make a trace public from 1527 within a traced function without needing direct access to the span object. 1528 """ 1529 if not self._tracing_enabled: 1530 langfuse_logger.debug( 1531 "Operation skipped: set_current_trace_as_public - Tracing is disabled or client is in no-op mode." 1532 ) 1533 return 1534 1535 current_otel_span = self._get_current_otel_span() 1536 1537 if current_otel_span is not None and current_otel_span.is_recording(): 1538 span_class = self._get_span_class( 1539 self._get_observation_type_from_otel_span(current_otel_span) 1540 ) 1541 span = span_class( 1542 otel_span=current_otel_span, 1543 langfuse_client=self, 1544 environment=self._environment, 1545 ) 1546 1547 span.set_trace_as_public() 1548 1549 def create_event( 1550 self, 1551 *, 1552 trace_context: Optional[TraceContext] = None, 1553 name: str, 1554 input: Optional[Any] = None, 1555 output: Optional[Any] = None, 1556 metadata: Optional[Any] = None, 1557 version: Optional[str] = None, 1558 level: Optional[SpanLevel] = None, 1559 status_message: Optional[str] = None, 1560 ) -> LangfuseEvent: 1561 """Create a new Langfuse observation of type 'EVENT'. 1562 1563 The created Langfuse Event observation will be the child of the current span in the context. 1564 1565 Args: 1566 trace_context: Optional context for connecting to an existing trace 1567 name: Name of the span (e.g., function or operation name) 1568 input: Input data for the operation (can be any JSON-serializable object) 1569 output: Output data from the operation (can be any JSON-serializable object) 1570 metadata: Additional metadata to associate with the span 1571 version: Version identifier for the code or component 1572 level: Importance level of the span (info, warning, error) 1573 status_message: Optional status message for the span 1574 1575 Returns: 1576 The Langfuse Event object 1577 1578 Example: 1579 ```python 1580 event = langfuse.create_event(name="process-event") 1581 ``` 1582 """ 1583 timestamp = time_ns() 1584 1585 if trace_context: 1586 trace_id = trace_context.get("trace_id", None) 1587 parent_span_id = trace_context.get("parent_span_id", None) 1588 1589 if trace_id: 1590 remote_parent_span = self._create_remote_parent_span( 1591 trace_id=trace_id, parent_span_id=parent_span_id 1592 ) 1593 1594 with otel_trace_api.use_span( 1595 cast(otel_trace_api.Span, remote_parent_span) 1596 ): 1597 otel_span = self._otel_tracer.start_span( 1598 name=name, start_time=timestamp 1599 ) 1600 otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True) 1601 1602 return cast( 1603 LangfuseEvent, 1604 LangfuseEvent( 1605 otel_span=otel_span, 1606 langfuse_client=self, 1607 environment=self._environment, 1608 release=self._release, 1609 input=input, 1610 output=output, 1611 metadata=metadata, 1612 version=version, 1613 level=level, 1614 status_message=status_message, 1615 ).end(end_time=timestamp), 1616 ) 1617 1618 otel_span = self._otel_tracer.start_span(name=name, start_time=timestamp) 1619 1620 return cast( 1621 LangfuseEvent, 1622 LangfuseEvent( 1623 otel_span=otel_span, 1624 langfuse_client=self, 1625 environment=self._environment, 1626 release=self._release, 1627 input=input, 1628 output=output, 1629 metadata=metadata, 1630 version=version, 1631 level=level, 1632 status_message=status_message, 1633 ).end(end_time=timestamp), 1634 ) 1635 1636 def _create_remote_parent_span( 1637 self, *, trace_id: str, parent_span_id: Optional[str] 1638 ) -> Any: 1639 if not self._is_valid_trace_id(trace_id): 1640 langfuse_logger.warning( 1641 f"Passed trace ID '{trace_id}' is not a valid 32 lowercase hex char Langfuse trace id. Ignoring trace ID." 1642 ) 1643 1644 if parent_span_id and not self._is_valid_span_id(parent_span_id): 1645 langfuse_logger.warning( 1646 f"Passed span ID '{parent_span_id}' is not a valid 16 lowercase hex char Langfuse span id. Ignoring parent span ID." 1647 ) 1648 1649 int_trace_id = int(trace_id, 16) 1650 int_parent_span_id = ( 1651 int(parent_span_id, 16) 1652 if parent_span_id 1653 else RandomIdGenerator().generate_span_id() 1654 ) 1655 1656 span_context = otel_trace_api.SpanContext( 1657 trace_id=int_trace_id, 1658 span_id=int_parent_span_id, 1659 trace_flags=otel_trace_api.TraceFlags(0x01), # mark span as sampled 1660 is_remote=False, 1661 ) 1662 1663 return otel_trace_api.NonRecordingSpan(span_context) 1664 1665 def _is_valid_trace_id(self, trace_id: str) -> bool: 1666 pattern = r"^[0-9a-f]{32}$" 1667 1668 return bool(re.match(pattern, trace_id)) 1669 1670 def _is_valid_span_id(self, span_id: str) -> bool: 1671 pattern = r"^[0-9a-f]{16}$" 1672 1673 return bool(re.match(pattern, span_id)) 1674 1675 def _create_observation_id(self, *, seed: Optional[str] = None) -> str: 1676 """Create a unique observation ID for use with Langfuse. 1677 1678 This method generates a unique observation ID (span ID in OpenTelemetry terms) 1679 for use with various Langfuse APIs. It can either generate a random ID or 1680 create a deterministic ID based on a seed string. 1681 1682 Observation IDs must be 16 lowercase hexadecimal characters, representing 8 bytes. 1683 This method ensures the generated ID meets this requirement. If you need to 1684 correlate an external ID with a Langfuse observation ID, use the external ID as 1685 the seed to get a valid, deterministic observation ID. 1686 1687 Args: 1688 seed: Optional string to use as a seed for deterministic ID generation. 1689 If provided, the same seed will always produce the same ID. 1690 If not provided, a random ID will be generated. 1691 1692 Returns: 1693 A 16-character lowercase hexadecimal string representing the observation ID. 1694 1695 Example: 1696 ```python 1697 # Generate a random observation ID 1698 obs_id = langfuse.create_observation_id() 1699 1700 # Generate a deterministic ID based on a seed 1701 user_obs_id = langfuse.create_observation_id(seed="user-123-feedback") 1702 1703 # Correlate an external item ID with a Langfuse observation ID 1704 item_id = "item-789012" 1705 correlated_obs_id = langfuse.create_observation_id(seed=item_id) 1706 1707 # Use the ID with Langfuse APIs 1708 langfuse.create_score( 1709 name="relevance", 1710 value=0.95, 1711 trace_id=trace_id, 1712 observation_id=obs_id 1713 ) 1714 ``` 1715 """ 1716 if not seed: 1717 span_id_int = RandomIdGenerator().generate_span_id() 1718 1719 return self._format_otel_span_id(span_id_int) 1720 1721 return sha256(seed.encode("utf-8")).digest()[:8].hex() 1722 1723 @staticmethod 1724 def create_trace_id(*, seed: Optional[str] = None) -> str: 1725 """Create a unique trace ID for use with Langfuse. 1726 1727 This method generates a unique trace ID for use with various Langfuse APIs. 1728 It can either generate a random ID or create a deterministic ID based on 1729 a seed string. 1730 1731 Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes. 1732 This method ensures the generated ID meets this requirement. If you need to 1733 correlate an external ID with a Langfuse trace ID, use the external ID as the 1734 seed to get a valid, deterministic Langfuse trace ID. 1735 1736 Args: 1737 seed: Optional string to use as a seed for deterministic ID generation. 1738 If provided, the same seed will always produce the same ID. 1739 If not provided, a random ID will be generated. 1740 1741 Returns: 1742 A 32-character lowercase hexadecimal string representing the Langfuse trace ID. 1743 1744 Example: 1745 ```python 1746 # Generate a random trace ID 1747 trace_id = langfuse.create_trace_id() 1748 1749 # Generate a deterministic ID based on a seed 1750 session_trace_id = langfuse.create_trace_id(seed="session-456") 1751 1752 # Correlate an external ID with a Langfuse trace ID 1753 external_id = "external-system-123456" 1754 correlated_trace_id = langfuse.create_trace_id(seed=external_id) 1755 1756 # Use the ID with trace context 1757 with langfuse.start_as_current_observation( 1758 name="process-request", 1759 trace_context={"trace_id": trace_id} 1760 ) as span: 1761 # Operation will be part of the specific trace 1762 pass 1763 ``` 1764 """ 1765 if not seed: 1766 trace_id_int = RandomIdGenerator().generate_trace_id() 1767 1768 return Langfuse._format_otel_trace_id(trace_id_int) 1769 1770 return sha256(seed.encode("utf-8")).digest()[:16].hex() 1771 1772 def _get_otel_trace_id(self, otel_span: otel_trace_api.Span) -> str: 1773 span_context = otel_span.get_span_context() 1774 1775 return self._format_otel_trace_id(span_context.trace_id) 1776 1777 def _get_otel_span_id(self, otel_span: otel_trace_api.Span) -> str: 1778 span_context = otel_span.get_span_context() 1779 1780 return self._format_otel_span_id(span_context.span_id) 1781 1782 @staticmethod 1783 def _format_otel_span_id(span_id_int: int) -> str: 1784 """Format an integer span ID to a 16-character lowercase hex string. 1785 1786 Internal method to convert an OpenTelemetry integer span ID to the standard 1787 W3C Trace Context format (16-character lowercase hex string). 1788 1789 Args: 1790 span_id_int: 64-bit integer representing a span ID 1791 1792 Returns: 1793 A 16-character lowercase hexadecimal string 1794 """ 1795 return format(span_id_int, "016x") 1796 1797 @staticmethod 1798 def _format_otel_trace_id(trace_id_int: int) -> str: 1799 """Format an integer trace ID to a 32-character lowercase hex string. 1800 1801 Internal method to convert an OpenTelemetry integer trace ID to the standard 1802 W3C Trace Context format (32-character lowercase hex string). 1803 1804 Args: 1805 trace_id_int: 128-bit integer representing a trace ID 1806 1807 Returns: 1808 A 32-character lowercase hexadecimal string 1809 """ 1810 return format(trace_id_int, "032x") 1811 1812 @overload 1813 def create_score( 1814 self, 1815 *, 1816 name: str, 1817 value: float, 1818 session_id: Optional[str] = None, 1819 dataset_run_id: Optional[str] = None, 1820 trace_id: Optional[str] = None, 1821 observation_id: Optional[str] = None, 1822 score_id: Optional[str] = None, 1823 data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None, 1824 comment: Optional[str] = None, 1825 config_id: Optional[str] = None, 1826 metadata: Optional[Any] = None, 1827 timestamp: Optional[datetime] = None, 1828 environment: Optional[str] = None, 1829 ) -> None: ... 1830 1831 @overload 1832 def create_score( 1833 self, 1834 *, 1835 name: str, 1836 value: str, 1837 session_id: Optional[str] = None, 1838 dataset_run_id: Optional[str] = None, 1839 trace_id: Optional[str] = None, 1840 score_id: Optional[str] = None, 1841 observation_id: Optional[str] = None, 1842 data_type: Optional[ 1843 Literal["CATEGORICAL", "TEXT", "CORRECTION"] 1844 ] = "CATEGORICAL", 1845 comment: Optional[str] = None, 1846 config_id: Optional[str] = None, 1847 metadata: Optional[Any] = None, 1848 timestamp: Optional[datetime] = None, 1849 environment: Optional[str] = None, 1850 ) -> None: ... 1851 1852 def create_score( 1853 self, 1854 *, 1855 name: str, 1856 value: Union[float, str], 1857 session_id: Optional[str] = None, 1858 dataset_run_id: Optional[str] = None, 1859 trace_id: Optional[str] = None, 1860 observation_id: Optional[str] = None, 1861 score_id: Optional[str] = None, 1862 data_type: Optional[ScoreDataType] = None, 1863 comment: Optional[str] = None, 1864 config_id: Optional[str] = None, 1865 metadata: Optional[Any] = None, 1866 timestamp: Optional[datetime] = None, 1867 environment: Optional[str] = None, 1868 ) -> None: 1869 """Create a score for a specific trace or observation. 1870 1871 This method creates a score for evaluating a Langfuse trace or observation. Scores can be 1872 used to track quality metrics, user feedback, or automated evaluations. 1873 1874 Args: 1875 name: Name of the score (e.g., "relevance", "accuracy") 1876 value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION) 1877 session_id: ID of the Langfuse session to associate the score with 1878 dataset_run_id: ID of the Langfuse dataset run to associate the score with 1879 trace_id: ID of the Langfuse trace to associate the score with 1880 observation_id: Optional ID of the specific observation to score. Trace ID must be provided too. 1881 score_id: Optional custom ID for the score (auto-generated if not provided) 1882 data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION) 1883 comment: Optional comment or explanation for the score 1884 config_id: Optional ID of a score config defined in Langfuse 1885 metadata: Optional metadata to be attached to the score 1886 timestamp: Optional timestamp for the score (defaults to current UTC time) 1887 environment: Optional environment override for this score. If omitted, 1888 the score uses the client-level environment from 1889 `Langfuse(environment=...)` or `LANGFUSE_TRACING_ENVIRONMENT`. 1890 Langfuse observation wrapper methods pass their resolved span 1891 environment here so scores created via `span.score()` or 1892 `span.score_trace()` stay grouped with the scored observation or 1893 trace, including request-scoped environments propagated with 1894 `propagate_attributes(environment=...)`. 1895 1896 Example: 1897 ```python 1898 # Create a numeric score for accuracy 1899 langfuse.create_score( 1900 name="accuracy", 1901 value=0.92, 1902 trace_id="abcdef1234567890abcdef1234567890", 1903 data_type="NUMERIC", 1904 comment="High accuracy with minor irrelevant details" 1905 ) 1906 1907 # Create a categorical score for sentiment 1908 langfuse.create_score( 1909 name="sentiment", 1910 value="positive", 1911 trace_id="abcdef1234567890abcdef1234567890", 1912 observation_id="abcdef1234567890", 1913 data_type="CATEGORICAL" 1914 ) 1915 ``` 1916 """ 1917 if not self._tracing_enabled: 1918 return 1919 1920 score_id = score_id or self._create_observation_id() 1921 1922 try: 1923 new_body = ScoreBody( 1924 id=score_id, 1925 sessionId=session_id, 1926 datasetRunId=dataset_run_id, 1927 traceId=trace_id, 1928 observationId=observation_id, 1929 name=name, 1930 value=value, 1931 dataType=data_type, # type: ignore 1932 comment=comment, 1933 configId=config_id, 1934 environment=environment or self._environment, 1935 metadata=metadata, 1936 ) 1937 1938 event = { 1939 "id": self.create_trace_id(), 1940 "type": "score-create", 1941 "timestamp": timestamp or _get_timestamp(), 1942 "body": new_body, 1943 } 1944 1945 if self._resources is not None: 1946 # Force the score to be in sample if it was for a legacy trace ID, i.e. non-32 hexchar 1947 force_sample = ( 1948 not self._is_valid_trace_id(trace_id) if trace_id else True 1949 ) 1950 1951 self._resources.add_score_task( 1952 event, 1953 force_sample=force_sample, 1954 ) 1955 1956 except Exception as e: 1957 langfuse_logger.exception( 1958 f"Error creating score: Failed to process score event for trace_id={trace_id}, name={name}. Error: {e}" 1959 ) 1960 1961 def _create_trace_tags_via_ingestion( 1962 self, 1963 *, 1964 trace_id: str, 1965 tags: List[str], 1966 ) -> None: 1967 """Private helper to enqueue trace tag updates via ingestion API events.""" 1968 if not self._tracing_enabled: 1969 return 1970 1971 if len(tags) == 0: 1972 return 1973 1974 try: 1975 new_body = TraceBody( 1976 id=trace_id, 1977 tags=tags, 1978 ) 1979 1980 event = { 1981 "id": self.create_trace_id(), 1982 "type": "trace-create", 1983 "timestamp": _get_timestamp(), 1984 "body": new_body, 1985 } 1986 1987 if self._resources is not None: 1988 self._resources.add_trace_task(event) 1989 except Exception as e: 1990 langfuse_logger.exception( 1991 f"Error updating trace tags: Failed to process trace update event for trace_id={trace_id}. Error: {e}" 1992 ) 1993 1994 @overload 1995 def score_current_span( 1996 self, 1997 *, 1998 name: str, 1999 value: float, 2000 score_id: Optional[str] = None, 2001 data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None, 2002 comment: Optional[str] = None, 2003 config_id: Optional[str] = None, 2004 metadata: Optional[Any] = None, 2005 ) -> None: ... 2006 2007 @overload 2008 def score_current_span( 2009 self, 2010 *, 2011 name: str, 2012 value: str, 2013 score_id: Optional[str] = None, 2014 data_type: Optional[ 2015 Literal["CATEGORICAL", "TEXT", "CORRECTION"] 2016 ] = "CATEGORICAL", 2017 comment: Optional[str] = None, 2018 config_id: Optional[str] = None, 2019 metadata: Optional[Any] = None, 2020 ) -> None: ... 2021 2022 def score_current_span( 2023 self, 2024 *, 2025 name: str, 2026 value: Union[float, str], 2027 score_id: Optional[str] = None, 2028 data_type: Optional[ScoreDataType] = None, 2029 comment: Optional[str] = None, 2030 config_id: Optional[str] = None, 2031 metadata: Optional[Any] = None, 2032 ) -> None: 2033 """Create a score for the current active span. 2034 2035 This method scores the currently active span in the context. It's a convenient 2036 way to score the current operation without needing to know its trace and span IDs. 2037 If the active span has a `langfuse.environment` attribute, including one 2038 set by `propagate_attributes(environment=...)`, the score uses that 2039 environment. Otherwise it uses the client-level environment. 2040 2041 Args: 2042 name: Name of the score (e.g., "relevance", "accuracy") 2043 value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION) 2044 score_id: Optional custom ID for the score (auto-generated if not provided) 2045 data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION) 2046 comment: Optional comment or explanation for the score 2047 config_id: Optional ID of a score config defined in Langfuse 2048 metadata: Optional metadata to be attached to the score 2049 2050 Example: 2051 ```python 2052 with langfuse.start_as_current_generation(name="answer-query") as generation: 2053 # Generate answer 2054 response = generate_answer(...) 2055 generation.update(output=response) 2056 2057 # Score the generation 2058 langfuse.score_current_span( 2059 name="relevance", 2060 value=0.85, 2061 data_type="NUMERIC", 2062 comment="Mostly relevant but contains some tangential information", 2063 metadata={"model": "gpt-4", "prompt_version": "v2"} 2064 ) 2065 ``` 2066 """ 2067 current_span = self._get_current_otel_span() 2068 2069 if current_span is not None: 2070 trace_id = self._get_otel_trace_id(current_span) 2071 observation_id = self._get_otel_span_id(current_span) 2072 2073 langfuse_logger.info( 2074 f"Score: Creating score name='{name}' value={value} for current span ({observation_id}) in trace {trace_id}" 2075 ) 2076 2077 self.create_score( 2078 trace_id=trace_id, 2079 observation_id=observation_id, 2080 name=name, 2081 value=cast(str, value), 2082 score_id=score_id, 2083 data_type=cast(Literal["CATEGORICAL", "TEXT", "CORRECTION"], data_type), 2084 comment=comment, 2085 config_id=config_id, 2086 metadata=metadata, 2087 environment=get_string_span_attribute( 2088 current_span, LangfuseOtelSpanAttributes.ENVIRONMENT 2089 ), 2090 ) 2091 2092 @overload 2093 def score_current_trace( 2094 self, 2095 *, 2096 name: str, 2097 value: float, 2098 score_id: Optional[str] = None, 2099 data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None, 2100 comment: Optional[str] = None, 2101 config_id: Optional[str] = None, 2102 metadata: Optional[Any] = None, 2103 ) -> None: ... 2104 2105 @overload 2106 def score_current_trace( 2107 self, 2108 *, 2109 name: str, 2110 value: str, 2111 score_id: Optional[str] = None, 2112 data_type: Optional[ 2113 Literal["CATEGORICAL", "TEXT", "CORRECTION"] 2114 ] = "CATEGORICAL", 2115 comment: Optional[str] = None, 2116 config_id: Optional[str] = None, 2117 metadata: Optional[Any] = None, 2118 ) -> None: ... 2119 2120 def score_current_trace( 2121 self, 2122 *, 2123 name: str, 2124 value: Union[float, str], 2125 score_id: Optional[str] = None, 2126 data_type: Optional[ScoreDataType] = None, 2127 comment: Optional[str] = None, 2128 config_id: Optional[str] = None, 2129 metadata: Optional[Any] = None, 2130 ) -> None: 2131 """Create a score for the current trace. 2132 2133 This method scores the trace of the currently active span. Unlike score_current_span, 2134 this method associates the score with the entire trace rather than a specific span. 2135 It's useful for scoring overall performance or quality of the entire operation. 2136 If the active span has a `langfuse.environment` attribute, including one 2137 set by `propagate_attributes(environment=...)`, the score uses that 2138 environment. Otherwise it uses the client-level environment. 2139 2140 Args: 2141 name: Name of the score (e.g., "user_satisfaction", "overall_quality") 2142 value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION) 2143 score_id: Optional custom ID for the score (auto-generated if not provided) 2144 data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION) 2145 comment: Optional comment or explanation for the score 2146 config_id: Optional ID of a score config defined in Langfuse 2147 metadata: Optional metadata to be attached to the score 2148 2149 Example: 2150 ```python 2151 with langfuse.start_as_current_observation(name="process-user-request") as span: 2152 # Process request 2153 result = process_complete_request() 2154 span.update(output=result) 2155 2156 # Score the overall trace 2157 langfuse.score_current_trace( 2158 name="overall_quality", 2159 value=0.95, 2160 data_type="NUMERIC", 2161 comment="High quality end-to-end response", 2162 metadata={"evaluator": "gpt-4", "criteria": "comprehensive"} 2163 ) 2164 ``` 2165 """ 2166 current_span = self._get_current_otel_span() 2167 2168 if current_span is not None: 2169 trace_id = self._get_otel_trace_id(current_span) 2170 2171 langfuse_logger.info( 2172 f"Score: Creating score name='{name}' value={value} for entire trace {trace_id}" 2173 ) 2174 2175 self.create_score( 2176 trace_id=trace_id, 2177 name=name, 2178 value=cast(str, value), 2179 score_id=score_id, 2180 data_type=cast(Literal["CATEGORICAL", "TEXT", "CORRECTION"], data_type), 2181 comment=comment, 2182 config_id=config_id, 2183 metadata=metadata, 2184 environment=get_string_span_attribute( 2185 current_span, LangfuseOtelSpanAttributes.ENVIRONMENT 2186 ), 2187 ) 2188 2189 def flush(self) -> None: 2190 """Force flush all pending spans and events to the Langfuse API. 2191 2192 This method manually flushes any pending spans, scores, and other events to the 2193 Langfuse API. It's useful in scenarios where you want to ensure all data is sent 2194 before proceeding, without waiting for the automatic flush interval. 2195 2196 Example: 2197 ```python 2198 # Record some spans and scores 2199 with langfuse.start_as_current_observation(name="operation") as span: 2200 # Do work... 2201 pass 2202 2203 # Ensure all data is sent to Langfuse before proceeding 2204 langfuse.flush() 2205 2206 # Continue with other work 2207 ``` 2208 """ 2209 if self._resources is not None: 2210 self._resources.flush() 2211 2212 def shutdown(self) -> None: 2213 """Shut down the Langfuse client and flush all pending data. 2214 2215 This method cleanly shuts down the Langfuse client, ensuring all pending data 2216 is flushed to the API and all background threads are properly terminated. 2217 2218 It's important to call this method when your application is shutting down to 2219 prevent data loss and resource leaks. For most applications, using the client 2220 as a context manager or relying on the automatic shutdown via atexit is sufficient. 2221 2222 Example: 2223 ```python 2224 # Initialize Langfuse 2225 langfuse = Langfuse(public_key="...", secret_key="...") 2226 2227 # Use Langfuse throughout your application 2228 # ... 2229 2230 # When application is shutting down 2231 langfuse.shutdown() 2232 ``` 2233 """ 2234 if self._resources is not None: 2235 self._resources.shutdown() 2236 2237 def get_current_trace_id(self) -> Optional[str]: 2238 """Get the trace ID of the current active span. 2239 2240 This method retrieves the trace ID from the currently active span in the context. 2241 It can be used to get the trace ID for referencing in logs, external systems, 2242 or for creating related operations. 2243 2244 Returns: 2245 The current trace ID as a 32-character lowercase hexadecimal string, 2246 or None if there is no active span. 2247 2248 Example: 2249 ```python 2250 with langfuse.start_as_current_observation(name="process-request") as span: 2251 # Get the current trace ID for reference 2252 trace_id = langfuse.get_current_trace_id() 2253 2254 # Use it for external correlation 2255 log.info(f"Processing request with trace_id: {trace_id}") 2256 2257 # Or pass to another system 2258 external_system.process(data, trace_id=trace_id) 2259 ``` 2260 """ 2261 if not self._tracing_enabled: 2262 langfuse_logger.debug( 2263 "Operation skipped: get_current_trace_id - Tracing is disabled or client is in no-op mode." 2264 ) 2265 return None 2266 2267 current_otel_span = self._get_current_otel_span() 2268 2269 return self._get_otel_trace_id(current_otel_span) if current_otel_span else None 2270 2271 def get_current_observation_id(self) -> Optional[str]: 2272 """Get the observation ID (span ID) of the current active span. 2273 2274 This method retrieves the observation ID from the currently active span in the context. 2275 It can be used to get the observation ID for referencing in logs, external systems, 2276 or for creating scores or other related operations. 2277 2278 Returns: 2279 The current observation ID as a 16-character lowercase hexadecimal string, 2280 or None if there is no active span. 2281 2282 Example: 2283 ```python 2284 with langfuse.start_as_current_observation(name="process-user-query") as span: 2285 # Get the current observation ID 2286 observation_id = langfuse.get_current_observation_id() 2287 2288 # Store it for later reference 2289 cache.set(f"query_{query_id}_observation", observation_id) 2290 2291 # Process the query... 2292 ``` 2293 """ 2294 if not self._tracing_enabled: 2295 langfuse_logger.debug( 2296 "Operation skipped: get_current_observation_id - Tracing is disabled or client is in no-op mode." 2297 ) 2298 return None 2299 2300 current_otel_span = self._get_current_otel_span() 2301 2302 return self._get_otel_span_id(current_otel_span) if current_otel_span else None 2303 2304 def _get_project_id(self) -> Optional[str]: 2305 """Fetch and return the current project id. Persisted across requests. Returns None if no project id is found for api keys.""" 2306 if not self._project_id: 2307 proj = self.api.projects.get() 2308 if not proj.data or not proj.data[0].id: 2309 return None 2310 2311 self._project_id = proj.data[0].id 2312 2313 return self._project_id 2314 2315 def get_trace_url(self, *, trace_id: Optional[str] = None) -> Optional[str]: 2316 """Get the URL to view a trace in the Langfuse UI. 2317 2318 This method generates a URL that links directly to a trace in the Langfuse UI. 2319 It's useful for providing links in logs, notifications, or debugging tools. 2320 2321 Args: 2322 trace_id: Optional trace ID to generate a URL for. If not provided, 2323 the trace ID of the current active span will be used. 2324 2325 Returns: 2326 A URL string pointing to the trace in the Langfuse UI, 2327 or None if the project ID couldn't be retrieved or no trace ID is available. 2328 2329 Example: 2330 ```python 2331 # Get URL for the current trace 2332 with langfuse.start_as_current_observation(name="process-request") as span: 2333 trace_url = langfuse.get_trace_url() 2334 log.info(f"Processing trace: {trace_url}") 2335 2336 # Get URL for a specific trace 2337 specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef") 2338 send_notification(f"Review needed for trace: {specific_trace_url}") 2339 ``` 2340 """ 2341 final_trace_id = trace_id or self.get_current_trace_id() 2342 if not final_trace_id: 2343 return None 2344 2345 project_id = self._get_project_id() 2346 2347 return ( 2348 f"{self._base_url}/project/{project_id}/traces/{final_trace_id}" 2349 if project_id and final_trace_id 2350 else None 2351 ) 2352 2353 def get_dataset( 2354 self, 2355 name: str, 2356 *, 2357 fetch_items_page_size: Optional[int] = 50, 2358 version: Optional[datetime] = None, 2359 ) -> "DatasetClient": 2360 """Fetch a dataset by its name. 2361 2362 Args: 2363 name: The name of the dataset to fetch. 2364 fetch_items_page_size: All items of the dataset will be fetched in chunks of this size. Defaults to 50. 2365 version: Retrieve dataset items as they existed at this specific point in time (UTC). 2366 If provided, returns the state of items at the specified UTC timestamp. 2367 If not provided, returns the latest version. Must be a timezone-aware datetime object in UTC. 2368 2369 Returns: 2370 DatasetClient: The dataset with the given name. 2371 """ 2372 try: 2373 langfuse_logger.debug(f"Getting datasets {name}") 2374 dataset = self.api.datasets.get(dataset_name=self._url_encode(name)) 2375 2376 dataset_items: List[DatasetItem] = [] 2377 page = 1 2378 2379 while True: 2380 new_items = self.api.dataset_items.list( 2381 dataset_name=self._url_encode(name, is_url_param=True), 2382 page=page, 2383 limit=fetch_items_page_size, 2384 version=version, 2385 ) 2386 dataset_items.extend( 2387 self._hydrate_dataset_item_media_references(item) 2388 for item in new_items.data 2389 ) 2390 2391 if new_items.meta.total_pages <= page: 2392 break 2393 2394 page += 1 2395 2396 return DatasetClient( 2397 dataset=dataset, 2398 items=dataset_items, 2399 version=version, 2400 langfuse_client=self, 2401 ) 2402 2403 except Error as e: 2404 handle_fern_exception(e) 2405 raise e 2406 2407 def get_dataset_run( 2408 self, *, dataset_name: str, run_name: str 2409 ) -> DatasetRunWithItems: 2410 """Fetch a dataset run by dataset name and run name. 2411 2412 Args: 2413 dataset_name (str): The name of the dataset. 2414 run_name (str): The name of the run. 2415 2416 Returns: 2417 DatasetRunWithItems: The dataset run with its items. 2418 """ 2419 try: 2420 return cast( 2421 DatasetRunWithItems, 2422 self.api.datasets.get_run( 2423 dataset_name=self._url_encode(dataset_name), 2424 run_name=self._url_encode(run_name), 2425 request_options=None, 2426 ), 2427 ) 2428 except Error as e: 2429 handle_fern_exception(e) 2430 raise e 2431 2432 def get_dataset_runs( 2433 self, 2434 *, 2435 dataset_name: str, 2436 page: Optional[int] = None, 2437 limit: Optional[int] = None, 2438 ) -> PaginatedDatasetRuns: 2439 """Fetch all runs for a dataset. 2440 2441 Args: 2442 dataset_name (str): The name of the dataset. 2443 page (Optional[int]): Page number, starts at 1. 2444 limit (Optional[int]): Limit of items per page. 2445 2446 Returns: 2447 PaginatedDatasetRuns: Paginated list of dataset runs. 2448 """ 2449 try: 2450 return cast( 2451 PaginatedDatasetRuns, 2452 self.api.datasets.get_runs( 2453 dataset_name=self._url_encode(dataset_name), 2454 page=page, 2455 limit=limit, 2456 request_options=None, 2457 ), 2458 ) 2459 except Error as e: 2460 handle_fern_exception(e) 2461 raise e 2462 2463 def delete_dataset_run( 2464 self, *, dataset_name: str, run_name: str 2465 ) -> DeleteDatasetRunResponse: 2466 """Delete a dataset run and all its run items. This action is irreversible. 2467 2468 Args: 2469 dataset_name (str): The name of the dataset. 2470 run_name (str): The name of the run. 2471 2472 Returns: 2473 DeleteDatasetRunResponse: Confirmation of deletion. 2474 """ 2475 try: 2476 return cast( 2477 DeleteDatasetRunResponse, 2478 self.api.datasets.delete_run( 2479 dataset_name=self._url_encode(dataset_name), 2480 run_name=self._url_encode(run_name), 2481 request_options=None, 2482 ), 2483 ) 2484 except Error as e: 2485 handle_fern_exception(e) 2486 raise e 2487 2488 def run_experiment( 2489 self, 2490 *, 2491 name: str, 2492 run_name: Optional[str] = None, 2493 description: Optional[str] = None, 2494 data: ExperimentData, 2495 task: TaskFunction, 2496 evaluators: List[EvaluatorFunction] = [], 2497 composite_evaluator: Optional[CompositeEvaluatorFunction] = None, 2498 run_evaluators: List[RunEvaluatorFunction] = [], 2499 max_concurrency: int = 50, 2500 metadata: Optional[Dict[str, str]] = None, 2501 _dataset_version: Optional[datetime] = None, 2502 ) -> ExperimentResult: 2503 """Run an experiment on a dataset with automatic tracing and evaluation. 2504 2505 This method executes a task function on each item in the provided dataset, 2506 automatically traces all executions with Langfuse for observability, runs 2507 item-level and run-level evaluators on the outputs, and returns comprehensive 2508 results with evaluation metrics. 2509 2510 The experiment system provides: 2511 - Automatic tracing of all task executions 2512 - Concurrent processing with configurable limits 2513 - Comprehensive error handling that isolates failures 2514 - Integration with Langfuse datasets for experiment tracking 2515 - Flexible evaluation framework supporting both sync and async evaluators 2516 2517 Args: 2518 name: Human-readable name for the experiment. Used for identification 2519 in the Langfuse UI. 2520 run_name: Optional exact name for the experiment run. If provided, this will be 2521 used as the exact dataset run name if the `data` contains Langfuse dataset items. 2522 If not provided, this will default to the experiment name appended with an ISO timestamp. 2523 description: Optional description explaining the experiment's purpose, 2524 methodology, or expected outcomes. 2525 data: Array of data items to process. Can be either: 2526 - List of dict-like items with 'input', 'expected_output', 'metadata' keys 2527 - List of Langfuse DatasetItem objects from dataset.items 2528 task: Function that processes each data item and returns output. 2529 Must accept 'item' as keyword argument and can return sync or async results. 2530 The task function signature should be: task(*, item, **kwargs) -> Any 2531 evaluators: List of functions to evaluate each item's output individually. 2532 Each evaluator receives input, output, expected_output, and metadata. 2533 Can return single Evaluation dict or list of Evaluation dicts. 2534 composite_evaluator: Optional function that creates composite scores from item-level evaluations. 2535 Receives the same inputs as item-level evaluators (input, output, expected_output, metadata) 2536 plus the list of evaluations from item-level evaluators. Useful for weighted averages, 2537 pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics. 2538 run_evaluators: List of functions to evaluate the entire experiment run. 2539 Each run evaluator receives all item_results and can compute aggregate metrics. 2540 Useful for calculating averages, distributions, or cross-item comparisons. 2541 max_concurrency: Maximum number of concurrent task executions (default: 50). 2542 Controls the number of items processed simultaneously. Adjust based on 2543 API rate limits and system resources. 2544 metadata: Optional metadata dictionary to attach to all experiment traces. 2545 This metadata will be included in every trace created during the experiment. 2546 If `data` are Langfuse dataset items, the metadata will be attached to the dataset run, too. 2547 2548 Returns: 2549 ExperimentResult containing: 2550 - run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset. 2551 - item_results: List of results for each processed item with outputs and evaluations 2552 - run_evaluations: List of aggregate evaluation results for the entire run 2553 - experiment_id: Stable identifier for the experiment run across all items 2554 - dataset_run_id: ID of the dataset run (if using Langfuse datasets) 2555 - dataset_run_url: Direct URL to view results in Langfuse UI (if applicable) 2556 2557 Raises: 2558 ValueError: If required parameters are missing or invalid 2559 Exception: If experiment setup fails (individual item failures are handled gracefully) 2560 2561 Examples: 2562 Basic experiment with local data: 2563 ```python 2564 def summarize_text(*, item, **kwargs): 2565 return f"Summary: {item['input'][:50]}..." 2566 2567 def length_evaluator(*, input, output, expected_output=None, **kwargs): 2568 return { 2569 "name": "output_length", 2570 "value": len(output), 2571 "comment": f"Output contains {len(output)} characters" 2572 } 2573 2574 result = langfuse.run_experiment( 2575 name="Text Summarization Test", 2576 description="Evaluate summarization quality and length", 2577 data=[ 2578 {"input": "Long article text...", "expected_output": "Expected summary"}, 2579 {"input": "Another article...", "expected_output": "Another summary"} 2580 ], 2581 task=summarize_text, 2582 evaluators=[length_evaluator] 2583 ) 2584 2585 print(f"Processed {len(result.item_results)} items") 2586 for item_result in result.item_results: 2587 print(f"Input: {item_result.item['input']}") 2588 print(f"Output: {item_result.output}") 2589 print(f"Evaluations: {item_result.evaluations}") 2590 ``` 2591 2592 Advanced experiment with async task and multiple evaluators: 2593 ```python 2594 async def llm_task(*, item, **kwargs): 2595 # Simulate async LLM call 2596 response = await openai_client.chat.completions.create( 2597 model="gpt-4", 2598 messages=[{"role": "user", "content": item["input"]}] 2599 ) 2600 return response.choices[0].message.content 2601 2602 def accuracy_evaluator(*, input, output, expected_output=None, **kwargs): 2603 if expected_output and expected_output.lower() in output.lower(): 2604 return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"} 2605 return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"} 2606 2607 def toxicity_evaluator(*, input, output, expected_output=None, **kwargs): 2608 # Simulate toxicity check 2609 toxicity_score = check_toxicity(output) # Your toxicity checker 2610 return { 2611 "name": "toxicity", 2612 "value": toxicity_score, 2613 "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}" 2614 } 2615 2616 def average_accuracy(*, item_results, **kwargs): 2617 accuracies = [ 2618 eval.value for result in item_results 2619 for eval in result.evaluations 2620 if eval.name == "accuracy" 2621 ] 2622 return { 2623 "name": "average_accuracy", 2624 "value": sum(accuracies) / len(accuracies) if accuracies else 0, 2625 "comment": f"Average accuracy across {len(accuracies)} items" 2626 } 2627 2628 result = langfuse.run_experiment( 2629 name="LLM Safety and Accuracy Test", 2630 description="Evaluate model accuracy and safety across diverse prompts", 2631 data=test_dataset, # Your dataset items 2632 task=llm_task, 2633 evaluators=[accuracy_evaluator, toxicity_evaluator], 2634 run_evaluators=[average_accuracy], 2635 max_concurrency=5, # Limit concurrent API calls 2636 metadata={"model": "gpt-4", "temperature": 0.7} 2637 ) 2638 ``` 2639 2640 Using with Langfuse datasets: 2641 ```python 2642 # Get dataset from Langfuse 2643 dataset = langfuse.get_dataset("my-eval-dataset") 2644 2645 result = dataset.run_experiment( 2646 name="Production Model Evaluation", 2647 description="Monthly evaluation of production model performance", 2648 task=my_production_task, 2649 evaluators=[accuracy_evaluator, latency_evaluator] 2650 ) 2651 2652 # Results automatically linked to dataset in Langfuse UI 2653 print(f"View results: {result['dataset_run_url']}") 2654 ``` 2655 2656 Note: 2657 - Task and evaluator functions can be either synchronous or asynchronous 2658 - Individual item failures are logged but don't stop the experiment 2659 - All executions are automatically traced and visible in Langfuse UI 2660 - When using Langfuse datasets, results are automatically linked for easy comparison 2661 - This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.) 2662 - Async execution is handled automatically with smart event loop detection 2663 """ 2664 return cast( 2665 ExperimentResult, 2666 run_async_safely( 2667 self._run_experiment_async( 2668 name=name, 2669 run_name=self._create_experiment_run_name( 2670 name=name, run_name=run_name 2671 ), 2672 description=description, 2673 data=data, 2674 task=task, 2675 evaluators=evaluators or [], 2676 composite_evaluator=composite_evaluator, 2677 run_evaluators=run_evaluators or [], 2678 max_concurrency=max_concurrency, 2679 metadata=metadata, 2680 dataset_version=_dataset_version, 2681 ), 2682 ), 2683 ) 2684 2685 async def _run_experiment_async( 2686 self, 2687 *, 2688 name: str, 2689 run_name: str, 2690 description: Optional[str], 2691 data: ExperimentData, 2692 task: TaskFunction, 2693 evaluators: List[EvaluatorFunction], 2694 composite_evaluator: Optional[CompositeEvaluatorFunction], 2695 run_evaluators: List[RunEvaluatorFunction], 2696 max_concurrency: int, 2697 metadata: Optional[Dict[str, Any]] = None, 2698 dataset_version: Optional[datetime] = None, 2699 ) -> ExperimentResult: 2700 langfuse_logger.debug( 2701 f"Starting experiment '{name}' run '{run_name}' with {len(data)} items" 2702 ) 2703 2704 shared_fallback_experiment_id = self._create_observation_id() 2705 2706 # Set up concurrency control 2707 semaphore = asyncio.Semaphore(max_concurrency) 2708 2709 # Process all items 2710 async def process_item(item: ExperimentItem) -> ExperimentItemResult: 2711 async with semaphore: 2712 return await self._process_experiment_item( 2713 item, 2714 task, 2715 evaluators, 2716 composite_evaluator, 2717 shared_fallback_experiment_id, 2718 name, 2719 run_name, 2720 description, 2721 metadata, 2722 dataset_version, 2723 ) 2724 2725 # Run all items concurrently 2726 tasks = [process_item(item) for item in data] 2727 item_results = await asyncio.gather(*tasks, return_exceptions=True) 2728 2729 # Filter out any exceptions and log errors 2730 valid_results: List[ExperimentItemResult] = [] 2731 for i, result in enumerate(item_results): 2732 if isinstance(result, Exception): 2733 langfuse_logger.error(f"Item {i} failed: {result}") 2734 elif isinstance(result, ExperimentItemResult): 2735 valid_results.append(result) # type: ignore 2736 2737 # Run experiment-level evaluators 2738 run_evaluations: List[Evaluation] = [] 2739 for run_evaluator in run_evaluators: 2740 try: 2741 evaluations = await _run_evaluator( 2742 run_evaluator, item_results=valid_results 2743 ) 2744 run_evaluations.extend(evaluations) 2745 except Exception as e: 2746 langfuse_logger.error(f"Run evaluator failed: {e}") 2747 2748 # Generate dataset run URL if applicable 2749 dataset_run_id = next( 2750 ( 2751 result.dataset_run_id 2752 for result in valid_results 2753 if result.dataset_run_id 2754 ), 2755 None, 2756 ) 2757 dataset_run_url = None 2758 if dataset_run_id and data: 2759 try: 2760 # Check if the first item has dataset_id (for DatasetItem objects) 2761 first_item = data[0] 2762 dataset_id = None 2763 2764 if hasattr(first_item, "dataset_id"): 2765 dataset_id = getattr(first_item, "dataset_id", None) 2766 2767 if dataset_id: 2768 project_id = self._get_project_id() 2769 2770 if project_id: 2771 dataset_run_url = f"{self._base_url}/project/{project_id}/datasets/{dataset_id}/runs/{dataset_run_id}" 2772 2773 except Exception: 2774 pass # URL generation is optional 2775 2776 # Store run-level evaluations as scores 2777 for evaluation in run_evaluations: 2778 try: 2779 if dataset_run_id: 2780 self.create_score( 2781 dataset_run_id=dataset_run_id, 2782 name=evaluation.name or "<unknown>", 2783 value=evaluation.value, # type: ignore 2784 comment=evaluation.comment, 2785 metadata=evaluation.metadata, 2786 data_type=evaluation.data_type, # type: ignore 2787 config_id=evaluation.config_id, 2788 ) 2789 2790 except Exception as e: 2791 langfuse_logger.error(f"Failed to store run evaluation: {e}") 2792 2793 # Flush scores and traces 2794 self.flush() 2795 2796 return ExperimentResult( 2797 name=name, 2798 run_name=run_name, 2799 description=description, 2800 item_results=valid_results, 2801 run_evaluations=run_evaluations, 2802 experiment_id=dataset_run_id or shared_fallback_experiment_id, 2803 dataset_run_id=dataset_run_id, 2804 dataset_run_url=dataset_run_url, 2805 ) 2806 2807 async def _process_experiment_item( 2808 self, 2809 item: ExperimentItem, 2810 task: Callable, 2811 evaluators: List[Callable], 2812 composite_evaluator: Optional[CompositeEvaluatorFunction], 2813 fallback_experiment_id: str, 2814 experiment_name: str, 2815 experiment_run_name: str, 2816 experiment_description: Optional[str], 2817 experiment_metadata: Optional[Dict[str, Any]] = None, 2818 dataset_version: Optional[datetime] = None, 2819 ) -> ExperimentItemResult: 2820 span_name = "experiment-item-run" 2821 2822 with self.start_as_current_observation(name=span_name) as span: 2823 try: 2824 input_data = ( 2825 item.get("input") 2826 if isinstance(item, dict) 2827 else getattr(item, "input", None) 2828 ) 2829 2830 if input_data is None: 2831 raise ValueError("Experiment Item is missing input. Skipping item.") 2832 2833 expected_output = ( 2834 item.get("expected_output") 2835 if isinstance(item, dict) 2836 else getattr(item, "expected_output", None) 2837 ) 2838 2839 item_metadata = ( 2840 item.get("metadata") 2841 if isinstance(item, dict) 2842 else getattr(item, "metadata", None) 2843 ) 2844 2845 final_observation_metadata = { 2846 "experiment_name": experiment_name, 2847 "experiment_run_name": experiment_run_name, 2848 **(experiment_metadata or {}), 2849 } 2850 2851 trace_id = span.trace_id 2852 dataset_id = None 2853 dataset_item_id = None 2854 dataset_run_id = None 2855 2856 # Link to dataset run if this is a dataset item 2857 if hasattr(item, "id") and hasattr(item, "dataset_id"): 2858 try: 2859 # Use sync API to avoid event loop issues when run_async_safely 2860 # creates multiple event loops across different threads 2861 dataset_run_item = await asyncio.to_thread( 2862 self.api.dataset_run_items.create, 2863 run_name=experiment_run_name, 2864 run_description=experiment_description, 2865 metadata=experiment_metadata, 2866 dataset_item_id=item.id, # type: ignore 2867 trace_id=trace_id, 2868 observation_id=span.id, 2869 dataset_version=dataset_version, 2870 ) 2871 2872 dataset_run_id = dataset_run_item.dataset_run_id 2873 2874 except Exception as e: 2875 langfuse_logger.error(f"Failed to create dataset run item: {e}") 2876 2877 if ( 2878 not isinstance(item, dict) 2879 and hasattr(item, "dataset_id") 2880 and hasattr(item, "id") 2881 ): 2882 dataset_id = item.dataset_id 2883 dataset_item_id = item.id 2884 2885 final_observation_metadata.update( 2886 {"dataset_id": dataset_id, "dataset_item_id": dataset_item_id} 2887 ) 2888 2889 if isinstance(item_metadata, dict): 2890 final_observation_metadata.update(item_metadata) 2891 2892 experiment_id = dataset_run_id or fallback_experiment_id 2893 experiment_item_id = ( 2894 dataset_item_id or get_sha256_hash_hex(_serialize(input_data))[:16] 2895 ) 2896 span._otel_span.set_attributes( 2897 { 2898 k: v 2899 for k, v in { 2900 LangfuseOtelSpanAttributes.ENVIRONMENT: LANGFUSE_SDK_EXPERIMENT_ENVIRONMENT, 2901 LangfuseOtelSpanAttributes.EXPERIMENT_DESCRIPTION: experiment_description, 2902 LangfuseOtelSpanAttributes.EXPERIMENT_ITEM_EXPECTED_OUTPUT: _serialize( 2903 expected_output 2904 ), 2905 }.items() 2906 if v is not None 2907 } 2908 ) 2909 2910 propagated_experiment_attributes = PropagatedExperimentAttributes( 2911 experiment_id=experiment_id, 2912 experiment_name=experiment_run_name, 2913 experiment_metadata=_flatten_and_serialize_metadata_values( 2914 experiment_metadata 2915 ), 2916 experiment_dataset_id=dataset_id, 2917 experiment_item_id=experiment_item_id, 2918 experiment_item_metadata=_flatten_and_serialize_metadata_values( 2919 item_metadata if isinstance(item_metadata, dict) else None 2920 ), 2921 experiment_item_root_observation_id=span.id, 2922 ) 2923 2924 with _propagate_attributes(experiment=propagated_experiment_attributes): 2925 output = await _run_task(task, item) 2926 2927 span.update( 2928 input=input_data, 2929 output=output, 2930 metadata=final_observation_metadata, 2931 ) 2932 2933 except Exception as e: 2934 span.update( 2935 output=f"Error: {str(e)}", level="ERROR", status_message=str(e) 2936 ) 2937 raise e 2938 2939 # Run evaluators 2940 evaluations = [] 2941 2942 for evaluator in evaluators: 2943 try: 2944 eval_metadata: Optional[Dict[str, Any]] = None 2945 2946 if isinstance(item, dict): 2947 eval_metadata = item.get("metadata") 2948 elif hasattr(item, "metadata"): 2949 eval_metadata = item.metadata 2950 2951 with _propagate_attributes( 2952 experiment=propagated_experiment_attributes 2953 ): 2954 eval_results = await _run_evaluator( 2955 evaluator, 2956 input=input_data, 2957 output=output, 2958 expected_output=expected_output, 2959 metadata=eval_metadata, 2960 ) 2961 evaluations.extend(eval_results) 2962 2963 # Store evaluations as scores 2964 for evaluation in eval_results: 2965 self.create_score( 2966 trace_id=trace_id, 2967 observation_id=span.id, 2968 name=evaluation.name, 2969 value=evaluation.value, # type: ignore 2970 comment=evaluation.comment, 2971 metadata=evaluation.metadata, 2972 config_id=evaluation.config_id, 2973 data_type=evaluation.data_type, # type: ignore 2974 ) 2975 2976 except Exception as e: 2977 langfuse_logger.error(f"Evaluator failed: {e}") 2978 2979 # Run composite evaluator if provided and we have evaluations 2980 if composite_evaluator and evaluations: 2981 try: 2982 composite_eval_metadata: Optional[Dict[str, Any]] = None 2983 if isinstance(item, dict): 2984 composite_eval_metadata = item.get("metadata") 2985 elif hasattr(item, "metadata"): 2986 composite_eval_metadata = item.metadata 2987 2988 with _propagate_attributes( 2989 experiment=propagated_experiment_attributes 2990 ): 2991 result = composite_evaluator( 2992 input=input_data, 2993 output=output, 2994 expected_output=expected_output, 2995 metadata=composite_eval_metadata, 2996 evaluations=evaluations, 2997 ) 2998 2999 # Handle async composite evaluators 3000 if asyncio.iscoroutine(result): 3001 result = await result 3002 3003 # Normalize to list 3004 composite_evals: List[Evaluation] = [] 3005 if isinstance(result, (dict, Evaluation)): 3006 composite_evals = [result] # type: ignore 3007 elif isinstance(result, list): 3008 composite_evals = result # type: ignore 3009 3010 # Store composite evaluations as scores and add to evaluations list 3011 for composite_evaluation in composite_evals: 3012 self.create_score( 3013 trace_id=trace_id, 3014 observation_id=span.id, 3015 name=composite_evaluation.name, 3016 value=composite_evaluation.value, # type: ignore 3017 comment=composite_evaluation.comment, 3018 metadata=composite_evaluation.metadata, 3019 config_id=composite_evaluation.config_id, 3020 data_type=composite_evaluation.data_type, # type: ignore 3021 ) 3022 evaluations.append(composite_evaluation) 3023 3024 except Exception as e: 3025 langfuse_logger.error(f"Composite evaluator failed: {e}") 3026 3027 return ExperimentItemResult( 3028 item=item, 3029 output=output, 3030 evaluations=evaluations, 3031 trace_id=trace_id, 3032 dataset_run_id=dataset_run_id, 3033 ) 3034 3035 def _create_experiment_run_name( 3036 self, *, name: Optional[str] = None, run_name: Optional[str] = None 3037 ) -> str: 3038 if run_name: 3039 return run_name 3040 3041 iso_timestamp = _get_timestamp().isoformat().replace("+00:00", "Z") 3042 3043 return f"{name} - {iso_timestamp}" 3044 3045 def run_batched_evaluation( 3046 self, 3047 *, 3048 scope: Literal["traces", "observations"], 3049 mapper: MapperFunction, 3050 filter: Optional[str] = None, 3051 fetch_batch_size: int = 50, 3052 fetch_trace_fields: Optional[str] = None, 3053 max_items: Optional[int] = None, 3054 max_retries: int = 3, 3055 evaluators: List[EvaluatorFunction], 3056 composite_evaluator: Optional[CompositeEvaluatorFunction] = None, 3057 max_concurrency: int = 5, 3058 metadata: Optional[Dict[str, Any]] = None, 3059 _add_observation_scores_to_trace: bool = False, 3060 _additional_trace_tags: Optional[List[str]] = None, 3061 resume_from: Optional[BatchEvaluationResumeToken] = None, 3062 verbose: bool = False, 3063 ) -> BatchEvaluationResult: 3064 """Fetch traces or observations and run evaluations on each item. 3065 3066 This method provides a powerful way to evaluate existing data in Langfuse at scale. 3067 It fetches items based on filters, transforms them using a mapper function, runs 3068 evaluators on each item, and creates scores that are linked back to the original 3069 entities. This is ideal for: 3070 3071 - Running evaluations on production traces after deployment 3072 - Backtesting new evaluation metrics on historical data 3073 - Batch scoring of observations for quality monitoring 3074 - Periodic evaluation runs on recent data 3075 3076 The method uses a streaming/pipeline approach to process items in batches, making 3077 it memory-efficient for large datasets. It includes comprehensive error handling, 3078 retry logic, and resume capability for long-running evaluations. 3079 3080 Args: 3081 scope: The type of items to evaluate. Must be one of: 3082 - "traces": Evaluate complete traces with all their observations 3083 - "observations": Evaluate individual observations (spans, generations, events) 3084 mapper: Function that transforms API response objects into evaluator inputs. 3085 Receives a trace/observation object and returns an EvaluatorInputs 3086 instance with input, output, expected_output, and metadata fields. 3087 Can be sync or async. 3088 evaluators: List of evaluation functions to run on each item. Each evaluator 3089 receives the mapped inputs and returns Evaluation object(s). Evaluator 3090 failures are logged but don't stop the batch evaluation. 3091 filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples: 3092 - '{"tags": ["production"]}' 3093 - '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}' 3094 Default: None (fetches all items). 3095 fetch_batch_size: Number of items to fetch per API call and hold in memory. 3096 Larger values may be faster but use more memory. Default: 50. 3097 fetch_trace_fields: Comma-separated list of fields to include when fetching traces. Available field groups: 'core' (always included), 'io' (input, output, metadata), 'scores', 'observations', 'metrics'. If not specified, all fields are returned. Example: 'core,scores,metrics'. Note: Excluded 'observations' or 'scores' fields return empty arrays; excluded 'metrics' returns -1 for 'totalCost' and 'latency'. Only relevant if scope is 'traces'. 3098 max_items: Maximum total number of items to process. If None, processes all 3099 items matching the filter. Useful for testing or limiting evaluation runs. 3100 Default: None (process all). 3101 max_concurrency: Maximum number of items to evaluate concurrently. Controls 3102 parallelism and resource usage. Default: 5. 3103 composite_evaluator: Optional function that creates a composite score from 3104 item-level evaluations. Receives the original item and its evaluations, 3105 returns a single Evaluation. Useful for weighted averages or combined metrics. 3106 Default: None. 3107 metadata: Optional metadata dict to add to all created scores. Useful for 3108 tracking evaluation runs, versions, or other context. Default: None. 3109 max_retries: Maximum number of retry attempts for failed batch fetches. 3110 Uses exponential backoff (1s, 2s, 4s). Default: 3. 3111 verbose: If True, logs progress information to console. Useful for monitoring 3112 long-running evaluations. Default: False. 3113 resume_from: Optional resume token from a previous incomplete run. Allows 3114 continuing evaluation after interruption or failure. Default: None. 3115 3116 3117 Returns: 3118 BatchEvaluationResult containing: 3119 - total_items_fetched: Number of items fetched from API 3120 - total_items_processed: Number of items successfully evaluated 3121 - total_items_failed: Number of items that failed evaluation 3122 - total_scores_created: Scores created by item-level evaluators 3123 - total_composite_scores_created: Scores created by composite evaluator 3124 - total_evaluations_failed: Individual evaluator failures 3125 - evaluator_stats: Per-evaluator statistics (success rate, scores created) 3126 - resume_token: Token for resuming if incomplete (None if completed) 3127 - completed: True if all items processed 3128 - duration_seconds: Total execution time 3129 - failed_item_ids: IDs of items that failed 3130 - error_summary: Error types and counts 3131 - has_more_items: True if max_items reached but more exist 3132 3133 Raises: 3134 ValueError: If invalid scope is provided. 3135 3136 Examples: 3137 Basic trace evaluation: 3138 ```python 3139 from langfuse import Langfuse, EvaluatorInputs, Evaluation 3140 3141 client = Langfuse() 3142 3143 # Define mapper to extract fields from traces 3144 def trace_mapper(trace): 3145 return EvaluatorInputs( 3146 input=trace.input, 3147 output=trace.output, 3148 expected_output=None, 3149 metadata={"trace_id": trace.id} 3150 ) 3151 3152 # Define evaluator 3153 def length_evaluator(*, input, output, expected_output, metadata): 3154 return Evaluation( 3155 name="output_length", 3156 value=len(output) if output else 0 3157 ) 3158 3159 # Run batch evaluation 3160 result = client.run_batched_evaluation( 3161 scope="traces", 3162 mapper=trace_mapper, 3163 evaluators=[length_evaluator], 3164 filter='{"tags": ["production"]}', 3165 max_items=1000, 3166 verbose=True 3167 ) 3168 3169 print(f"Processed {result.total_items_processed} traces") 3170 print(f"Created {result.total_scores_created} scores") 3171 ``` 3172 3173 Evaluation with composite scorer: 3174 ```python 3175 def accuracy_evaluator(*, input, output, expected_output, metadata): 3176 # ... evaluation logic 3177 return Evaluation(name="accuracy", value=0.85) 3178 3179 def relevance_evaluator(*, input, output, expected_output, metadata): 3180 # ... evaluation logic 3181 return Evaluation(name="relevance", value=0.92) 3182 3183 def composite_evaluator(*, item, evaluations): 3184 # Weighted average of evaluations 3185 weights = {"accuracy": 0.6, "relevance": 0.4} 3186 total = sum( 3187 e.value * weights.get(e.name, 0) 3188 for e in evaluations 3189 if isinstance(e.value, (int, float)) 3190 ) 3191 return Evaluation( 3192 name="composite_score", 3193 value=total, 3194 comment=f"Weighted average of {len(evaluations)} metrics" 3195 ) 3196 3197 result = client.run_batched_evaluation( 3198 scope="traces", 3199 mapper=trace_mapper, 3200 evaluators=[accuracy_evaluator, relevance_evaluator], 3201 composite_evaluator=composite_evaluator, 3202 filter='{"user_id": "important_user"}', 3203 verbose=True 3204 ) 3205 ``` 3206 3207 Handling incomplete runs with resume: 3208 ```python 3209 # Initial run that may fail or timeout 3210 result = client.run_batched_evaluation( 3211 scope="observations", 3212 mapper=obs_mapper, 3213 evaluators=[my_evaluator], 3214 max_items=10000, 3215 verbose=True 3216 ) 3217 3218 # Check if incomplete 3219 if not result.completed and result.resume_token: 3220 print(f"Processed {result.resume_token.items_processed} items before interruption") 3221 3222 # Resume from where it left off 3223 result = client.run_batched_evaluation( 3224 scope="observations", 3225 mapper=obs_mapper, 3226 evaluators=[my_evaluator], 3227 resume_from=result.resume_token, 3228 verbose=True 3229 ) 3230 3231 print(f"Total items processed: {result.total_items_processed}") 3232 ``` 3233 3234 Monitoring evaluator performance: 3235 ```python 3236 result = client.run_batched_evaluation(...) 3237 3238 for stats in result.evaluator_stats: 3239 success_rate = stats.successful_runs / stats.total_runs 3240 print(f"{stats.name}:") 3241 print(f" Success rate: {success_rate:.1%}") 3242 print(f" Scores created: {stats.total_scores_created}") 3243 3244 if stats.failed_runs > 0: 3245 print(f" ⚠️ Failed {stats.failed_runs} times") 3246 ``` 3247 3248 Note: 3249 - Evaluator failures are logged but don't stop the batch evaluation 3250 - Individual item failures are tracked but don't stop processing 3251 - Fetch failures are retried with exponential backoff 3252 - All scores are automatically flushed to Langfuse at the end 3253 - The resume mechanism uses timestamp-based filtering to avoid duplicates 3254 """ 3255 runner = BatchEvaluationRunner(self) 3256 3257 return cast( 3258 BatchEvaluationResult, 3259 run_async_safely( 3260 runner.run_async( 3261 scope=scope, 3262 mapper=mapper, 3263 evaluators=evaluators, 3264 filter=filter, 3265 fetch_batch_size=fetch_batch_size, 3266 fetch_trace_fields=fetch_trace_fields, 3267 max_items=max_items, 3268 max_concurrency=max_concurrency, 3269 composite_evaluator=composite_evaluator, 3270 metadata=metadata, 3271 _add_observation_scores_to_trace=_add_observation_scores_to_trace, 3272 _additional_trace_tags=_additional_trace_tags, 3273 max_retries=max_retries, 3274 verbose=verbose, 3275 resume_from=resume_from, 3276 ) 3277 ), 3278 ) 3279 3280 def auth_check(self) -> bool: 3281 """Check if the provided credentials (public and secret key) are valid. 3282 3283 Raises: 3284 Exception: If no projects were found for the provided credentials. 3285 3286 Note: 3287 This method is blocking. It is discouraged to use it in production code. 3288 """ 3289 try: 3290 projects = self.api.projects.get() 3291 langfuse_logger.debug( 3292 f"Auth check successful, found {len(projects.data)} projects" 3293 ) 3294 if len(projects.data) == 0: 3295 raise Exception( 3296 "Auth check failed, no project found for the keys provided." 3297 ) 3298 return True 3299 3300 except AttributeError as e: 3301 langfuse_logger.warning( 3302 f"Auth check failed: Client not properly initialized. Error: {e}" 3303 ) 3304 return False 3305 3306 except Error as e: 3307 handle_fern_exception(e) 3308 raise e 3309 3310 def create_dataset( 3311 self, 3312 *, 3313 name: str, 3314 description: Optional[str] = None, 3315 metadata: Optional[Any] = None, 3316 input_schema: Optional[Any] = None, 3317 expected_output_schema: Optional[Any] = None, 3318 ) -> Dataset: 3319 """Create a dataset with the given name on Langfuse. 3320 3321 Args: 3322 name: Name of the dataset to create. 3323 description: Description of the dataset. Defaults to None. 3324 metadata: Additional metadata. Defaults to None. 3325 input_schema: JSON Schema for validating dataset item inputs. When set, all new items will be validated against this schema. 3326 expected_output_schema: JSON Schema for validating dataset item expected outputs. When set, all new items will be validated against this schema. 3327 3328 Returns: 3329 Dataset: The created dataset as returned by the Langfuse API. 3330 """ 3331 try: 3332 langfuse_logger.debug(f"Creating datasets {name}") 3333 3334 result = self.api.datasets.create( 3335 name=name, 3336 description=description, 3337 metadata=metadata, 3338 input_schema=input_schema, 3339 expected_output_schema=expected_output_schema, 3340 ) 3341 3342 return cast(Dataset, result) 3343 3344 except Error as e: 3345 handle_fern_exception(e) 3346 raise e 3347 3348 def create_dataset_item( 3349 self, 3350 *, 3351 dataset_name: str, 3352 input: Optional[Any] = None, 3353 expected_output: Optional[Any] = None, 3354 metadata: Optional[Any] = None, 3355 source_trace_id: Optional[str] = None, 3356 source_observation_id: Optional[str] = None, 3357 status: Optional[DatasetStatus] = None, 3358 id: Optional[str] = None, 3359 ) -> DatasetItem: 3360 """Create a dataset item. 3361 3362 Upserts if an item with id already exists. 3363 3364 Args: 3365 dataset_name: Name of the dataset in which the dataset item should be created. 3366 input: Input data. Defaults to None. Can contain any dict, list or scalar. 3367 expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar. 3368 metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar. 3369 source_trace_id: Id of the source trace. Defaults to None. 3370 source_observation_id: Id of the source observation. Defaults to None. 3371 status: Status of the dataset item. Defaults to ACTIVE for newly created items. 3372 id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets. 3373 3374 Returns: 3375 DatasetItem: The created dataset item as returned by the Langfuse API. 3376 3377 Example: 3378 ```python 3379 from langfuse import Langfuse 3380 3381 langfuse = Langfuse() 3382 3383 # Uploading items to the Langfuse dataset named "capital_cities" 3384 langfuse.create_dataset_item( 3385 dataset_name="capital_cities", 3386 input={"input": {"country": "Italy"}}, 3387 expected_output={"expected_output": "Rome"}, 3388 metadata={"foo": "bar"} 3389 ) 3390 ``` 3391 """ 3392 try: 3393 langfuse_logger.debug(f"Creating dataset item for dataset {dataset_name}") 3394 3395 # Media uploads must reference the (dataset, item) they belong to, and 3396 # the item need not exist yet — so settle on the item id up front and 3397 # reuse it for the create call below. 3398 item_id = id if id is not None else str(uuid.uuid4()) 3399 3400 # Single pass per field: swap each LangfuseMedia for its reference 3401 # string (derived from content, not the upload) and collect the media 3402 # still to upload, deduped by media id and tagged with its field. 3403 pending_media: Dict[str, Tuple[LangfuseMedia, str]] = {} 3404 input = self._process_dataset_item_media( 3405 data=input, 3406 pending_media=pending_media, 3407 field=DatasetItemMediaReferenceField.INPUT.value, 3408 ) 3409 expected_output = self._process_dataset_item_media( 3410 data=expected_output, 3411 pending_media=pending_media, 3412 field=DatasetItemMediaReferenceField.EXPECTED_OUTPUT.value, 3413 ) 3414 metadata = self._process_dataset_item_media( 3415 data=metadata, 3416 pending_media=pending_media, 3417 field=DatasetItemMediaReferenceField.METADATA.value, 3418 ) 3419 3420 # The upload needs the dataset id, but the create API only takes the 3421 # name. Resolve it once, and only when there is actually media to 3422 # upload — a plain item pays no extra datasets.get round-trip. 3423 if pending_media: 3424 assert self._resources is not None 3425 dataset_id = self.api.datasets.get(self._url_encode(dataset_name)).id 3426 for media, field in pending_media.values(): 3427 self._resources._media_manager._upload_media_sync( 3428 media=media, 3429 dataset_id=dataset_id, 3430 dataset_item_id=item_id, 3431 field=field, 3432 ) 3433 3434 result = self.api.dataset_items.create( 3435 dataset_name=dataset_name, 3436 input=input, 3437 expected_output=expected_output, 3438 metadata=metadata, 3439 source_trace_id=source_trace_id, 3440 source_observation_id=source_observation_id, 3441 status=status, 3442 id=item_id, 3443 ) 3444 3445 return cast(DatasetItem, result) 3446 except Error as e: 3447 handle_fern_exception(e) 3448 raise e 3449 3450 def _process_dataset_item_media( 3451 self, 3452 *, 3453 data: Any, 3454 pending_media: Dict[str, Tuple[LangfuseMedia, str]], 3455 field: str, 3456 ) -> Any: 3457 """Swap each ``LangfuseMedia`` for its reference string in ``data``. 3458 3459 Each replaced media is recorded in ``pending_media`` (keyed by media id, 3460 so the same media across fields uploads once) for the caller to upload 3461 after the dataset id has been resolved. 3462 """ 3463 if self._resources is None: 3464 return data 3465 3466 max_levels = 10 3467 3468 def _process_data_recursively( 3469 data: Any, level: int, ancestor_container_ids: set[int] 3470 ) -> Any: 3471 if isinstance(data, LangfuseMedia): 3472 reference_string = data._reference_string 3473 media_id = data._media_id 3474 if reference_string is None or media_id is None: 3475 raise ValueError( 3476 "Cannot create dataset item with invalid LangfuseMedia." 3477 ) 3478 # First field a media appears in wins; later duplicates dedupe. 3479 pending_media.setdefault(media_id, (data, field)) 3480 return reference_string 3481 3482 if isinstance(data, LangfuseMediaReference): 3483 return data.reference_string if data.reference_string else data 3484 3485 # Tuples are intentionally excluded: namedtuple subclasses can't be 3486 # rebuilt from an iterable, so media inside them is left untouched. 3487 if not isinstance(data, (list, set, frozenset, dict)): 3488 return data 3489 3490 # Container ids only protect against recursive cycles. 3491 data_id = id(data) 3492 if data_id in ancestor_container_ids or level > max_levels: 3493 return data 3494 3495 next_ancestor_container_ids = ancestor_container_ids | {data_id} 3496 3497 if isinstance(data, (list, set, frozenset)): 3498 processed = ( 3499 _process_data_recursively( 3500 item, level + 1, next_ancestor_container_ids 3501 ) 3502 for item in data 3503 ) 3504 return type(data)(processed) 3505 3506 return { 3507 key: _process_data_recursively( 3508 value, level + 1, next_ancestor_container_ids 3509 ) 3510 for key, value in data.items() 3511 } 3512 3513 return _process_data_recursively(data, 1, set()) 3514 3515 def _hydrate_dataset_item_media_references(self, item: DatasetItem) -> DatasetItem: 3516 media_references = item.media_references or [] 3517 if not media_references: 3518 return item 3519 3520 # Map the API enum member to the snake_case model attribute so this keeps 3521 # working regardless of the enum's wire value (e.g. "expectedOutput"). 3522 attr_by_field = { 3523 DatasetItemMediaReferenceField.INPUT: "input", 3524 DatasetItemMediaReferenceField.EXPECTED_OUTPUT: "expected_output", 3525 DatasetItemMediaReferenceField.METADATA: "metadata", 3526 } 3527 hydrated_fields = { 3528 "input": item.input, 3529 "expected_output": item.expected_output, 3530 "metadata": item.metadata, 3531 } 3532 3533 for media_reference in media_references: 3534 media = media_reference.media 3535 field = attr_by_field.get(media_reference.field) 3536 if field is None: 3537 continue 3538 3539 replacement = LangfuseMediaReference( 3540 media_id=media.media_id, 3541 content_type=media.content_type, 3542 url=media.url, 3543 url_expiry=media.url_expiry, 3544 content_length=media.content_length, 3545 reference_string=media_reference.reference_string, 3546 ) 3547 hydrated_fields[field] = self._replace_json_path_value( 3548 value=hydrated_fields[field], 3549 path=media_reference.json_path, 3550 replacement=replacement, 3551 ) 3552 3553 return item.model_copy( 3554 update={ 3555 "input": hydrated_fields["input"], 3556 "expected_output": hydrated_fields["expected_output"], 3557 "metadata": hydrated_fields["metadata"], 3558 } 3559 ) 3560 3561 def _replace_json_path_value( 3562 self, *, value: Any, path: str, replacement: LangfuseMediaReference 3563 ) -> Any: 3564 try: 3565 return json_path.set_value_at_path(value, path, replacement) 3566 except Exception as e: 3567 langfuse_logger.warning( 3568 f"Failed to hydrate dataset media reference at JSONPath {path}", 3569 exc_info=e, 3570 ) 3571 3572 return value 3573 3574 def resolve_media_references( 3575 self, 3576 *, 3577 obj: Any, 3578 resolve_with: Literal["base64_data_uri"], 3579 max_depth: int = 10, 3580 content_fetch_timeout_seconds: int = 5, 3581 ) -> Any: 3582 """Replace media reference strings in an object with base64 data URIs. 3583 3584 This method recursively traverses an object (up to max_depth) looking for media reference strings 3585 in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using 3586 the provided Langfuse client and replaces the reference string with a base64 data URI. 3587 3588 If fetching media content fails for a reference string, a warning is logged and the reference 3589 string is left unchanged. 3590 3591 Args: 3592 obj: The object to process. Can be a primitive value, array, or nested object. 3593 If the object has a __dict__ attribute, a dict will be returned instead of the original object type. 3594 resolve_with: The representation of the media content to replace the media reference string with. 3595 Currently only "base64_data_uri" is supported. 3596 max_depth: int: The maximum depth to traverse the object. Default is 10. 3597 content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5. 3598 3599 Returns: 3600 A deep copy of the input object with all media references replaced with base64 data URIs where possible. 3601 If the input object has a __dict__ attribute, a dict will be returned instead of the original object type. 3602 3603 Example: 3604 obj = { 3605 "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@", 3606 "nested": { 3607 "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@" 3608 } 3609 } 3610 3611 result = await LangfuseMedia.resolve_media_references(obj, langfuse_client) 3612 3613 # Result: 3614 # { 3615 # "image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...", 3616 # "nested": { 3617 # "pdf": "data:application/pdf;base64,JVBERi0xLjcK..." 3618 # } 3619 # } 3620 """ 3621 return LangfuseMedia.resolve_media_references( 3622 langfuse_client=self, 3623 obj=obj, 3624 resolve_with=resolve_with, 3625 max_depth=max_depth, 3626 content_fetch_timeout_seconds=content_fetch_timeout_seconds, 3627 ) 3628 3629 @overload 3630 def get_prompt( 3631 self, 3632 name: str, 3633 *, 3634 version: Optional[int] = None, 3635 label: Optional[str] = None, 3636 type: Literal["chat"], 3637 cache_ttl_seconds: Optional[int] = None, 3638 fallback: Optional[List[ChatMessageDict]] = None, 3639 max_retries: Optional[int] = None, 3640 fetch_timeout_seconds: Optional[int] = None, 3641 ) -> ChatPromptClient: ... 3642 3643 @overload 3644 def get_prompt( 3645 self, 3646 name: str, 3647 *, 3648 version: Optional[int] = None, 3649 label: Optional[str] = None, 3650 type: Literal["text"] = "text", 3651 cache_ttl_seconds: Optional[int] = None, 3652 fallback: Optional[str] = None, 3653 max_retries: Optional[int] = None, 3654 fetch_timeout_seconds: Optional[int] = None, 3655 ) -> TextPromptClient: ... 3656 3657 def get_prompt( 3658 self, 3659 name: str, 3660 *, 3661 version: Optional[int] = None, 3662 label: Optional[str] = None, 3663 type: Literal["chat", "text"] = "text", 3664 cache_ttl_seconds: Optional[int] = None, 3665 fallback: Union[Optional[List[ChatMessageDict]], Optional[str]] = None, 3666 max_retries: Optional[int] = None, 3667 fetch_timeout_seconds: Optional[int] = None, 3668 ) -> PromptClient: 3669 """Get a prompt. 3670 3671 This method attempts to fetch the requested prompt from the local cache. If the prompt is not found 3672 in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again 3673 and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will 3674 return the expired prompt as a fallback. 3675 3676 Args: 3677 name (str): The name of the prompt to retrieve. 3678 3679 Keyword Args: 3680 version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both. 3681 label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both. 3682 cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a 3683 keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0. 3684 type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text". 3685 fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None. 3686 max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds. 3687 fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default. 3688 3689 Returns: 3690 The prompt object retrieved from the cache or directly fetched if not cached or expired of type 3691 - TextPromptClient, if type argument is 'text'. 3692 - ChatPromptClient, if type argument is 'chat'. 3693 3694 Raises: 3695 Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an 3696 expired prompt in the cache, in which case it logs a warning and returns the expired prompt. 3697 """ 3698 if self._resources is None: 3699 raise Error( 3700 "SDK is not correctly initialized. Check the init logs for more details." 3701 ) 3702 if version is not None and label is not None: 3703 raise ValueError("Cannot specify both version and label at the same time.") 3704 3705 if not name: 3706 raise ValueError("Prompt name cannot be empty.") 3707 3708 cache_key = PromptCache.generate_cache_key(name, version=version, label=label) 3709 bounded_max_retries = self._get_bounded_max_retries( 3710 max_retries, default_max_retries=2, max_retries_upper_bound=4 3711 ) 3712 3713 langfuse_logger.debug(f"Getting prompt '{cache_key}'") 3714 cached_prompt = self._resources.prompt_cache.get(cache_key) 3715 3716 if cached_prompt is None or cache_ttl_seconds == 0: 3717 langfuse_logger.debug( 3718 f"Prompt '{cache_key}' not found in cache or caching disabled." 3719 ) 3720 try: 3721 return self._fetch_prompt_and_update_cache( 3722 name, 3723 version=version, 3724 label=label, 3725 ttl_seconds=cache_ttl_seconds, 3726 max_retries=bounded_max_retries, 3727 fetch_timeout_seconds=fetch_timeout_seconds, 3728 ) 3729 except Exception as e: 3730 if fallback: 3731 langfuse_logger.warning( 3732 f"Returning fallback prompt for '{cache_key}' due to fetch error: {e}" 3733 ) 3734 3735 fallback_client_args: Dict[str, Any] = { 3736 "name": name, 3737 "prompt": fallback, 3738 "type": type, 3739 "version": version or 0, 3740 "config": {}, 3741 "labels": [label] if label else [], 3742 "tags": [], 3743 } 3744 3745 if type == "text": 3746 return TextPromptClient( 3747 prompt=Prompt_Text(**fallback_client_args), 3748 is_fallback=True, 3749 ) 3750 3751 if type == "chat": 3752 return ChatPromptClient( 3753 prompt=Prompt_Chat(**fallback_client_args), 3754 is_fallback=True, 3755 ) 3756 3757 raise e 3758 3759 if cached_prompt.is_expired(): 3760 langfuse_logger.debug(f"Stale prompt '{cache_key}' found in cache.") 3761 try: 3762 # refresh prompt in background thread, refresh_prompt deduplicates tasks 3763 langfuse_logger.debug(f"Refreshing prompt '{cache_key}' in background.") 3764 3765 def refresh_task() -> None: 3766 self._fetch_prompt_and_update_cache( 3767 name, 3768 version=version, 3769 label=label, 3770 ttl_seconds=cache_ttl_seconds, 3771 max_retries=bounded_max_retries, 3772 fetch_timeout_seconds=fetch_timeout_seconds, 3773 ) 3774 3775 self._resources.prompt_cache.add_refresh_prompt_task_if_current( 3776 cache_key, 3777 cached_prompt, 3778 refresh_task, 3779 ) 3780 langfuse_logger.debug( 3781 f"Returning stale prompt '{cache_key}' from cache." 3782 ) 3783 # return stale prompt 3784 return cached_prompt.value 3785 3786 except Exception as e: 3787 langfuse_logger.warning( 3788 f"Error when refreshing cached prompt '{cache_key}', returning cached version. Error: {e}" 3789 ) 3790 # creation of refresh prompt task failed, return stale prompt 3791 return cached_prompt.value 3792 3793 return cached_prompt.value 3794 3795 def _fetch_prompt_and_update_cache( 3796 self, 3797 name: str, 3798 *, 3799 version: Optional[int] = None, 3800 label: Optional[str] = None, 3801 ttl_seconds: Optional[int] = None, 3802 max_retries: int, 3803 fetch_timeout_seconds: Optional[int], 3804 ) -> PromptClient: 3805 cache_key = PromptCache.generate_cache_key(name, version=version, label=label) 3806 langfuse_logger.debug(f"Fetching prompt '{cache_key}' from server...") 3807 3808 try: 3809 3810 @backoff.on_exception( 3811 backoff.constant, Exception, max_tries=max_retries + 1, logger=None 3812 ) 3813 def fetch_prompts() -> Any: 3814 return self.api.prompts.get( 3815 self._url_encode(name), 3816 version=version, 3817 label=label, 3818 request_options={ 3819 "timeout_in_seconds": fetch_timeout_seconds, 3820 } 3821 if fetch_timeout_seconds is not None 3822 else None, 3823 ) 3824 3825 prompt_response = fetch_prompts() 3826 3827 prompt: PromptClient 3828 if prompt_response.type == "chat": 3829 prompt = ChatPromptClient(prompt_response) 3830 else: 3831 prompt = TextPromptClient(prompt_response) 3832 3833 if self._resources is not None: 3834 self._resources.prompt_cache.set(cache_key, prompt, ttl_seconds) 3835 3836 return prompt 3837 3838 except NotFoundError as not_found_error: 3839 langfuse_logger.warning( 3840 f"Prompt '{cache_key}' not found during refresh, evicting from cache." 3841 ) 3842 if self._resources is not None: 3843 self._resources.prompt_cache.delete(cache_key) 3844 raise not_found_error 3845 3846 except Exception as e: 3847 langfuse_logger.error( 3848 f"Error while fetching prompt '{cache_key}': {str(e)}" 3849 ) 3850 raise e 3851 3852 def _get_bounded_max_retries( 3853 self, 3854 max_retries: Optional[int], 3855 *, 3856 default_max_retries: int = 2, 3857 max_retries_upper_bound: int = 4, 3858 ) -> int: 3859 if max_retries is None: 3860 return default_max_retries 3861 3862 bounded_max_retries = min( 3863 max(max_retries, 0), 3864 max_retries_upper_bound, 3865 ) 3866 3867 return bounded_max_retries 3868 3869 @overload 3870 def create_prompt( 3871 self, 3872 *, 3873 name: str, 3874 prompt: List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]], 3875 labels: List[str] = [], 3876 tags: Optional[List[str]] = None, 3877 type: Optional[Literal["chat"]], 3878 config: Optional[Any] = None, 3879 commit_message: Optional[str] = None, 3880 ) -> ChatPromptClient: ... 3881 3882 @overload 3883 def create_prompt( 3884 self, 3885 *, 3886 name: str, 3887 prompt: str, 3888 labels: List[str] = [], 3889 tags: Optional[List[str]] = None, 3890 type: Optional[Literal["text"]] = "text", 3891 config: Optional[Any] = None, 3892 commit_message: Optional[str] = None, 3893 ) -> TextPromptClient: ... 3894 3895 def create_prompt( 3896 self, 3897 *, 3898 name: str, 3899 prompt: Union[ 3900 str, List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]] 3901 ], 3902 labels: List[str] = [], 3903 tags: Optional[List[str]] = None, 3904 type: Optional[Literal["chat", "text"]] = "text", 3905 config: Optional[Any] = None, 3906 commit_message: Optional[str] = None, 3907 ) -> PromptClient: 3908 """Create a new prompt in Langfuse. 3909 3910 Keyword Args: 3911 name : The name of the prompt to be created. 3912 prompt : The content of the prompt to be created. 3913 is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead. 3914 labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label. 3915 tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt. 3916 config: Additional structured data to be saved with the prompt. Defaults to None. 3917 type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text". 3918 commit_message: Optional string describing the change. 3919 3920 Returns: 3921 TextPromptClient: The prompt if type argument is 'text'. 3922 ChatPromptClient: The prompt if type argument is 'chat'. 3923 """ 3924 try: 3925 langfuse_logger.debug(f"Creating prompt {name=}, {labels=}") 3926 3927 if type == "chat": 3928 if not isinstance(prompt, list): 3929 raise ValueError( 3930 "For 'chat' type, 'prompt' must be a list of chat messages with role and content attributes." 3931 ) 3932 request: Union[CreateChatPromptRequest, CreateTextPromptRequest] = ( 3933 CreateChatPromptRequest( 3934 name=name, 3935 prompt=cast(Any, prompt), 3936 labels=labels, 3937 tags=tags, 3938 config=config or {}, 3939 commit_message=commit_message, 3940 type=CreateChatPromptType.CHAT, 3941 ) 3942 ) 3943 server_prompt = self.api.prompts.create(request=request) 3944 3945 if self._resources is not None: 3946 self._resources.prompt_cache.invalidate(name) 3947 3948 return ChatPromptClient(prompt=cast(Prompt_Chat, server_prompt)) 3949 3950 if not isinstance(prompt, str): 3951 raise ValueError("For 'text' type, 'prompt' must be a string.") 3952 3953 request = CreateTextPromptRequest( 3954 name=name, 3955 prompt=prompt, 3956 labels=labels, 3957 tags=tags, 3958 config=config or {}, 3959 commit_message=commit_message, 3960 ) 3961 3962 server_prompt = self.api.prompts.create(request=request) 3963 3964 if self._resources is not None: 3965 self._resources.prompt_cache.invalidate(name) 3966 3967 return TextPromptClient(prompt=cast(Prompt_Text, server_prompt)) 3968 3969 except Error as e: 3970 handle_fern_exception(e) 3971 raise e 3972 3973 def update_prompt( 3974 self, 3975 *, 3976 name: str, 3977 version: int, 3978 new_labels: List[str] = [], 3979 ) -> Any: 3980 """Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name. 3981 3982 Args: 3983 name (str): The name of the prompt to update. 3984 version (int): The version number of the prompt to update. 3985 new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to []. 3986 3987 Returns: 3988 Prompt: The updated prompt from the Langfuse API. 3989 3990 """ 3991 updated_prompt = self.api.prompt_version.update( 3992 name=self._url_encode(name), 3993 version=version, 3994 new_labels=new_labels, 3995 ) 3996 3997 if self._resources is not None: 3998 self._resources.prompt_cache.invalidate(name) 3999 4000 return updated_prompt 4001 4002 def _url_encode(self, url: str, *, is_url_param: Optional[bool] = False) -> str: 4003 # httpx ≥ 0.28 does its own WHATWG-compliant quoting (eg. encodes bare 4004 # “%”, “?”, “#”, “|”, … in query/path parts). Re-quoting here would 4005 # double-encode, so we skip when the value is about to be sent straight 4006 # to httpx (`is_url_param=True`) and the installed version is ≥ 0.28. 4007 if is_url_param and Version(httpx.__version__) >= Version("0.28.0"): 4008 return url 4009 4010 # urllib.parse.quote does not escape slashes "/" by default; we need to add safe="" to force escaping 4011 # we need add safe="" to force escaping of slashes 4012 # This is necessary for prompts in prompt folders 4013 return urllib.parse.quote(url, safe="") 4014 4015 def clear_prompt_cache(self) -> None: 4016 """Clear the entire prompt cache, removing all cached prompts. 4017 4018 This method is useful when you want to force a complete refresh of all 4019 cached prompts, for example after major updates or when you need to 4020 ensure the latest versions are fetched from the server. 4021 """ 4022 if self._resources is not None: 4023 self._resources.prompt_cache.clear()
Main client for Langfuse tracing and platform features.
This class provides an interface for creating and managing traces, spans, and generations in Langfuse as well as interacting with the Langfuse API.
The client features a thread-safe singleton pattern for each unique public API key, ensuring consistent trace context propagation across your application. It implements efficient batching of spans with configurable flush settings and includes background thread management for media uploads and score ingestion.
Configuration is flexible through either direct parameters or environment variables, with graceful fallbacks and runtime configuration updates.
Attributes:
- api: Synchronous API client for Langfuse backend communication
- async_api: Asynchronous API client for Langfuse backend communication
- _otel_tracer: Internal LangfuseTracer instance managing OpenTelemetry components
Arguments:
- public_key (Optional[str]): Your Langfuse public API key. Can also be set via LANGFUSE_PUBLIC_KEY environment variable.
- secret_key (Optional[str]): Your Langfuse secret API key. Can also be set via LANGFUSE_SECRET_KEY environment variable.
- base_url (Optional[str]): The Langfuse API base URL. Defaults to "https://cloud.langfuse.com". Can also be set via LANGFUSE_BASE_URL environment variable.
- host (Optional[str]): Deprecated. Use base_url instead. The Langfuse API host URL. Defaults to "https://cloud.langfuse.com".
- timeout (Optional[int]): Timeout in seconds for API requests. Defaults to 5 seconds.
- httpx_client (Optional[httpx.Client]): Custom httpx client for making non-tracing HTTP requests. If not provided, a default client will be created.
- debug (bool): Enable debug logging. Defaults to False. Can also be set via LANGFUSE_DEBUG environment variable.
- tracing_enabled (Optional[bool]): Enable or disable tracing. Defaults to True. Can also be set via LANGFUSE_TRACING_ENABLED environment variable.
- flush_at (Optional[int]): Number of spans to batch before sending to the API. Defaults to 512. Can also be set via LANGFUSE_FLUSH_AT environment variable.
- flush_interval (Optional[float]): Time in seconds between batch flushes. Defaults to 5 seconds. Can also be set via LANGFUSE_FLUSH_INTERVAL environment variable.
- environment (Optional[str]): Environment name for tracing. Default is 'default'. Can also be set via LANGFUSE_TRACING_ENVIRONMENT environment variable. Can be any lowercase alphanumeric string with hyphens and underscores that does not start with 'langfuse'.
- release (Optional[str]): Release version/hash of your application. Used for grouping analytics by release.
- media_upload_thread_count (Optional[int]): Number of background threads for handling media uploads. Defaults to 1. Can also be set via LANGFUSE_MEDIA_UPLOAD_THREAD_COUNT environment variable.
- sample_rate (Optional[float]): Sampling rate for traces (0.0 to 1.0). Defaults to 1.0 (100% of traces are sampled). Can also be set via LANGFUSE_SAMPLE_RATE environment variable.
- mask (Optional[MaskFunction]): Function to mask sensitive data synchronously when Langfuse SDK attributes are created. This applies only to data set through Langfuse SDK APIs such as
start_observation(),update(), andset_trace_io(). mask_otel_spans (Optional[MaskOtelSpansFunction]): Synchronous export-stage hook for masking raw OpenTelemetry span attributes before this Langfuse client sends them to Langfuse. Use this for spans created by third-party OpenTelemetry instrumentations, or when you need to inspect final span attributes after export filtering and Langfuse media handling. It does not modify spans already exported through other OpenTelemetry exporters.
The hook receives one OpenTelemetry export batch. A batch is not guaranteed to contain a complete trace, request, or Langfuse observation tree. The hook usually runs on the OpenTelemetry batch span processor worker thread; during
flush()and shutdown it may run on the caller thread. Keep it synchronous, deterministic, and fast.Return
Noneto leave the batch unchanged. ReturnMaskOtelSpansResultwithOtelSpanPatchvalues to delete or replace attributes on selected spans. If the hook raises or returns an invalid batch result, Langfuse drops the whole export batch. If one returned span patch is invalid, Langfuse drops only that span from the Langfuse export.Example:
from typing import Optional from langfuse import Langfuse from langfuse.types import ( MaskOtelSpansParams, MaskOtelSpansResult, OtelSpanPatch, ) def mask_otel_spans( *, params: MaskOtelSpansParams ) -> Optional[MaskOtelSpansResult]: patches = {} for identifier, span in params.spans.items(): if "gen_ai.prompt.0.content" in span.attributes: patches[identifier] = OtelSpanPatch( delete_attributes=("gen_ai.prompt.0.content",), set_attributes={"masking.applied": True}, ) return MaskOtelSpansResult(span_patches=patches) langfuse = Langfuse(mask_otel_spans=mask_otel_spans)blocked_instrumentation_scopes (Optional[List[str]]): Deprecated. Use
should_export_spaninstead. Equivalent behavior:from langfuse.span_filter import is_default_export_span blocked = {"sqlite", "requests"} should_export_span = lambda span: ( is_default_export_span(span) and ( span.instrumentation_scope is None or span.instrumentation_scope.name not in blocked ) )should_export_span (Optional[Callable[[ReadableSpan], bool]]): Callback to decide whether to export a span. If omitted, Langfuse uses the default filter (Langfuse SDK spans, spans with
gen_ai.*attributes, and known LLM instrumentation scopes).- additional_headers (Optional[Dict[str, str]]): Additional headers to include in all API requests and in the default OTLPSpanExporter requests. These headers will be merged with default headers. Note: If httpx_client is provided, additional_headers must be set directly on your custom httpx_client as well. If
span_exporteris provided, these headers are not wired into that exporter and must be configured on the exporter instance directly. - tracer_provider(Optional[TracerProvider]): OpenTelemetry TracerProvider to use for Langfuse. This can be useful to set to have disconnected tracing between Langfuse and other OpenTelemetry-span emitting libraries. Note: To track active spans, the context is still shared between TracerProviders. This may lead to broken trace trees.
- id_generator (Optional[IdGenerator]): OpenTelemetry ID generator to use when Langfuse creates its own TracerProvider. If omitted, the OpenTelemetry SDK default is used. If
tracer_provideris provided, or an OpenTelemetry TracerProvider is already registered globally, configure the ID generator on that provider instead. - span_exporter (Optional[SpanExporter]): Custom OpenTelemetry span exporter for the Langfuse span processor. If omitted, Langfuse creates an OTLPSpanExporter pointed at the Langfuse OTLP endpoint. If provided, Langfuse does not wire
base_url, exporter headers, exporter auth, or exporter timeout into it. Configure endpoint, headers, and timeout on the exporter instance directly. If you are sending spans to Langfuse v4 or using Langfuse Cloud Fast Preview, includex-langfuse-ingestion-version=4on the exporter to enable real time processing of exported spans.
Example:
from langfuse.otel import Langfuse # Initialize the client (reads from env vars if not provided) langfuse = Langfuse( public_key="your-public-key", secret_key="your-secret-key", host="https://cloud.langfuse.com", # Optional, default shown ) # Create a trace span with langfuse.start_as_current_observation(name="process-query") as span: # Your application code here # Create a nested generation span for an LLM call with span.start_as_current_generation( name="generate-response", model="gpt-4", input={"query": "Tell me about AI"}, model_parameters={"temperature": 0.7, "max_tokens": 500} ) as generation: # Generate response here response = "AI is a field of computer science..." generation.update( output=response, usage_details={"prompt_tokens": 10, "completion_tokens": 50}, cost_details={"total_cost": 0.0023} ) # Score the generation (supports NUMERIC, BOOLEAN, CATEGORICAL) generation.score(name="relevance", value=0.95, data_type="NUMERIC")
281 def __init__( 282 self, 283 *, 284 public_key: Optional[str] = None, 285 secret_key: Optional[str] = None, 286 base_url: Optional[str] = None, 287 host: Optional[str] = None, 288 timeout: Optional[int] = None, 289 httpx_client: Optional[httpx.Client] = None, 290 debug: bool = False, 291 tracing_enabled: Optional[bool] = True, 292 flush_at: Optional[int] = None, 293 flush_interval: Optional[float] = None, 294 environment: Optional[str] = None, 295 release: Optional[str] = None, 296 media_upload_thread_count: Optional[int] = None, 297 sample_rate: Optional[float] = None, 298 mask: Optional[MaskFunction] = None, 299 mask_otel_spans: Optional[MaskOtelSpansFunction] = None, 300 blocked_instrumentation_scopes: Optional[List[str]] = None, 301 should_export_span: Optional[Callable[[ReadableSpan], bool]] = None, 302 additional_headers: Optional[Dict[str, str]] = None, 303 tracer_provider: Optional[TracerProvider] = None, 304 id_generator: Optional[IdGenerator] = None, 305 span_exporter: Optional[SpanExporter] = None, 306 ): 307 self._base_url = ( 308 base_url 309 or os.environ.get(LANGFUSE_BASE_URL) 310 or host 311 or os.environ.get(LANGFUSE_HOST, "https://cloud.langfuse.com") 312 ) 313 self._environment = environment or cast( 314 str, os.environ.get(LANGFUSE_TRACING_ENVIRONMENT) 315 ) 316 self._release = ( 317 release 318 or os.environ.get(LANGFUSE_RELEASE, None) 319 or get_common_release_envs() 320 ) 321 self._project_id: Optional[str] = None 322 sample_rate = sample_rate or float(os.environ.get(LANGFUSE_SAMPLE_RATE, 1.0)) 323 if not 0.0 <= sample_rate <= 1.0: 324 raise ValueError( 325 f"Sample rate must be between 0.0 and 1.0, got {sample_rate}" 326 ) 327 328 timeout = timeout or int(os.environ.get(LANGFUSE_TIMEOUT, 5)) 329 330 self._tracing_enabled = ( 331 tracing_enabled 332 and os.environ.get(LANGFUSE_TRACING_ENABLED, "true").lower() != "false" 333 ) 334 if not self._tracing_enabled: 335 langfuse_logger.info( 336 "Configuration: Langfuse tracing is explicitly disabled. No data will be sent to the Langfuse API." 337 ) 338 339 debug = ( 340 debug if debug else (os.getenv(LANGFUSE_DEBUG, "false").lower() == "true") 341 ) 342 if debug: 343 logging.basicConfig( 344 format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" 345 ) 346 langfuse_logger.setLevel(logging.DEBUG) 347 348 public_key = public_key or os.environ.get(LANGFUSE_PUBLIC_KEY) 349 if public_key is None: 350 langfuse_logger.warning( 351 "Authentication error: Langfuse client initialized without public_key. Client will be disabled. " 352 "Provide a public_key parameter or set LANGFUSE_PUBLIC_KEY environment variable. " 353 ) 354 self._otel_tracer = otel_trace_api.NoOpTracer() 355 return 356 357 secret_key = secret_key or os.environ.get(LANGFUSE_SECRET_KEY) 358 if secret_key is None: 359 langfuse_logger.warning( 360 "Authentication error: Langfuse client initialized without secret_key. Client will be disabled. " 361 "Provide a secret_key parameter or set LANGFUSE_SECRET_KEY environment variable. " 362 ) 363 self._otel_tracer = otel_trace_api.NoOpTracer() 364 return 365 366 if os.environ.get("OTEL_SDK_DISABLED", "false").lower() == "true": 367 langfuse_logger.warning( 368 "OTEL_SDK_DISABLED is set. Langfuse tracing will be disabled and no traces will appear in the UI." 369 ) 370 371 if blocked_instrumentation_scopes is not None: 372 warnings.warn( 373 "`blocked_instrumentation_scopes` is deprecated and will be removed in a future release. " 374 "Use `should_export_span` instead. Example: " 375 "from langfuse.span_filter import is_default_export_span; " 376 'blocked={"scope"}; should_export_span=lambda span: ' 377 "is_default_export_span(span) and (span.instrumentation_scope is None or " 378 "span.instrumentation_scope.name not in blocked).", 379 DeprecationWarning, 380 stacklevel=2, 381 ) 382 383 # Initialize api and tracer if requirements are met 384 self._resources = LangfuseResourceManager( 385 public_key=public_key, 386 secret_key=secret_key, 387 base_url=self._base_url, 388 timeout=timeout, 389 environment=self._environment, 390 release=release, 391 flush_at=flush_at, 392 flush_interval=flush_interval, 393 httpx_client=httpx_client, 394 media_upload_thread_count=media_upload_thread_count, 395 sample_rate=sample_rate, 396 mask=mask, 397 mask_otel_spans=mask_otel_spans, 398 tracing_enabled=self._tracing_enabled, 399 blocked_instrumentation_scopes=blocked_instrumentation_scopes, 400 should_export_span=should_export_span, 401 additional_headers=additional_headers, 402 tracer_provider=tracer_provider, 403 id_generator=id_generator, 404 span_exporter=span_exporter, 405 ) 406 self._mask = self._resources.mask 407 408 self._otel_tracer = ( 409 self._resources.tracer 410 if self._tracing_enabled and self._resources.tracer is not None 411 else otel_trace_api.NoOpTracer() 412 ) 413 self.api = self._resources.api 414 self.async_api = self._resources.async_api
563 def start_observation( 564 self, 565 *, 566 trace_context: Optional[TraceContext] = None, 567 name: str, 568 as_type: ObservationTypeLiteralNoEvent = "span", 569 input: Optional[Any] = None, 570 output: Optional[Any] = None, 571 metadata: Optional[Any] = None, 572 version: Optional[str] = None, 573 level: Optional[SpanLevel] = None, 574 status_message: Optional[str] = None, 575 completion_start_time: Optional[datetime] = None, 576 model: Optional[str] = None, 577 model_parameters: Optional[Dict[str, MapValue]] = None, 578 usage_details: Optional[Dict[str, int]] = None, 579 cost_details: Optional[Dict[str, float]] = None, 580 prompt: Optional[PromptClient] = None, 581 ) -> Union[ 582 LangfuseSpan, 583 LangfuseGeneration, 584 LangfuseAgent, 585 LangfuseTool, 586 LangfuseChain, 587 LangfuseRetriever, 588 LangfuseEvaluator, 589 LangfuseEmbedding, 590 LangfuseGuardrail, 591 ]: 592 """Create a new observation of the specified type. 593 594 This method creates a new observation but does not set it as the current span in the 595 context. To create and use an observation within a context, use start_as_current_observation(). 596 597 Args: 598 trace_context: Optional context for connecting to an existing trace 599 name: Name of the observation 600 as_type: Type of observation to create (defaults to "span") 601 input: Input data for the operation 602 output: Output data from the operation 603 metadata: Additional metadata to associate with the observation 604 version: Version identifier for the code or component 605 level: Importance level of the observation 606 status_message: Optional status message for the observation 607 completion_start_time: When the model started generating (for generation types) 608 model: Name/identifier of the AI model used (for generation types) 609 model_parameters: Parameters used for the model (for generation types) 610 usage_details: Token usage information (for generation types) 611 cost_details: Cost information (for generation types) 612 prompt: Associated prompt template (for generation types) 613 614 Returns: 615 An observation object of the appropriate type that must be ended with .end() 616 """ 617 if trace_context: 618 trace_id = trace_context.get("trace_id", None) 619 parent_span_id = trace_context.get("parent_span_id", None) 620 621 if trace_id: 622 remote_parent_span = self._create_remote_parent_span( 623 trace_id=trace_id, parent_span_id=parent_span_id 624 ) 625 626 with otel_trace_api.use_span( 627 cast(otel_trace_api.Span, remote_parent_span) 628 ): 629 otel_span = self._otel_tracer.start_span(name=name) 630 otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True) 631 632 return self._create_observation_from_otel_span( 633 otel_span=otel_span, 634 as_type=as_type, 635 input=input, 636 output=output, 637 metadata=metadata, 638 version=version, 639 level=level, 640 status_message=status_message, 641 completion_start_time=completion_start_time, 642 model=model, 643 model_parameters=model_parameters, 644 usage_details=usage_details, 645 cost_details=cost_details, 646 prompt=prompt, 647 ) 648 649 otel_span = self._otel_tracer.start_span(name=name) 650 651 return self._create_observation_from_otel_span( 652 otel_span=otel_span, 653 as_type=as_type, 654 input=input, 655 output=output, 656 metadata=metadata, 657 version=version, 658 level=level, 659 status_message=status_message, 660 completion_start_time=completion_start_time, 661 model=model, 662 model_parameters=model_parameters, 663 usage_details=usage_details, 664 cost_details=cost_details, 665 prompt=prompt, 666 )
Create a new observation of the specified type.
This method creates a new observation but does not set it as the current span in the context. To create and use an observation within a context, use start_as_current_observation().
Arguments:
- trace_context: Optional context for connecting to an existing trace
- name: Name of the observation
- as_type: Type of observation to create (defaults to "span")
- input: Input data for the operation
- output: Output data from the operation
- metadata: Additional metadata to associate with the observation
- version: Version identifier for the code or component
- level: Importance level of the observation
- status_message: Optional status message for the observation
- completion_start_time: When the model started generating (for generation types)
- model: Name/identifier of the AI model used (for generation types)
- model_parameters: Parameters used for the model (for generation types)
- usage_details: Token usage information (for generation types)
- cost_details: Cost information (for generation types)
- prompt: Associated prompt template (for generation types)
Returns:
An observation object of the appropriate type that must be ended with .end()
896 def start_as_current_observation( 897 self, 898 *, 899 trace_context: Optional[TraceContext] = None, 900 name: str, 901 as_type: ObservationTypeLiteralNoEvent = "span", 902 input: Optional[Any] = None, 903 output: Optional[Any] = None, 904 metadata: Optional[Any] = None, 905 version: Optional[str] = None, 906 level: Optional[SpanLevel] = None, 907 status_message: Optional[str] = None, 908 completion_start_time: Optional[datetime] = None, 909 model: Optional[str] = None, 910 model_parameters: Optional[Dict[str, MapValue]] = None, 911 usage_details: Optional[Dict[str, int]] = None, 912 cost_details: Optional[Dict[str, float]] = None, 913 prompt: Optional[PromptClient] = None, 914 end_on_exit: Optional[bool] = None, 915 ) -> Union[ 916 _AgnosticContextManager[LangfuseGeneration], 917 _AgnosticContextManager[LangfuseSpan], 918 _AgnosticContextManager[LangfuseAgent], 919 _AgnosticContextManager[LangfuseTool], 920 _AgnosticContextManager[LangfuseChain], 921 _AgnosticContextManager[LangfuseRetriever], 922 _AgnosticContextManager[LangfuseEvaluator], 923 _AgnosticContextManager[LangfuseEmbedding], 924 _AgnosticContextManager[LangfuseGuardrail], 925 ]: 926 """Create a new observation and set it as the current span in a context manager. 927 928 This method creates a new observation of the specified type and sets it as the 929 current span within a context manager. Use this method with a 'with' statement to 930 automatically handle the observation lifecycle within a code block. 931 932 The created observation will be the child of the current span in the context. 933 934 Args: 935 trace_context: Optional context for connecting to an existing trace 936 name: Name of the observation (e.g., function or operation name) 937 as_type: Type of observation to create (defaults to "span") 938 input: Input data for the operation (can be any JSON-serializable object) 939 output: Output data from the operation (can be any JSON-serializable object) 940 metadata: Additional metadata to associate with the observation 941 version: Version identifier for the code or component 942 level: Importance level of the observation (info, warning, error) 943 status_message: Optional status message for the observation 944 end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks. 945 946 The following parameters are available when as_type is: "generation" or "embedding". 947 completion_start_time: When the model started generating the response 948 model: Name/identifier of the AI model used (e.g., "gpt-4") 949 model_parameters: Parameters used for the model (e.g., temperature, max_tokens) 950 usage_details: Token usage information (e.g., prompt_tokens, completion_tokens) 951 cost_details: Cost information for the model call 952 prompt: Associated prompt template from Langfuse prompt management 953 954 Returns: 955 A context manager that yields the appropriate observation type based on as_type 956 957 Example: 958 ```python 959 # Create a span 960 with langfuse.start_as_current_observation(name="process-query", as_type="span") as span: 961 # Do work 962 result = process_data() 963 span.update(output=result) 964 965 # Create a child span automatically 966 with span.start_as_current_observation(name="sub-operation") as child_span: 967 # Do sub-operation work 968 child_span.update(output="sub-result") 969 970 # Create a tool observation 971 with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool: 972 # Do tool work 973 results = search_web(query) 974 tool.update(output=results) 975 976 # Create a generation observation 977 with langfuse.start_as_current_observation( 978 name="answer-generation", 979 as_type="generation", 980 model="gpt-4" 981 ) as generation: 982 # Generate answer 983 response = llm.generate(...) 984 generation.update(output=response) 985 ``` 986 """ 987 if as_type in get_observation_types_list(ObservationTypeGenerationLike): 988 if trace_context: 989 trace_id = trace_context.get("trace_id", None) 990 parent_span_id = trace_context.get("parent_span_id", None) 991 992 if trace_id: 993 remote_parent_span = self._create_remote_parent_span( 994 trace_id=trace_id, parent_span_id=parent_span_id 995 ) 996 997 return cast( 998 Union[ 999 _AgnosticContextManager[LangfuseGeneration], 1000 _AgnosticContextManager[LangfuseEmbedding], 1001 ], 1002 self._create_span_with_parent_context( 1003 as_type=as_type, 1004 name=name, 1005 remote_parent_span=remote_parent_span, 1006 parent=None, 1007 end_on_exit=end_on_exit, 1008 input=input, 1009 output=output, 1010 metadata=metadata, 1011 version=version, 1012 level=level, 1013 status_message=status_message, 1014 completion_start_time=completion_start_time, 1015 model=model, 1016 model_parameters=model_parameters, 1017 usage_details=usage_details, 1018 cost_details=cost_details, 1019 prompt=prompt, 1020 ), 1021 ) 1022 1023 return cast( 1024 Union[ 1025 _AgnosticContextManager[LangfuseGeneration], 1026 _AgnosticContextManager[LangfuseEmbedding], 1027 ], 1028 self._start_as_current_otel_span_with_processed_media( 1029 as_type=as_type, 1030 name=name, 1031 end_on_exit=end_on_exit, 1032 input=input, 1033 output=output, 1034 metadata=metadata, 1035 version=version, 1036 level=level, 1037 status_message=status_message, 1038 completion_start_time=completion_start_time, 1039 model=model, 1040 model_parameters=model_parameters, 1041 usage_details=usage_details, 1042 cost_details=cost_details, 1043 prompt=prompt, 1044 ), 1045 ) 1046 1047 if as_type in get_observation_types_list(ObservationTypeSpanLike): 1048 if trace_context: 1049 trace_id = trace_context.get("trace_id", None) 1050 parent_span_id = trace_context.get("parent_span_id", None) 1051 1052 if trace_id: 1053 remote_parent_span = self._create_remote_parent_span( 1054 trace_id=trace_id, parent_span_id=parent_span_id 1055 ) 1056 1057 return cast( 1058 Union[ 1059 _AgnosticContextManager[LangfuseSpan], 1060 _AgnosticContextManager[LangfuseAgent], 1061 _AgnosticContextManager[LangfuseTool], 1062 _AgnosticContextManager[LangfuseChain], 1063 _AgnosticContextManager[LangfuseRetriever], 1064 _AgnosticContextManager[LangfuseEvaluator], 1065 _AgnosticContextManager[LangfuseGuardrail], 1066 ], 1067 self._create_span_with_parent_context( 1068 as_type=as_type, 1069 name=name, 1070 remote_parent_span=remote_parent_span, 1071 parent=None, 1072 end_on_exit=end_on_exit, 1073 input=input, 1074 output=output, 1075 metadata=metadata, 1076 version=version, 1077 level=level, 1078 status_message=status_message, 1079 ), 1080 ) 1081 1082 return cast( 1083 Union[ 1084 _AgnosticContextManager[LangfuseSpan], 1085 _AgnosticContextManager[LangfuseAgent], 1086 _AgnosticContextManager[LangfuseTool], 1087 _AgnosticContextManager[LangfuseChain], 1088 _AgnosticContextManager[LangfuseRetriever], 1089 _AgnosticContextManager[LangfuseEvaluator], 1090 _AgnosticContextManager[LangfuseGuardrail], 1091 ], 1092 self._start_as_current_otel_span_with_processed_media( 1093 as_type=as_type, 1094 name=name, 1095 end_on_exit=end_on_exit, 1096 input=input, 1097 output=output, 1098 metadata=metadata, 1099 version=version, 1100 level=level, 1101 status_message=status_message, 1102 ), 1103 ) 1104 1105 # This should never be reached since all valid types are handled above 1106 langfuse_logger.warning( 1107 f"Unknown observation type: {as_type}, falling back to span" 1108 ) 1109 return self._start_as_current_otel_span_with_processed_media( 1110 as_type="span", 1111 name=name, 1112 end_on_exit=end_on_exit, 1113 input=input, 1114 output=output, 1115 metadata=metadata, 1116 version=version, 1117 level=level, 1118 status_message=status_message, 1119 )
Create a new observation and set it as the current span in a context manager.
This method creates a new observation of the specified type and sets it as the current span within a context manager. Use this method with a 'with' statement to automatically handle the observation lifecycle within a code block.
The created observation will be the child of the current span in the context.
Arguments:
- trace_context: Optional context for connecting to an existing trace
- name: Name of the observation (e.g., function or operation name)
- as_type: Type of observation to create (defaults to "span")
- input: Input data for the operation (can be any JSON-serializable object)
- output: Output data from the operation (can be any JSON-serializable object)
- metadata: Additional metadata to associate with the observation
- version: Version identifier for the code or component
- level: Importance level of the observation (info, warning, error)
- status_message: Optional status message for the observation
- end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
- The following parameters are available when as_type is: "generation" or "embedding".
- completion_start_time: When the model started generating the response
- model: Name/identifier of the AI model used (e.g., "gpt-4")
- model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
- usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
- cost_details: Cost information for the model call
- prompt: Associated prompt template from Langfuse prompt management
Returns:
A context manager that yields the appropriate observation type based on as_type
Example:
# Create a span with langfuse.start_as_current_observation(name="process-query", as_type="span") as span: # Do work result = process_data() span.update(output=result) # Create a child span automatically with span.start_as_current_observation(name="sub-operation") as child_span: # Do sub-operation work child_span.update(output="sub-result") # Create a tool observation with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool: # Do tool work results = search_web(query) tool.update(output=results) # Create a generation observation with langfuse.start_as_current_observation( name="answer-generation", as_type="generation", model="gpt-4" ) as generation: # Generate answer response = llm.generate(...) generation.update(output=response)
1311 def update_current_generation( 1312 self, 1313 *, 1314 name: Optional[str] = None, 1315 input: Optional[Any] = None, 1316 output: Optional[Any] = None, 1317 metadata: Optional[Any] = None, 1318 version: Optional[str] = None, 1319 level: Optional[SpanLevel] = None, 1320 status_message: Optional[str] = None, 1321 completion_start_time: Optional[datetime] = None, 1322 model: Optional[str] = None, 1323 model_parameters: Optional[Dict[str, MapValue]] = None, 1324 usage_details: Optional[Dict[str, int]] = None, 1325 cost_details: Optional[Dict[str, float]] = None, 1326 prompt: Optional[PromptClient] = None, 1327 ) -> None: 1328 """Update the current active generation span with new information. 1329 1330 This method updates the current generation span in the active context with 1331 additional information. It's useful for adding output, usage stats, or other 1332 details that become available during or after model generation. 1333 1334 Args: 1335 name: The generation name 1336 input: Updated input data for the model 1337 output: Output from the model (e.g., completions) 1338 metadata: Additional metadata to associate with the generation 1339 version: Version identifier for the model or component 1340 level: Importance level of the generation (info, warning, error) 1341 status_message: Optional status message for the generation 1342 completion_start_time: When the model started generating the response 1343 model: Name/identifier of the AI model used (e.g., "gpt-4") 1344 model_parameters: Parameters used for the model (e.g., temperature, max_tokens) 1345 usage_details: Token usage information (e.g., prompt_tokens, completion_tokens) 1346 cost_details: Cost information for the model call 1347 prompt: Associated prompt template from Langfuse prompt management 1348 1349 Example: 1350 ```python 1351 with langfuse.start_as_current_generation(name="answer-query") as generation: 1352 # Initial setup and API call 1353 response = llm.generate(...) 1354 1355 # Update with results that weren't available at creation time 1356 langfuse.update_current_generation( 1357 output=response.text, 1358 usage_details={ 1359 "prompt_tokens": response.usage.prompt_tokens, 1360 "completion_tokens": response.usage.completion_tokens 1361 } 1362 ) 1363 ``` 1364 """ 1365 if not self._tracing_enabled: 1366 langfuse_logger.debug( 1367 "Operation skipped: update_current_generation - Tracing is disabled or client is in no-op mode." 1368 ) 1369 return 1370 1371 current_otel_span = self._get_current_otel_span() 1372 1373 if current_otel_span is not None: 1374 generation = LangfuseGeneration( 1375 otel_span=current_otel_span, langfuse_client=self 1376 ) 1377 1378 if name: 1379 current_otel_span.update_name(name) 1380 1381 generation.update( 1382 input=input, 1383 output=output, 1384 metadata=metadata, 1385 version=version, 1386 level=level, 1387 status_message=status_message, 1388 completion_start_time=completion_start_time, 1389 model=model, 1390 model_parameters=model_parameters, 1391 usage_details=usage_details, 1392 cost_details=cost_details, 1393 prompt=prompt, 1394 )
Update the current active generation span with new information.
This method updates the current generation span in the active context with additional information. It's useful for adding output, usage stats, or other details that become available during or after model generation.
Arguments:
- name: The generation name
- input: Updated input data for the model
- output: Output from the model (e.g., completions)
- metadata: Additional metadata to associate with the generation
- version: Version identifier for the model or component
- level: Importance level of the generation (info, warning, error)
- status_message: Optional status message for the generation
- completion_start_time: When the model started generating the response
- model: Name/identifier of the AI model used (e.g., "gpt-4")
- model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
- usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
- cost_details: Cost information for the model call
- prompt: Associated prompt template from Langfuse prompt management
Example:
with langfuse.start_as_current_generation(name="answer-query") as generation: # Initial setup and API call response = llm.generate(...) # Update with results that weren't available at creation time langfuse.update_current_generation( output=response.text, usage_details={ "prompt_tokens": response.usage.prompt_tokens, "completion_tokens": response.usage.completion_tokens } )
1396 def update_current_span( 1397 self, 1398 *, 1399 name: Optional[str] = None, 1400 input: Optional[Any] = None, 1401 output: Optional[Any] = None, 1402 metadata: Optional[Any] = None, 1403 version: Optional[str] = None, 1404 level: Optional[SpanLevel] = None, 1405 status_message: Optional[str] = None, 1406 ) -> None: 1407 """Update the current active span with new information. 1408 1409 This method updates the current span in the active context with 1410 additional information. It's useful for adding outputs or metadata 1411 that become available during execution. 1412 1413 Args: 1414 name: The span name 1415 input: Updated input data for the operation 1416 output: Output data from the operation 1417 metadata: Additional metadata to associate with the span 1418 version: Version identifier for the code or component 1419 level: Importance level of the span (info, warning, error) 1420 status_message: Optional status message for the span 1421 1422 Example: 1423 ```python 1424 with langfuse.start_as_current_observation(name="process-data") as span: 1425 # Initial processing 1426 result = process_first_part() 1427 1428 # Update with intermediate results 1429 langfuse.update_current_span(metadata={"intermediate_result": result}) 1430 1431 # Continue processing 1432 final_result = process_second_part(result) 1433 1434 # Final update 1435 langfuse.update_current_span(output=final_result) 1436 ``` 1437 """ 1438 if not self._tracing_enabled: 1439 langfuse_logger.debug( 1440 "Operation skipped: update_current_span - Tracing is disabled or client is in no-op mode." 1441 ) 1442 return 1443 1444 current_otel_span = self._get_current_otel_span() 1445 1446 if current_otel_span is not None: 1447 span_class = self._get_span_class( 1448 self._get_observation_type_from_otel_span(current_otel_span) 1449 ) 1450 span = span_class( 1451 otel_span=current_otel_span, 1452 langfuse_client=self, 1453 environment=self._environment, 1454 release=self._release, 1455 ) 1456 1457 if name: 1458 current_otel_span.update_name(name) 1459 1460 span.update( 1461 input=input, 1462 output=output, 1463 metadata=metadata, 1464 version=version, 1465 level=level, 1466 status_message=status_message, 1467 )
Update the current active span with new information.
This method updates the current span in the active context with additional information. It's useful for adding outputs or metadata that become available during execution.
Arguments:
- name: The span name
- input: Updated input data for the operation
- output: Output data from the operation
- metadata: Additional metadata to associate with the span
- version: Version identifier for the code or component
- level: Importance level of the span (info, warning, error)
- status_message: Optional status message for the span
Example:
with langfuse.start_as_current_observation(name="process-data") as span: # Initial processing result = process_first_part() # Update with intermediate results langfuse.update_current_span(metadata={"intermediate_result": result}) # Continue processing final_result = process_second_part(result) # Final update langfuse.update_current_span(output=final_result)
1469 @deprecated( 1470 "Trace-level input/output is deprecated. " 1471 "For trace attributes (user_id, session_id, tags, etc.), use propagate_attributes() instead. " 1472 "This method will be removed in a future major version." 1473 ) 1474 def set_current_trace_io( 1475 self, 1476 *, 1477 input: Optional[Any] = None, 1478 output: Optional[Any] = None, 1479 ) -> None: 1480 """Set trace-level input and output for the current span's trace. 1481 1482 .. deprecated:: 1483 This is a legacy method for backward compatibility with Langfuse platform 1484 features that still rely on trace-level input/output (e.g., legacy LLM-as-a-judge 1485 evaluators). It will be removed in a future major version. 1486 1487 For setting other trace attributes (user_id, session_id, metadata, tags, version), 1488 use :meth:`propagate_attributes` instead. 1489 1490 Args: 1491 input: Input data to associate with the trace. 1492 output: Output data to associate with the trace. 1493 """ 1494 if not self._tracing_enabled: 1495 langfuse_logger.debug( 1496 "Operation skipped: set_current_trace_io - Tracing is disabled or client is in no-op mode." 1497 ) 1498 return 1499 1500 current_otel_span = self._get_current_otel_span() 1501 1502 if current_otel_span is not None and current_otel_span.is_recording(): 1503 span_class = self._get_span_class( 1504 self._get_observation_type_from_otel_span(current_otel_span) 1505 ) 1506 span = span_class( 1507 otel_span=current_otel_span, 1508 langfuse_client=self, 1509 environment=self._environment, 1510 release=self._release, 1511 ) 1512 1513 span.set_trace_io( 1514 input=input, 1515 output=output, 1516 )
Set trace-level input and output for the current span's trace.
Deprecated since version : This is a legacy method for backward compatibility with Langfuse platform features that still rely on trace-level input/output (e.g., legacy LLM-as-a-judge evaluators). It will be removed in a future major version.
For setting other trace attributes (user_id, session_id, metadata, tags, version),
use propagate_attributes() instead.
Arguments:
- input: Input data to associate with the trace.
- output: Output data to associate with the trace.
1518 def set_current_trace_as_public(self) -> None: 1519 """Make the current trace publicly accessible via its URL. 1520 1521 When a trace is published, anyone with the trace link can view the full trace 1522 without needing to be logged in to Langfuse. This action cannot be undone 1523 programmatically - once published, the entire trace becomes public. 1524 1525 This is a convenience method that publishes the trace from the currently 1526 active span context. Use this when you want to make a trace public from 1527 within a traced function without needing direct access to the span object. 1528 """ 1529 if not self._tracing_enabled: 1530 langfuse_logger.debug( 1531 "Operation skipped: set_current_trace_as_public - Tracing is disabled or client is in no-op mode." 1532 ) 1533 return 1534 1535 current_otel_span = self._get_current_otel_span() 1536 1537 if current_otel_span is not None and current_otel_span.is_recording(): 1538 span_class = self._get_span_class( 1539 self._get_observation_type_from_otel_span(current_otel_span) 1540 ) 1541 span = span_class( 1542 otel_span=current_otel_span, 1543 langfuse_client=self, 1544 environment=self._environment, 1545 ) 1546 1547 span.set_trace_as_public()
Make the current trace publicly accessible via its URL.
When a trace is published, anyone with the trace link can view the full trace without needing to be logged in to Langfuse. This action cannot be undone programmatically - once published, the entire trace becomes public.
This is a convenience method that publishes the trace from the currently active span context. Use this when you want to make a trace public from within a traced function without needing direct access to the span object.
1549 def create_event( 1550 self, 1551 *, 1552 trace_context: Optional[TraceContext] = None, 1553 name: str, 1554 input: Optional[Any] = None, 1555 output: Optional[Any] = None, 1556 metadata: Optional[Any] = None, 1557 version: Optional[str] = None, 1558 level: Optional[SpanLevel] = None, 1559 status_message: Optional[str] = None, 1560 ) -> LangfuseEvent: 1561 """Create a new Langfuse observation of type 'EVENT'. 1562 1563 The created Langfuse Event observation will be the child of the current span in the context. 1564 1565 Args: 1566 trace_context: Optional context for connecting to an existing trace 1567 name: Name of the span (e.g., function or operation name) 1568 input: Input data for the operation (can be any JSON-serializable object) 1569 output: Output data from the operation (can be any JSON-serializable object) 1570 metadata: Additional metadata to associate with the span 1571 version: Version identifier for the code or component 1572 level: Importance level of the span (info, warning, error) 1573 status_message: Optional status message for the span 1574 1575 Returns: 1576 The Langfuse Event object 1577 1578 Example: 1579 ```python 1580 event = langfuse.create_event(name="process-event") 1581 ``` 1582 """ 1583 timestamp = time_ns() 1584 1585 if trace_context: 1586 trace_id = trace_context.get("trace_id", None) 1587 parent_span_id = trace_context.get("parent_span_id", None) 1588 1589 if trace_id: 1590 remote_parent_span = self._create_remote_parent_span( 1591 trace_id=trace_id, parent_span_id=parent_span_id 1592 ) 1593 1594 with otel_trace_api.use_span( 1595 cast(otel_trace_api.Span, remote_parent_span) 1596 ): 1597 otel_span = self._otel_tracer.start_span( 1598 name=name, start_time=timestamp 1599 ) 1600 otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True) 1601 1602 return cast( 1603 LangfuseEvent, 1604 LangfuseEvent( 1605 otel_span=otel_span, 1606 langfuse_client=self, 1607 environment=self._environment, 1608 release=self._release, 1609 input=input, 1610 output=output, 1611 metadata=metadata, 1612 version=version, 1613 level=level, 1614 status_message=status_message, 1615 ).end(end_time=timestamp), 1616 ) 1617 1618 otel_span = self._otel_tracer.start_span(name=name, start_time=timestamp) 1619 1620 return cast( 1621 LangfuseEvent, 1622 LangfuseEvent( 1623 otel_span=otel_span, 1624 langfuse_client=self, 1625 environment=self._environment, 1626 release=self._release, 1627 input=input, 1628 output=output, 1629 metadata=metadata, 1630 version=version, 1631 level=level, 1632 status_message=status_message, 1633 ).end(end_time=timestamp), 1634 )
Create a new Langfuse observation of type 'EVENT'.
The created Langfuse Event observation will be the child of the current span in the context.
Arguments:
- trace_context: Optional context for connecting to an existing trace
- name: Name of the span (e.g., function or operation name)
- input: Input data for the operation (can be any JSON-serializable object)
- output: Output data from the operation (can be any JSON-serializable object)
- metadata: Additional metadata to associate with the span
- version: Version identifier for the code or component
- level: Importance level of the span (info, warning, error)
- status_message: Optional status message for the span
Returns:
The Langfuse Event object
Example:
event = langfuse.create_event(name="process-event")
1723 @staticmethod 1724 def create_trace_id(*, seed: Optional[str] = None) -> str: 1725 """Create a unique trace ID for use with Langfuse. 1726 1727 This method generates a unique trace ID for use with various Langfuse APIs. 1728 It can either generate a random ID or create a deterministic ID based on 1729 a seed string. 1730 1731 Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes. 1732 This method ensures the generated ID meets this requirement. If you need to 1733 correlate an external ID with a Langfuse trace ID, use the external ID as the 1734 seed to get a valid, deterministic Langfuse trace ID. 1735 1736 Args: 1737 seed: Optional string to use as a seed for deterministic ID generation. 1738 If provided, the same seed will always produce the same ID. 1739 If not provided, a random ID will be generated. 1740 1741 Returns: 1742 A 32-character lowercase hexadecimal string representing the Langfuse trace ID. 1743 1744 Example: 1745 ```python 1746 # Generate a random trace ID 1747 trace_id = langfuse.create_trace_id() 1748 1749 # Generate a deterministic ID based on a seed 1750 session_trace_id = langfuse.create_trace_id(seed="session-456") 1751 1752 # Correlate an external ID with a Langfuse trace ID 1753 external_id = "external-system-123456" 1754 correlated_trace_id = langfuse.create_trace_id(seed=external_id) 1755 1756 # Use the ID with trace context 1757 with langfuse.start_as_current_observation( 1758 name="process-request", 1759 trace_context={"trace_id": trace_id} 1760 ) as span: 1761 # Operation will be part of the specific trace 1762 pass 1763 ``` 1764 """ 1765 if not seed: 1766 trace_id_int = RandomIdGenerator().generate_trace_id() 1767 1768 return Langfuse._format_otel_trace_id(trace_id_int) 1769 1770 return sha256(seed.encode("utf-8")).digest()[:16].hex()
Create a unique trace ID for use with Langfuse.
This method generates a unique trace ID for use with various Langfuse APIs. It can either generate a random ID or create a deterministic ID based on a seed string.
Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes. This method ensures the generated ID meets this requirement. If you need to correlate an external ID with a Langfuse trace ID, use the external ID as the seed to get a valid, deterministic Langfuse trace ID.
Arguments:
- seed: Optional string to use as a seed for deterministic ID generation. If provided, the same seed will always produce the same ID. If not provided, a random ID will be generated.
Returns:
A 32-character lowercase hexadecimal string representing the Langfuse trace ID.
Example:
# Generate a random trace ID trace_id = langfuse.create_trace_id() # Generate a deterministic ID based on a seed session_trace_id = langfuse.create_trace_id(seed="session-456") # Correlate an external ID with a Langfuse trace ID external_id = "external-system-123456" correlated_trace_id = langfuse.create_trace_id(seed=external_id) # Use the ID with trace context with langfuse.start_as_current_observation( name="process-request", trace_context={"trace_id": trace_id} ) as span: # Operation will be part of the specific trace pass
1852 def create_score( 1853 self, 1854 *, 1855 name: str, 1856 value: Union[float, str], 1857 session_id: Optional[str] = None, 1858 dataset_run_id: Optional[str] = None, 1859 trace_id: Optional[str] = None, 1860 observation_id: Optional[str] = None, 1861 score_id: Optional[str] = None, 1862 data_type: Optional[ScoreDataType] = None, 1863 comment: Optional[str] = None, 1864 config_id: Optional[str] = None, 1865 metadata: Optional[Any] = None, 1866 timestamp: Optional[datetime] = None, 1867 environment: Optional[str] = None, 1868 ) -> None: 1869 """Create a score for a specific trace or observation. 1870 1871 This method creates a score for evaluating a Langfuse trace or observation. Scores can be 1872 used to track quality metrics, user feedback, or automated evaluations. 1873 1874 Args: 1875 name: Name of the score (e.g., "relevance", "accuracy") 1876 value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION) 1877 session_id: ID of the Langfuse session to associate the score with 1878 dataset_run_id: ID of the Langfuse dataset run to associate the score with 1879 trace_id: ID of the Langfuse trace to associate the score with 1880 observation_id: Optional ID of the specific observation to score. Trace ID must be provided too. 1881 score_id: Optional custom ID for the score (auto-generated if not provided) 1882 data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION) 1883 comment: Optional comment or explanation for the score 1884 config_id: Optional ID of a score config defined in Langfuse 1885 metadata: Optional metadata to be attached to the score 1886 timestamp: Optional timestamp for the score (defaults to current UTC time) 1887 environment: Optional environment override for this score. If omitted, 1888 the score uses the client-level environment from 1889 `Langfuse(environment=...)` or `LANGFUSE_TRACING_ENVIRONMENT`. 1890 Langfuse observation wrapper methods pass their resolved span 1891 environment here so scores created via `span.score()` or 1892 `span.score_trace()` stay grouped with the scored observation or 1893 trace, including request-scoped environments propagated with 1894 `propagate_attributes(environment=...)`. 1895 1896 Example: 1897 ```python 1898 # Create a numeric score for accuracy 1899 langfuse.create_score( 1900 name="accuracy", 1901 value=0.92, 1902 trace_id="abcdef1234567890abcdef1234567890", 1903 data_type="NUMERIC", 1904 comment="High accuracy with minor irrelevant details" 1905 ) 1906 1907 # Create a categorical score for sentiment 1908 langfuse.create_score( 1909 name="sentiment", 1910 value="positive", 1911 trace_id="abcdef1234567890abcdef1234567890", 1912 observation_id="abcdef1234567890", 1913 data_type="CATEGORICAL" 1914 ) 1915 ``` 1916 """ 1917 if not self._tracing_enabled: 1918 return 1919 1920 score_id = score_id or self._create_observation_id() 1921 1922 try: 1923 new_body = ScoreBody( 1924 id=score_id, 1925 sessionId=session_id, 1926 datasetRunId=dataset_run_id, 1927 traceId=trace_id, 1928 observationId=observation_id, 1929 name=name, 1930 value=value, 1931 dataType=data_type, # type: ignore 1932 comment=comment, 1933 configId=config_id, 1934 environment=environment or self._environment, 1935 metadata=metadata, 1936 ) 1937 1938 event = { 1939 "id": self.create_trace_id(), 1940 "type": "score-create", 1941 "timestamp": timestamp or _get_timestamp(), 1942 "body": new_body, 1943 } 1944 1945 if self._resources is not None: 1946 # Force the score to be in sample if it was for a legacy trace ID, i.e. non-32 hexchar 1947 force_sample = ( 1948 not self._is_valid_trace_id(trace_id) if trace_id else True 1949 ) 1950 1951 self._resources.add_score_task( 1952 event, 1953 force_sample=force_sample, 1954 ) 1955 1956 except Exception as e: 1957 langfuse_logger.exception( 1958 f"Error creating score: Failed to process score event for trace_id={trace_id}, name={name}. Error: {e}" 1959 )
Create a score for a specific trace or observation.
This method creates a score for evaluating a Langfuse trace or observation. Scores can be used to track quality metrics, user feedback, or automated evaluations.
Arguments:
- name: Name of the score (e.g., "relevance", "accuracy")
- value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION)
- session_id: ID of the Langfuse session to associate the score with
- dataset_run_id: ID of the Langfuse dataset run to associate the score with
- trace_id: ID of the Langfuse trace to associate the score with
- observation_id: Optional ID of the specific observation to score. Trace ID must be provided too.
- score_id: Optional custom ID for the score (auto-generated if not provided)
- data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION)
- comment: Optional comment or explanation for the score
- config_id: Optional ID of a score config defined in Langfuse
- metadata: Optional metadata to be attached to the score
- timestamp: Optional timestamp for the score (defaults to current UTC time)
- environment: Optional environment override for this score. If omitted,
the score uses the client-level environment from
Langfuse(environment=...)orLANGFUSE_TRACING_ENVIRONMENT. Langfuse observation wrapper methods pass their resolved span environment here so scores created viaspan.score()orspan.score_trace()stay grouped with the scored observation or trace, including request-scoped environments propagated withpropagate_attributes(environment=...).
Example:
# Create a numeric score for accuracy langfuse.create_score( name="accuracy", value=0.92, trace_id="abcdef1234567890abcdef1234567890", data_type="NUMERIC", comment="High accuracy with minor irrelevant details" ) # Create a categorical score for sentiment langfuse.create_score( name="sentiment", value="positive", trace_id="abcdef1234567890abcdef1234567890", observation_id="abcdef1234567890", data_type="CATEGORICAL" )
2022 def score_current_span( 2023 self, 2024 *, 2025 name: str, 2026 value: Union[float, str], 2027 score_id: Optional[str] = None, 2028 data_type: Optional[ScoreDataType] = None, 2029 comment: Optional[str] = None, 2030 config_id: Optional[str] = None, 2031 metadata: Optional[Any] = None, 2032 ) -> None: 2033 """Create a score for the current active span. 2034 2035 This method scores the currently active span in the context. It's a convenient 2036 way to score the current operation without needing to know its trace and span IDs. 2037 If the active span has a `langfuse.environment` attribute, including one 2038 set by `propagate_attributes(environment=...)`, the score uses that 2039 environment. Otherwise it uses the client-level environment. 2040 2041 Args: 2042 name: Name of the score (e.g., "relevance", "accuracy") 2043 value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION) 2044 score_id: Optional custom ID for the score (auto-generated if not provided) 2045 data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION) 2046 comment: Optional comment or explanation for the score 2047 config_id: Optional ID of a score config defined in Langfuse 2048 metadata: Optional metadata to be attached to the score 2049 2050 Example: 2051 ```python 2052 with langfuse.start_as_current_generation(name="answer-query") as generation: 2053 # Generate answer 2054 response = generate_answer(...) 2055 generation.update(output=response) 2056 2057 # Score the generation 2058 langfuse.score_current_span( 2059 name="relevance", 2060 value=0.85, 2061 data_type="NUMERIC", 2062 comment="Mostly relevant but contains some tangential information", 2063 metadata={"model": "gpt-4", "prompt_version": "v2"} 2064 ) 2065 ``` 2066 """ 2067 current_span = self._get_current_otel_span() 2068 2069 if current_span is not None: 2070 trace_id = self._get_otel_trace_id(current_span) 2071 observation_id = self._get_otel_span_id(current_span) 2072 2073 langfuse_logger.info( 2074 f"Score: Creating score name='{name}' value={value} for current span ({observation_id}) in trace {trace_id}" 2075 ) 2076 2077 self.create_score( 2078 trace_id=trace_id, 2079 observation_id=observation_id, 2080 name=name, 2081 value=cast(str, value), 2082 score_id=score_id, 2083 data_type=cast(Literal["CATEGORICAL", "TEXT", "CORRECTION"], data_type), 2084 comment=comment, 2085 config_id=config_id, 2086 metadata=metadata, 2087 environment=get_string_span_attribute( 2088 current_span, LangfuseOtelSpanAttributes.ENVIRONMENT 2089 ), 2090 )
Create a score for the current active span.
This method scores the currently active span in the context. It's a convenient
way to score the current operation without needing to know its trace and span IDs.
If the active span has a langfuse.environment attribute, including one
set by propagate_attributes(environment=...), the score uses that
environment. Otherwise it uses the client-level environment.
Arguments:
- name: Name of the score (e.g., "relevance", "accuracy")
- value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION)
- score_id: Optional custom ID for the score (auto-generated if not provided)
- data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION)
- comment: Optional comment or explanation for the score
- config_id: Optional ID of a score config defined in Langfuse
- metadata: Optional metadata to be attached to the score
Example:
with langfuse.start_as_current_generation(name="answer-query") as generation: # Generate answer response = generate_answer(...) generation.update(output=response) # Score the generation langfuse.score_current_span( name="relevance", value=0.85, data_type="NUMERIC", comment="Mostly relevant but contains some tangential information", metadata={"model": "gpt-4", "prompt_version": "v2"} )
2120 def score_current_trace( 2121 self, 2122 *, 2123 name: str, 2124 value: Union[float, str], 2125 score_id: Optional[str] = None, 2126 data_type: Optional[ScoreDataType] = None, 2127 comment: Optional[str] = None, 2128 config_id: Optional[str] = None, 2129 metadata: Optional[Any] = None, 2130 ) -> None: 2131 """Create a score for the current trace. 2132 2133 This method scores the trace of the currently active span. Unlike score_current_span, 2134 this method associates the score with the entire trace rather than a specific span. 2135 It's useful for scoring overall performance or quality of the entire operation. 2136 If the active span has a `langfuse.environment` attribute, including one 2137 set by `propagate_attributes(environment=...)`, the score uses that 2138 environment. Otherwise it uses the client-level environment. 2139 2140 Args: 2141 name: Name of the score (e.g., "user_satisfaction", "overall_quality") 2142 value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION) 2143 score_id: Optional custom ID for the score (auto-generated if not provided) 2144 data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION) 2145 comment: Optional comment or explanation for the score 2146 config_id: Optional ID of a score config defined in Langfuse 2147 metadata: Optional metadata to be attached to the score 2148 2149 Example: 2150 ```python 2151 with langfuse.start_as_current_observation(name="process-user-request") as span: 2152 # Process request 2153 result = process_complete_request() 2154 span.update(output=result) 2155 2156 # Score the overall trace 2157 langfuse.score_current_trace( 2158 name="overall_quality", 2159 value=0.95, 2160 data_type="NUMERIC", 2161 comment="High quality end-to-end response", 2162 metadata={"evaluator": "gpt-4", "criteria": "comprehensive"} 2163 ) 2164 ``` 2165 """ 2166 current_span = self._get_current_otel_span() 2167 2168 if current_span is not None: 2169 trace_id = self._get_otel_trace_id(current_span) 2170 2171 langfuse_logger.info( 2172 f"Score: Creating score name='{name}' value={value} for entire trace {trace_id}" 2173 ) 2174 2175 self.create_score( 2176 trace_id=trace_id, 2177 name=name, 2178 value=cast(str, value), 2179 score_id=score_id, 2180 data_type=cast(Literal["CATEGORICAL", "TEXT", "CORRECTION"], data_type), 2181 comment=comment, 2182 config_id=config_id, 2183 metadata=metadata, 2184 environment=get_string_span_attribute( 2185 current_span, LangfuseOtelSpanAttributes.ENVIRONMENT 2186 ), 2187 )
Create a score for the current trace.
This method scores the trace of the currently active span. Unlike score_current_span,
this method associates the score with the entire trace rather than a specific span.
It's useful for scoring overall performance or quality of the entire operation.
If the active span has a langfuse.environment attribute, including one
set by propagate_attributes(environment=...), the score uses that
environment. Otherwise it uses the client-level environment.
Arguments:
- name: Name of the score (e.g., "user_satisfaction", "overall_quality")
- value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION)
- score_id: Optional custom ID for the score (auto-generated if not provided)
- data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION)
- comment: Optional comment or explanation for the score
- config_id: Optional ID of a score config defined in Langfuse
- metadata: Optional metadata to be attached to the score
Example:
with langfuse.start_as_current_observation(name="process-user-request") as span: # Process request result = process_complete_request() span.update(output=result) # Score the overall trace langfuse.score_current_trace( name="overall_quality", value=0.95, data_type="NUMERIC", comment="High quality end-to-end response", metadata={"evaluator": "gpt-4", "criteria": "comprehensive"} )
2189 def flush(self) -> None: 2190 """Force flush all pending spans and events to the Langfuse API. 2191 2192 This method manually flushes any pending spans, scores, and other events to the 2193 Langfuse API. It's useful in scenarios where you want to ensure all data is sent 2194 before proceeding, without waiting for the automatic flush interval. 2195 2196 Example: 2197 ```python 2198 # Record some spans and scores 2199 with langfuse.start_as_current_observation(name="operation") as span: 2200 # Do work... 2201 pass 2202 2203 # Ensure all data is sent to Langfuse before proceeding 2204 langfuse.flush() 2205 2206 # Continue with other work 2207 ``` 2208 """ 2209 if self._resources is not None: 2210 self._resources.flush()
Force flush all pending spans and events to the Langfuse API.
This method manually flushes any pending spans, scores, and other events to the Langfuse API. It's useful in scenarios where you want to ensure all data is sent before proceeding, without waiting for the automatic flush interval.
Example:
# Record some spans and scores with langfuse.start_as_current_observation(name="operation") as span: # Do work... pass # Ensure all data is sent to Langfuse before proceeding langfuse.flush() # Continue with other work
2212 def shutdown(self) -> None: 2213 """Shut down the Langfuse client and flush all pending data. 2214 2215 This method cleanly shuts down the Langfuse client, ensuring all pending data 2216 is flushed to the API and all background threads are properly terminated. 2217 2218 It's important to call this method when your application is shutting down to 2219 prevent data loss and resource leaks. For most applications, using the client 2220 as a context manager or relying on the automatic shutdown via atexit is sufficient. 2221 2222 Example: 2223 ```python 2224 # Initialize Langfuse 2225 langfuse = Langfuse(public_key="...", secret_key="...") 2226 2227 # Use Langfuse throughout your application 2228 # ... 2229 2230 # When application is shutting down 2231 langfuse.shutdown() 2232 ``` 2233 """ 2234 if self._resources is not None: 2235 self._resources.shutdown()
Shut down the Langfuse client and flush all pending data.
This method cleanly shuts down the Langfuse client, ensuring all pending data is flushed to the API and all background threads are properly terminated.
It's important to call this method when your application is shutting down to prevent data loss and resource leaks. For most applications, using the client as a context manager or relying on the automatic shutdown via atexit is sufficient.
Example:
# Initialize Langfuse langfuse = Langfuse(public_key="...", secret_key="...") # Use Langfuse throughout your application # ... # When application is shutting down langfuse.shutdown()
2237 def get_current_trace_id(self) -> Optional[str]: 2238 """Get the trace ID of the current active span. 2239 2240 This method retrieves the trace ID from the currently active span in the context. 2241 It can be used to get the trace ID for referencing in logs, external systems, 2242 or for creating related operations. 2243 2244 Returns: 2245 The current trace ID as a 32-character lowercase hexadecimal string, 2246 or None if there is no active span. 2247 2248 Example: 2249 ```python 2250 with langfuse.start_as_current_observation(name="process-request") as span: 2251 # Get the current trace ID for reference 2252 trace_id = langfuse.get_current_trace_id() 2253 2254 # Use it for external correlation 2255 log.info(f"Processing request with trace_id: {trace_id}") 2256 2257 # Or pass to another system 2258 external_system.process(data, trace_id=trace_id) 2259 ``` 2260 """ 2261 if not self._tracing_enabled: 2262 langfuse_logger.debug( 2263 "Operation skipped: get_current_trace_id - Tracing is disabled or client is in no-op mode." 2264 ) 2265 return None 2266 2267 current_otel_span = self._get_current_otel_span() 2268 2269 return self._get_otel_trace_id(current_otel_span) if current_otel_span else None
Get the trace ID of the current active span.
This method retrieves the trace ID from the currently active span in the context. It can be used to get the trace ID for referencing in logs, external systems, or for creating related operations.
Returns:
The current trace ID as a 32-character lowercase hexadecimal string, or None if there is no active span.
Example:
with langfuse.start_as_current_observation(name="process-request") as span: # Get the current trace ID for reference trace_id = langfuse.get_current_trace_id() # Use it for external correlation log.info(f"Processing request with trace_id: {trace_id}") # Or pass to another system external_system.process(data, trace_id=trace_id)
2271 def get_current_observation_id(self) -> Optional[str]: 2272 """Get the observation ID (span ID) of the current active span. 2273 2274 This method retrieves the observation ID from the currently active span in the context. 2275 It can be used to get the observation ID for referencing in logs, external systems, 2276 or for creating scores or other related operations. 2277 2278 Returns: 2279 The current observation ID as a 16-character lowercase hexadecimal string, 2280 or None if there is no active span. 2281 2282 Example: 2283 ```python 2284 with langfuse.start_as_current_observation(name="process-user-query") as span: 2285 # Get the current observation ID 2286 observation_id = langfuse.get_current_observation_id() 2287 2288 # Store it for later reference 2289 cache.set(f"query_{query_id}_observation", observation_id) 2290 2291 # Process the query... 2292 ``` 2293 """ 2294 if not self._tracing_enabled: 2295 langfuse_logger.debug( 2296 "Operation skipped: get_current_observation_id - Tracing is disabled or client is in no-op mode." 2297 ) 2298 return None 2299 2300 current_otel_span = self._get_current_otel_span() 2301 2302 return self._get_otel_span_id(current_otel_span) if current_otel_span else None
Get the observation ID (span ID) of the current active span.
This method retrieves the observation ID from the currently active span in the context. It can be used to get the observation ID for referencing in logs, external systems, or for creating scores or other related operations.
Returns:
The current observation ID as a 16-character lowercase hexadecimal string, or None if there is no active span.
Example:
with langfuse.start_as_current_observation(name="process-user-query") as span: # Get the current observation ID observation_id = langfuse.get_current_observation_id() # Store it for later reference cache.set(f"query_{query_id}_observation", observation_id) # Process the query...
2315 def get_trace_url(self, *, trace_id: Optional[str] = None) -> Optional[str]: 2316 """Get the URL to view a trace in the Langfuse UI. 2317 2318 This method generates a URL that links directly to a trace in the Langfuse UI. 2319 It's useful for providing links in logs, notifications, or debugging tools. 2320 2321 Args: 2322 trace_id: Optional trace ID to generate a URL for. If not provided, 2323 the trace ID of the current active span will be used. 2324 2325 Returns: 2326 A URL string pointing to the trace in the Langfuse UI, 2327 or None if the project ID couldn't be retrieved or no trace ID is available. 2328 2329 Example: 2330 ```python 2331 # Get URL for the current trace 2332 with langfuse.start_as_current_observation(name="process-request") as span: 2333 trace_url = langfuse.get_trace_url() 2334 log.info(f"Processing trace: {trace_url}") 2335 2336 # Get URL for a specific trace 2337 specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef") 2338 send_notification(f"Review needed for trace: {specific_trace_url}") 2339 ``` 2340 """ 2341 final_trace_id = trace_id or self.get_current_trace_id() 2342 if not final_trace_id: 2343 return None 2344 2345 project_id = self._get_project_id() 2346 2347 return ( 2348 f"{self._base_url}/project/{project_id}/traces/{final_trace_id}" 2349 if project_id and final_trace_id 2350 else None 2351 )
Get the URL to view a trace in the Langfuse UI.
This method generates a URL that links directly to a trace in the Langfuse UI. It's useful for providing links in logs, notifications, or debugging tools.
Arguments:
- trace_id: Optional trace ID to generate a URL for. If not provided, the trace ID of the current active span will be used.
Returns:
A URL string pointing to the trace in the Langfuse UI, or None if the project ID couldn't be retrieved or no trace ID is available.
Example:
# Get URL for the current trace with langfuse.start_as_current_observation(name="process-request") as span: trace_url = langfuse.get_trace_url() log.info(f"Processing trace: {trace_url}") # Get URL for a specific trace specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef") send_notification(f"Review needed for trace: {specific_trace_url}")
2353 def get_dataset( 2354 self, 2355 name: str, 2356 *, 2357 fetch_items_page_size: Optional[int] = 50, 2358 version: Optional[datetime] = None, 2359 ) -> "DatasetClient": 2360 """Fetch a dataset by its name. 2361 2362 Args: 2363 name: The name of the dataset to fetch. 2364 fetch_items_page_size: All items of the dataset will be fetched in chunks of this size. Defaults to 50. 2365 version: Retrieve dataset items as they existed at this specific point in time (UTC). 2366 If provided, returns the state of items at the specified UTC timestamp. 2367 If not provided, returns the latest version. Must be a timezone-aware datetime object in UTC. 2368 2369 Returns: 2370 DatasetClient: The dataset with the given name. 2371 """ 2372 try: 2373 langfuse_logger.debug(f"Getting datasets {name}") 2374 dataset = self.api.datasets.get(dataset_name=self._url_encode(name)) 2375 2376 dataset_items: List[DatasetItem] = [] 2377 page = 1 2378 2379 while True: 2380 new_items = self.api.dataset_items.list( 2381 dataset_name=self._url_encode(name, is_url_param=True), 2382 page=page, 2383 limit=fetch_items_page_size, 2384 version=version, 2385 ) 2386 dataset_items.extend( 2387 self._hydrate_dataset_item_media_references(item) 2388 for item in new_items.data 2389 ) 2390 2391 if new_items.meta.total_pages <= page: 2392 break 2393 2394 page += 1 2395 2396 return DatasetClient( 2397 dataset=dataset, 2398 items=dataset_items, 2399 version=version, 2400 langfuse_client=self, 2401 ) 2402 2403 except Error as e: 2404 handle_fern_exception(e) 2405 raise e
Fetch a dataset by its name.
Arguments:
- name: The name of the dataset to fetch.
- fetch_items_page_size: All items of the dataset will be fetched in chunks of this size. Defaults to 50.
- version: Retrieve dataset items as they existed at this specific point in time (UTC). If provided, returns the state of items at the specified UTC timestamp. If not provided, returns the latest version. Must be a timezone-aware datetime object in UTC.
Returns:
DatasetClient: The dataset with the given name.
2407 def get_dataset_run( 2408 self, *, dataset_name: str, run_name: str 2409 ) -> DatasetRunWithItems: 2410 """Fetch a dataset run by dataset name and run name. 2411 2412 Args: 2413 dataset_name (str): The name of the dataset. 2414 run_name (str): The name of the run. 2415 2416 Returns: 2417 DatasetRunWithItems: The dataset run with its items. 2418 """ 2419 try: 2420 return cast( 2421 DatasetRunWithItems, 2422 self.api.datasets.get_run( 2423 dataset_name=self._url_encode(dataset_name), 2424 run_name=self._url_encode(run_name), 2425 request_options=None, 2426 ), 2427 ) 2428 except Error as e: 2429 handle_fern_exception(e) 2430 raise e
Fetch a dataset run by dataset name and run name.
Arguments:
- dataset_name (str): The name of the dataset.
- run_name (str): The name of the run.
Returns:
DatasetRunWithItems: The dataset run with its items.
2432 def get_dataset_runs( 2433 self, 2434 *, 2435 dataset_name: str, 2436 page: Optional[int] = None, 2437 limit: Optional[int] = None, 2438 ) -> PaginatedDatasetRuns: 2439 """Fetch all runs for a dataset. 2440 2441 Args: 2442 dataset_name (str): The name of the dataset. 2443 page (Optional[int]): Page number, starts at 1. 2444 limit (Optional[int]): Limit of items per page. 2445 2446 Returns: 2447 PaginatedDatasetRuns: Paginated list of dataset runs. 2448 """ 2449 try: 2450 return cast( 2451 PaginatedDatasetRuns, 2452 self.api.datasets.get_runs( 2453 dataset_name=self._url_encode(dataset_name), 2454 page=page, 2455 limit=limit, 2456 request_options=None, 2457 ), 2458 ) 2459 except Error as e: 2460 handle_fern_exception(e) 2461 raise e
Fetch all runs for a dataset.
Arguments:
- dataset_name (str): The name of the dataset.
- page (Optional[int]): Page number, starts at 1.
- limit (Optional[int]): Limit of items per page.
Returns:
PaginatedDatasetRuns: Paginated list of dataset runs.
2463 def delete_dataset_run( 2464 self, *, dataset_name: str, run_name: str 2465 ) -> DeleteDatasetRunResponse: 2466 """Delete a dataset run and all its run items. This action is irreversible. 2467 2468 Args: 2469 dataset_name (str): The name of the dataset. 2470 run_name (str): The name of the run. 2471 2472 Returns: 2473 DeleteDatasetRunResponse: Confirmation of deletion. 2474 """ 2475 try: 2476 return cast( 2477 DeleteDatasetRunResponse, 2478 self.api.datasets.delete_run( 2479 dataset_name=self._url_encode(dataset_name), 2480 run_name=self._url_encode(run_name), 2481 request_options=None, 2482 ), 2483 ) 2484 except Error as e: 2485 handle_fern_exception(e) 2486 raise e
Delete a dataset run and all its run items. This action is irreversible.
Arguments:
- dataset_name (str): The name of the dataset.
- run_name (str): The name of the run.
Returns:
DeleteDatasetRunResponse: Confirmation of deletion.
2488 def run_experiment( 2489 self, 2490 *, 2491 name: str, 2492 run_name: Optional[str] = None, 2493 description: Optional[str] = None, 2494 data: ExperimentData, 2495 task: TaskFunction, 2496 evaluators: List[EvaluatorFunction] = [], 2497 composite_evaluator: Optional[CompositeEvaluatorFunction] = None, 2498 run_evaluators: List[RunEvaluatorFunction] = [], 2499 max_concurrency: int = 50, 2500 metadata: Optional[Dict[str, str]] = None, 2501 _dataset_version: Optional[datetime] = None, 2502 ) -> ExperimentResult: 2503 """Run an experiment on a dataset with automatic tracing and evaluation. 2504 2505 This method executes a task function on each item in the provided dataset, 2506 automatically traces all executions with Langfuse for observability, runs 2507 item-level and run-level evaluators on the outputs, and returns comprehensive 2508 results with evaluation metrics. 2509 2510 The experiment system provides: 2511 - Automatic tracing of all task executions 2512 - Concurrent processing with configurable limits 2513 - Comprehensive error handling that isolates failures 2514 - Integration with Langfuse datasets for experiment tracking 2515 - Flexible evaluation framework supporting both sync and async evaluators 2516 2517 Args: 2518 name: Human-readable name for the experiment. Used for identification 2519 in the Langfuse UI. 2520 run_name: Optional exact name for the experiment run. If provided, this will be 2521 used as the exact dataset run name if the `data` contains Langfuse dataset items. 2522 If not provided, this will default to the experiment name appended with an ISO timestamp. 2523 description: Optional description explaining the experiment's purpose, 2524 methodology, or expected outcomes. 2525 data: Array of data items to process. Can be either: 2526 - List of dict-like items with 'input', 'expected_output', 'metadata' keys 2527 - List of Langfuse DatasetItem objects from dataset.items 2528 task: Function that processes each data item and returns output. 2529 Must accept 'item' as keyword argument and can return sync or async results. 2530 The task function signature should be: task(*, item, **kwargs) -> Any 2531 evaluators: List of functions to evaluate each item's output individually. 2532 Each evaluator receives input, output, expected_output, and metadata. 2533 Can return single Evaluation dict or list of Evaluation dicts. 2534 composite_evaluator: Optional function that creates composite scores from item-level evaluations. 2535 Receives the same inputs as item-level evaluators (input, output, expected_output, metadata) 2536 plus the list of evaluations from item-level evaluators. Useful for weighted averages, 2537 pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics. 2538 run_evaluators: List of functions to evaluate the entire experiment run. 2539 Each run evaluator receives all item_results and can compute aggregate metrics. 2540 Useful for calculating averages, distributions, or cross-item comparisons. 2541 max_concurrency: Maximum number of concurrent task executions (default: 50). 2542 Controls the number of items processed simultaneously. Adjust based on 2543 API rate limits and system resources. 2544 metadata: Optional metadata dictionary to attach to all experiment traces. 2545 This metadata will be included in every trace created during the experiment. 2546 If `data` are Langfuse dataset items, the metadata will be attached to the dataset run, too. 2547 2548 Returns: 2549 ExperimentResult containing: 2550 - run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset. 2551 - item_results: List of results for each processed item with outputs and evaluations 2552 - run_evaluations: List of aggregate evaluation results for the entire run 2553 - experiment_id: Stable identifier for the experiment run across all items 2554 - dataset_run_id: ID of the dataset run (if using Langfuse datasets) 2555 - dataset_run_url: Direct URL to view results in Langfuse UI (if applicable) 2556 2557 Raises: 2558 ValueError: If required parameters are missing or invalid 2559 Exception: If experiment setup fails (individual item failures are handled gracefully) 2560 2561 Examples: 2562 Basic experiment with local data: 2563 ```python 2564 def summarize_text(*, item, **kwargs): 2565 return f"Summary: {item['input'][:50]}..." 2566 2567 def length_evaluator(*, input, output, expected_output=None, **kwargs): 2568 return { 2569 "name": "output_length", 2570 "value": len(output), 2571 "comment": f"Output contains {len(output)} characters" 2572 } 2573 2574 result = langfuse.run_experiment( 2575 name="Text Summarization Test", 2576 description="Evaluate summarization quality and length", 2577 data=[ 2578 {"input": "Long article text...", "expected_output": "Expected summary"}, 2579 {"input": "Another article...", "expected_output": "Another summary"} 2580 ], 2581 task=summarize_text, 2582 evaluators=[length_evaluator] 2583 ) 2584 2585 print(f"Processed {len(result.item_results)} items") 2586 for item_result in result.item_results: 2587 print(f"Input: {item_result.item['input']}") 2588 print(f"Output: {item_result.output}") 2589 print(f"Evaluations: {item_result.evaluations}") 2590 ``` 2591 2592 Advanced experiment with async task and multiple evaluators: 2593 ```python 2594 async def llm_task(*, item, **kwargs): 2595 # Simulate async LLM call 2596 response = await openai_client.chat.completions.create( 2597 model="gpt-4", 2598 messages=[{"role": "user", "content": item["input"]}] 2599 ) 2600 return response.choices[0].message.content 2601 2602 def accuracy_evaluator(*, input, output, expected_output=None, **kwargs): 2603 if expected_output and expected_output.lower() in output.lower(): 2604 return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"} 2605 return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"} 2606 2607 def toxicity_evaluator(*, input, output, expected_output=None, **kwargs): 2608 # Simulate toxicity check 2609 toxicity_score = check_toxicity(output) # Your toxicity checker 2610 return { 2611 "name": "toxicity", 2612 "value": toxicity_score, 2613 "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}" 2614 } 2615 2616 def average_accuracy(*, item_results, **kwargs): 2617 accuracies = [ 2618 eval.value for result in item_results 2619 for eval in result.evaluations 2620 if eval.name == "accuracy" 2621 ] 2622 return { 2623 "name": "average_accuracy", 2624 "value": sum(accuracies) / len(accuracies) if accuracies else 0, 2625 "comment": f"Average accuracy across {len(accuracies)} items" 2626 } 2627 2628 result = langfuse.run_experiment( 2629 name="LLM Safety and Accuracy Test", 2630 description="Evaluate model accuracy and safety across diverse prompts", 2631 data=test_dataset, # Your dataset items 2632 task=llm_task, 2633 evaluators=[accuracy_evaluator, toxicity_evaluator], 2634 run_evaluators=[average_accuracy], 2635 max_concurrency=5, # Limit concurrent API calls 2636 metadata={"model": "gpt-4", "temperature": 0.7} 2637 ) 2638 ``` 2639 2640 Using with Langfuse datasets: 2641 ```python 2642 # Get dataset from Langfuse 2643 dataset = langfuse.get_dataset("my-eval-dataset") 2644 2645 result = dataset.run_experiment( 2646 name="Production Model Evaluation", 2647 description="Monthly evaluation of production model performance", 2648 task=my_production_task, 2649 evaluators=[accuracy_evaluator, latency_evaluator] 2650 ) 2651 2652 # Results automatically linked to dataset in Langfuse UI 2653 print(f"View results: {result['dataset_run_url']}") 2654 ``` 2655 2656 Note: 2657 - Task and evaluator functions can be either synchronous or asynchronous 2658 - Individual item failures are logged but don't stop the experiment 2659 - All executions are automatically traced and visible in Langfuse UI 2660 - When using Langfuse datasets, results are automatically linked for easy comparison 2661 - This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.) 2662 - Async execution is handled automatically with smart event loop detection 2663 """ 2664 return cast( 2665 ExperimentResult, 2666 run_async_safely( 2667 self._run_experiment_async( 2668 name=name, 2669 run_name=self._create_experiment_run_name( 2670 name=name, run_name=run_name 2671 ), 2672 description=description, 2673 data=data, 2674 task=task, 2675 evaluators=evaluators or [], 2676 composite_evaluator=composite_evaluator, 2677 run_evaluators=run_evaluators or [], 2678 max_concurrency=max_concurrency, 2679 metadata=metadata, 2680 dataset_version=_dataset_version, 2681 ), 2682 ), 2683 )
Run an experiment on a dataset with automatic tracing and evaluation.
This method executes a task function on each item in the provided dataset, automatically traces all executions with Langfuse for observability, runs item-level and run-level evaluators on the outputs, and returns comprehensive results with evaluation metrics.
The experiment system provides:
- Automatic tracing of all task executions
- Concurrent processing with configurable limits
- Comprehensive error handling that isolates failures
- Integration with Langfuse datasets for experiment tracking
- Flexible evaluation framework supporting both sync and async evaluators
Arguments:
- name: Human-readable name for the experiment. Used for identification in the Langfuse UI.
- run_name: Optional exact name for the experiment run. If provided, this will be
used as the exact dataset run name if the
datacontains Langfuse dataset items. If not provided, this will default to the experiment name appended with an ISO timestamp. - description: Optional description explaining the experiment's purpose, methodology, or expected outcomes.
- data: Array of data items to process. Can be either:
- List of dict-like items with 'input', 'expected_output', 'metadata' keys
- List of Langfuse DatasetItem objects from dataset.items
- task: Function that processes each data item and returns output. Must accept 'item' as keyword argument and can return sync or async results. The task function signature should be: task(*, item, **kwargs) -> Any
- evaluators: List of functions to evaluate each item's output individually. Each evaluator receives input, output, expected_output, and metadata. Can return single Evaluation dict or list of Evaluation dicts.
- composite_evaluator: Optional function that creates composite scores from item-level evaluations. Receives the same inputs as item-level evaluators (input, output, expected_output, metadata) plus the list of evaluations from item-level evaluators. Useful for weighted averages, pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics.
- run_evaluators: List of functions to evaluate the entire experiment run. Each run evaluator receives all item_results and can compute aggregate metrics. Useful for calculating averages, distributions, or cross-item comparisons.
- max_concurrency: Maximum number of concurrent task executions (default: 50). Controls the number of items processed simultaneously. Adjust based on API rate limits and system resources.
- metadata: Optional metadata dictionary to attach to all experiment traces.
This metadata will be included in every trace created during the experiment.
If
dataare Langfuse dataset items, the metadata will be attached to the dataset run, too.
Returns:
ExperimentResult containing:
- run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset.
- item_results: List of results for each processed item with outputs and evaluations
- run_evaluations: List of aggregate evaluation results for the entire run
- experiment_id: Stable identifier for the experiment run across all items
- dataset_run_id: ID of the dataset run (if using Langfuse datasets)
- dataset_run_url: Direct URL to view results in Langfuse UI (if applicable)
Raises:
- ValueError: If required parameters are missing or invalid
- Exception: If experiment setup fails (individual item failures are handled gracefully)
Examples:
Basic experiment with local data:
def summarize_text(*, item, **kwargs): return f"Summary: {item['input'][:50]}..." def length_evaluator(*, input, output, expected_output=None, **kwargs): return { "name": "output_length", "value": len(output), "comment": f"Output contains {len(output)} characters" } result = langfuse.run_experiment( name="Text Summarization Test", description="Evaluate summarization quality and length", data=[ {"input": "Long article text...", "expected_output": "Expected summary"}, {"input": "Another article...", "expected_output": "Another summary"} ], task=summarize_text, evaluators=[length_evaluator] ) print(f"Processed {len(result.item_results)} items") for item_result in result.item_results: print(f"Input: {item_result.item['input']}") print(f"Output: {item_result.output}") print(f"Evaluations: {item_result.evaluations}")Advanced experiment with async task and multiple evaluators:
async def llm_task(*, item, **kwargs): # Simulate async LLM call response = await openai_client.chat.completions.create( model="gpt-4", messages=[{"role": "user", "content": item["input"]}] ) return response.choices[0].message.content def accuracy_evaluator(*, input, output, expected_output=None, **kwargs): if expected_output and expected_output.lower() in output.lower(): return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"} return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"} def toxicity_evaluator(*, input, output, expected_output=None, **kwargs): # Simulate toxicity check toxicity_score = check_toxicity(output) # Your toxicity checker return { "name": "toxicity", "value": toxicity_score, "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}" } def average_accuracy(*, item_results, **kwargs): accuracies = [ eval.value for result in item_results for eval in result.evaluations if eval.name == "accuracy" ] return { "name": "average_accuracy", "value": sum(accuracies) / len(accuracies) if accuracies else 0, "comment": f"Average accuracy across {len(accuracies)} items" } result = langfuse.run_experiment( name="LLM Safety and Accuracy Test", description="Evaluate model accuracy and safety across diverse prompts", data=test_dataset, # Your dataset items task=llm_task, evaluators=[accuracy_evaluator, toxicity_evaluator], run_evaluators=[average_accuracy], max_concurrency=5, # Limit concurrent API calls metadata={"model": "gpt-4", "temperature": 0.7} )Using with Langfuse datasets:
# Get dataset from Langfuse dataset = langfuse.get_dataset("my-eval-dataset") result = dataset.run_experiment( name="Production Model Evaluation", description="Monthly evaluation of production model performance", task=my_production_task, evaluators=[accuracy_evaluator, latency_evaluator] ) # Results automatically linked to dataset in Langfuse UI print(f"View results: {result['dataset_run_url']}")
Note:
- Task and evaluator functions can be either synchronous or asynchronous
- Individual item failures are logged but don't stop the experiment
- All executions are automatically traced and visible in Langfuse UI
- When using Langfuse datasets, results are automatically linked for easy comparison
- This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.)
- Async execution is handled automatically with smart event loop detection
3045 def run_batched_evaluation( 3046 self, 3047 *, 3048 scope: Literal["traces", "observations"], 3049 mapper: MapperFunction, 3050 filter: Optional[str] = None, 3051 fetch_batch_size: int = 50, 3052 fetch_trace_fields: Optional[str] = None, 3053 max_items: Optional[int] = None, 3054 max_retries: int = 3, 3055 evaluators: List[EvaluatorFunction], 3056 composite_evaluator: Optional[CompositeEvaluatorFunction] = None, 3057 max_concurrency: int = 5, 3058 metadata: Optional[Dict[str, Any]] = None, 3059 _add_observation_scores_to_trace: bool = False, 3060 _additional_trace_tags: Optional[List[str]] = None, 3061 resume_from: Optional[BatchEvaluationResumeToken] = None, 3062 verbose: bool = False, 3063 ) -> BatchEvaluationResult: 3064 """Fetch traces or observations and run evaluations on each item. 3065 3066 This method provides a powerful way to evaluate existing data in Langfuse at scale. 3067 It fetches items based on filters, transforms them using a mapper function, runs 3068 evaluators on each item, and creates scores that are linked back to the original 3069 entities. This is ideal for: 3070 3071 - Running evaluations on production traces after deployment 3072 - Backtesting new evaluation metrics on historical data 3073 - Batch scoring of observations for quality monitoring 3074 - Periodic evaluation runs on recent data 3075 3076 The method uses a streaming/pipeline approach to process items in batches, making 3077 it memory-efficient for large datasets. It includes comprehensive error handling, 3078 retry logic, and resume capability for long-running evaluations. 3079 3080 Args: 3081 scope: The type of items to evaluate. Must be one of: 3082 - "traces": Evaluate complete traces with all their observations 3083 - "observations": Evaluate individual observations (spans, generations, events) 3084 mapper: Function that transforms API response objects into evaluator inputs. 3085 Receives a trace/observation object and returns an EvaluatorInputs 3086 instance with input, output, expected_output, and metadata fields. 3087 Can be sync or async. 3088 evaluators: List of evaluation functions to run on each item. Each evaluator 3089 receives the mapped inputs and returns Evaluation object(s). Evaluator 3090 failures are logged but don't stop the batch evaluation. 3091 filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples: 3092 - '{"tags": ["production"]}' 3093 - '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}' 3094 Default: None (fetches all items). 3095 fetch_batch_size: Number of items to fetch per API call and hold in memory. 3096 Larger values may be faster but use more memory. Default: 50. 3097 fetch_trace_fields: Comma-separated list of fields to include when fetching traces. Available field groups: 'core' (always included), 'io' (input, output, metadata), 'scores', 'observations', 'metrics'. If not specified, all fields are returned. Example: 'core,scores,metrics'. Note: Excluded 'observations' or 'scores' fields return empty arrays; excluded 'metrics' returns -1 for 'totalCost' and 'latency'. Only relevant if scope is 'traces'. 3098 max_items: Maximum total number of items to process. If None, processes all 3099 items matching the filter. Useful for testing or limiting evaluation runs. 3100 Default: None (process all). 3101 max_concurrency: Maximum number of items to evaluate concurrently. Controls 3102 parallelism and resource usage. Default: 5. 3103 composite_evaluator: Optional function that creates a composite score from 3104 item-level evaluations. Receives the original item and its evaluations, 3105 returns a single Evaluation. Useful for weighted averages or combined metrics. 3106 Default: None. 3107 metadata: Optional metadata dict to add to all created scores. Useful for 3108 tracking evaluation runs, versions, or other context. Default: None. 3109 max_retries: Maximum number of retry attempts for failed batch fetches. 3110 Uses exponential backoff (1s, 2s, 4s). Default: 3. 3111 verbose: If True, logs progress information to console. Useful for monitoring 3112 long-running evaluations. Default: False. 3113 resume_from: Optional resume token from a previous incomplete run. Allows 3114 continuing evaluation after interruption or failure. Default: None. 3115 3116 3117 Returns: 3118 BatchEvaluationResult containing: 3119 - total_items_fetched: Number of items fetched from API 3120 - total_items_processed: Number of items successfully evaluated 3121 - total_items_failed: Number of items that failed evaluation 3122 - total_scores_created: Scores created by item-level evaluators 3123 - total_composite_scores_created: Scores created by composite evaluator 3124 - total_evaluations_failed: Individual evaluator failures 3125 - evaluator_stats: Per-evaluator statistics (success rate, scores created) 3126 - resume_token: Token for resuming if incomplete (None if completed) 3127 - completed: True if all items processed 3128 - duration_seconds: Total execution time 3129 - failed_item_ids: IDs of items that failed 3130 - error_summary: Error types and counts 3131 - has_more_items: True if max_items reached but more exist 3132 3133 Raises: 3134 ValueError: If invalid scope is provided. 3135 3136 Examples: 3137 Basic trace evaluation: 3138 ```python 3139 from langfuse import Langfuse, EvaluatorInputs, Evaluation 3140 3141 client = Langfuse() 3142 3143 # Define mapper to extract fields from traces 3144 def trace_mapper(trace): 3145 return EvaluatorInputs( 3146 input=trace.input, 3147 output=trace.output, 3148 expected_output=None, 3149 metadata={"trace_id": trace.id} 3150 ) 3151 3152 # Define evaluator 3153 def length_evaluator(*, input, output, expected_output, metadata): 3154 return Evaluation( 3155 name="output_length", 3156 value=len(output) if output else 0 3157 ) 3158 3159 # Run batch evaluation 3160 result = client.run_batched_evaluation( 3161 scope="traces", 3162 mapper=trace_mapper, 3163 evaluators=[length_evaluator], 3164 filter='{"tags": ["production"]}', 3165 max_items=1000, 3166 verbose=True 3167 ) 3168 3169 print(f"Processed {result.total_items_processed} traces") 3170 print(f"Created {result.total_scores_created} scores") 3171 ``` 3172 3173 Evaluation with composite scorer: 3174 ```python 3175 def accuracy_evaluator(*, input, output, expected_output, metadata): 3176 # ... evaluation logic 3177 return Evaluation(name="accuracy", value=0.85) 3178 3179 def relevance_evaluator(*, input, output, expected_output, metadata): 3180 # ... evaluation logic 3181 return Evaluation(name="relevance", value=0.92) 3182 3183 def composite_evaluator(*, item, evaluations): 3184 # Weighted average of evaluations 3185 weights = {"accuracy": 0.6, "relevance": 0.4} 3186 total = sum( 3187 e.value * weights.get(e.name, 0) 3188 for e in evaluations 3189 if isinstance(e.value, (int, float)) 3190 ) 3191 return Evaluation( 3192 name="composite_score", 3193 value=total, 3194 comment=f"Weighted average of {len(evaluations)} metrics" 3195 ) 3196 3197 result = client.run_batched_evaluation( 3198 scope="traces", 3199 mapper=trace_mapper, 3200 evaluators=[accuracy_evaluator, relevance_evaluator], 3201 composite_evaluator=composite_evaluator, 3202 filter='{"user_id": "important_user"}', 3203 verbose=True 3204 ) 3205 ``` 3206 3207 Handling incomplete runs with resume: 3208 ```python 3209 # Initial run that may fail or timeout 3210 result = client.run_batched_evaluation( 3211 scope="observations", 3212 mapper=obs_mapper, 3213 evaluators=[my_evaluator], 3214 max_items=10000, 3215 verbose=True 3216 ) 3217 3218 # Check if incomplete 3219 if not result.completed and result.resume_token: 3220 print(f"Processed {result.resume_token.items_processed} items before interruption") 3221 3222 # Resume from where it left off 3223 result = client.run_batched_evaluation( 3224 scope="observations", 3225 mapper=obs_mapper, 3226 evaluators=[my_evaluator], 3227 resume_from=result.resume_token, 3228 verbose=True 3229 ) 3230 3231 print(f"Total items processed: {result.total_items_processed}") 3232 ``` 3233 3234 Monitoring evaluator performance: 3235 ```python 3236 result = client.run_batched_evaluation(...) 3237 3238 for stats in result.evaluator_stats: 3239 success_rate = stats.successful_runs / stats.total_runs 3240 print(f"{stats.name}:") 3241 print(f" Success rate: {success_rate:.1%}") 3242 print(f" Scores created: {stats.total_scores_created}") 3243 3244 if stats.failed_runs > 0: 3245 print(f" ⚠️ Failed {stats.failed_runs} times") 3246 ``` 3247 3248 Note: 3249 - Evaluator failures are logged but don't stop the batch evaluation 3250 - Individual item failures are tracked but don't stop processing 3251 - Fetch failures are retried with exponential backoff 3252 - All scores are automatically flushed to Langfuse at the end 3253 - The resume mechanism uses timestamp-based filtering to avoid duplicates 3254 """ 3255 runner = BatchEvaluationRunner(self) 3256 3257 return cast( 3258 BatchEvaluationResult, 3259 run_async_safely( 3260 runner.run_async( 3261 scope=scope, 3262 mapper=mapper, 3263 evaluators=evaluators, 3264 filter=filter, 3265 fetch_batch_size=fetch_batch_size, 3266 fetch_trace_fields=fetch_trace_fields, 3267 max_items=max_items, 3268 max_concurrency=max_concurrency, 3269 composite_evaluator=composite_evaluator, 3270 metadata=metadata, 3271 _add_observation_scores_to_trace=_add_observation_scores_to_trace, 3272 _additional_trace_tags=_additional_trace_tags, 3273 max_retries=max_retries, 3274 verbose=verbose, 3275 resume_from=resume_from, 3276 ) 3277 ), 3278 )
Fetch traces or observations and run evaluations on each item.
This method provides a powerful way to evaluate existing data in Langfuse at scale. It fetches items based on filters, transforms them using a mapper function, runs evaluators on each item, and creates scores that are linked back to the original entities. This is ideal for:
- Running evaluations on production traces after deployment
- Backtesting new evaluation metrics on historical data
- Batch scoring of observations for quality monitoring
- Periodic evaluation runs on recent data
The method uses a streaming/pipeline approach to process items in batches, making it memory-efficient for large datasets. It includes comprehensive error handling, retry logic, and resume capability for long-running evaluations.
Arguments:
- scope: The type of items to evaluate. Must be one of:
- "traces": Evaluate complete traces with all their observations
- "observations": Evaluate individual observations (spans, generations, events)
- mapper: Function that transforms API response objects into evaluator inputs. Receives a trace/observation object and returns an EvaluatorInputs instance with input, output, expected_output, and metadata fields. Can be sync or async.
- evaluators: List of evaluation functions to run on each item. Each evaluator receives the mapped inputs and returns Evaluation object(s). Evaluator failures are logged but don't stop the batch evaluation.
- filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples:
- '{"tags": ["production"]}'
- '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}' Default: None (fetches all items).
- fetch_batch_size: Number of items to fetch per API call and hold in memory. Larger values may be faster but use more memory. Default: 50.
- fetch_trace_fields: Comma-separated list of fields to include when fetching traces. Available field groups: 'core' (always included), 'io' (input, output, metadata), 'scores', 'observations', 'metrics'. If not specified, all fields are returned. Example: 'core,scores,metrics'. Note: Excluded 'observations' or 'scores' fields return empty arrays; excluded 'metrics' returns -1 for 'totalCost' and 'latency'. Only relevant if scope is 'traces'.
- max_items: Maximum total number of items to process. If None, processes all items matching the filter. Useful for testing or limiting evaluation runs. Default: None (process all).
- max_concurrency: Maximum number of items to evaluate concurrently. Controls parallelism and resource usage. Default: 5.
- composite_evaluator: Optional function that creates a composite score from item-level evaluations. Receives the original item and its evaluations, returns a single Evaluation. Useful for weighted averages or combined metrics. Default: None.
- metadata: Optional metadata dict to add to all created scores. Useful for tracking evaluation runs, versions, or other context. Default: None.
- max_retries: Maximum number of retry attempts for failed batch fetches. Uses exponential backoff (1s, 2s, 4s). Default: 3.
- verbose: If True, logs progress information to console. Useful for monitoring long-running evaluations. Default: False.
- resume_from: Optional resume token from a previous incomplete run. Allows continuing evaluation after interruption or failure. Default: None.
Returns:
BatchEvaluationResult containing: - total_items_fetched: Number of items fetched from API - total_items_processed: Number of items successfully evaluated - total_items_failed: Number of items that failed evaluation - total_scores_created: Scores created by item-level evaluators - total_composite_scores_created: Scores created by composite evaluator - total_evaluations_failed: Individual evaluator failures - evaluator_stats: Per-evaluator statistics (success rate, scores created) - resume_token: Token for resuming if incomplete (None if completed) - completed: True if all items processed - duration_seconds: Total execution time - failed_item_ids: IDs of items that failed - error_summary: Error types and counts - has_more_items: True if max_items reached but more exist
Raises:
- ValueError: If invalid scope is provided.
Examples:
Basic trace evaluation:
from langfuse import Langfuse, EvaluatorInputs, Evaluation client = Langfuse() # Define mapper to extract fields from traces def trace_mapper(trace): return EvaluatorInputs( input=trace.input, output=trace.output, expected_output=None, metadata={"trace_id": trace.id} ) # Define evaluator def length_evaluator(*, input, output, expected_output, metadata): return Evaluation( name="output_length", value=len(output) if output else 0 ) # Run batch evaluation result = client.run_batched_evaluation( scope="traces", mapper=trace_mapper, evaluators=[length_evaluator], filter='{"tags": ["production"]}', max_items=1000, verbose=True ) print(f"Processed {result.total_items_processed} traces") print(f"Created {result.total_scores_created} scores")Evaluation with composite scorer:
def accuracy_evaluator(*, input, output, expected_output, metadata): # ... evaluation logic return Evaluation(name="accuracy", value=0.85) def relevance_evaluator(*, input, output, expected_output, metadata): # ... evaluation logic return Evaluation(name="relevance", value=0.92) def composite_evaluator(*, item, evaluations): # Weighted average of evaluations weights = {"accuracy": 0.6, "relevance": 0.4} total = sum( e.value * weights.get(e.name, 0) for e in evaluations if isinstance(e.value, (int, float)) ) return Evaluation( name="composite_score", value=total, comment=f"Weighted average of {len(evaluations)} metrics" ) result = client.run_batched_evaluation( scope="traces", mapper=trace_mapper, evaluators=[accuracy_evaluator, relevance_evaluator], composite_evaluator=composite_evaluator, filter='{"user_id": "important_user"}', verbose=True )Handling incomplete runs with resume:
# Initial run that may fail or timeout result = client.run_batched_evaluation( scope="observations", mapper=obs_mapper, evaluators=[my_evaluator], max_items=10000, verbose=True ) # Check if incomplete if not result.completed and result.resume_token: print(f"Processed {result.resume_token.items_processed} items before interruption") # Resume from where it left off result = client.run_batched_evaluation( scope="observations", mapper=obs_mapper, evaluators=[my_evaluator], resume_from=result.resume_token, verbose=True ) print(f"Total items processed: {result.total_items_processed}")Monitoring evaluator performance:
result = client.run_batched_evaluation(...) for stats in result.evaluator_stats: success_rate = stats.successful_runs / stats.total_runs print(f"{stats.name}:") print(f" Success rate: {success_rate:.1%}") print(f" Scores created: {stats.total_scores_created}") if stats.failed_runs > 0: print(f" ⚠️ Failed {stats.failed_runs} times")
Note:
- Evaluator failures are logged but don't stop the batch evaluation
- Individual item failures are tracked but don't stop processing
- Fetch failures are retried with exponential backoff
- All scores are automatically flushed to Langfuse at the end
- The resume mechanism uses timestamp-based filtering to avoid duplicates
3280 def auth_check(self) -> bool: 3281 """Check if the provided credentials (public and secret key) are valid. 3282 3283 Raises: 3284 Exception: If no projects were found for the provided credentials. 3285 3286 Note: 3287 This method is blocking. It is discouraged to use it in production code. 3288 """ 3289 try: 3290 projects = self.api.projects.get() 3291 langfuse_logger.debug( 3292 f"Auth check successful, found {len(projects.data)} projects" 3293 ) 3294 if len(projects.data) == 0: 3295 raise Exception( 3296 "Auth check failed, no project found for the keys provided." 3297 ) 3298 return True 3299 3300 except AttributeError as e: 3301 langfuse_logger.warning( 3302 f"Auth check failed: Client not properly initialized. Error: {e}" 3303 ) 3304 return False 3305 3306 except Error as e: 3307 handle_fern_exception(e) 3308 raise e
Check if the provided credentials (public and secret key) are valid.
Raises:
- Exception: If no projects were found for the provided credentials.
Note:
This method is blocking. It is discouraged to use it in production code.
3310 def create_dataset( 3311 self, 3312 *, 3313 name: str, 3314 description: Optional[str] = None, 3315 metadata: Optional[Any] = None, 3316 input_schema: Optional[Any] = None, 3317 expected_output_schema: Optional[Any] = None, 3318 ) -> Dataset: 3319 """Create a dataset with the given name on Langfuse. 3320 3321 Args: 3322 name: Name of the dataset to create. 3323 description: Description of the dataset. Defaults to None. 3324 metadata: Additional metadata. Defaults to None. 3325 input_schema: JSON Schema for validating dataset item inputs. When set, all new items will be validated against this schema. 3326 expected_output_schema: JSON Schema for validating dataset item expected outputs. When set, all new items will be validated against this schema. 3327 3328 Returns: 3329 Dataset: The created dataset as returned by the Langfuse API. 3330 """ 3331 try: 3332 langfuse_logger.debug(f"Creating datasets {name}") 3333 3334 result = self.api.datasets.create( 3335 name=name, 3336 description=description, 3337 metadata=metadata, 3338 input_schema=input_schema, 3339 expected_output_schema=expected_output_schema, 3340 ) 3341 3342 return cast(Dataset, result) 3343 3344 except Error as e: 3345 handle_fern_exception(e) 3346 raise e
Create a dataset with the given name on Langfuse.
Arguments:
- name: Name of the dataset to create.
- description: Description of the dataset. Defaults to None.
- metadata: Additional metadata. Defaults to None.
- input_schema: JSON Schema for validating dataset item inputs. When set, all new items will be validated against this schema.
- expected_output_schema: JSON Schema for validating dataset item expected outputs. When set, all new items will be validated against this schema.
Returns:
Dataset: The created dataset as returned by the Langfuse API.
3348 def create_dataset_item( 3349 self, 3350 *, 3351 dataset_name: str, 3352 input: Optional[Any] = None, 3353 expected_output: Optional[Any] = None, 3354 metadata: Optional[Any] = None, 3355 source_trace_id: Optional[str] = None, 3356 source_observation_id: Optional[str] = None, 3357 status: Optional[DatasetStatus] = None, 3358 id: Optional[str] = None, 3359 ) -> DatasetItem: 3360 """Create a dataset item. 3361 3362 Upserts if an item with id already exists. 3363 3364 Args: 3365 dataset_name: Name of the dataset in which the dataset item should be created. 3366 input: Input data. Defaults to None. Can contain any dict, list or scalar. 3367 expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar. 3368 metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar. 3369 source_trace_id: Id of the source trace. Defaults to None. 3370 source_observation_id: Id of the source observation. Defaults to None. 3371 status: Status of the dataset item. Defaults to ACTIVE for newly created items. 3372 id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets. 3373 3374 Returns: 3375 DatasetItem: The created dataset item as returned by the Langfuse API. 3376 3377 Example: 3378 ```python 3379 from langfuse import Langfuse 3380 3381 langfuse = Langfuse() 3382 3383 # Uploading items to the Langfuse dataset named "capital_cities" 3384 langfuse.create_dataset_item( 3385 dataset_name="capital_cities", 3386 input={"input": {"country": "Italy"}}, 3387 expected_output={"expected_output": "Rome"}, 3388 metadata={"foo": "bar"} 3389 ) 3390 ``` 3391 """ 3392 try: 3393 langfuse_logger.debug(f"Creating dataset item for dataset {dataset_name}") 3394 3395 # Media uploads must reference the (dataset, item) they belong to, and 3396 # the item need not exist yet — so settle on the item id up front and 3397 # reuse it for the create call below. 3398 item_id = id if id is not None else str(uuid.uuid4()) 3399 3400 # Single pass per field: swap each LangfuseMedia for its reference 3401 # string (derived from content, not the upload) and collect the media 3402 # still to upload, deduped by media id and tagged with its field. 3403 pending_media: Dict[str, Tuple[LangfuseMedia, str]] = {} 3404 input = self._process_dataset_item_media( 3405 data=input, 3406 pending_media=pending_media, 3407 field=DatasetItemMediaReferenceField.INPUT.value, 3408 ) 3409 expected_output = self._process_dataset_item_media( 3410 data=expected_output, 3411 pending_media=pending_media, 3412 field=DatasetItemMediaReferenceField.EXPECTED_OUTPUT.value, 3413 ) 3414 metadata = self._process_dataset_item_media( 3415 data=metadata, 3416 pending_media=pending_media, 3417 field=DatasetItemMediaReferenceField.METADATA.value, 3418 ) 3419 3420 # The upload needs the dataset id, but the create API only takes the 3421 # name. Resolve it once, and only when there is actually media to 3422 # upload — a plain item pays no extra datasets.get round-trip. 3423 if pending_media: 3424 assert self._resources is not None 3425 dataset_id = self.api.datasets.get(self._url_encode(dataset_name)).id 3426 for media, field in pending_media.values(): 3427 self._resources._media_manager._upload_media_sync( 3428 media=media, 3429 dataset_id=dataset_id, 3430 dataset_item_id=item_id, 3431 field=field, 3432 ) 3433 3434 result = self.api.dataset_items.create( 3435 dataset_name=dataset_name, 3436 input=input, 3437 expected_output=expected_output, 3438 metadata=metadata, 3439 source_trace_id=source_trace_id, 3440 source_observation_id=source_observation_id, 3441 status=status, 3442 id=item_id, 3443 ) 3444 3445 return cast(DatasetItem, result) 3446 except Error as e: 3447 handle_fern_exception(e) 3448 raise e
Create a dataset item.
Upserts if an item with id already exists.
Arguments:
- dataset_name: Name of the dataset in which the dataset item should be created.
- input: Input data. Defaults to None. Can contain any dict, list or scalar.
- expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar.
- metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar.
- source_trace_id: Id of the source trace. Defaults to None.
- source_observation_id: Id of the source observation. Defaults to None.
- status: Status of the dataset item. Defaults to ACTIVE for newly created items.
- id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets.
Returns:
DatasetItem: The created dataset item as returned by the Langfuse API.
Example:
from langfuse import Langfuse langfuse = Langfuse() # Uploading items to the Langfuse dataset named "capital_cities" langfuse.create_dataset_item( dataset_name="capital_cities", input={"input": {"country": "Italy"}}, expected_output={"expected_output": "Rome"}, metadata={"foo": "bar"} )
3574 def resolve_media_references( 3575 self, 3576 *, 3577 obj: Any, 3578 resolve_with: Literal["base64_data_uri"], 3579 max_depth: int = 10, 3580 content_fetch_timeout_seconds: int = 5, 3581 ) -> Any: 3582 """Replace media reference strings in an object with base64 data URIs. 3583 3584 This method recursively traverses an object (up to max_depth) looking for media reference strings 3585 in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using 3586 the provided Langfuse client and replaces the reference string with a base64 data URI. 3587 3588 If fetching media content fails for a reference string, a warning is logged and the reference 3589 string is left unchanged. 3590 3591 Args: 3592 obj: The object to process. Can be a primitive value, array, or nested object. 3593 If the object has a __dict__ attribute, a dict will be returned instead of the original object type. 3594 resolve_with: The representation of the media content to replace the media reference string with. 3595 Currently only "base64_data_uri" is supported. 3596 max_depth: int: The maximum depth to traverse the object. Default is 10. 3597 content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5. 3598 3599 Returns: 3600 A deep copy of the input object with all media references replaced with base64 data URIs where possible. 3601 If the input object has a __dict__ attribute, a dict will be returned instead of the original object type. 3602 3603 Example: 3604 obj = { 3605 "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@", 3606 "nested": { 3607 "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@" 3608 } 3609 } 3610 3611 result = await LangfuseMedia.resolve_media_references(obj, langfuse_client) 3612 3613 # Result: 3614 # { 3615 # "image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...", 3616 # "nested": { 3617 # "pdf": "data:application/pdf;base64,JVBERi0xLjcK..." 3618 # } 3619 # } 3620 """ 3621 return LangfuseMedia.resolve_media_references( 3622 langfuse_client=self, 3623 obj=obj, 3624 resolve_with=resolve_with, 3625 max_depth=max_depth, 3626 content_fetch_timeout_seconds=content_fetch_timeout_seconds, 3627 )
Replace media reference strings in an object with base64 data URIs.
This method recursively traverses an object (up to max_depth) looking for media reference strings in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using the provided Langfuse client and replaces the reference string with a base64 data URI.
If fetching media content fails for a reference string, a warning is logged and the reference string is left unchanged.
Arguments:
- obj: The object to process. Can be a primitive value, array, or nested object. If the object has a __dict__ attribute, a dict will be returned instead of the original object type.
- resolve_with: The representation of the media content to replace the media reference string with. Currently only "base64_data_uri" is supported.
- max_depth: int: The maximum depth to traverse the object. Default is 10.
- content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5.
Returns:
A deep copy of the input object with all media references replaced with base64 data URIs where possible. If the input object has a __dict__ attribute, a dict will be returned instead of the original object type.
Example:
obj = { "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@", "nested": { "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@" } }
result = await LangfuseMedia.resolve_media_references(obj, langfuse_client)
Result:
{
"image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...",
"nested": {
"pdf": "data:application/pdf;base64,JVBERi0xLjcK..."
}
}
3657 def get_prompt( 3658 self, 3659 name: str, 3660 *, 3661 version: Optional[int] = None, 3662 label: Optional[str] = None, 3663 type: Literal["chat", "text"] = "text", 3664 cache_ttl_seconds: Optional[int] = None, 3665 fallback: Union[Optional[List[ChatMessageDict]], Optional[str]] = None, 3666 max_retries: Optional[int] = None, 3667 fetch_timeout_seconds: Optional[int] = None, 3668 ) -> PromptClient: 3669 """Get a prompt. 3670 3671 This method attempts to fetch the requested prompt from the local cache. If the prompt is not found 3672 in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again 3673 and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will 3674 return the expired prompt as a fallback. 3675 3676 Args: 3677 name (str): The name of the prompt to retrieve. 3678 3679 Keyword Args: 3680 version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both. 3681 label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both. 3682 cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a 3683 keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0. 3684 type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text". 3685 fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None. 3686 max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds. 3687 fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default. 3688 3689 Returns: 3690 The prompt object retrieved from the cache or directly fetched if not cached or expired of type 3691 - TextPromptClient, if type argument is 'text'. 3692 - ChatPromptClient, if type argument is 'chat'. 3693 3694 Raises: 3695 Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an 3696 expired prompt in the cache, in which case it logs a warning and returns the expired prompt. 3697 """ 3698 if self._resources is None: 3699 raise Error( 3700 "SDK is not correctly initialized. Check the init logs for more details." 3701 ) 3702 if version is not None and label is not None: 3703 raise ValueError("Cannot specify both version and label at the same time.") 3704 3705 if not name: 3706 raise ValueError("Prompt name cannot be empty.") 3707 3708 cache_key = PromptCache.generate_cache_key(name, version=version, label=label) 3709 bounded_max_retries = self._get_bounded_max_retries( 3710 max_retries, default_max_retries=2, max_retries_upper_bound=4 3711 ) 3712 3713 langfuse_logger.debug(f"Getting prompt '{cache_key}'") 3714 cached_prompt = self._resources.prompt_cache.get(cache_key) 3715 3716 if cached_prompt is None or cache_ttl_seconds == 0: 3717 langfuse_logger.debug( 3718 f"Prompt '{cache_key}' not found in cache or caching disabled." 3719 ) 3720 try: 3721 return self._fetch_prompt_and_update_cache( 3722 name, 3723 version=version, 3724 label=label, 3725 ttl_seconds=cache_ttl_seconds, 3726 max_retries=bounded_max_retries, 3727 fetch_timeout_seconds=fetch_timeout_seconds, 3728 ) 3729 except Exception as e: 3730 if fallback: 3731 langfuse_logger.warning( 3732 f"Returning fallback prompt for '{cache_key}' due to fetch error: {e}" 3733 ) 3734 3735 fallback_client_args: Dict[str, Any] = { 3736 "name": name, 3737 "prompt": fallback, 3738 "type": type, 3739 "version": version or 0, 3740 "config": {}, 3741 "labels": [label] if label else [], 3742 "tags": [], 3743 } 3744 3745 if type == "text": 3746 return TextPromptClient( 3747 prompt=Prompt_Text(**fallback_client_args), 3748 is_fallback=True, 3749 ) 3750 3751 if type == "chat": 3752 return ChatPromptClient( 3753 prompt=Prompt_Chat(**fallback_client_args), 3754 is_fallback=True, 3755 ) 3756 3757 raise e 3758 3759 if cached_prompt.is_expired(): 3760 langfuse_logger.debug(f"Stale prompt '{cache_key}' found in cache.") 3761 try: 3762 # refresh prompt in background thread, refresh_prompt deduplicates tasks 3763 langfuse_logger.debug(f"Refreshing prompt '{cache_key}' in background.") 3764 3765 def refresh_task() -> None: 3766 self._fetch_prompt_and_update_cache( 3767 name, 3768 version=version, 3769 label=label, 3770 ttl_seconds=cache_ttl_seconds, 3771 max_retries=bounded_max_retries, 3772 fetch_timeout_seconds=fetch_timeout_seconds, 3773 ) 3774 3775 self._resources.prompt_cache.add_refresh_prompt_task_if_current( 3776 cache_key, 3777 cached_prompt, 3778 refresh_task, 3779 ) 3780 langfuse_logger.debug( 3781 f"Returning stale prompt '{cache_key}' from cache." 3782 ) 3783 # return stale prompt 3784 return cached_prompt.value 3785 3786 except Exception as e: 3787 langfuse_logger.warning( 3788 f"Error when refreshing cached prompt '{cache_key}', returning cached version. Error: {e}" 3789 ) 3790 # creation of refresh prompt task failed, return stale prompt 3791 return cached_prompt.value 3792 3793 return cached_prompt.value
Get a prompt.
This method attempts to fetch the requested prompt from the local cache. If the prompt is not found in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will return the expired prompt as a fallback.
Arguments:
- name (str): The name of the prompt to retrieve.
Keyword Args:
- version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the
productionlabel is returned. Specify either version or label, not both. - label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the
productionlabel is returned. Specify either version or label, not both. - cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a
- keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0.
- type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text".
- fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None.
- max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds.
- fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default.
Returns:
The prompt object retrieved from the cache or directly fetched if not cached or expired of type
- TextPromptClient, if type argument is 'text'.
- ChatPromptClient, if type argument is 'chat'.
Raises:
- Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an
- expired prompt in the cache, in which case it logs a warning and returns the expired prompt.
3895 def create_prompt( 3896 self, 3897 *, 3898 name: str, 3899 prompt: Union[ 3900 str, List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]] 3901 ], 3902 labels: List[str] = [], 3903 tags: Optional[List[str]] = None, 3904 type: Optional[Literal["chat", "text"]] = "text", 3905 config: Optional[Any] = None, 3906 commit_message: Optional[str] = None, 3907 ) -> PromptClient: 3908 """Create a new prompt in Langfuse. 3909 3910 Keyword Args: 3911 name : The name of the prompt to be created. 3912 prompt : The content of the prompt to be created. 3913 is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead. 3914 labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label. 3915 tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt. 3916 config: Additional structured data to be saved with the prompt. Defaults to None. 3917 type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text". 3918 commit_message: Optional string describing the change. 3919 3920 Returns: 3921 TextPromptClient: The prompt if type argument is 'text'. 3922 ChatPromptClient: The prompt if type argument is 'chat'. 3923 """ 3924 try: 3925 langfuse_logger.debug(f"Creating prompt {name=}, {labels=}") 3926 3927 if type == "chat": 3928 if not isinstance(prompt, list): 3929 raise ValueError( 3930 "For 'chat' type, 'prompt' must be a list of chat messages with role and content attributes." 3931 ) 3932 request: Union[CreateChatPromptRequest, CreateTextPromptRequest] = ( 3933 CreateChatPromptRequest( 3934 name=name, 3935 prompt=cast(Any, prompt), 3936 labels=labels, 3937 tags=tags, 3938 config=config or {}, 3939 commit_message=commit_message, 3940 type=CreateChatPromptType.CHAT, 3941 ) 3942 ) 3943 server_prompt = self.api.prompts.create(request=request) 3944 3945 if self._resources is not None: 3946 self._resources.prompt_cache.invalidate(name) 3947 3948 return ChatPromptClient(prompt=cast(Prompt_Chat, server_prompt)) 3949 3950 if not isinstance(prompt, str): 3951 raise ValueError("For 'text' type, 'prompt' must be a string.") 3952 3953 request = CreateTextPromptRequest( 3954 name=name, 3955 prompt=prompt, 3956 labels=labels, 3957 tags=tags, 3958 config=config or {}, 3959 commit_message=commit_message, 3960 ) 3961 3962 server_prompt = self.api.prompts.create(request=request) 3963 3964 if self._resources is not None: 3965 self._resources.prompt_cache.invalidate(name) 3966 3967 return TextPromptClient(prompt=cast(Prompt_Text, server_prompt)) 3968 3969 except Error as e: 3970 handle_fern_exception(e) 3971 raise e
Create a new prompt in Langfuse.
Keyword Args:
- name : The name of the prompt to be created.
- prompt : The content of the prompt to be created.
- is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead.
- labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label.
- tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt.
- config: Additional structured data to be saved with the prompt. Defaults to None.
- type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text".
- commit_message: Optional string describing the change.
Returns:
TextPromptClient: The prompt if type argument is 'text'. ChatPromptClient: The prompt if type argument is 'chat'.
3973 def update_prompt( 3974 self, 3975 *, 3976 name: str, 3977 version: int, 3978 new_labels: List[str] = [], 3979 ) -> Any: 3980 """Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name. 3981 3982 Args: 3983 name (str): The name of the prompt to update. 3984 version (int): The version number of the prompt to update. 3985 new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to []. 3986 3987 Returns: 3988 Prompt: The updated prompt from the Langfuse API. 3989 3990 """ 3991 updated_prompt = self.api.prompt_version.update( 3992 name=self._url_encode(name), 3993 version=version, 3994 new_labels=new_labels, 3995 ) 3996 3997 if self._resources is not None: 3998 self._resources.prompt_cache.invalidate(name) 3999 4000 return updated_prompt
Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name.
Arguments:
- name (str): The name of the prompt to update.
- version (int): The version number of the prompt to update.
- new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to [].
Returns:
Prompt: The updated prompt from the Langfuse API.
4015 def clear_prompt_cache(self) -> None: 4016 """Clear the entire prompt cache, removing all cached prompts. 4017 4018 This method is useful when you want to force a complete refresh of all 4019 cached prompts, for example after major updates or when you need to 4020 ensure the latest versions are fetched from the server. 4021 """ 4022 if self._resources is not None: 4023 self._resources.prompt_cache.clear()
Clear the entire prompt cache, removing all cached prompts.
This method is useful when you want to force a complete refresh of all cached prompts, for example after major updates or when you need to ensure the latest versions are fetched from the server.
99class LangfuseMedia: 100 """A class for wrapping media objects for upload to Langfuse. 101 102 This class handles the preparation and formatting of media content for Langfuse, 103 supporting both base64 data URIs and raw content bytes. 104 105 Args: 106 obj (Optional[object]): The source object to be wrapped. Can be accessed via the `obj` attribute. 107 base64_data_uri (Optional[str]): A base64-encoded data URI containing the media content 108 and content type (e.g., "data:image/jpeg;base64,/9j/4AAQ..."). 109 content_type (Optional[str]): The MIME type of the media content when providing raw bytes. 110 content_bytes (Optional[bytes]): Raw bytes of the media content. 111 file_path (Optional[str]): The path to the file containing the media content. For relative paths, 112 the current working directory is used. 113 114 Raises: 115 ValueError: If neither base64_data_uri or the combination of content_bytes 116 and content_type is provided. 117 """ 118 119 obj: object 120 121 _content_bytes: Optional[bytes] 122 _content_type: Optional[MediaContentType] 123 _source: Optional[str] 124 _media_id: Optional[str] 125 126 def __init__( 127 self, 128 *, 129 obj: Optional[object] = None, 130 base64_data_uri: Optional[str] = None, 131 content_type: Optional[MediaContentType] = None, 132 content_bytes: Optional[bytes] = None, 133 file_path: Optional[str] = None, 134 ): 135 """Initialize a LangfuseMedia object. 136 137 Args: 138 obj: The object to wrap. 139 140 base64_data_uri: A base64-encoded data URI containing the media content 141 and content type (e.g., "data:image/jpeg;base64,/9j/4AAQ..."). 142 content_type: The MIME type of the media content when providing raw bytes or reading from a file. 143 content_bytes: Raw bytes of the media content. 144 file_path: The path to the file containing the media content. For relative paths, 145 the current working directory is used. 146 """ 147 self.obj = obj 148 149 if base64_data_uri is not None: 150 parsed_data = self._parse_base64_data_uri(base64_data_uri) 151 self._content_bytes, self._content_type = parsed_data 152 self._source = "base64_data_uri" 153 154 elif content_bytes is not None and content_type is not None: 155 self._content_type = content_type 156 self._content_bytes = content_bytes 157 self._source = "bytes" 158 elif ( 159 file_path is not None 160 and content_type is not None 161 and os.path.exists(file_path) 162 ): 163 self._content_bytes = self._read_file(file_path) 164 self._content_type = content_type if self._content_bytes else None 165 self._source = "file" if self._content_bytes else None 166 else: 167 logger.error( 168 "base64_data_uri, or content_bytes and content_type, or file_path must be provided to LangfuseMedia" 169 ) 170 171 self._content_bytes = None 172 self._content_type = None 173 self._source = None 174 175 self._media_id = self._get_media_id() 176 177 def _read_file(self, file_path: str) -> Optional[bytes]: 178 try: 179 with open(file_path, "rb") as file: 180 return file.read() 181 except Exception as e: 182 logger.error(f"Error reading file at path {file_path}", exc_info=e) 183 184 return None 185 186 def _get_media_id(self) -> Optional[str]: 187 content_hash = self._content_sha256_hash 188 189 if content_hash is None: 190 return None 191 192 # Convert hash to base64Url 193 url_safe_content_hash = content_hash.replace("+", "-").replace("/", "_") 194 195 return url_safe_content_hash[:22] 196 197 @property 198 def _content_length(self) -> Optional[int]: 199 return len(self._content_bytes) if self._content_bytes else None 200 201 @property 202 def _content_sha256_hash(self) -> Optional[str]: 203 if self._content_bytes is None: 204 return None 205 206 sha256_hash_bytes = hashlib.sha256(self._content_bytes).digest() 207 208 return base64.b64encode(sha256_hash_bytes).decode("utf-8") 209 210 @property 211 def _reference_string(self) -> Optional[str]: 212 if self._content_type is None or self._source is None or self._media_id is None: 213 return None 214 215 return f"@@@langfuseMedia:type={self._content_type}|id={self._media_id}|source={self._source}@@@" 216 217 @staticmethod 218 def parse_reference_string(reference_string: str) -> ParsedMediaReference: 219 """Parse a media reference string into a ParsedMediaReference. 220 221 Example reference string: 222 "@@@langfuseMedia:type=image/jpeg|id=some-uuid|source=base64_data_uri@@@" 223 224 Args: 225 reference_string: The reference string to parse. 226 227 Returns: 228 A TypedDict with the media_id, source, and content_type. 229 230 Raises: 231 ValueError: If the reference string is empty or not a string. 232 ValueError: If the reference string does not start with "@@@langfuseMedia:type=". 233 ValueError: If the reference string does not end with "@@@". 234 ValueError: If the reference string is missing required fields. 235 """ 236 if not reference_string: 237 raise ValueError("Reference string is empty") 238 239 if not isinstance(reference_string, str): 240 raise ValueError("Reference string is not a string") 241 242 if not reference_string.startswith("@@@langfuseMedia:type="): 243 raise ValueError( 244 "Reference string does not start with '@@@langfuseMedia:type='" 245 ) 246 247 if not reference_string.endswith("@@@"): 248 raise ValueError("Reference string does not end with '@@@'") 249 250 content = reference_string[len("@@@langfuseMedia:") :].rstrip("@@@") 251 252 # Split into key-value pairs 253 pairs = content.split("|") 254 parsed_data = {} 255 256 for pair in pairs: 257 key, value = pair.split("=", 1) 258 parsed_data[key] = value 259 260 # Verify all required fields are present 261 if not all(key in parsed_data for key in ["type", "id", "source"]): 262 raise ValueError("Missing required fields in reference string") 263 264 return ParsedMediaReference( 265 media_id=parsed_data["id"], 266 source=parsed_data["source"], 267 content_type=cast(MediaContentType, parsed_data["type"]), 268 ) 269 270 def _parse_base64_data_uri( 271 self, data: str 272 ) -> Tuple[Optional[bytes], Optional[MediaContentType]]: 273 # Example data URI: data:image/jpeg;base64,/9j/4AAQ... 274 try: 275 if not data or not isinstance(data, str): 276 raise ValueError("Data URI is not a string") 277 278 if not data.startswith("data:"): 279 raise ValueError("Data URI does not start with 'data:'") 280 281 header, actual_data = data[5:].split(",", 1) 282 if not header or not actual_data: 283 raise ValueError("Invalid URI") 284 285 # Split header into parts and check for base64 286 header_parts = header.split(";") 287 if "base64" not in header_parts: 288 raise ValueError("Data is not base64 encoded") 289 290 # Content type is the first part 291 content_type = header_parts[0] 292 if not content_type: 293 raise ValueError("Content type is empty") 294 295 return base64.b64decode(actual_data), cast(MediaContentType, content_type) 296 297 except Exception as e: 298 logger.error("Error parsing base64 data URI", exc_info=e) 299 300 return None, None 301 302 @staticmethod 303 def resolve_media_references( 304 *, 305 obj: T, 306 langfuse_client: "Langfuse", 307 resolve_with: Literal["base64_data_uri"], 308 max_depth: int = 10, 309 content_fetch_timeout_seconds: int = 10, 310 ) -> T: 311 """Replace media reference strings in an object with base64 data URIs. 312 313 This method recursively traverses an object (up to max_depth) looking for media reference strings 314 in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using 315 the provided Langfuse client and replaces the reference string with a base64 data URI. 316 317 If fetching media content fails for a reference string, a warning is logged and the reference 318 string is left unchanged. 319 320 Args: 321 obj: The object to process. Can be a primitive value, array, or nested object. 322 If the object has a __dict__ attribute, a dict will be returned instead of the original object type. 323 langfuse_client: Langfuse client instance used to fetch media content. 324 resolve_with: The representation of the media content to replace the media reference string with. 325 Currently only "base64_data_uri" is supported. 326 max_depth: Optional. Default is 10. The maximum depth to traverse the object. 327 328 Returns: 329 A deep copy of the input object with all media references replaced with base64 data URIs where possible. 330 If the input object has a __dict__ attribute, a dict will be returned instead of the original object type. 331 332 Example: 333 obj = { 334 "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@", 335 "nested": { 336 "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@" 337 } 338 } 339 340 result = await LangfuseMedia.resolve_media_references(obj, langfuse_client) 341 342 # Result: 343 # { 344 # "image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...", 345 # "nested": { 346 # "pdf": "data:application/pdf;base64,JVBERi0xLjcK..." 347 # } 348 # } 349 """ 350 351 def traverse(obj: Any, depth: int) -> Any: 352 if depth > max_depth: 353 return obj 354 355 # Handle string 356 if isinstance(obj, str): 357 regex = r"@@@langfuseMedia:.+?@@@" 358 reference_string_matches = re.findall(regex, obj) 359 if len(reference_string_matches) == 0: 360 return obj 361 362 result = obj 363 reference_string_to_media_content = {} 364 httpx_client = ( 365 langfuse_client._resources.httpx_client 366 if langfuse_client._resources is not None 367 else None 368 ) 369 370 for reference_string in reference_string_matches: 371 try: 372 parsed_media_reference = LangfuseMedia.parse_reference_string( 373 reference_string 374 ) 375 media_data = langfuse_client.api.media.get( 376 parsed_media_reference["media_id"] 377 ) 378 media_content = ( 379 httpx_client.get( 380 media_data.url, 381 timeout=content_fetch_timeout_seconds, 382 ) 383 if httpx_client is not None 384 else httpx.get( 385 media_data.url, timeout=content_fetch_timeout_seconds 386 ) 387 ) 388 media_content.raise_for_status() 389 390 base64_media_content = base64.b64encode( 391 media_content.content 392 ).decode() 393 base64_data_uri = f"data:{media_data.content_type};base64,{base64_media_content}" 394 395 reference_string_to_media_content[reference_string] = ( 396 base64_data_uri 397 ) 398 except Exception as e: 399 logger.warning( 400 f"Error fetching media content for reference string {reference_string}: {e}" 401 ) 402 # Do not replace the reference string if there's an error 403 continue 404 405 for ( 406 ref_str, 407 media_content_str, 408 ) in reference_string_to_media_content.items(): 409 result = result.replace(ref_str, media_content_str) 410 411 return result 412 413 # Handle arrays 414 if isinstance(obj, list): 415 return [traverse(item, depth + 1) for item in obj] 416 417 # Handle dictionaries 418 if isinstance(obj, dict): 419 return {key: traverse(value, depth + 1) for key, value in obj.items()} 420 421 # Handle objects: 422 if hasattr(obj, "__dict__"): 423 return { 424 key: traverse(value, depth + 1) 425 for key, value in obj.__dict__.items() 426 } 427 428 return obj 429 430 return cast(T, traverse(obj, 0))
A class for wrapping media objects for upload to Langfuse.
This class handles the preparation and formatting of media content for Langfuse, supporting both base64 data URIs and raw content bytes.
Arguments:
- obj (Optional[object]): The source object to be wrapped. Can be accessed via the
objattribute. - base64_data_uri (Optional[str]): A base64-encoded data URI containing the media content and content type (e.g., "data:image/jpeg;base64,/9j/4AAQ...").
- content_type (Optional[str]): The MIME type of the media content when providing raw bytes.
- content_bytes (Optional[bytes]): Raw bytes of the media content.
- file_path (Optional[str]): The path to the file containing the media content. For relative paths, the current working directory is used.
Raises:
- ValueError: If neither base64_data_uri or the combination of content_bytes and content_type is provided.
126 def __init__( 127 self, 128 *, 129 obj: Optional[object] = None, 130 base64_data_uri: Optional[str] = None, 131 content_type: Optional[MediaContentType] = None, 132 content_bytes: Optional[bytes] = None, 133 file_path: Optional[str] = None, 134 ): 135 """Initialize a LangfuseMedia object. 136 137 Args: 138 obj: The object to wrap. 139 140 base64_data_uri: A base64-encoded data URI containing the media content 141 and content type (e.g., "data:image/jpeg;base64,/9j/4AAQ..."). 142 content_type: The MIME type of the media content when providing raw bytes or reading from a file. 143 content_bytes: Raw bytes of the media content. 144 file_path: The path to the file containing the media content. For relative paths, 145 the current working directory is used. 146 """ 147 self.obj = obj 148 149 if base64_data_uri is not None: 150 parsed_data = self._parse_base64_data_uri(base64_data_uri) 151 self._content_bytes, self._content_type = parsed_data 152 self._source = "base64_data_uri" 153 154 elif content_bytes is not None and content_type is not None: 155 self._content_type = content_type 156 self._content_bytes = content_bytes 157 self._source = "bytes" 158 elif ( 159 file_path is not None 160 and content_type is not None 161 and os.path.exists(file_path) 162 ): 163 self._content_bytes = self._read_file(file_path) 164 self._content_type = content_type if self._content_bytes else None 165 self._source = "file" if self._content_bytes else None 166 else: 167 logger.error( 168 "base64_data_uri, or content_bytes and content_type, or file_path must be provided to LangfuseMedia" 169 ) 170 171 self._content_bytes = None 172 self._content_type = None 173 self._source = None 174 175 self._media_id = self._get_media_id()
Initialize a LangfuseMedia object.
Arguments:
- obj: The object to wrap.
- base64_data_uri: A base64-encoded data URI containing the media content and content type (e.g., "data:image/jpeg;base64,/9j/4AAQ...").
- content_type: The MIME type of the media content when providing raw bytes or reading from a file.
- content_bytes: Raw bytes of the media content.
- file_path: The path to the file containing the media content. For relative paths, the current working directory is used.
217 @staticmethod 218 def parse_reference_string(reference_string: str) -> ParsedMediaReference: 219 """Parse a media reference string into a ParsedMediaReference. 220 221 Example reference string: 222 "@@@langfuseMedia:type=image/jpeg|id=some-uuid|source=base64_data_uri@@@" 223 224 Args: 225 reference_string: The reference string to parse. 226 227 Returns: 228 A TypedDict with the media_id, source, and content_type. 229 230 Raises: 231 ValueError: If the reference string is empty or not a string. 232 ValueError: If the reference string does not start with "@@@langfuseMedia:type=". 233 ValueError: If the reference string does not end with "@@@". 234 ValueError: If the reference string is missing required fields. 235 """ 236 if not reference_string: 237 raise ValueError("Reference string is empty") 238 239 if not isinstance(reference_string, str): 240 raise ValueError("Reference string is not a string") 241 242 if not reference_string.startswith("@@@langfuseMedia:type="): 243 raise ValueError( 244 "Reference string does not start with '@@@langfuseMedia:type='" 245 ) 246 247 if not reference_string.endswith("@@@"): 248 raise ValueError("Reference string does not end with '@@@'") 249 250 content = reference_string[len("@@@langfuseMedia:") :].rstrip("@@@") 251 252 # Split into key-value pairs 253 pairs = content.split("|") 254 parsed_data = {} 255 256 for pair in pairs: 257 key, value = pair.split("=", 1) 258 parsed_data[key] = value 259 260 # Verify all required fields are present 261 if not all(key in parsed_data for key in ["type", "id", "source"]): 262 raise ValueError("Missing required fields in reference string") 263 264 return ParsedMediaReference( 265 media_id=parsed_data["id"], 266 source=parsed_data["source"], 267 content_type=cast(MediaContentType, parsed_data["type"]), 268 )
Parse a media reference string into a ParsedMediaReference.
Example reference string:
"@@@langfuseMedia:type=image/jpeg|id=some-uuid|source=base64_data_uri@@@"
Arguments:
- reference_string: The reference string to parse.
Returns:
A TypedDict with the media_id, source, and content_type.
Raises:
- ValueError: If the reference string is empty or not a string.
- ValueError: If the reference string does not start with "@@@langfuseMedia:type=".
- ValueError: If the reference string does not end with "@@@".
- ValueError: If the reference string is missing required fields.
302 @staticmethod 303 def resolve_media_references( 304 *, 305 obj: T, 306 langfuse_client: "Langfuse", 307 resolve_with: Literal["base64_data_uri"], 308 max_depth: int = 10, 309 content_fetch_timeout_seconds: int = 10, 310 ) -> T: 311 """Replace media reference strings in an object with base64 data URIs. 312 313 This method recursively traverses an object (up to max_depth) looking for media reference strings 314 in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using 315 the provided Langfuse client and replaces the reference string with a base64 data URI. 316 317 If fetching media content fails for a reference string, a warning is logged and the reference 318 string is left unchanged. 319 320 Args: 321 obj: The object to process. Can be a primitive value, array, or nested object. 322 If the object has a __dict__ attribute, a dict will be returned instead of the original object type. 323 langfuse_client: Langfuse client instance used to fetch media content. 324 resolve_with: The representation of the media content to replace the media reference string with. 325 Currently only "base64_data_uri" is supported. 326 max_depth: Optional. Default is 10. The maximum depth to traverse the object. 327 328 Returns: 329 A deep copy of the input object with all media references replaced with base64 data URIs where possible. 330 If the input object has a __dict__ attribute, a dict will be returned instead of the original object type. 331 332 Example: 333 obj = { 334 "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@", 335 "nested": { 336 "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@" 337 } 338 } 339 340 result = await LangfuseMedia.resolve_media_references(obj, langfuse_client) 341 342 # Result: 343 # { 344 # "image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...", 345 # "nested": { 346 # "pdf": "data:application/pdf;base64,JVBERi0xLjcK..." 347 # } 348 # } 349 """ 350 351 def traverse(obj: Any, depth: int) -> Any: 352 if depth > max_depth: 353 return obj 354 355 # Handle string 356 if isinstance(obj, str): 357 regex = r"@@@langfuseMedia:.+?@@@" 358 reference_string_matches = re.findall(regex, obj) 359 if len(reference_string_matches) == 0: 360 return obj 361 362 result = obj 363 reference_string_to_media_content = {} 364 httpx_client = ( 365 langfuse_client._resources.httpx_client 366 if langfuse_client._resources is not None 367 else None 368 ) 369 370 for reference_string in reference_string_matches: 371 try: 372 parsed_media_reference = LangfuseMedia.parse_reference_string( 373 reference_string 374 ) 375 media_data = langfuse_client.api.media.get( 376 parsed_media_reference["media_id"] 377 ) 378 media_content = ( 379 httpx_client.get( 380 media_data.url, 381 timeout=content_fetch_timeout_seconds, 382 ) 383 if httpx_client is not None 384 else httpx.get( 385 media_data.url, timeout=content_fetch_timeout_seconds 386 ) 387 ) 388 media_content.raise_for_status() 389 390 base64_media_content = base64.b64encode( 391 media_content.content 392 ).decode() 393 base64_data_uri = f"data:{media_data.content_type};base64,{base64_media_content}" 394 395 reference_string_to_media_content[reference_string] = ( 396 base64_data_uri 397 ) 398 except Exception as e: 399 logger.warning( 400 f"Error fetching media content for reference string {reference_string}: {e}" 401 ) 402 # Do not replace the reference string if there's an error 403 continue 404 405 for ( 406 ref_str, 407 media_content_str, 408 ) in reference_string_to_media_content.items(): 409 result = result.replace(ref_str, media_content_str) 410 411 return result 412 413 # Handle arrays 414 if isinstance(obj, list): 415 return [traverse(item, depth + 1) for item in obj] 416 417 # Handle dictionaries 418 if isinstance(obj, dict): 419 return {key: traverse(value, depth + 1) for key, value in obj.items()} 420 421 # Handle objects: 422 if hasattr(obj, "__dict__"): 423 return { 424 key: traverse(value, depth + 1) 425 for key, value in obj.__dict__.items() 426 } 427 428 return obj 429 430 return cast(T, traverse(obj, 0))
Replace media reference strings in an object with base64 data URIs.
This method recursively traverses an object (up to max_depth) looking for media reference strings in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using the provided Langfuse client and replaces the reference string with a base64 data URI.
If fetching media content fails for a reference string, a warning is logged and the reference string is left unchanged.
Arguments:
- obj: The object to process. Can be a primitive value, array, or nested object. If the object has a __dict__ attribute, a dict will be returned instead of the original object type.
- langfuse_client: Langfuse client instance used to fetch media content.
- resolve_with: The representation of the media content to replace the media reference string with. Currently only "base64_data_uri" is supported.
- max_depth: Optional. Default is 10. The maximum depth to traverse the object.
Returns:
A deep copy of the input object with all media references replaced with base64 data URIs where possible. If the input object has a __dict__ attribute, a dict will be returned instead of the original object type.
Example:
obj = { "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@", "nested": { "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@" } }
result = await LangfuseMedia.resolve_media_references(obj, langfuse_client)
Result:
{
"image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...",
"nested": {
"pdf": "data:application/pdf;base64,JVBERi0xLjcK..."
}
}
24@dataclass(frozen=True) 25class LangfuseMediaReference: 26 """Resolved reference to media stored in Langfuse.""" 27 28 media_id: str 29 content_type: str 30 url: str 31 url_expiry: Optional[str] = None 32 content_length: Optional[int] = None 33 reference_string: Optional[str] = None 34 35 def is_url_expired(self) -> bool: 36 """Return whether the signed URL is already expired.""" 37 if self.url_expiry is None: 38 return False 39 40 expiry = self.url_expiry.replace("Z", "+00:00") 41 42 try: 43 expiry_datetime = datetime.fromisoformat(expiry) 44 except ValueError: 45 return False 46 47 if expiry_datetime.tzinfo is None: 48 expiry_datetime = expiry_datetime.replace(tzinfo=timezone.utc) 49 50 return expiry_datetime <= datetime.now(timezone.utc) 51 52 def fetch_bytes( 53 self, *, timeout: float = 30.0, client: Optional[httpx.Client] = None 54 ) -> bytes: 55 """Fetch the media content from the signed URL. 56 57 Args: 58 timeout: Request timeout in seconds. 59 client: Optional httpx client to use for the request. Pass this to 60 honor custom transport settings (proxy, CA bundle, mTLS) — in 61 particular when multiple Langfuse clients are configured, since 62 the SDK cannot otherwise tell which client produced this 63 reference. When omitted, the single configured client is used, 64 falling back to a default httpx client. 65 """ 66 from langfuse._client.resource_manager import LangfuseResourceManager 67 68 httpx_client = client or LangfuseResourceManager.get_singleton_httpx_client() 69 response = ( 70 httpx_client.get(self.url, timeout=timeout) 71 if httpx_client is not None 72 else httpx.get(self.url, timeout=timeout) 73 ) 74 response.raise_for_status() 75 76 return response.content 77 78 def fetch_base64( 79 self, *, timeout: float = 30.0, client: Optional[httpx.Client] = None 80 ) -> str: 81 """Fetch media and return raw base64 without a data URI prefix. 82 83 See :meth:`fetch_bytes` for the ``client`` argument. 84 """ 85 return base64.b64encode( 86 self.fetch_bytes(timeout=timeout, client=client) 87 ).decode() 88 89 def fetch_data_uri( 90 self, *, timeout: float = 30.0, client: Optional[httpx.Client] = None 91 ) -> str: 92 """Fetch media and return it as a data URI. 93 94 See :meth:`fetch_bytes` for the ``client`` argument. 95 """ 96 return f"data:{self.content_type};base64,{self.fetch_base64(timeout=timeout, client=client)}"
Resolved reference to media stored in Langfuse.
35 def is_url_expired(self) -> bool: 36 """Return whether the signed URL is already expired.""" 37 if self.url_expiry is None: 38 return False 39 40 expiry = self.url_expiry.replace("Z", "+00:00") 41 42 try: 43 expiry_datetime = datetime.fromisoformat(expiry) 44 except ValueError: 45 return False 46 47 if expiry_datetime.tzinfo is None: 48 expiry_datetime = expiry_datetime.replace(tzinfo=timezone.utc) 49 50 return expiry_datetime <= datetime.now(timezone.utc)
Return whether the signed URL is already expired.
52 def fetch_bytes( 53 self, *, timeout: float = 30.0, client: Optional[httpx.Client] = None 54 ) -> bytes: 55 """Fetch the media content from the signed URL. 56 57 Args: 58 timeout: Request timeout in seconds. 59 client: Optional httpx client to use for the request. Pass this to 60 honor custom transport settings (proxy, CA bundle, mTLS) — in 61 particular when multiple Langfuse clients are configured, since 62 the SDK cannot otherwise tell which client produced this 63 reference. When omitted, the single configured client is used, 64 falling back to a default httpx client. 65 """ 66 from langfuse._client.resource_manager import LangfuseResourceManager 67 68 httpx_client = client or LangfuseResourceManager.get_singleton_httpx_client() 69 response = ( 70 httpx_client.get(self.url, timeout=timeout) 71 if httpx_client is not None 72 else httpx.get(self.url, timeout=timeout) 73 ) 74 response.raise_for_status() 75 76 return response.content
Fetch the media content from the signed URL.
Arguments:
- timeout: Request timeout in seconds.
- client: Optional httpx client to use for the request. Pass this to honor custom transport settings (proxy, CA bundle, mTLS) — in particular when multiple Langfuse clients are configured, since the SDK cannot otherwise tell which client produced this reference. When omitted, the single configured client is used, falling back to a default httpx client.
78 def fetch_base64( 79 self, *, timeout: float = 30.0, client: Optional[httpx.Client] = None 80 ) -> str: 81 """Fetch media and return raw base64 without a data URI prefix. 82 83 See :meth:`fetch_bytes` for the ``client`` argument. 84 """ 85 return base64.b64encode( 86 self.fetch_bytes(timeout=timeout, client=client) 87 ).decode()
Fetch media and return raw base64 without a data URI prefix.
See fetch_bytes() for the client argument.
89 def fetch_data_uri( 90 self, *, timeout: float = 30.0, client: Optional[httpx.Client] = None 91 ) -> str: 92 """Fetch media and return it as a data URI. 93 94 See :meth:`fetch_bytes` for the ``client`` argument. 95 """ 96 return f"data:{self.content_type};base64,{self.fetch_base64(timeout=timeout, client=client)}"
Fetch media and return it as a data URI.
See fetch_bytes() for the client argument.
65def get_client(*, public_key: Optional[str] = None) -> Langfuse: 66 """Get or create a Langfuse client instance. 67 68 Returns an existing Langfuse client or creates a new one if none exists. In multi-project setups, 69 providing a public_key is required. Multi-project support is experimental - see Langfuse docs. 70 71 Behavior: 72 - Single project: Returns existing client or creates new one 73 - Multi-project: Requires public_key to return specific client 74 - No public_key in multi-project: Returns disabled client to prevent data leakage 75 76 The function uses a singleton pattern per public_key to conserve resources and maintain state. 77 78 Args: 79 public_key (Optional[str]): Project identifier 80 - With key: Returns client for that project 81 - Without key: Returns single client or disabled client if multiple exist 82 83 Returns: 84 Langfuse: Client instance in one of three states: 85 1. Client for specified public_key 86 2. Default client for single-project setup 87 3. Disabled client when multiple projects exist without key 88 89 Security: 90 Disables tracing when multiple projects exist without explicit key to prevent 91 cross-project data leakage. Multi-project setups are experimental. 92 93 Example: 94 ```python 95 # Single project 96 client = get_client() # Default client 97 98 # In multi-project usage: 99 client_a = get_client(public_key="project_a_key") # Returns project A's client 100 client_b = get_client(public_key="project_b_key") # Returns project B's client 101 102 # Without specific key in multi-project setup: 103 client = get_client() # Returns disabled client for safety 104 ``` 105 """ 106 with LangfuseResourceManager._lock: 107 active_instances = LangfuseResourceManager._instances 108 109 # If no explicit public_key provided, check execution context 110 if not public_key: 111 public_key = _current_public_key.get(None) 112 113 if not public_key: 114 if len(active_instances) == 0: 115 # No clients initialized yet, create default instance 116 return Langfuse() 117 118 if len(active_instances) == 1: 119 # Only one client exists, safe to use without specifying key 120 instance = list(active_instances.values())[0] 121 122 # Initialize with the credentials bound to the instance 123 # This is important if the original instance was instantiated 124 # via constructor arguments 125 return _create_client_from_instance(instance) 126 127 else: 128 # Multiple clients exist but no key specified - disable tracing 129 # to prevent cross-project data leakage 130 langfuse_logger.warning( 131 "No 'langfuse_public_key' passed to decorated function, but multiple langfuse clients are instantiated in current process. Skipping tracing for this function to avoid cross-project leakage." 132 ) 133 return Langfuse( 134 tracing_enabled=False, public_key="fake", secret_key="fake" 135 ) 136 137 else: 138 # Specific key provided, look up existing instance 139 target_instance: Optional[LangfuseResourceManager] = active_instances.get( 140 public_key, None 141 ) 142 143 if target_instance is None: 144 # No instance found with this key - client not initialized properly 145 langfuse_logger.warning( 146 f"No Langfuse client with public key {public_key} has been initialized. Skipping tracing for decorated function." 147 ) 148 return Langfuse( 149 tracing_enabled=False, public_key="fake", secret_key="fake" 150 ) 151 152 # target_instance is guaranteed to be not None at this point 153 return _create_client_from_instance(target_instance, public_key)
Get or create a Langfuse client instance.
Returns an existing Langfuse client or creates a new one if none exists. In multi-project setups, providing a public_key is required. Multi-project support is experimental - see Langfuse docs.
Behavior:
- Single project: Returns existing client or creates new one
- Multi-project: Requires public_key to return specific client
- No public_key in multi-project: Returns disabled client to prevent data leakage
The function uses a singleton pattern per public_key to conserve resources and maintain state.
Arguments:
- public_key (Optional[str]): Project identifier
- With key: Returns client for that project
- Without key: Returns single client or disabled client if multiple exist
Returns:
Langfuse: Client instance in one of three states: 1. Client for specified public_key 2. Default client for single-project setup 3. Disabled client when multiple projects exist without key
Security:
Disables tracing when multiple projects exist without explicit key to prevent cross-project data leakage. Multi-project setups are experimental.
Example:
# Single project client = get_client() # Default client # In multi-project usage: client_a = get_client(public_key="project_a_key") # Returns project A's client client_b = get_client(public_key="project_b_key") # Returns project B's client # Without specific key in multi-project setup: client = get_client() # Returns disabled client for safety
88 def observe( 89 self, 90 func: Optional[F] = None, 91 *, 92 name: Optional[str] = None, 93 as_type: Optional[ObservationTypeLiteralNoEvent] = None, 94 capture_input: Optional[bool] = None, 95 capture_output: Optional[bool] = None, 96 transform_to_string: Optional[Callable[[Iterable], str]] = None, 97 ) -> Union[F, Callable[[F], F]]: 98 """Wrap a function to create and manage Langfuse tracing around its execution, supporting both synchronous and asynchronous functions. 99 100 This decorator provides seamless integration of Langfuse observability into your codebase. It automatically creates 101 spans or generations around function execution, capturing timing, inputs/outputs, and error states. The decorator 102 intelligently handles both synchronous and asynchronous functions, preserving function signatures and type hints. 103 104 Using OpenTelemetry's distributed tracing system, it maintains proper trace context propagation throughout your application, 105 enabling you to see hierarchical traces of function calls with detailed performance metrics and function-specific details. 106 107 Args: 108 func (Optional[Callable]): The function to decorate. When used with parentheses @observe(), this will be None. 109 name (Optional[str]): Custom name for the created trace or span. If not provided, the function name is used. 110 as_type (Optional[Literal]): Set the observation type. Supported values: 111 "generation", "span", "agent", "tool", "chain", "retriever", "embedding", "evaluator", "guardrail". 112 Observation types are highlighted in the Langfuse UI for filtering and visualization. 113 The types "generation" and "embedding" create a span on which additional attributes such as model metrics 114 can be set. 115 116 Returns: 117 Callable: A wrapped version of the original function that automatically creates and manages Langfuse spans. 118 119 Example: 120 For general function tracing with automatic naming: 121 ```python 122 @observe() 123 def process_user_request(user_id, query): 124 # Function is automatically traced with name "process_user_request" 125 return get_response(query) 126 ``` 127 128 For language model generation tracking: 129 ```python 130 @observe(name="answer-generation", as_type="generation") 131 async def generate_answer(query): 132 # Creates a generation-type span with extended LLM metrics 133 response = await openai.chat.completions.create( 134 model="gpt-4", 135 messages=[{"role": "user", "content": query}] 136 ) 137 return response.choices[0].message.content 138 ``` 139 140 For trace context propagation between functions: 141 ```python 142 @observe() 143 def main_process(): 144 # Parent span is created 145 return sub_process() # Child span automatically connected to parent 146 147 @observe() 148 def sub_process(): 149 # Automatically becomes a child span of main_process 150 return "result" 151 ``` 152 153 Raises: 154 Exception: Propagates any exceptions from the wrapped function after logging them in the trace. 155 156 Notes: 157 - The decorator preserves the original function's signature, docstring, and return type. 158 - Proper parent-child relationships between spans are automatically maintained. 159 - Special keyword arguments can be passed to control tracing: 160 - langfuse_trace_id: Explicitly set the trace ID for this function call 161 - langfuse_parent_observation_id: Explicitly set the parent span ID 162 - langfuse_public_key: Use a specific Langfuse project (when multiple clients exist) 163 - For async functions, the decorator returns an async function wrapper. 164 - For sync functions, the decorator returns a synchronous wrapper. 165 """ 166 valid_types = set(get_observation_types_list(ObservationTypeLiteralNoEvent)) 167 if as_type is not None and as_type not in valid_types: 168 logger.warning( 169 f"Invalid as_type '{as_type}'. Valid types are: {', '.join(sorted(valid_types))}. Defaulting to 'span'." 170 ) 171 as_type = "span" 172 173 function_io_capture_enabled = os.environ.get( 174 LANGFUSE_OBSERVE_DECORATOR_IO_CAPTURE_ENABLED, "True" 175 ).lower() not in ("false", "0") 176 177 should_capture_input = ( 178 capture_input if capture_input is not None else function_io_capture_enabled 179 ) 180 181 should_capture_output = ( 182 capture_output 183 if capture_output is not None 184 else function_io_capture_enabled 185 ) 186 187 def decorator(func: F) -> F: 188 return ( 189 self._async_observe( 190 func, 191 name=name, 192 as_type=as_type, 193 capture_input=should_capture_input, 194 capture_output=should_capture_output, 195 transform_to_string=transform_to_string, 196 ) 197 if asyncio.iscoroutinefunction(func) 198 else self._sync_observe( 199 func, 200 name=name, 201 as_type=as_type, 202 capture_input=should_capture_input, 203 capture_output=should_capture_output, 204 transform_to_string=transform_to_string, 205 ) 206 ) 207 208 """Handle decorator with or without parentheses. 209 210 This logic enables the decorator to work both with and without parentheses: 211 - @observe - Python passes the function directly to the decorator 212 - @observe() - Python calls the decorator first, which must return a function decorator 213 214 When called without arguments (@observe), the func parameter contains the function to decorate, 215 so we directly apply the decorator to it. When called with parentheses (@observe()), 216 func is None, so we return the decorator function itself for Python to apply in the next step. 217 """ 218 if func is None: 219 return decorator 220 else: 221 return decorator(func)
Wrap a function to create and manage Langfuse tracing around its execution, supporting both synchronous and asynchronous functions.
This decorator provides seamless integration of Langfuse observability into your codebase. It automatically creates spans or generations around function execution, capturing timing, inputs/outputs, and error states. The decorator intelligently handles both synchronous and asynchronous functions, preserving function signatures and type hints.
Using OpenTelemetry's distributed tracing system, it maintains proper trace context propagation throughout your application, enabling you to see hierarchical traces of function calls with detailed performance metrics and function-specific details.
Arguments:
- func (Optional[Callable]): The function to decorate. When used with parentheses @observe(), this will be None.
- name (Optional[str]): Custom name for the created trace or span. If not provided, the function name is used.
- as_type (Optional[Literal]): Set the observation type. Supported values: "generation", "span", "agent", "tool", "chain", "retriever", "embedding", "evaluator", "guardrail". Observation types are highlighted in the Langfuse UI for filtering and visualization. The types "generation" and "embedding" create a span on which additional attributes such as model metrics can be set.
Returns:
Callable: A wrapped version of the original function that automatically creates and manages Langfuse spans.
Example:
For general function tracing with automatic naming:
@observe() def process_user_request(user_id, query): # Function is automatically traced with name "process_user_request" return get_response(query)For language model generation tracking:
@observe(name="answer-generation", as_type="generation") async def generate_answer(query): # Creates a generation-type span with extended LLM metrics response = await openai.chat.completions.create( model="gpt-4", messages=[{"role": "user", "content": query}] ) return response.choices[0].message.contentFor trace context propagation between functions:
@observe() def main_process(): # Parent span is created return sub_process() # Child span automatically connected to parent @observe() def sub_process(): # Automatically becomes a child span of main_process return "result"
Raises:
- Exception: Propagates any exceptions from the wrapped function after logging them in the trace.
Notes:
- The decorator preserves the original function's signature, docstring, and return type.
- Proper parent-child relationships between spans are automatically maintained.
- Special keyword arguments can be passed to control tracing:
- langfuse_trace_id: Explicitly set the trace ID for this function call
- langfuse_parent_observation_id: Explicitly set the parent span ID
- langfuse_public_key: Use a specific Langfuse project (when multiple clients exist)
- For async functions, the decorator returns an async function wrapper.
- For sync functions, the decorator returns a synchronous wrapper.
98def propagate_attributes( 99 *, 100 user_id: Optional[str] = None, 101 session_id: Optional[str] = None, 102 metadata: Optional[Dict[str, Any]] = None, 103 version: Optional[str] = None, 104 tags: Optional[List[str]] = None, 105 trace_name: Optional[str] = None, 106 environment: Optional[str] = None, 107 as_baggage: bool = False, 108) -> _AgnosticContextManager[Any]: 109 """Propagate trace-level attributes to all spans created within this context. 110 111 This context manager sets attributes on the currently active span AND automatically 112 propagates them to all new child spans created within the context. This is the 113 recommended way to set trace-level attributes like user_id, session_id, 114 environment, and metadata dimensions that should be consistently applied across 115 all observations in a trace. 116 117 **IMPORTANT**: Call this as early as possible within your trace/workflow. Only the 118 currently active span and spans created after entering this context will have these 119 attributes. Pre-existing spans will NOT be retroactively updated. 120 121 **Why this matters**: Langfuse aggregation queries (e.g., total cost by user_id, 122 filtering by session_id) only include observations that have the attribute set. 123 If you call `propagate_attributes` late in your workflow, earlier spans won't be 124 included in aggregations for that attribute. 125 126 Args: 127 user_id: User identifier to associate with all spans in this context. 128 Must be US-ASCII string, ≤200 characters. Use this to track which user 129 generated each trace and enable e.g. per-user cost/performance analysis. 130 session_id: Session identifier to associate with all spans in this context. 131 Must be US-ASCII string, ≤200 characters. Use this to group related traces 132 within a user session (e.g., a conversation thread, multi-turn interaction). 133 metadata: Additional key-value metadata to propagate to all spans. 134 - Keys must be US-ASCII strings 135 - Values are coerced to strings 136 - Coerced values must be ≤200 characters 137 - Use for dimensions like internal correlating identifiers 138 - AVOID: large payloads or sensitive data 139 version: Version identfier for parts of your application that are independently versioned, e.g. agents 140 tags: List of tags to categorize the group of observations 141 trace_name: Name to assign to the trace. Must be US-ASCII string, ≤200 characters. 142 Use this to set a consistent trace name for all spans created within this context. 143 environment: Langfuse environment to assign to spans created in this context. 144 Must be a lowercase alphanumeric string with optional hyphens or underscores, 145 must be ≤40 characters, and must not start with "langfuse". This maps to 146 the first-class `langfuse.environment` attribute, not to trace metadata. 147 Use it for request-scoped environments, for example when one shared proxy 148 handles calls from dev, staging, qa, and prod. A propagated environment 149 takes precedence over the local client default configured via 150 `Langfuse(environment=...)` or `LANGFUSE_TRACING_ENVIRONMENT` for spans 151 created while this propagation context is active. 152 as_baggage: If True, propagates attributes using OpenTelemetry baggage for 153 cross-process/service propagation. **Security warning**: When enabled, 154 attribute values are added to HTTP headers on ALL outbound requests. 155 This includes `environment` as the `langfuse_environment` baggage entry. 156 Only enable if values are safe to transmit via HTTP headers and you need 157 cross-service tracing. Default: False. 158 159 Returns: 160 Context manager that propagates attributes to all child spans. 161 162 Example: 163 Basic usage with user and session tracking: 164 165 ```python 166 from langfuse import Langfuse 167 168 langfuse = Langfuse() 169 170 # Set attributes early in the trace 171 with langfuse.start_as_current_observation(name="user_workflow") as span: 172 with langfuse.propagate_attributes( 173 user_id="user_123", 174 session_id="session_abc", 175 environment="production", 176 metadata={"experiment": "variant_a"} 177 ): 178 # All spans created here will have user_id, session_id, environment, and metadata 179 with langfuse.start_observation(name="llm_call") as llm_span: 180 # This span inherits user_id, session_id, environment, and experiment metadata 181 ... 182 183 with langfuse.start_generation(name="completion") as gen: 184 # This span also inherits all attributes 185 ... 186 ``` 187 188 Late propagation (anti-pattern): 189 190 ```python 191 with langfuse.start_as_current_observation(name="workflow") as span: 192 # These spans WON'T have user_id 193 early_span = langfuse.start_observation(name="early_work") 194 early_span.end() 195 196 # Set attributes in the middle 197 with langfuse.propagate_attributes(user_id="user_123"): 198 # Only spans created AFTER this point will have user_id 199 late_span = langfuse.start_observation(name="late_work") 200 late_span.end() 201 202 # Result: Aggregations by user_id will miss "early_work" span 203 ``` 204 205 Cross-service propagation with baggage (advanced): 206 207 ```python 208 # Service A - originating service 209 with langfuse.start_as_current_observation(name="api_request"): 210 with langfuse.propagate_attributes( 211 user_id="user_123", 212 session_id="session_abc", 213 environment="staging", 214 as_baggage=True # Propagate via HTTP headers 215 ): 216 # Make HTTP request to Service B 217 response = requests.get("https://service-b.example.com/api") 218 # user_id, session_id, and environment are now in HTTP headers 219 220 # Service B - downstream service 221 # OpenTelemetry will automatically extract baggage from HTTP headers 222 # and propagate attributes to spans in Service B. If Service B has a local 223 # Langfuse environment configured, the propagated environment wins for 224 # spans created within this context. 225 ``` 226 227 Note: 228 - **Validation**: Attribute values (user_id, session_id, version, tags, 229 trace_name) must be strings ≤200 characters. Environment must also match 230 Langfuse's environment format: lowercase alphanumeric with optional 231 hyphens or underscores, must be ≤40 characters, and it must not start with "langfuse". Metadata 232 values are coerced to strings before the 200 character limit is applied. 233 Invalid values will be dropped with a warning logged. 234 - **OpenTelemetry**: This uses OpenTelemetry context propagation under the hood, 235 making it compatible with other OTel-instrumented libraries. 236 237 Raises: 238 No exceptions are raised. Invalid values are logged as warnings and dropped. 239 """ 240 return _propagate_attributes( 241 user_id=user_id, 242 session_id=session_id, 243 metadata=metadata, 244 version=version, 245 tags=tags, 246 trace_name=trace_name, 247 environment=environment, 248 as_baggage=as_baggage, 249 )
Propagate trace-level attributes to all spans created within this context.
This context manager sets attributes on the currently active span AND automatically propagates them to all new child spans created within the context. This is the recommended way to set trace-level attributes like user_id, session_id, environment, and metadata dimensions that should be consistently applied across all observations in a trace.
IMPORTANT: Call this as early as possible within your trace/workflow. Only the currently active span and spans created after entering this context will have these attributes. Pre-existing spans will NOT be retroactively updated.
Why this matters: Langfuse aggregation queries (e.g., total cost by user_id,
filtering by session_id) only include observations that have the attribute set.
If you call propagate_attributes late in your workflow, earlier spans won't be
included in aggregations for that attribute.
Arguments:
- user_id: User identifier to associate with all spans in this context. Must be US-ASCII string, ≤200 characters. Use this to track which user generated each trace and enable e.g. per-user cost/performance analysis.
- session_id: Session identifier to associate with all spans in this context. Must be US-ASCII string, ≤200 characters. Use this to group related traces within a user session (e.g., a conversation thread, multi-turn interaction).
- metadata: Additional key-value metadata to propagate to all spans.
- Keys must be US-ASCII strings
- Values are coerced to strings
- Coerced values must be ≤200 characters
- Use for dimensions like internal correlating identifiers
- AVOID: large payloads or sensitive data
- version: Version identfier for parts of your application that are independently versioned, e.g. agents
- tags: List of tags to categorize the group of observations
- trace_name: Name to assign to the trace. Must be US-ASCII string, ≤200 characters. Use this to set a consistent trace name for all spans created within this context.
- environment: Langfuse environment to assign to spans created in this context.
Must be a lowercase alphanumeric string with optional hyphens or underscores,
must be ≤40 characters, and must not start with "langfuse". This maps to
the first-class
langfuse.environmentattribute, not to trace metadata. Use it for request-scoped environments, for example when one shared proxy handles calls from dev, staging, qa, and prod. A propagated environment takes precedence over the local client default configured viaLangfuse(environment=...)orLANGFUSE_TRACING_ENVIRONMENTfor spans created while this propagation context is active. - as_baggage: If True, propagates attributes using OpenTelemetry baggage for
cross-process/service propagation. Security warning: When enabled,
attribute values are added to HTTP headers on ALL outbound requests.
This includes
environmentas thelangfuse_environmentbaggage entry. Only enable if values are safe to transmit via HTTP headers and you need cross-service tracing. Default: False.
Returns:
Context manager that propagates attributes to all child spans.
Example:
Basic usage with user and session tracking:
from langfuse import Langfuse langfuse = Langfuse() # Set attributes early in the trace with langfuse.start_as_current_observation(name="user_workflow") as span: with langfuse.propagate_attributes( user_id="user_123", session_id="session_abc", environment="production", metadata={"experiment": "variant_a"} ): # All spans created here will have user_id, session_id, environment, and metadata with langfuse.start_observation(name="llm_call") as llm_span: # This span inherits user_id, session_id, environment, and experiment metadata ... with langfuse.start_generation(name="completion") as gen: # This span also inherits all attributes ...Late propagation (anti-pattern):
with langfuse.start_as_current_observation(name="workflow") as span: # These spans WON'T have user_id early_span = langfuse.start_observation(name="early_work") early_span.end() # Set attributes in the middle with langfuse.propagate_attributes(user_id="user_123"): # Only spans created AFTER this point will have user_id late_span = langfuse.start_observation(name="late_work") late_span.end() # Result: Aggregations by user_id will miss "early_work" spanCross-service propagation with baggage (advanced):
# Service A - originating service with langfuse.start_as_current_observation(name="api_request"): with langfuse.propagate_attributes( user_id="user_123", session_id="session_abc", environment="staging", as_baggage=True # Propagate via HTTP headers ): # Make HTTP request to Service B response = requests.get("https://service-b.example.com/api") # user_id, session_id, and environment are now in HTTP headers # Service B - downstream service # OpenTelemetry will automatically extract baggage from HTTP headers # and propagate attributes to spans in Service B. If Service B has a local # Langfuse environment configured, the propagated environment wins for # spans created within this context.
Note:
- Validation: Attribute values (user_id, session_id, version, tags, trace_name) must be strings ≤200 characters. Environment must also match Langfuse's environment format: lowercase alphanumeric with optional hyphens or underscores, must be ≤40 characters, and it must not start with "langfuse". Metadata values are coerced to strings before the 200 character limit is applied. Invalid values will be dropped with a warning logged.
- OpenTelemetry: This uses OpenTelemetry context propagation under the hood, making it compatible with other OTel-instrumented libraries.
Raises:
- No exceptions are raised. Invalid values are logged as warnings and dropped.
1267class LangfuseSpan(LangfuseObservationWrapper): 1268 """Standard span implementation for general operations in Langfuse. 1269 1270 This class represents a general-purpose span that can be used to trace 1271 any operation in your application. It extends the base LangfuseObservationWrapper 1272 with specific methods for creating child spans, generations, and updating 1273 span-specific attributes. If possible, use a more specific type for 1274 better observability and insights. 1275 """ 1276 1277 def __init__( 1278 self, 1279 *, 1280 otel_span: otel_trace_api.Span, 1281 langfuse_client: "Langfuse", 1282 input: Optional[Any] = None, 1283 output: Optional[Any] = None, 1284 metadata: Optional[Any] = None, 1285 environment: Optional[str] = None, 1286 release: Optional[str] = None, 1287 version: Optional[str] = None, 1288 level: Optional[SpanLevel] = None, 1289 status_message: Optional[str] = None, 1290 ): 1291 """Initialize a new LangfuseSpan. 1292 1293 Args: 1294 otel_span: The OpenTelemetry span to wrap 1295 langfuse_client: Reference to the parent Langfuse client 1296 input: Input data for the span (any JSON-serializable object) 1297 output: Output data from the span (any JSON-serializable object) 1298 metadata: Additional metadata to associate with the span 1299 environment: The tracing environment 1300 release: Release identifier for the application 1301 version: Version identifier for the code or component 1302 level: Importance level of the span (info, warning, error) 1303 status_message: Optional status message for the span 1304 """ 1305 super().__init__( 1306 otel_span=otel_span, 1307 as_type="span", 1308 langfuse_client=langfuse_client, 1309 input=input, 1310 output=output, 1311 metadata=metadata, 1312 environment=environment, 1313 release=release, 1314 version=version, 1315 level=level, 1316 status_message=status_message, 1317 )
Standard span implementation for general operations in Langfuse.
This class represents a general-purpose span that can be used to trace any operation in your application. It extends the base LangfuseObservationWrapper with specific methods for creating child spans, generations, and updating span-specific attributes. If possible, use a more specific type for better observability and insights.
1277 def __init__( 1278 self, 1279 *, 1280 otel_span: otel_trace_api.Span, 1281 langfuse_client: "Langfuse", 1282 input: Optional[Any] = None, 1283 output: Optional[Any] = None, 1284 metadata: Optional[Any] = None, 1285 environment: Optional[str] = None, 1286 release: Optional[str] = None, 1287 version: Optional[str] = None, 1288 level: Optional[SpanLevel] = None, 1289 status_message: Optional[str] = None, 1290 ): 1291 """Initialize a new LangfuseSpan. 1292 1293 Args: 1294 otel_span: The OpenTelemetry span to wrap 1295 langfuse_client: Reference to the parent Langfuse client 1296 input: Input data for the span (any JSON-serializable object) 1297 output: Output data from the span (any JSON-serializable object) 1298 metadata: Additional metadata to associate with the span 1299 environment: The tracing environment 1300 release: Release identifier for the application 1301 version: Version identifier for the code or component 1302 level: Importance level of the span (info, warning, error) 1303 status_message: Optional status message for the span 1304 """ 1305 super().__init__( 1306 otel_span=otel_span, 1307 as_type="span", 1308 langfuse_client=langfuse_client, 1309 input=input, 1310 output=output, 1311 metadata=metadata, 1312 environment=environment, 1313 release=release, 1314 version=version, 1315 level=level, 1316 status_message=status_message, 1317 )
Initialize a new LangfuseSpan.
Arguments:
- otel_span: The OpenTelemetry span to wrap
- langfuse_client: Reference to the parent Langfuse client
- input: Input data for the span (any JSON-serializable object)
- output: Output data from the span (any JSON-serializable object)
- metadata: Additional metadata to associate with the span
- environment: The tracing environment
- release: Release identifier for the application
- version: Version identifier for the code or component
- level: Importance level of the span (info, warning, error)
- status_message: Optional status message for the span
1320class LangfuseGeneration(LangfuseObservationWrapper): 1321 """Specialized span implementation for AI model generations in Langfuse. 1322 1323 This class represents a generation span specifically designed for tracking 1324 AI/LLM operations. It extends the base LangfuseObservationWrapper with specialized 1325 attributes for model details, token usage, and costs. 1326 """ 1327 1328 def __init__( 1329 self, 1330 *, 1331 otel_span: otel_trace_api.Span, 1332 langfuse_client: "Langfuse", 1333 input: Optional[Any] = None, 1334 output: Optional[Any] = None, 1335 metadata: Optional[Any] = None, 1336 environment: Optional[str] = None, 1337 release: Optional[str] = None, 1338 version: Optional[str] = None, 1339 level: Optional[SpanLevel] = None, 1340 status_message: Optional[str] = None, 1341 completion_start_time: Optional[datetime] = None, 1342 model: Optional[str] = None, 1343 model_parameters: Optional[Dict[str, MapValue]] = None, 1344 usage_details: Optional[Dict[str, int]] = None, 1345 cost_details: Optional[Dict[str, float]] = None, 1346 prompt: Optional[PromptClient] = None, 1347 ): 1348 """Initialize a new LangfuseGeneration span. 1349 1350 Args: 1351 otel_span: The OpenTelemetry span to wrap 1352 langfuse_client: Reference to the parent Langfuse client 1353 input: Input data for the generation (e.g., prompts) 1354 output: Output from the generation (e.g., completions) 1355 metadata: Additional metadata to associate with the generation 1356 environment: The tracing environment 1357 release: Release identifier for the application 1358 version: Version identifier for the model or component 1359 level: Importance level of the generation (info, warning, error) 1360 status_message: Optional status message for the generation 1361 completion_start_time: When the model started generating the response 1362 model: Name/identifier of the AI model used (e.g., "gpt-4") 1363 model_parameters: Parameters used for the model (e.g., temperature, max_tokens) 1364 usage_details: Token usage information (e.g., prompt_tokens, completion_tokens) 1365 cost_details: Cost information for the model call 1366 prompt: Associated prompt template from Langfuse prompt management 1367 """ 1368 super().__init__( 1369 as_type="generation", 1370 otel_span=otel_span, 1371 langfuse_client=langfuse_client, 1372 input=input, 1373 output=output, 1374 metadata=metadata, 1375 environment=environment, 1376 release=release, 1377 version=version, 1378 level=level, 1379 status_message=status_message, 1380 completion_start_time=completion_start_time, 1381 model=model, 1382 model_parameters=model_parameters, 1383 usage_details=usage_details, 1384 cost_details=cost_details, 1385 prompt=prompt, 1386 )
Specialized span implementation for AI model generations in Langfuse.
This class represents a generation span specifically designed for tracking AI/LLM operations. It extends the base LangfuseObservationWrapper with specialized attributes for model details, token usage, and costs.
1328 def __init__( 1329 self, 1330 *, 1331 otel_span: otel_trace_api.Span, 1332 langfuse_client: "Langfuse", 1333 input: Optional[Any] = None, 1334 output: Optional[Any] = None, 1335 metadata: Optional[Any] = None, 1336 environment: Optional[str] = None, 1337 release: Optional[str] = None, 1338 version: Optional[str] = None, 1339 level: Optional[SpanLevel] = None, 1340 status_message: Optional[str] = None, 1341 completion_start_time: Optional[datetime] = None, 1342 model: Optional[str] = None, 1343 model_parameters: Optional[Dict[str, MapValue]] = None, 1344 usage_details: Optional[Dict[str, int]] = None, 1345 cost_details: Optional[Dict[str, float]] = None, 1346 prompt: Optional[PromptClient] = None, 1347 ): 1348 """Initialize a new LangfuseGeneration span. 1349 1350 Args: 1351 otel_span: The OpenTelemetry span to wrap 1352 langfuse_client: Reference to the parent Langfuse client 1353 input: Input data for the generation (e.g., prompts) 1354 output: Output from the generation (e.g., completions) 1355 metadata: Additional metadata to associate with the generation 1356 environment: The tracing environment 1357 release: Release identifier for the application 1358 version: Version identifier for the model or component 1359 level: Importance level of the generation (info, warning, error) 1360 status_message: Optional status message for the generation 1361 completion_start_time: When the model started generating the response 1362 model: Name/identifier of the AI model used (e.g., "gpt-4") 1363 model_parameters: Parameters used for the model (e.g., temperature, max_tokens) 1364 usage_details: Token usage information (e.g., prompt_tokens, completion_tokens) 1365 cost_details: Cost information for the model call 1366 prompt: Associated prompt template from Langfuse prompt management 1367 """ 1368 super().__init__( 1369 as_type="generation", 1370 otel_span=otel_span, 1371 langfuse_client=langfuse_client, 1372 input=input, 1373 output=output, 1374 metadata=metadata, 1375 environment=environment, 1376 release=release, 1377 version=version, 1378 level=level, 1379 status_message=status_message, 1380 completion_start_time=completion_start_time, 1381 model=model, 1382 model_parameters=model_parameters, 1383 usage_details=usage_details, 1384 cost_details=cost_details, 1385 prompt=prompt, 1386 )
Initialize a new LangfuseGeneration span.
Arguments:
- otel_span: The OpenTelemetry span to wrap
- langfuse_client: Reference to the parent Langfuse client
- input: Input data for the generation (e.g., prompts)
- output: Output from the generation (e.g., completions)
- metadata: Additional metadata to associate with the generation
- environment: The tracing environment
- release: Release identifier for the application
- version: Version identifier for the model or component
- level: Importance level of the generation (info, warning, error)
- status_message: Optional status message for the generation
- completion_start_time: When the model started generating the response
- model: Name/identifier of the AI model used (e.g., "gpt-4")
- model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
- usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
- cost_details: Cost information for the model call
- prompt: Associated prompt template from Langfuse prompt management
1389class LangfuseEvent(LangfuseObservationWrapper): 1390 """Specialized span implementation for Langfuse Events.""" 1391 1392 def __init__( 1393 self, 1394 *, 1395 otel_span: otel_trace_api.Span, 1396 langfuse_client: "Langfuse", 1397 input: Optional[Any] = None, 1398 output: Optional[Any] = None, 1399 metadata: Optional[Any] = None, 1400 environment: Optional[str] = None, 1401 release: Optional[str] = None, 1402 version: Optional[str] = None, 1403 level: Optional[SpanLevel] = None, 1404 status_message: Optional[str] = None, 1405 ): 1406 """Initialize a new LangfuseEvent span. 1407 1408 Args: 1409 otel_span: The OpenTelemetry span to wrap 1410 langfuse_client: Reference to the parent Langfuse client 1411 input: Input data for the event 1412 output: Output from the event 1413 metadata: Additional metadata to associate with the generation 1414 environment: The tracing environment 1415 release: Release identifier for the application 1416 version: Version identifier for the model or component 1417 level: Importance level of the generation (info, warning, error) 1418 status_message: Optional status message for the generation 1419 """ 1420 super().__init__( 1421 otel_span=otel_span, 1422 as_type="event", 1423 langfuse_client=langfuse_client, 1424 input=input, 1425 output=output, 1426 metadata=metadata, 1427 environment=environment, 1428 release=release, 1429 version=version, 1430 level=level, 1431 status_message=status_message, 1432 ) 1433 1434 def update( 1435 self, 1436 *, 1437 name: Optional[str] = None, 1438 input: Optional[Any] = None, 1439 output: Optional[Any] = None, 1440 metadata: Optional[Any] = None, 1441 version: Optional[str] = None, 1442 level: Optional[SpanLevel] = None, 1443 status_message: Optional[str] = None, 1444 completion_start_time: Optional[datetime] = None, 1445 model: Optional[str] = None, 1446 model_parameters: Optional[Dict[str, MapValue]] = None, 1447 usage_details: Optional[Dict[str, int]] = None, 1448 cost_details: Optional[Dict[str, float]] = None, 1449 prompt: Optional[PromptClient] = None, 1450 **kwargs: Any, 1451 ) -> "LangfuseEvent": 1452 """Update is not allowed for LangfuseEvent because events cannot be updated. 1453 1454 This method logs a warning and returns self without making changes. 1455 1456 Returns: 1457 self: Returns the unchanged LangfuseEvent instance 1458 """ 1459 langfuse_logger.warning( 1460 "Attempted to update LangfuseEvent observation. Events cannot be updated after creation." 1461 ) 1462 return self
Specialized span implementation for Langfuse Events.
1392 def __init__( 1393 self, 1394 *, 1395 otel_span: otel_trace_api.Span, 1396 langfuse_client: "Langfuse", 1397 input: Optional[Any] = None, 1398 output: Optional[Any] = None, 1399 metadata: Optional[Any] = None, 1400 environment: Optional[str] = None, 1401 release: Optional[str] = None, 1402 version: Optional[str] = None, 1403 level: Optional[SpanLevel] = None, 1404 status_message: Optional[str] = None, 1405 ): 1406 """Initialize a new LangfuseEvent span. 1407 1408 Args: 1409 otel_span: The OpenTelemetry span to wrap 1410 langfuse_client: Reference to the parent Langfuse client 1411 input: Input data for the event 1412 output: Output from the event 1413 metadata: Additional metadata to associate with the generation 1414 environment: The tracing environment 1415 release: Release identifier for the application 1416 version: Version identifier for the model or component 1417 level: Importance level of the generation (info, warning, error) 1418 status_message: Optional status message for the generation 1419 """ 1420 super().__init__( 1421 otel_span=otel_span, 1422 as_type="event", 1423 langfuse_client=langfuse_client, 1424 input=input, 1425 output=output, 1426 metadata=metadata, 1427 environment=environment, 1428 release=release, 1429 version=version, 1430 level=level, 1431 status_message=status_message, 1432 )
Initialize a new LangfuseEvent span.
Arguments:
- otel_span: The OpenTelemetry span to wrap
- langfuse_client: Reference to the parent Langfuse client
- input: Input data for the event
- output: Output from the event
- metadata: Additional metadata to associate with the generation
- environment: The tracing environment
- release: Release identifier for the application
- version: Version identifier for the model or component
- level: Importance level of the generation (info, warning, error)
- status_message: Optional status message for the generation
1434 def update( 1435 self, 1436 *, 1437 name: Optional[str] = None, 1438 input: Optional[Any] = None, 1439 output: Optional[Any] = None, 1440 metadata: Optional[Any] = None, 1441 version: Optional[str] = None, 1442 level: Optional[SpanLevel] = None, 1443 status_message: Optional[str] = None, 1444 completion_start_time: Optional[datetime] = None, 1445 model: Optional[str] = None, 1446 model_parameters: Optional[Dict[str, MapValue]] = None, 1447 usage_details: Optional[Dict[str, int]] = None, 1448 cost_details: Optional[Dict[str, float]] = None, 1449 prompt: Optional[PromptClient] = None, 1450 **kwargs: Any, 1451 ) -> "LangfuseEvent": 1452 """Update is not allowed for LangfuseEvent because events cannot be updated. 1453 1454 This method logs a warning and returns self without making changes. 1455 1456 Returns: 1457 self: Returns the unchanged LangfuseEvent instance 1458 """ 1459 langfuse_logger.warning( 1460 "Attempted to update LangfuseEvent observation. Events cannot be updated after creation." 1461 ) 1462 return self
Update is not allowed for LangfuseEvent because events cannot be updated.
This method logs a warning and returns self without making changes.
Returns:
self: Returns the unchanged LangfuseEvent instance
28class LangfuseOtelSpanAttributes: 29 # Langfuse-Trace attributes 30 TRACE_NAME = "langfuse.trace.name" 31 TRACE_USER_ID = "user.id" 32 TRACE_SESSION_ID = "session.id" 33 TRACE_TAGS = "langfuse.trace.tags" 34 TRACE_PUBLIC = "langfuse.trace.public" 35 TRACE_METADATA = "langfuse.trace.metadata" 36 TRACE_INPUT = "langfuse.trace.input" 37 TRACE_OUTPUT = "langfuse.trace.output" 38 39 # Langfuse-observation attributes 40 OBSERVATION_TYPE = "langfuse.observation.type" 41 OBSERVATION_METADATA = "langfuse.observation.metadata" 42 OBSERVATION_LEVEL = "langfuse.observation.level" 43 OBSERVATION_STATUS_MESSAGE = "langfuse.observation.status_message" 44 OBSERVATION_INPUT = "langfuse.observation.input" 45 OBSERVATION_OUTPUT = "langfuse.observation.output" 46 47 # Langfuse-observation of type Generation attributes 48 OBSERVATION_COMPLETION_START_TIME = "langfuse.observation.completion_start_time" 49 OBSERVATION_MODEL = "langfuse.observation.model.name" 50 OBSERVATION_MODEL_PARAMETERS = "langfuse.observation.model.parameters" 51 OBSERVATION_USAGE_DETAILS = "langfuse.observation.usage_details" 52 OBSERVATION_COST_DETAILS = "langfuse.observation.cost_details" 53 OBSERVATION_PROMPT_NAME = "langfuse.observation.prompt.name" 54 OBSERVATION_PROMPT_VERSION = "langfuse.observation.prompt.version" 55 56 # General 57 ENVIRONMENT = "langfuse.environment" 58 RELEASE = "langfuse.release" 59 VERSION = "langfuse.version" 60 61 # Internal 62 AS_ROOT = "langfuse.internal.as_root" 63 IS_APP_ROOT = "langfuse.internal.is_app_root" 64 65 # Experiments 66 EXPERIMENT_ID = "langfuse.experiment.id" 67 EXPERIMENT_NAME = "langfuse.experiment.name" 68 EXPERIMENT_DESCRIPTION = "langfuse.experiment.description" 69 EXPERIMENT_METADATA = "langfuse.experiment.metadata" 70 EXPERIMENT_DATASET_ID = "langfuse.experiment.dataset.id" 71 EXPERIMENT_ITEM_ID = "langfuse.experiment.item.id" 72 EXPERIMENT_ITEM_EXPECTED_OUTPUT = "langfuse.experiment.item.expected_output" 73 EXPERIMENT_ITEM_METADATA = "langfuse.experiment.item.metadata" 74 EXPERIMENT_ITEM_ROOT_OBSERVATION_ID = "langfuse.experiment.item.root_observation_id"
1465class LangfuseAgent(LangfuseObservationWrapper): 1466 """Agent observation for reasoning blocks that act on tools using LLM guidance.""" 1467 1468 def __init__(self, **kwargs: Any) -> None: 1469 """Initialize a new LangfuseAgent span.""" 1470 kwargs["as_type"] = "agent" 1471 super().__init__(**kwargs)
Agent observation for reasoning blocks that act on tools using LLM guidance.
1474class LangfuseTool(LangfuseObservationWrapper): 1475 """Tool observation representing external tool calls, e.g., calling a weather API.""" 1476 1477 def __init__(self, **kwargs: Any) -> None: 1478 """Initialize a new LangfuseTool span.""" 1479 kwargs["as_type"] = "tool" 1480 super().__init__(**kwargs)
Tool observation representing external tool calls, e.g., calling a weather API.
1483class LangfuseChain(LangfuseObservationWrapper): 1484 """Chain observation for connecting LLM application steps, e.g. passing context from retriever to LLM.""" 1485 1486 def __init__(self, **kwargs: Any) -> None: 1487 """Initialize a new LangfuseChain span.""" 1488 kwargs["as_type"] = "chain" 1489 super().__init__(**kwargs)
Chain observation for connecting LLM application steps, e.g. passing context from retriever to LLM.
1501class LangfuseEmbedding(LangfuseObservationWrapper): 1502 """Embedding observation for LLM embedding calls, typically used before retrieval.""" 1503 1504 def __init__(self, **kwargs: Any) -> None: 1505 """Initialize a new LangfuseEmbedding span.""" 1506 kwargs["as_type"] = "embedding" 1507 super().__init__(**kwargs)
Embedding observation for LLM embedding calls, typically used before retrieval.
1510class LangfuseEvaluator(LangfuseObservationWrapper): 1511 """Evaluator observation for assessing relevance, correctness, or helpfulness of LLM outputs.""" 1512 1513 def __init__(self, **kwargs: Any) -> None: 1514 """Initialize a new LangfuseEvaluator span.""" 1515 kwargs["as_type"] = "evaluator" 1516 super().__init__(**kwargs)
Evaluator observation for assessing relevance, correctness, or helpfulness of LLM outputs.
1492class LangfuseRetriever(LangfuseObservationWrapper): 1493 """Retriever observation for data retrieval steps, e.g. vector store or database queries.""" 1494 1495 def __init__(self, **kwargs: Any) -> None: 1496 """Initialize a new LangfuseRetriever span.""" 1497 kwargs["as_type"] = "retriever" 1498 super().__init__(**kwargs)
Retriever observation for data retrieval steps, e.g. vector store or database queries.
1519class LangfuseGuardrail(LangfuseObservationWrapper): 1520 """Guardrail observation for protection e.g. against jailbreaks or offensive content.""" 1521 1522 def __init__(self, **kwargs: Any) -> None: 1523 """Initialize a new LangfuseGuardrail span.""" 1524 kwargs["as_type"] = "guardrail" 1525 super().__init__(**kwargs)
Guardrail observation for protection e.g. against jailbreaks or offensive content.
101class Evaluation: 102 """Represents an evaluation result for an experiment item or an entire experiment run. 103 104 This class provides a strongly-typed way to create evaluation results in evaluator functions. 105 Users must use keyword arguments when instantiating this class. 106 107 Attributes: 108 name: Unique identifier for the evaluation metric. Should be descriptive 109 and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity"). 110 Used for aggregation and comparison across experiment runs. 111 value: The evaluation score or result. Can be: 112 - Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42) 113 - String: For categorical results like "positive", "negative", "neutral" 114 - Boolean: For binary assessments like "passes_safety_check" 115 comment: Optional human-readable explanation of the evaluation result. 116 Useful for providing context, explaining scoring rationale, or noting 117 special conditions. Displayed in Langfuse UI for interpretability. 118 metadata: Optional structured metadata about the evaluation process. 119 Can include confidence scores, intermediate calculations, model versions, 120 or any other relevant technical details. 121 data_type: Optional score data type. Required if value is not NUMERIC. 122 One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC. 123 config_id: Optional Langfuse score config ID. 124 125 Examples: 126 Basic accuracy evaluation: 127 ```python 128 from langfuse import Evaluation 129 130 def accuracy_evaluator(*, input, output, expected_output=None, **kwargs): 131 if not expected_output: 132 return Evaluation(name="accuracy", value=0, comment="No expected output") 133 134 is_correct = output.strip().lower() == expected_output.strip().lower() 135 return Evaluation( 136 name="accuracy", 137 value=1.0 if is_correct else 0.0, 138 comment="Correct answer" if is_correct else "Incorrect answer" 139 ) 140 ``` 141 142 Multi-metric evaluator: 143 ```python 144 def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs): 145 return [ 146 Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"), 147 Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"), 148 Evaluation( 149 name="quality", 150 value=0.85, 151 comment="High quality response", 152 metadata={"confidence": 0.92, "model": "gpt-4"} 153 ) 154 ] 155 ``` 156 157 Categorical evaluation: 158 ```python 159 def sentiment_evaluator(*, input, output, **kwargs): 160 sentiment = analyze_sentiment(output) # Returns "positive", "negative", or "neutral" 161 return Evaluation( 162 name="sentiment", 163 value=sentiment, 164 comment=f"Response expresses {sentiment} sentiment", 165 data_type="CATEGORICAL" 166 ) 167 ``` 168 169 Failed evaluation with error handling: 170 ```python 171 def external_api_evaluator(*, input, output, **kwargs): 172 try: 173 score = external_api.evaluate(output) 174 return Evaluation(name="external_score", value=score) 175 except Exception as e: 176 return Evaluation( 177 name="external_score", 178 value=0, 179 comment=f"API unavailable: {e}", 180 metadata={"error": str(e), "retry_count": 3} 181 ) 182 ``` 183 184 Note: 185 All arguments must be passed as keywords. Positional arguments are not allowed 186 to ensure code clarity and prevent errors from argument reordering. 187 """ 188 189 def __init__( 190 self, 191 *, 192 name: str, 193 value: Union[int, float, str, bool], 194 comment: Optional[str] = None, 195 metadata: Optional[Dict[str, Any]] = None, 196 data_type: Optional[ExperimentScoreType] = None, 197 config_id: Optional[str] = None, 198 ): 199 """Initialize an Evaluation with the provided data. 200 201 Args: 202 name: Unique identifier for the evaluation metric. 203 value: The evaluation score or result. 204 comment: Optional human-readable explanation of the result. 205 metadata: Optional structured metadata about the evaluation process. 206 data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN). 207 config_id: Optional Langfuse score config ID. 208 209 Note: 210 All arguments must be provided as keywords. Positional arguments will raise a TypeError. 211 """ 212 self.name = name 213 self.value = value 214 self.comment = comment 215 self.metadata = metadata 216 self.data_type = data_type 217 self.config_id = config_id
Represents an evaluation result for an experiment item or an entire experiment run.
This class provides a strongly-typed way to create evaluation results in evaluator functions. Users must use keyword arguments when instantiating this class.
Attributes:
- name: Unique identifier for the evaluation metric. Should be descriptive and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity"). Used for aggregation and comparison across experiment runs.
- value: The evaluation score or result. Can be:
- Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42)
- String: For categorical results like "positive", "negative", "neutral"
- Boolean: For binary assessments like "passes_safety_check"
- comment: Optional human-readable explanation of the evaluation result. Useful for providing context, explaining scoring rationale, or noting special conditions. Displayed in Langfuse UI for interpretability.
- metadata: Optional structured metadata about the evaluation process. Can include confidence scores, intermediate calculations, model versions, or any other relevant technical details.
- data_type: Optional score data type. Required if value is not NUMERIC. One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC.
- config_id: Optional Langfuse score config ID.
Examples:
Basic accuracy evaluation:
from langfuse import Evaluation def accuracy_evaluator(*, input, output, expected_output=None, **kwargs): if not expected_output: return Evaluation(name="accuracy", value=0, comment="No expected output") is_correct = output.strip().lower() == expected_output.strip().lower() return Evaluation( name="accuracy", value=1.0 if is_correct else 0.0, comment="Correct answer" if is_correct else "Incorrect answer" )Multi-metric evaluator:
def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs): return [ Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"), Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"), Evaluation( name="quality", value=0.85, comment="High quality response", metadata={"confidence": 0.92, "model": "gpt-4"} ) ]Categorical evaluation:
def sentiment_evaluator(*, input, output, **kwargs): sentiment = analyze_sentiment(output) # Returns "positive", "negative", or "neutral" return Evaluation( name="sentiment", value=sentiment, comment=f"Response expresses {sentiment} sentiment", data_type="CATEGORICAL" )Failed evaluation with error handling:
def external_api_evaluator(*, input, output, **kwargs): try: score = external_api.evaluate(output) return Evaluation(name="external_score", value=score) except Exception as e: return Evaluation( name="external_score", value=0, comment=f"API unavailable: {e}", metadata={"error": str(e), "retry_count": 3} )
Note:
All arguments must be passed as keywords. Positional arguments are not allowed to ensure code clarity and prevent errors from argument reordering.
189 def __init__( 190 self, 191 *, 192 name: str, 193 value: Union[int, float, str, bool], 194 comment: Optional[str] = None, 195 metadata: Optional[Dict[str, Any]] = None, 196 data_type: Optional[ExperimentScoreType] = None, 197 config_id: Optional[str] = None, 198 ): 199 """Initialize an Evaluation with the provided data. 200 201 Args: 202 name: Unique identifier for the evaluation metric. 203 value: The evaluation score or result. 204 comment: Optional human-readable explanation of the result. 205 metadata: Optional structured metadata about the evaluation process. 206 data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN). 207 config_id: Optional Langfuse score config ID. 208 209 Note: 210 All arguments must be provided as keywords. Positional arguments will raise a TypeError. 211 """ 212 self.name = name 213 self.value = value 214 self.comment = comment 215 self.metadata = metadata 216 self.data_type = data_type 217 self.config_id = config_id
Initialize an Evaluation with the provided data.
Arguments:
- name: Unique identifier for the evaluation metric.
- value: The evaluation score or result.
- comment: Optional human-readable explanation of the result.
- metadata: Optional structured metadata about the evaluation process.
- data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
- config_id: Optional Langfuse score config ID.
Note:
All arguments must be provided as keywords. Positional arguments will raise a TypeError.
38class EvaluatorInputs: 39 """Input data structure for evaluators, returned by mapper functions. 40 41 This class provides a strongly-typed container for transforming API response 42 objects (traces, observations) into the standardized format expected 43 by evaluator functions. It ensures consistent access to input, output, expected 44 output, and metadata regardless of the source entity type. 45 46 Attributes: 47 input: The input data that was provided to generate the output being evaluated. 48 For traces, this might be the initial prompt or request. For observations, 49 this could be the span's input. The exact meaning depends on your use case. 50 output: The actual output that was produced and needs to be evaluated. 51 For traces, this is typically the final response. For observations, 52 this might be the generation output or span result. 53 expected_output: Optional ground truth or expected result for comparison. 54 Used by evaluators to assess correctness. May be None if no ground truth 55 is available for the entity being evaluated. 56 metadata: Optional structured metadata providing additional context for evaluation. 57 Can include information about the entity, execution context, user attributes, 58 or any other relevant data that evaluators might use. 59 60 Examples: 61 Simple mapper for traces: 62 ```python 63 from langfuse import EvaluatorInputs 64 65 def trace_mapper(trace): 66 return EvaluatorInputs( 67 input=trace.input, 68 output=trace.output, 69 expected_output=None, # No ground truth available 70 metadata={"user_id": trace.user_id, "tags": trace.tags} 71 ) 72 ``` 73 74 Mapper for observations extracting specific fields: 75 ```python 76 def observation_mapper(observation): 77 # Extract input/output from observation's data 78 input_data = observation.input if hasattr(observation, 'input') else None 79 output_data = observation.output if hasattr(observation, 'output') else None 80 81 return EvaluatorInputs( 82 input=input_data, 83 output=output_data, 84 expected_output=None, 85 metadata={ 86 "observation_type": observation.type, 87 "model": observation.model, 88 "latency_ms": observation.end_time - observation.start_time 89 } 90 ) 91 ``` 92 ``` 93 94 Note: 95 All arguments must be passed as keywords when instantiating this class. 96 """ 97 98 def __init__( 99 self, 100 *, 101 input: Any, 102 output: Any, 103 expected_output: Any = None, 104 metadata: Optional[Dict[str, Any]] = None, 105 ): 106 """Initialize EvaluatorInputs with the provided data. 107 108 Args: 109 input: The input data for evaluation. 110 output: The output data to be evaluated. 111 expected_output: Optional ground truth for comparison. 112 metadata: Optional additional context for evaluation. 113 114 Note: 115 All arguments must be provided as keywords. 116 """ 117 self.input = input 118 self.output = output 119 self.expected_output = expected_output 120 self.metadata = metadata
Input data structure for evaluators, returned by mapper functions.
This class provides a strongly-typed container for transforming API response objects (traces, observations) into the standardized format expected by evaluator functions. It ensures consistent access to input, output, expected output, and metadata regardless of the source entity type.
Attributes:
- input: The input data that was provided to generate the output being evaluated. For traces, this might be the initial prompt or request. For observations, this could be the span's input. The exact meaning depends on your use case.
- output: The actual output that was produced and needs to be evaluated. For traces, this is typically the final response. For observations, this might be the generation output or span result.
- expected_output: Optional ground truth or expected result for comparison. Used by evaluators to assess correctness. May be None if no ground truth is available for the entity being evaluated.
- metadata: Optional structured metadata providing additional context for evaluation. Can include information about the entity, execution context, user attributes, or any other relevant data that evaluators might use.
Examples:
Simple mapper for traces:
from langfuse import EvaluatorInputs def trace_mapper(trace): return EvaluatorInputs( input=trace.input, output=trace.output, expected_output=None, # No ground truth available metadata={"user_id": trace.user_id, "tags": trace.tags} )Mapper for observations extracting specific fields:
def observation_mapper(observation): # Extract input/output from observation's data input_data = observation.input if hasattr(observation, 'input') else None output_data = observation.output if hasattr(observation, 'output') else None return EvaluatorInputs( input=input_data, output=output_data, expected_output=None, metadata={ "observation_type": observation.type, "model": observation.model, "latency_ms": observation.end_time - observation.start_time } )```
Note:
All arguments must be passed as keywords when instantiating this class.
98 def __init__( 99 self, 100 *, 101 input: Any, 102 output: Any, 103 expected_output: Any = None, 104 metadata: Optional[Dict[str, Any]] = None, 105 ): 106 """Initialize EvaluatorInputs with the provided data. 107 108 Args: 109 input: The input data for evaluation. 110 output: The output data to be evaluated. 111 expected_output: Optional ground truth for comparison. 112 metadata: Optional additional context for evaluation. 113 114 Note: 115 All arguments must be provided as keywords. 116 """ 117 self.input = input 118 self.output = output 119 self.expected_output = expected_output 120 self.metadata = metadata
Initialize EvaluatorInputs with the provided data.
Arguments:
- input: The input data for evaluation.
- output: The output data to be evaluated.
- expected_output: Optional ground truth for comparison.
- metadata: Optional additional context for evaluation.
Note:
All arguments must be provided as keywords.
123class MapperFunction(Protocol): 124 """Protocol defining the interface for mapper functions in batch evaluation. 125 126 Mapper functions transform API response objects (traces or observations) 127 into the standardized EvaluatorInputs format that evaluators expect. This abstraction 128 allows you to define how to extract and structure evaluation data from different 129 entity types. 130 131 Mapper functions must: 132 - Accept a single item parameter (trace, observation) 133 - Return an EvaluatorInputs instance with input, output, expected_output, metadata 134 - Can be either synchronous or asynchronous 135 - Should handle missing or malformed data gracefully 136 """ 137 138 def __call__( 139 self, 140 *, 141 item: Union["TraceWithFullDetails", "ObservationsView"], 142 **kwargs: Dict[str, Any], 143 ) -> Union[EvaluatorInputs, Awaitable[EvaluatorInputs]]: 144 """Transform an API response object into evaluator inputs. 145 146 This method defines how to extract evaluation-relevant data from the raw 147 API response object. The implementation should map entity-specific fields 148 to the standardized input/output/expected_output/metadata structure. 149 150 Args: 151 item: The API response object to transform. The type depends on the scope: 152 - TraceWithFullDetails: When evaluating traces 153 - ObservationsView: When evaluating observations 154 155 Returns: 156 EvaluatorInputs: A structured container with: 157 - input: The input data that generated the output 158 - output: The output to be evaluated 159 - expected_output: Optional ground truth for comparison 160 - metadata: Optional additional context 161 162 Can return either a direct EvaluatorInputs instance or an awaitable 163 (for async mappers that need to fetch additional data). 164 165 Examples: 166 Basic trace mapper: 167 ```python 168 def map_trace(trace): 169 return EvaluatorInputs( 170 input=trace.input, 171 output=trace.output, 172 expected_output=None, 173 metadata={"trace_id": trace.id, "user": trace.user_id} 174 ) 175 ``` 176 177 Observation mapper with conditional logic: 178 ```python 179 def map_observation(observation): 180 # Extract fields based on observation type 181 if observation.type == "GENERATION": 182 input_data = observation.input 183 output_data = observation.output 184 else: 185 # For other types, use different fields 186 input_data = observation.metadata.get("input") 187 output_data = observation.metadata.get("output") 188 189 return EvaluatorInputs( 190 input=input_data, 191 output=output_data, 192 expected_output=None, 193 metadata={"obs_id": observation.id, "type": observation.type} 194 ) 195 ``` 196 197 Async mapper (if additional processing needed): 198 ```python 199 async def map_trace_async(trace): 200 # Could do async processing here if needed 201 processed_output = await some_async_transformation(trace.output) 202 203 return EvaluatorInputs( 204 input=trace.input, 205 output=processed_output, 206 expected_output=None, 207 metadata={"trace_id": trace.id} 208 ) 209 ``` 210 """ 211 ...
Protocol defining the interface for mapper functions in batch evaluation.
Mapper functions transform API response objects (traces or observations) into the standardized EvaluatorInputs format that evaluators expect. This abstraction allows you to define how to extract and structure evaluation data from different entity types.
Mapper functions must:
- Accept a single item parameter (trace, observation)
- Return an EvaluatorInputs instance with input, output, expected_output, metadata
- Can be either synchronous or asynchronous
- Should handle missing or malformed data gracefully
1927def _no_init_or_replace_init(self, *args, **kwargs): 1928 cls = type(self) 1929 1930 if cls._is_protocol: 1931 raise TypeError('Protocols cannot be instantiated') 1932 1933 # Already using a custom `__init__`. No need to calculate correct 1934 # `__init__` to call. This can lead to RecursionError. See bpo-45121. 1935 if cls.__init__ is not _no_init_or_replace_init: 1936 return 1937 1938 # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`. 1939 # The first instantiation of the subclass will call `_no_init_or_replace_init` which 1940 # searches for a proper new `__init__` in the MRO. The new `__init__` 1941 # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent 1942 # instantiation of the protocol subclass will thus use the new 1943 # `__init__` and no longer call `_no_init_or_replace_init`. 1944 for base in cls.__mro__: 1945 init = base.__dict__.get('__init__', _no_init_or_replace_init) 1946 if init is not _no_init_or_replace_init: 1947 cls.__init__ = init 1948 break 1949 else: 1950 # should not happen 1951 cls.__init__ = object.__init__ 1952 1953 cls.__init__(self, *args, **kwargs)
214class CompositeEvaluatorFunction(Protocol): 215 """Protocol defining the interface for composite evaluator functions. 216 217 Composite evaluators create aggregate scores from multiple item-level evaluations. 218 This is commonly used to compute weighted averages, combined metrics, or other 219 composite assessments based on individual evaluation results. 220 221 Composite evaluators: 222 - Accept the same inputs as item-level evaluators (input, output, expected_output, metadata) 223 plus the list of evaluations 224 - Return either a single Evaluation, a list of Evaluations, or a dict 225 - Can be either synchronous or asynchronous 226 - Have access to both raw item data and evaluation results 227 """ 228 229 def __call__( 230 self, 231 *, 232 input: Optional[Any] = None, 233 output: Optional[Any] = None, 234 expected_output: Optional[Any] = None, 235 metadata: Optional[Dict[str, Any]] = None, 236 evaluations: List[Evaluation], 237 **kwargs: Dict[str, Any], 238 ) -> Union[ 239 Evaluation, 240 List[Evaluation], 241 Dict[str, Any], 242 Awaitable[Evaluation], 243 Awaitable[List[Evaluation]], 244 Awaitable[Dict[str, Any]], 245 ]: 246 r"""Create a composite evaluation from item-level evaluation results. 247 248 This method combines multiple evaluation scores into a single composite metric. 249 Common use cases include weighted averages, pass/fail decisions based on multiple 250 criteria, or custom scoring logic that considers multiple dimensions. 251 252 Args: 253 input: The input data that was provided to the system being evaluated. 254 output: The output generated by the system being evaluated. 255 expected_output: The expected/reference output for comparison (if available). 256 metadata: Additional metadata about the evaluation context. 257 evaluations: List of evaluation results from item-level evaluators. 258 Each evaluation contains name, value, comment, and metadata. 259 260 Returns: 261 Can return any of: 262 - Evaluation: A single composite evaluation result 263 - List[Evaluation]: Multiple composite evaluations 264 - Dict: A dict that will be converted to an Evaluation 265 - name: Identifier for the composite metric (e.g., "composite_score") 266 - value: The computed composite value 267 - comment: Optional explanation of how the score was computed 268 - metadata: Optional details about the composition logic 269 270 Can return either a direct Evaluation instance or an awaitable 271 (for async composite evaluators). 272 273 Examples: 274 Simple weighted average: 275 ```python 276 def weighted_composite(*, input, output, expected_output, metadata, evaluations): 277 weights = { 278 "accuracy": 0.5, 279 "relevance": 0.3, 280 "safety": 0.2 281 } 282 283 total_score = 0.0 284 total_weight = 0.0 285 286 for eval in evaluations: 287 if eval.name in weights and isinstance(eval.value, (int, float)): 288 total_score += eval.value * weights[eval.name] 289 total_weight += weights[eval.name] 290 291 final_score = total_score / total_weight if total_weight > 0 else 0.0 292 293 return Evaluation( 294 name="composite_score", 295 value=final_score, 296 comment=f"Weighted average of {len(evaluations)} metrics" 297 ) 298 ``` 299 300 Pass/fail composite based on thresholds: 301 ```python 302 def pass_fail_composite(*, input, output, expected_output, metadata, evaluations): 303 # Must pass all criteria 304 thresholds = { 305 "accuracy": 0.7, 306 "safety": 0.9, 307 "relevance": 0.6 308 } 309 310 passes = True 311 failing_metrics = [] 312 313 for metric, threshold in thresholds.items(): 314 eval_result = next((e for e in evaluations if e.name == metric), None) 315 if eval_result and isinstance(eval_result.value, (int, float)): 316 if eval_result.value < threshold: 317 passes = False 318 failing_metrics.append(metric) 319 320 return Evaluation( 321 name="passes_all_checks", 322 value=passes, 323 comment=f"Failed: {', '.join(failing_metrics)}" if failing_metrics else "All checks passed", 324 data_type="BOOLEAN" 325 ) 326 ``` 327 328 Async composite with external scoring: 329 ```python 330 async def llm_composite(*, input, output, expected_output, metadata, evaluations): 331 # Use LLM to synthesize multiple evaluation results 332 eval_summary = "\n".join( 333 f"- {e.name}: {e.value}" for e in evaluations 334 ) 335 336 prompt = f"Given these evaluation scores:\n{eval_summary}\n" 337 prompt += f"For the output: {output}\n" 338 prompt += "Provide an overall quality score from 0-1." 339 340 response = await openai.chat.completions.create( 341 model="gpt-4", 342 messages=[{"role": "user", "content": prompt}] 343 ) 344 345 score = float(response.choices[0].message.content.strip()) 346 347 return Evaluation( 348 name="llm_composite_score", 349 value=score, 350 comment="LLM-synthesized composite score" 351 ) 352 ``` 353 354 Context-aware composite: 355 ```python 356 def context_composite(*, input, output, expected_output, metadata, evaluations): 357 # Adjust weighting based on metadata 358 base_weights = {"accuracy": 0.5, "speed": 0.3, "cost": 0.2} 359 360 # If metadata indicates high importance, prioritize accuracy 361 if metadata and metadata.get('importance') == 'high': 362 weights = {"accuracy": 0.7, "speed": 0.2, "cost": 0.1} 363 else: 364 weights = base_weights 365 366 total = sum( 367 e.value * weights.get(e.name, 0) 368 for e in evaluations 369 if isinstance(e.value, (int, float)) 370 ) 371 372 return Evaluation( 373 name="weighted_composite", 374 value=total, 375 comment="Context-aware weighted composite" 376 ) 377 ``` 378 """ 379 ...
Protocol defining the interface for composite evaluator functions.
Composite evaluators create aggregate scores from multiple item-level evaluations. This is commonly used to compute weighted averages, combined metrics, or other composite assessments based on individual evaluation results.
Composite evaluators:
- Accept the same inputs as item-level evaluators (input, output, expected_output, metadata) plus the list of evaluations
- Return either a single Evaluation, a list of Evaluations, or a dict
- Can be either synchronous or asynchronous
- Have access to both raw item data and evaluation results
1927def _no_init_or_replace_init(self, *args, **kwargs): 1928 cls = type(self) 1929 1930 if cls._is_protocol: 1931 raise TypeError('Protocols cannot be instantiated') 1932 1933 # Already using a custom `__init__`. No need to calculate correct 1934 # `__init__` to call. This can lead to RecursionError. See bpo-45121. 1935 if cls.__init__ is not _no_init_or_replace_init: 1936 return 1937 1938 # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`. 1939 # The first instantiation of the subclass will call `_no_init_or_replace_init` which 1940 # searches for a proper new `__init__` in the MRO. The new `__init__` 1941 # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent 1942 # instantiation of the protocol subclass will thus use the new 1943 # `__init__` and no longer call `_no_init_or_replace_init`. 1944 for base in cls.__mro__: 1945 init = base.__dict__.get('__init__', _no_init_or_replace_init) 1946 if init is not _no_init_or_replace_init: 1947 cls.__init__ = init 1948 break 1949 else: 1950 # should not happen 1951 cls.__init__ = object.__init__ 1952 1953 cls.__init__(self, *args, **kwargs)
382class EvaluatorStats: 383 """Statistics for a single evaluator's performance during batch evaluation. 384 385 This class tracks detailed metrics about how a specific evaluator performed 386 across all items in a batch evaluation run. It helps identify evaluator issues, 387 understand reliability, and optimize evaluation pipelines. 388 389 Attributes: 390 name: The name of the evaluator function (extracted from __name__). 391 total_runs: Total number of times the evaluator was invoked. 392 successful_runs: Number of times the evaluator completed successfully. 393 failed_runs: Number of times the evaluator raised an exception or failed. 394 total_scores_created: Total number of evaluation scores created by this evaluator. 395 Can be higher than successful_runs if the evaluator returns multiple scores. 396 397 Examples: 398 Accessing evaluator stats from batch evaluation result: 399 ```python 400 result = client.run_batched_evaluation(...) 401 402 for stats in result.evaluator_stats: 403 print(f"Evaluator: {stats.name}") 404 print(f" Success rate: {stats.successful_runs / stats.total_runs:.1%}") 405 print(f" Scores created: {stats.total_scores_created}") 406 407 if stats.failed_runs > 0: 408 print(f" ⚠️ Failed {stats.failed_runs} times") 409 ``` 410 411 Identifying problematic evaluators: 412 ```python 413 result = client.run_batched_evaluation(...) 414 415 # Find evaluators with high failure rates 416 for stats in result.evaluator_stats: 417 failure_rate = stats.failed_runs / stats.total_runs 418 if failure_rate > 0.1: # More than 10% failures 419 print(f"⚠️ {stats.name} has {failure_rate:.1%} failure rate") 420 print(f" Consider debugging or removing this evaluator") 421 ``` 422 423 Note: 424 All arguments must be passed as keywords when instantiating this class. 425 """ 426 427 def __init__( 428 self, 429 *, 430 name: str, 431 total_runs: int = 0, 432 successful_runs: int = 0, 433 failed_runs: int = 0, 434 total_scores_created: int = 0, 435 ): 436 """Initialize EvaluatorStats with the provided metrics. 437 438 Args: 439 name: The evaluator function name. 440 total_runs: Total number of evaluator invocations. 441 successful_runs: Number of successful completions. 442 failed_runs: Number of failures. 443 total_scores_created: Total scores created by this evaluator. 444 445 Note: 446 All arguments must be provided as keywords. 447 """ 448 self.name = name 449 self.total_runs = total_runs 450 self.successful_runs = successful_runs 451 self.failed_runs = failed_runs 452 self.total_scores_created = total_scores_created
Statistics for a single evaluator's performance during batch evaluation.
This class tracks detailed metrics about how a specific evaluator performed across all items in a batch evaluation run. It helps identify evaluator issues, understand reliability, and optimize evaluation pipelines.
Attributes:
- name: The name of the evaluator function (extracted from __name__).
- total_runs: Total number of times the evaluator was invoked.
- successful_runs: Number of times the evaluator completed successfully.
- failed_runs: Number of times the evaluator raised an exception or failed.
- total_scores_created: Total number of evaluation scores created by this evaluator. Can be higher than successful_runs if the evaluator returns multiple scores.
Examples:
Accessing evaluator stats from batch evaluation result:
result = client.run_batched_evaluation(...) for stats in result.evaluator_stats: print(f"Evaluator: {stats.name}") print(f" Success rate: {stats.successful_runs / stats.total_runs:.1%}") print(f" Scores created: {stats.total_scores_created}") if stats.failed_runs > 0: print(f" ⚠️ Failed {stats.failed_runs} times")Identifying problematic evaluators:
result = client.run_batched_evaluation(...) # Find evaluators with high failure rates for stats in result.evaluator_stats: failure_rate = stats.failed_runs / stats.total_runs if failure_rate > 0.1: # More than 10% failures print(f"⚠️ {stats.name} has {failure_rate:.1%} failure rate") print(f" Consider debugging or removing this evaluator")
Note:
All arguments must be passed as keywords when instantiating this class.
427 def __init__( 428 self, 429 *, 430 name: str, 431 total_runs: int = 0, 432 successful_runs: int = 0, 433 failed_runs: int = 0, 434 total_scores_created: int = 0, 435 ): 436 """Initialize EvaluatorStats with the provided metrics. 437 438 Args: 439 name: The evaluator function name. 440 total_runs: Total number of evaluator invocations. 441 successful_runs: Number of successful completions. 442 failed_runs: Number of failures. 443 total_scores_created: Total scores created by this evaluator. 444 445 Note: 446 All arguments must be provided as keywords. 447 """ 448 self.name = name 449 self.total_runs = total_runs 450 self.successful_runs = successful_runs 451 self.failed_runs = failed_runs 452 self.total_scores_created = total_scores_created
Initialize EvaluatorStats with the provided metrics.
Arguments:
- name: The evaluator function name.
- total_runs: Total number of evaluator invocations.
- successful_runs: Number of successful completions.
- failed_runs: Number of failures.
- total_scores_created: Total scores created by this evaluator.
Note:
All arguments must be provided as keywords.
455class BatchEvaluationResumeToken: 456 """Token for resuming a failed batch evaluation run. 457 458 This class encapsulates all the information needed to resume a batch evaluation 459 that was interrupted or failed partway through. It uses timestamp-based filtering 460 to avoid re-processing items that were already evaluated, even if the underlying 461 dataset changed between runs. 462 463 Attributes: 464 scope: The type of items being evaluated ("traces", "observations"). 465 filter: The original JSON filter string used to query items. 466 last_processed_timestamp: ISO 8601 timestamp of the last successfully processed item. 467 Used to construct a filter that only fetches items after this timestamp. 468 last_processed_id: The ID of the last successfully processed item, for reference. 469 items_processed: Count of items successfully processed before interruption. 470 471 Examples: 472 Resuming a failed batch evaluation: 473 ```python 474 # Initial run that fails partway through 475 try: 476 result = client.run_batched_evaluation( 477 scope="traces", 478 mapper=my_mapper, 479 evaluators=[evaluator1, evaluator2], 480 filter='{"tags": ["production"]}', 481 max_items=10000 482 ) 483 except Exception as e: 484 print(f"Evaluation failed: {e}") 485 486 # Save the resume token 487 if result.resume_token: 488 # Store resume token for later (e.g., in a file or database) 489 import json 490 with open("resume_token.json", "w") as f: 491 json.dump({ 492 "scope": result.resume_token.scope, 493 "filter": result.resume_token.filter, 494 "last_timestamp": result.resume_token.last_processed_timestamp, 495 "last_id": result.resume_token.last_processed_id, 496 "items_done": result.resume_token.items_processed 497 }, f) 498 499 # Later, resume from where it left off 500 with open("resume_token.json") as f: 501 token_data = json.load(f) 502 503 resume_token = BatchEvaluationResumeToken( 504 scope=token_data["scope"], 505 filter=token_data["filter"], 506 last_processed_timestamp=token_data["last_timestamp"], 507 last_processed_id=token_data["last_id"], 508 items_processed=token_data["items_done"] 509 ) 510 511 # Resume the evaluation 512 result = client.run_batched_evaluation( 513 scope="traces", 514 mapper=my_mapper, 515 evaluators=[evaluator1, evaluator2], 516 resume_from=resume_token 517 ) 518 519 print(f"Processed {result.total_items_processed} additional items") 520 ``` 521 522 Handling partial completion: 523 ```python 524 result = client.run_batched_evaluation(...) 525 526 if not result.completed: 527 print(f"Evaluation incomplete. Processed {result.resume_token.items_processed} items") 528 print(f"Last item: {result.resume_token.last_processed_id}") 529 print(f"Resume from: {result.resume_token.last_processed_timestamp}") 530 531 # Optionally retry automatically 532 if result.resume_token: 533 print("Retrying...") 534 result = client.run_batched_evaluation( 535 scope=result.resume_token.scope, 536 mapper=my_mapper, 537 evaluators=my_evaluators, 538 resume_from=result.resume_token 539 ) 540 ``` 541 542 Note: 543 All arguments must be passed as keywords when instantiating this class. 544 The timestamp-based approach means that items created after the initial run 545 but before the timestamp will be skipped. This is intentional to avoid 546 duplicates and ensure consistent evaluation. 547 """ 548 549 def __init__( 550 self, 551 *, 552 scope: str, 553 filter: Optional[str], 554 last_processed_timestamp: str, 555 last_processed_id: str, 556 items_processed: int, 557 ): 558 """Initialize BatchEvaluationResumeToken with the provided state. 559 560 Args: 561 scope: The scope type ("traces", "observations"). 562 filter: The original JSON filter string. 563 last_processed_timestamp: ISO 8601 timestamp of last processed item. 564 last_processed_id: ID of last processed item. 565 items_processed: Count of items processed before interruption. 566 567 Note: 568 All arguments must be provided as keywords. 569 """ 570 self.scope = scope 571 self.filter = filter 572 self.last_processed_timestamp = last_processed_timestamp 573 self.last_processed_id = last_processed_id 574 self.items_processed = items_processed
Token for resuming a failed batch evaluation run.
This class encapsulates all the information needed to resume a batch evaluation that was interrupted or failed partway through. It uses timestamp-based filtering to avoid re-processing items that were already evaluated, even if the underlying dataset changed between runs.
Attributes:
- scope: The type of items being evaluated ("traces", "observations").
- filter: The original JSON filter string used to query items.
- last_processed_timestamp: ISO 8601 timestamp of the last successfully processed item. Used to construct a filter that only fetches items after this timestamp.
- last_processed_id: The ID of the last successfully processed item, for reference.
- items_processed: Count of items successfully processed before interruption.
Examples:
Resuming a failed batch evaluation:
# Initial run that fails partway through try: result = client.run_batched_evaluation( scope="traces", mapper=my_mapper, evaluators=[evaluator1, evaluator2], filter='{"tags": ["production"]}', max_items=10000 ) except Exception as e: print(f"Evaluation failed: {e}") # Save the resume token if result.resume_token: # Store resume token for later (e.g., in a file or database) import json with open("resume_token.json", "w") as f: json.dump({ "scope": result.resume_token.scope, "filter": result.resume_token.filter, "last_timestamp": result.resume_token.last_processed_timestamp, "last_id": result.resume_token.last_processed_id, "items_done": result.resume_token.items_processed }, f) # Later, resume from where it left off with open("resume_token.json") as f: token_data = json.load(f) resume_token = BatchEvaluationResumeToken( scope=token_data["scope"], filter=token_data["filter"], last_processed_timestamp=token_data["last_timestamp"], last_processed_id=token_data["last_id"], items_processed=token_data["items_done"] ) # Resume the evaluation result = client.run_batched_evaluation( scope="traces", mapper=my_mapper, evaluators=[evaluator1, evaluator2], resume_from=resume_token ) print(f"Processed {result.total_items_processed} additional items")Handling partial completion:
result = client.run_batched_evaluation(...) if not result.completed: print(f"Evaluation incomplete. Processed {result.resume_token.items_processed} items") print(f"Last item: {result.resume_token.last_processed_id}") print(f"Resume from: {result.resume_token.last_processed_timestamp}") # Optionally retry automatically if result.resume_token: print("Retrying...") result = client.run_batched_evaluation( scope=result.resume_token.scope, mapper=my_mapper, evaluators=my_evaluators, resume_from=result.resume_token )
Note:
All arguments must be passed as keywords when instantiating this class. The timestamp-based approach means that items created after the initial run but before the timestamp will be skipped. This is intentional to avoid duplicates and ensure consistent evaluation.
549 def __init__( 550 self, 551 *, 552 scope: str, 553 filter: Optional[str], 554 last_processed_timestamp: str, 555 last_processed_id: str, 556 items_processed: int, 557 ): 558 """Initialize BatchEvaluationResumeToken with the provided state. 559 560 Args: 561 scope: The scope type ("traces", "observations"). 562 filter: The original JSON filter string. 563 last_processed_timestamp: ISO 8601 timestamp of last processed item. 564 last_processed_id: ID of last processed item. 565 items_processed: Count of items processed before interruption. 566 567 Note: 568 All arguments must be provided as keywords. 569 """ 570 self.scope = scope 571 self.filter = filter 572 self.last_processed_timestamp = last_processed_timestamp 573 self.last_processed_id = last_processed_id 574 self.items_processed = items_processed
Initialize BatchEvaluationResumeToken with the provided state.
Arguments:
- scope: The scope type ("traces", "observations").
- filter: The original JSON filter string.
- last_processed_timestamp: ISO 8601 timestamp of last processed item.
- last_processed_id: ID of last processed item.
- items_processed: Count of items processed before interruption.
Note:
All arguments must be provided as keywords.
577class BatchEvaluationResult: 578 r"""Complete result structure for batch evaluation execution. 579 580 This class encapsulates comprehensive statistics and metadata about a batch 581 evaluation run, including counts, evaluator-specific metrics, timing information, 582 error details, and resume capability. 583 584 Attributes: 585 total_items_fetched: Total number of items fetched from the API. 586 total_items_processed: Number of items successfully evaluated. 587 total_items_failed: Number of items that failed during evaluation. 588 total_scores_created: Total scores created by all item-level evaluators. 589 total_composite_scores_created: Scores created by the composite evaluator. 590 total_evaluations_failed: Number of individual evaluator failures across all items. 591 evaluator_stats: List of per-evaluator statistics (success/failure rates, scores created). 592 resume_token: Token for resuming if evaluation was interrupted (None if completed). 593 completed: True if all items were processed, False if stopped early or failed. 594 duration_seconds: Total time taken to execute the batch evaluation. 595 failed_item_ids: List of IDs for items that failed evaluation. 596 error_summary: Dictionary mapping error types to occurrence counts. 597 has_more_items: True if max_items limit was reached but more items exist. 598 item_evaluations: Dictionary mapping item IDs to their evaluation results (both regular and composite). 599 600 Examples: 601 Basic result inspection: 602 ```python 603 result = client.run_batched_evaluation(...) 604 605 print(f"Processed: {result.total_items_processed}/{result.total_items_fetched}") 606 print(f"Scores created: {result.total_scores_created}") 607 print(f"Duration: {result.duration_seconds:.2f}s") 608 print(f"Success rate: {result.total_items_processed / result.total_items_fetched:.1%}") 609 ``` 610 611 Detailed analysis with evaluator stats: 612 ```python 613 result = client.run_batched_evaluation(...) 614 615 print(f"\n📊 Batch Evaluation Results") 616 print(f"{'='*50}") 617 print(f"Items processed: {result.total_items_processed}") 618 print(f"Items failed: {result.total_items_failed}") 619 print(f"Scores created: {result.total_scores_created}") 620 621 if result.total_composite_scores_created > 0: 622 print(f"Composite scores: {result.total_composite_scores_created}") 623 624 print(f"\n📈 Evaluator Performance:") 625 for stats in result.evaluator_stats: 626 success_rate = stats.successful_runs / stats.total_runs if stats.total_runs > 0 else 0 627 print(f"\n {stats.name}:") 628 print(f" Success rate: {success_rate:.1%}") 629 print(f" Scores created: {stats.total_scores_created}") 630 if stats.failed_runs > 0: 631 print(f" ⚠️ Failures: {stats.failed_runs}") 632 633 if result.error_summary: 634 print(f"\n⚠️ Errors encountered:") 635 for error_type, count in result.error_summary.items(): 636 print(f" {error_type}: {count}") 637 ``` 638 639 Handling incomplete runs: 640 ```python 641 result = client.run_batched_evaluation(...) 642 643 if not result.completed: 644 print("⚠️ Evaluation incomplete!") 645 646 if result.resume_token: 647 print(f"Processed {result.resume_token.items_processed} items before failure") 648 print(f"Use resume_from parameter to continue from:") 649 print(f" Timestamp: {result.resume_token.last_processed_timestamp}") 650 print(f" Last ID: {result.resume_token.last_processed_id}") 651 652 if result.has_more_items: 653 print(f"ℹ️ More items available beyond max_items limit") 654 ``` 655 656 Performance monitoring: 657 ```python 658 result = client.run_batched_evaluation(...) 659 660 items_per_second = result.total_items_processed / result.duration_seconds 661 avg_scores_per_item = result.total_scores_created / result.total_items_processed 662 663 print(f"Performance metrics:") 664 print(f" Throughput: {items_per_second:.2f} items/second") 665 print(f" Avg scores/item: {avg_scores_per_item:.2f}") 666 print(f" Total duration: {result.duration_seconds:.2f}s") 667 668 if result.total_evaluations_failed > 0: 669 failure_rate = result.total_evaluations_failed / ( 670 result.total_items_processed * len(result.evaluator_stats) 671 ) 672 print(f" Evaluation failure rate: {failure_rate:.1%}") 673 ``` 674 675 Note: 676 All arguments must be passed as keywords when instantiating this class. 677 """ 678 679 def __init__( 680 self, 681 *, 682 total_items_fetched: int, 683 total_items_processed: int, 684 total_items_failed: int, 685 total_scores_created: int, 686 total_composite_scores_created: int, 687 total_evaluations_failed: int, 688 evaluator_stats: List[EvaluatorStats], 689 resume_token: Optional[BatchEvaluationResumeToken], 690 completed: bool, 691 duration_seconds: float, 692 failed_item_ids: List[str], 693 error_summary: Dict[str, int], 694 has_more_items: bool, 695 item_evaluations: Dict[str, List["Evaluation"]], 696 ): 697 """Initialize BatchEvaluationResult with comprehensive statistics. 698 699 Args: 700 total_items_fetched: Total items fetched from API. 701 total_items_processed: Items successfully evaluated. 702 total_items_failed: Items that failed evaluation. 703 total_scores_created: Scores from item-level evaluators. 704 total_composite_scores_created: Scores from composite evaluator. 705 total_evaluations_failed: Individual evaluator failures. 706 evaluator_stats: Per-evaluator statistics. 707 resume_token: Token for resuming (None if completed). 708 completed: Whether all items were processed. 709 duration_seconds: Total execution time. 710 failed_item_ids: IDs of failed items. 711 error_summary: Error types and counts. 712 has_more_items: Whether more items exist beyond max_items. 713 item_evaluations: Dictionary mapping item IDs to their evaluation results. 714 715 Note: 716 All arguments must be provided as keywords. 717 """ 718 self.total_items_fetched = total_items_fetched 719 self.total_items_processed = total_items_processed 720 self.total_items_failed = total_items_failed 721 self.total_scores_created = total_scores_created 722 self.total_composite_scores_created = total_composite_scores_created 723 self.total_evaluations_failed = total_evaluations_failed 724 self.evaluator_stats = evaluator_stats 725 self.resume_token = resume_token 726 self.completed = completed 727 self.duration_seconds = duration_seconds 728 self.failed_item_ids = failed_item_ids 729 self.error_summary = error_summary 730 self.has_more_items = has_more_items 731 self.item_evaluations = item_evaluations 732 733 def __str__(self) -> str: 734 """Return a formatted string representation of the batch evaluation results. 735 736 Returns: 737 A multi-line string with a summary of the evaluation results. 738 """ 739 lines = [] 740 lines.append("=" * 60) 741 lines.append("Batch Evaluation Results") 742 lines.append("=" * 60) 743 744 # Summary statistics 745 lines.append(f"\nStatus: {'Completed' if self.completed else 'Incomplete'}") 746 lines.append(f"Duration: {self.duration_seconds:.2f}s") 747 lines.append(f"\nItems fetched: {self.total_items_fetched}") 748 lines.append(f"Items processed: {self.total_items_processed}") 749 750 if self.total_items_failed > 0: 751 lines.append(f"Items failed: {self.total_items_failed}") 752 753 # Success rate 754 if self.total_items_fetched > 0: 755 success_rate = self.total_items_processed / self.total_items_fetched * 100 756 lines.append(f"Success rate: {success_rate:.1f}%") 757 758 # Scores created 759 lines.append(f"\nScores created: {self.total_scores_created}") 760 if self.total_composite_scores_created > 0: 761 lines.append(f"Composite scores: {self.total_composite_scores_created}") 762 763 total_scores = self.total_scores_created + self.total_composite_scores_created 764 lines.append(f"Total scores: {total_scores}") 765 766 # Evaluator statistics 767 if self.evaluator_stats: 768 lines.append("\nEvaluator Performance:") 769 for stats in self.evaluator_stats: 770 lines.append(f" {stats.name}:") 771 if stats.total_runs > 0: 772 success_rate = ( 773 stats.successful_runs / stats.total_runs * 100 774 if stats.total_runs > 0 775 else 0 776 ) 777 lines.append( 778 f" Runs: {stats.successful_runs}/{stats.total_runs} " 779 f"({success_rate:.1f}% success)" 780 ) 781 lines.append(f" Scores created: {stats.total_scores_created}") 782 if stats.failed_runs > 0: 783 lines.append(f" Failed runs: {stats.failed_runs}") 784 785 # Performance metrics 786 if self.total_items_processed > 0 and self.duration_seconds > 0: 787 items_per_sec = self.total_items_processed / self.duration_seconds 788 lines.append("\nPerformance:") 789 lines.append(f" Throughput: {items_per_sec:.2f} items/second") 790 if self.total_scores_created > 0: 791 avg_scores = self.total_scores_created / self.total_items_processed 792 lines.append(f" Avg scores per item: {avg_scores:.2f}") 793 794 # Errors and warnings 795 if self.error_summary: 796 lines.append("\nErrors encountered:") 797 for error_type, count in self.error_summary.items(): 798 lines.append(f" {error_type}: {count}") 799 800 # Incomplete run information 801 if not self.completed: 802 lines.append("\nWarning: Evaluation incomplete") 803 if self.resume_token: 804 lines.append( 805 f" Last processed: {self.resume_token.last_processed_timestamp}" 806 ) 807 lines.append(f" Items processed: {self.resume_token.items_processed}") 808 lines.append(" Use resume_from parameter to continue") 809 810 if self.has_more_items: 811 lines.append("\nNote: More items available beyond max_items limit") 812 813 lines.append("=" * 60) 814 return "\n".join(lines)
Complete result structure for batch evaluation execution.
This class encapsulates comprehensive statistics and metadata about a batch evaluation run, including counts, evaluator-specific metrics, timing information, error details, and resume capability.
Attributes:
- total_items_fetched: Total number of items fetched from the API.
- total_items_processed: Number of items successfully evaluated.
- total_items_failed: Number of items that failed during evaluation.
- total_scores_created: Total scores created by all item-level evaluators.
- total_composite_scores_created: Scores created by the composite evaluator.
- total_evaluations_failed: Number of individual evaluator failures across all items.
- evaluator_stats: List of per-evaluator statistics (success/failure rates, scores created).
- resume_token: Token for resuming if evaluation was interrupted (None if completed).
- completed: True if all items were processed, False if stopped early or failed.
- duration_seconds: Total time taken to execute the batch evaluation.
- failed_item_ids: List of IDs for items that failed evaluation.
- error_summary: Dictionary mapping error types to occurrence counts.
- has_more_items: True if max_items limit was reached but more items exist.
- item_evaluations: Dictionary mapping item IDs to their evaluation results (both regular and composite).
Examples:
Basic result inspection:
result = client.run_batched_evaluation(...) print(f"Processed: {result.total_items_processed}/{result.total_items_fetched}") print(f"Scores created: {result.total_scores_created}") print(f"Duration: {result.duration_seconds:.2f}s") print(f"Success rate: {result.total_items_processed / result.total_items_fetched:.1%}")Detailed analysis with evaluator stats:
result = client.run_batched_evaluation(...) print(f"\n📊 Batch Evaluation Results") print(f"{'='*50}") print(f"Items processed: {result.total_items_processed}") print(f"Items failed: {result.total_items_failed}") print(f"Scores created: {result.total_scores_created}") if result.total_composite_scores_created > 0: print(f"Composite scores: {result.total_composite_scores_created}") print(f"\n📈 Evaluator Performance:") for stats in result.evaluator_stats: success_rate = stats.successful_runs / stats.total_runs if stats.total_runs > 0 else 0 print(f"\n {stats.name}:") print(f" Success rate: {success_rate:.1%}") print(f" Scores created: {stats.total_scores_created}") if stats.failed_runs > 0: print(f" ⚠️ Failures: {stats.failed_runs}") if result.error_summary: print(f"\n⚠️ Errors encountered:") for error_type, count in result.error_summary.items(): print(f" {error_type}: {count}")Handling incomplete runs:
result = client.run_batched_evaluation(...) if not result.completed: print("⚠️ Evaluation incomplete!") if result.resume_token: print(f"Processed {result.resume_token.items_processed} items before failure") print(f"Use resume_from parameter to continue from:") print(f" Timestamp: {result.resume_token.last_processed_timestamp}") print(f" Last ID: {result.resume_token.last_processed_id}") if result.has_more_items: print(f"ℹ️ More items available beyond max_items limit")Performance monitoring:
result = client.run_batched_evaluation(...) items_per_second = result.total_items_processed / result.duration_seconds avg_scores_per_item = result.total_scores_created / result.total_items_processed print(f"Performance metrics:") print(f" Throughput: {items_per_second:.2f} items/second") print(f" Avg scores/item: {avg_scores_per_item:.2f}") print(f" Total duration: {result.duration_seconds:.2f}s") if result.total_evaluations_failed > 0: failure_rate = result.total_evaluations_failed / ( result.total_items_processed * len(result.evaluator_stats) ) print(f" Evaluation failure rate: {failure_rate:.1%}")
Note:
All arguments must be passed as keywords when instantiating this class.
679 def __init__( 680 self, 681 *, 682 total_items_fetched: int, 683 total_items_processed: int, 684 total_items_failed: int, 685 total_scores_created: int, 686 total_composite_scores_created: int, 687 total_evaluations_failed: int, 688 evaluator_stats: List[EvaluatorStats], 689 resume_token: Optional[BatchEvaluationResumeToken], 690 completed: bool, 691 duration_seconds: float, 692 failed_item_ids: List[str], 693 error_summary: Dict[str, int], 694 has_more_items: bool, 695 item_evaluations: Dict[str, List["Evaluation"]], 696 ): 697 """Initialize BatchEvaluationResult with comprehensive statistics. 698 699 Args: 700 total_items_fetched: Total items fetched from API. 701 total_items_processed: Items successfully evaluated. 702 total_items_failed: Items that failed evaluation. 703 total_scores_created: Scores from item-level evaluators. 704 total_composite_scores_created: Scores from composite evaluator. 705 total_evaluations_failed: Individual evaluator failures. 706 evaluator_stats: Per-evaluator statistics. 707 resume_token: Token for resuming (None if completed). 708 completed: Whether all items were processed. 709 duration_seconds: Total execution time. 710 failed_item_ids: IDs of failed items. 711 error_summary: Error types and counts. 712 has_more_items: Whether more items exist beyond max_items. 713 item_evaluations: Dictionary mapping item IDs to their evaluation results. 714 715 Note: 716 All arguments must be provided as keywords. 717 """ 718 self.total_items_fetched = total_items_fetched 719 self.total_items_processed = total_items_processed 720 self.total_items_failed = total_items_failed 721 self.total_scores_created = total_scores_created 722 self.total_composite_scores_created = total_composite_scores_created 723 self.total_evaluations_failed = total_evaluations_failed 724 self.evaluator_stats = evaluator_stats 725 self.resume_token = resume_token 726 self.completed = completed 727 self.duration_seconds = duration_seconds 728 self.failed_item_ids = failed_item_ids 729 self.error_summary = error_summary 730 self.has_more_items = has_more_items 731 self.item_evaluations = item_evaluations
Initialize BatchEvaluationResult with comprehensive statistics.
Arguments:
- total_items_fetched: Total items fetched from API.
- total_items_processed: Items successfully evaluated.
- total_items_failed: Items that failed evaluation.
- total_scores_created: Scores from item-level evaluators.
- total_composite_scores_created: Scores from composite evaluator.
- total_evaluations_failed: Individual evaluator failures.
- evaluator_stats: Per-evaluator statistics.
- resume_token: Token for resuming (None if completed).
- completed: Whether all items were processed.
- duration_seconds: Total execution time.
- failed_item_ids: IDs of failed items.
- error_summary: Error types and counts.
- has_more_items: Whether more items exist beyond max_items.
- item_evaluations: Dictionary mapping item IDs to their evaluation results.
Note:
All arguments must be provided as keywords.
1062class RunnerContext: 1063 """Wraps :meth:`Langfuse.run_experiment` with CI-injected defaults. 1064 1065 Intended for use with the ``langfuse/experiment-action`` GitHub Action 1066 (https://github.com/langfuse/experiment-action). The action builds a 1067 ``RunnerContext`` before invoking the user's ``experiment(context)`` 1068 function. Defaults set here (dataset, metadata tags) are applied when 1069 the user omits them on the :meth:`run_experiment` call; users can 1070 override any default by passing the corresponding argument explicitly. 1071 """ 1072 1073 def __init__( 1074 self, 1075 *, 1076 client: "Langfuse", 1077 data: Optional[ExperimentData] = None, 1078 dataset_version: Optional[datetime] = None, 1079 metadata: Optional[Dict[str, str]] = None, 1080 ): 1081 """Build a ``RunnerContext`` populated with defaults for ``run_experiment``. 1082 1083 Typically called by the ``langfuse/experiment-action`` GitHub Action, 1084 not by end users directly. Every field except ``client`` is optional: 1085 fields left as ``None`` simply mean the corresponding argument must be 1086 supplied on the :meth:`run_experiment` call. 1087 1088 Args: 1089 client: Initialized Langfuse SDK client used to execute the 1090 experiment. The action creates this from the 1091 ``langfuse_public_key`` / ``langfuse_secret_key`` / 1092 ``langfuse_base_url`` inputs. 1093 data: Default dataset items to run the experiment on. Accepts 1094 either ``List[LocalExperimentItem]`` or ``List[DatasetItem]``. 1095 Injected by the action when ``dataset_name`` is configured. 1096 If ``None``, the user must pass ``data=`` to 1097 :meth:`run_experiment`. 1098 dataset_version: Optional pinned dataset version. Injected by the 1099 action when ``dataset_version`` is configured. 1100 metadata: Default metadata attached to every experiment trace and 1101 the dataset run. The action injects GitHub-sourced tags (SHA, 1102 PR link, workflow run link, branch, GH user, etc.). Merged 1103 with any ``metadata`` passed to :meth:`run_experiment`, with 1104 user-supplied keys winning on collision. 1105 """ 1106 self.client = client 1107 self.data = data 1108 self.dataset_version = dataset_version 1109 self.metadata = metadata 1110 1111 def run_experiment( 1112 self, 1113 *, 1114 name: str, 1115 run_name: Optional[str] = None, 1116 description: Optional[str] = None, 1117 data: Optional[ExperimentData] = None, 1118 task: TaskFunction, 1119 evaluators: List[EvaluatorFunction] = [], 1120 composite_evaluator: Optional["CompositeEvaluatorFunction"] = None, 1121 run_evaluators: List[RunEvaluatorFunction] = [], 1122 max_concurrency: int = 50, 1123 metadata: Optional[Dict[str, str]] = None, 1124 _dataset_version: Optional[datetime] = None, 1125 ) -> ExperimentResult: 1126 resolved_data = data if data is not None else self.data 1127 if resolved_data is None: 1128 raise ValueError( 1129 "`data` must be provided either on the RunnerContext or the run_experiment call" 1130 ) 1131 1132 resolved_dataset_version = ( 1133 _dataset_version if _dataset_version is not None else self.dataset_version 1134 ) 1135 1136 merged_metadata: Optional[Dict[str, str]] 1137 if self.metadata is None and metadata is None: 1138 merged_metadata = None 1139 else: 1140 merged_metadata = {**(self.metadata or {}), **(metadata or {})} 1141 1142 return self.client.run_experiment( 1143 name=name, 1144 run_name=run_name, 1145 description=description, 1146 data=resolved_data, 1147 task=task, 1148 evaluators=evaluators, 1149 composite_evaluator=composite_evaluator, 1150 run_evaluators=run_evaluators, 1151 max_concurrency=max_concurrency, 1152 metadata=merged_metadata, 1153 _dataset_version=resolved_dataset_version, 1154 )
Wraps Langfuse.run_experiment() with CI-injected defaults.
Intended for use with the langfuse/experiment-action GitHub Action
(https://github.com/langfuse/experiment-action). The action builds a
RunnerContext before invoking the user's experiment(context)
function. Defaults set here (dataset, metadata tags) are applied when
the user omits them on the run_experiment() call; users can
override any default by passing the corresponding argument explicitly.
1073 def __init__( 1074 self, 1075 *, 1076 client: "Langfuse", 1077 data: Optional[ExperimentData] = None, 1078 dataset_version: Optional[datetime] = None, 1079 metadata: Optional[Dict[str, str]] = None, 1080 ): 1081 """Build a ``RunnerContext`` populated with defaults for ``run_experiment``. 1082 1083 Typically called by the ``langfuse/experiment-action`` GitHub Action, 1084 not by end users directly. Every field except ``client`` is optional: 1085 fields left as ``None`` simply mean the corresponding argument must be 1086 supplied on the :meth:`run_experiment` call. 1087 1088 Args: 1089 client: Initialized Langfuse SDK client used to execute the 1090 experiment. The action creates this from the 1091 ``langfuse_public_key`` / ``langfuse_secret_key`` / 1092 ``langfuse_base_url`` inputs. 1093 data: Default dataset items to run the experiment on. Accepts 1094 either ``List[LocalExperimentItem]`` or ``List[DatasetItem]``. 1095 Injected by the action when ``dataset_name`` is configured. 1096 If ``None``, the user must pass ``data=`` to 1097 :meth:`run_experiment`. 1098 dataset_version: Optional pinned dataset version. Injected by the 1099 action when ``dataset_version`` is configured. 1100 metadata: Default metadata attached to every experiment trace and 1101 the dataset run. The action injects GitHub-sourced tags (SHA, 1102 PR link, workflow run link, branch, GH user, etc.). Merged 1103 with any ``metadata`` passed to :meth:`run_experiment`, with 1104 user-supplied keys winning on collision. 1105 """ 1106 self.client = client 1107 self.data = data 1108 self.dataset_version = dataset_version 1109 self.metadata = metadata
Build a RunnerContext populated with defaults for run_experiment.
Typically called by the langfuse/experiment-action GitHub Action,
not by end users directly. Every field except client is optional:
fields left as None simply mean the corresponding argument must be
supplied on the run_experiment() call.
Arguments:
- client: Initialized Langfuse SDK client used to execute the
experiment. The action creates this from the
langfuse_public_key/langfuse_secret_key/langfuse_base_urlinputs. - data: Default dataset items to run the experiment on. Accepts
either
List[LocalExperimentItem]orList[DatasetItem]. Injected by the action whendataset_nameis configured. IfNone, the user must passdata=torun_experiment(). - dataset_version: Optional pinned dataset version. Injected by the
action when
dataset_versionis configured. - metadata: Default metadata attached to every experiment trace and
the dataset run. The action injects GitHub-sourced tags (SHA,
PR link, workflow run link, branch, GH user, etc.). Merged
with any
metadatapassed torun_experiment(), with user-supplied keys winning on collision.
1111 def run_experiment( 1112 self, 1113 *, 1114 name: str, 1115 run_name: Optional[str] = None, 1116 description: Optional[str] = None, 1117 data: Optional[ExperimentData] = None, 1118 task: TaskFunction, 1119 evaluators: List[EvaluatorFunction] = [], 1120 composite_evaluator: Optional["CompositeEvaluatorFunction"] = None, 1121 run_evaluators: List[RunEvaluatorFunction] = [], 1122 max_concurrency: int = 50, 1123 metadata: Optional[Dict[str, str]] = None, 1124 _dataset_version: Optional[datetime] = None, 1125 ) -> ExperimentResult: 1126 resolved_data = data if data is not None else self.data 1127 if resolved_data is None: 1128 raise ValueError( 1129 "`data` must be provided either on the RunnerContext or the run_experiment call" 1130 ) 1131 1132 resolved_dataset_version = ( 1133 _dataset_version if _dataset_version is not None else self.dataset_version 1134 ) 1135 1136 merged_metadata: Optional[Dict[str, str]] 1137 if self.metadata is None and metadata is None: 1138 merged_metadata = None 1139 else: 1140 merged_metadata = {**(self.metadata or {}), **(metadata or {})} 1141 1142 return self.client.run_experiment( 1143 name=name, 1144 run_name=run_name, 1145 description=description, 1146 data=resolved_data, 1147 task=task, 1148 evaluators=evaluators, 1149 composite_evaluator=composite_evaluator, 1150 run_evaluators=run_evaluators, 1151 max_concurrency=max_concurrency, 1152 metadata=merged_metadata, 1153 _dataset_version=resolved_dataset_version, 1154 )
1157class RegressionError(Exception): 1158 """Raised by a user's ``experiment`` function to signal a CI gate failure. 1159 1160 Intended for use with the ``langfuse/experiment-action`` GitHub Action 1161 (https://github.com/langfuse/experiment-action). The action catches this 1162 exception and, when ``should_fail_on_error`` is enabled, fails the 1163 workflow run and renders a callout in the PR comment using 1164 ``metric``/``value``/``threshold`` if supplied, otherwise ``str(exc)``. 1165 1166 Callers choose one of three forms: 1167 1168 - ``RegressionError(result=r)`` — minimal, generic message. 1169 - ``RegressionError(result=r, message="...")`` — free-form message. 1170 - ``RegressionError(result=r, metric="acc", value=0.7, threshold=0.9)`` — 1171 structured; ``metric`` and ``value`` must be provided together so the 1172 action can render a targeted callout without ``None`` placeholders. 1173 """ 1174 1175 @overload 1176 def __init__(self, *, result: ExperimentResult) -> None: ... 1177 @overload 1178 def __init__(self, *, result: ExperimentResult, message: str) -> None: ... 1179 @overload 1180 def __init__( 1181 self, 1182 *, 1183 result: ExperimentResult, 1184 metric: str, 1185 value: float, 1186 threshold: Optional[float] = None, 1187 message: Optional[str] = None, 1188 ) -> None: ... 1189 def __init__( 1190 self, 1191 *, 1192 result: ExperimentResult, 1193 metric: Optional[str] = None, 1194 value: Optional[float] = None, 1195 threshold: Optional[float] = None, 1196 message: Optional[str] = None, 1197 ): 1198 self.result = result 1199 self.metric = metric 1200 self.value = value 1201 self.threshold = threshold 1202 if message is not None: 1203 formatted = message 1204 elif metric is not None and value is not None: 1205 formatted = f"Regression on `{metric}`: {value} (threshold {threshold})" 1206 else: 1207 formatted = "Experiment regression detected" 1208 super().__init__(formatted)
Raised by a user's experiment function to signal a CI gate failure.
Intended for use with the langfuse/experiment-action GitHub Action
(https://github.com/langfuse/experiment-action). The action catches this
exception and, when should_fail_on_error is enabled, fails the
workflow run and renders a callout in the PR comment using
metric/value/threshold if supplied, otherwise str(exc).
Callers choose one of three forms:
RegressionError(result=r)— minimal, generic message.RegressionError(result=r, message="...")— free-form message.RegressionError(result=r, metric="acc", value=0.7, threshold=0.9)— structured;metricandvaluemust be provided together so the action can render a targeted callout withoutNoneplaceholders.
1189 def __init__( 1190 self, 1191 *, 1192 result: ExperimentResult, 1193 metric: Optional[str] = None, 1194 value: Optional[float] = None, 1195 threshold: Optional[float] = None, 1196 message: Optional[str] = None, 1197 ): 1198 self.result = result 1199 self.metric = metric 1200 self.value = value 1201 self.threshold = threshold 1202 if message is not None: 1203 formatted = message 1204 elif metric is not None and value is not None: 1205 formatted = f"Regression on `{metric}`: {value} (threshold {threshold})" 1206 else: 1207 formatted = "Experiment regression detected" 1208 super().__init__(formatted)
98def is_default_export_span(span: ReadableSpan) -> bool: 99 """Return whether a span should be exported by default.""" 100 return ( 101 is_langfuse_span(span) or is_genai_span(span) or is_known_llm_instrumentor(span) 102 )
Return whether a span should be exported by default.
61def is_langfuse_span(span: ReadableSpan) -> bool: 62 """Return whether the span was created by the Langfuse SDK tracer.""" 63 return ( 64 span.instrumentation_scope is not None 65 and span.instrumentation_scope.name == LANGFUSE_TRACER_NAME 66 )
Return whether the span was created by the Langfuse SDK tracer.
69def is_genai_span(span: ReadableSpan) -> bool: 70 """Return whether the span has any ``gen_ai.*`` semantic convention attribute.""" 71 if span.attributes is None: 72 return False 73 74 return any( 75 isinstance(key, str) and key.startswith("gen_ai") 76 for key in span.attributes.keys() 77 )
Return whether the span has any gen_ai.* semantic convention attribute.
85def is_known_llm_instrumentor(span: ReadableSpan) -> bool: 86 """Return whether the span comes from a known LLM instrumentation scope.""" 87 if span.instrumentation_scope is None: 88 return False 89 90 scope_name = span.instrumentation_scope.name 91 92 return any( 93 _matches_scope_prefix(scope_name, prefix) 94 for prefix in KNOWN_LLM_INSTRUMENTATION_SCOPE_PREFIXES 95 )
Return whether the span comes from a known LLM instrumentation scope.
224class MaskOtelSpansFunction(Protocol): 225 """Function protocol for export-stage OpenTelemetry span masking. 226 227 `mask_otel_spans` runs after Langfuse decides which spans this client should 228 export and after export-stage media handling has converted supported media 229 payloads into Langfuse media references. It affects only the spans exported 230 by this Langfuse client. If the same OpenTelemetry spans are sent to another 231 exporter, that exporter receives its own unmodified copy. 232 233 The function is synchronous. It usually runs on the OpenTelemetry batch span 234 processor worker thread; during `flush()` and shutdown it may run on the 235 caller thread. Keep it deterministic and fast, and avoid relying on request 236 locals, the current active span, or async I/O. 237 238 Return `None` to leave the whole batch unchanged, or return 239 `MaskOtelSpansResult` with sparse patches for the spans that should change. 240 241 Example: 242 ```python 243 from typing import Optional 244 245 from langfuse import Langfuse 246 from langfuse.types import ( 247 MaskOtelSpansParams, 248 MaskOtelSpansResult, 249 OtelSpanPatch, 250 ) 251 252 def mask_otel_spans( 253 *, params: MaskOtelSpansParams 254 ) -> Optional[MaskOtelSpansResult]: 255 patches = {} 256 257 for identifier, span in params.spans.items(): 258 if span.instrumentation_scope_name == "openai": 259 patches[identifier] = OtelSpanPatch( 260 delete_attributes=( 261 "gen_ai.prompt.0.content", 262 "gen_ai.completion.0.content", 263 ), 264 set_attributes={"masking.applied": True}, 265 ) 266 267 return MaskOtelSpansResult(span_patches=patches) 268 269 langfuse = Langfuse(mask_otel_spans=mask_otel_spans) 270 ``` 271 """ 272 273 def __call__( 274 self, *, params: MaskOtelSpansParams 275 ) -> Optional[MaskOtelSpansResult]: ...
Function protocol for export-stage OpenTelemetry span masking.
mask_otel_spans runs after Langfuse decides which spans this client should
export and after export-stage media handling has converted supported media
payloads into Langfuse media references. It affects only the spans exported
by this Langfuse client. If the same OpenTelemetry spans are sent to another
exporter, that exporter receives its own unmodified copy.
The function is synchronous. It usually runs on the OpenTelemetry batch span
processor worker thread; during flush() and shutdown it may run on the
caller thread. Keep it deterministic and fast, and avoid relying on request
locals, the current active span, or async I/O.
Return None to leave the whole batch unchanged, or return
MaskOtelSpansResult with sparse patches for the spans that should change.
Example:
from typing import Optional from langfuse import Langfuse from langfuse.types import ( MaskOtelSpansParams, MaskOtelSpansResult, OtelSpanPatch, ) def mask_otel_spans( *, params: MaskOtelSpansParams ) -> Optional[MaskOtelSpansResult]: patches = {} for identifier, span in params.spans.items(): if span.instrumentation_scope_name == "openai": patches[identifier] = OtelSpanPatch( delete_attributes=( "gen_ai.prompt.0.content", "gen_ai.completion.0.content", ), set_attributes={"masking.applied": True}, ) return MaskOtelSpansResult(span_patches=patches) langfuse = Langfuse(mask_otel_spans=mask_otel_spans)
1927def _no_init_or_replace_init(self, *args, **kwargs): 1928 cls = type(self) 1929 1930 if cls._is_protocol: 1931 raise TypeError('Protocols cannot be instantiated') 1932 1933 # Already using a custom `__init__`. No need to calculate correct 1934 # `__init__` to call. This can lead to RecursionError. See bpo-45121. 1935 if cls.__init__ is not _no_init_or_replace_init: 1936 return 1937 1938 # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`. 1939 # The first instantiation of the subclass will call `_no_init_or_replace_init` which 1940 # searches for a proper new `__init__` in the MRO. The new `__init__` 1941 # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent 1942 # instantiation of the protocol subclass will thus use the new 1943 # `__init__` and no longer call `_no_init_or_replace_init`. 1944 for base in cls.__mro__: 1945 init = base.__dict__.get('__init__', _no_init_or_replace_init) 1946 if init is not _no_init_or_replace_init: 1947 cls.__init__ = init 1948 break 1949 else: 1950 # should not happen 1951 cls.__init__ = object.__init__ 1952 1953 cls.__init__(self, *args, **kwargs)
123@dataclass(frozen=True) 124class MaskOtelSpansParams: 125 """Input passed to an export-stage OpenTelemetry span masking function. 126 127 A single call receives one OpenTelemetry export batch, not necessarily a 128 complete trace, request, or Langfuse observation tree. Batch contents depend 129 on OpenTelemetry span processor settings such as `flush_at`, 130 `flush_interval`, explicit `flush()`, and shutdown. 131 132 Example: 133 ```python 134 from typing import Optional 135 136 from langfuse.types import ( 137 MaskOtelSpansParams, 138 MaskOtelSpansResult, 139 OtelSpanPatch, 140 ) 141 142 def mask_otel_spans( 143 *, params: MaskOtelSpansParams 144 ) -> Optional[MaskOtelSpansResult]: 145 patches = {} 146 147 for identifier, span in params.spans.items(): 148 if "http.request.header.authorization" in span.attributes: 149 patches[identifier] = OtelSpanPatch( 150 delete_attributes=("http.request.header.authorization",), 151 set_attributes={"security.redacted": True}, 152 ) 153 154 return MaskOtelSpansResult(span_patches=patches) 155 ``` 156 157 Attributes: 158 spans: Read-only mapping from stable span identifiers to span snapshots. 159 Return patches using keys from this mapping. 160 """ 161 162 spans: Mapping[OtelSpanIdentifier, OtelSpanData]
Input passed to an export-stage OpenTelemetry span masking function.
A single call receives one OpenTelemetry export batch, not necessarily a
complete trace, request, or Langfuse observation tree. Batch contents depend
on OpenTelemetry span processor settings such as flush_at,
flush_interval, explicit flush(), and shutdown.
Example:
from typing import Optional from langfuse.types import ( MaskOtelSpansParams, MaskOtelSpansResult, OtelSpanPatch, ) def mask_otel_spans( *, params: MaskOtelSpansParams ) -> Optional[MaskOtelSpansResult]: patches = {} for identifier, span in params.spans.items(): if "http.request.header.authorization" in span.attributes: patches[identifier] = OtelSpanPatch( delete_attributes=("http.request.header.authorization",), set_attributes={"security.redacted": True}, ) return MaskOtelSpansResult(span_patches=patches)
Attributes:
- spans: Read-only mapping from stable span identifiers to span snapshots. Return patches using keys from this mapping.
200@dataclass(frozen=True) 201class MaskOtelSpansResult: 202 """Patches returned by a `mask_otel_spans` function. 203 204 Omit spans that do not need changes. A mapping value of `None` also leaves 205 that span unchanged. Returning an invalid patch to drop a span is not a 206 supported API; use `should_export_span` when you need span-level export 207 filtering. 208 209 If `mask_otel_spans` raises or returns an object that is not a 210 `MaskOtelSpansResult`, Langfuse drops the whole export batch. If one 211 individual `OtelSpanPatch` is invalid, Langfuse drops only that span from 212 the export batch. 213 214 Attributes: 215 span_patches: Mapping from identifiers in `MaskOtelSpansParams.spans` to 216 sparse attribute patches. 217 """ 218 219 span_patches: Mapping[OtelSpanIdentifier, Optional[OtelSpanPatch]] = field( 220 default_factory=lambda: MappingProxyType({}) 221 )
Patches returned by a mask_otel_spans function.
Omit spans that do not need changes. A mapping value of None also leaves
that span unchanged. Returning an invalid patch to drop a span is not a
supported API; use should_export_span when you need span-level export
filtering.
If mask_otel_spans raises or returns an object that is not a
MaskOtelSpansResult, Langfuse drops the whole export batch. If one
individual OtelSpanPatch is invalid, Langfuse drops only that span from
the export batch.
Attributes:
- span_patches: Mapping from identifiers in
MaskOtelSpansParams.spansto sparse attribute patches.
82@dataclass(frozen=True) 83class OtelSpanData: 84 """Read-only OpenTelemetry span snapshot passed to `mask_otel_spans`. 85 86 The snapshot contains the span data that Langfuse is about to export after 87 the SDK has applied `should_export_span` filtering and export-stage media 88 processing. The mappings are immutable views and mutating them is not 89 supported; return an `OtelSpanPatch` to change exported attributes. 90 91 `mask_otel_spans` can only change span attributes. It cannot change the 92 span name, IDs, parent relationship, resource attributes, events, links, or 93 instrumentation scope. 94 95 Attributes: 96 trace_id: Lowercase 32-character hexadecimal OpenTelemetry trace ID. 97 span_id: Lowercase 16-character hexadecimal OpenTelemetry span ID. 98 parent_span_id: Lowercase hexadecimal parent span ID, or `None` for a 99 root span or when the parent is not available. 100 name: OpenTelemetry span name. 101 instrumentation_scope_name: Name of the instrumentation scope that 102 emitted the span, for example `openai` or `langfuse`. 103 instrumentation_scope_version: Version of the instrumentation scope, if 104 the instrumentation library provided one. 105 attributes: Read-only attributes that will be exported unless patched. 106 Values use OpenTelemetry `AttributeValue` types: strings, booleans, 107 numbers, or homogeneous sequences of those scalar values. 108 resource_attributes: Read-only resource attributes from the span's 109 OpenTelemetry resource. These are available for decisions only and 110 cannot be patched through `mask_otel_spans`. 111 """ 112 113 trace_id: str 114 span_id: str 115 parent_span_id: Optional[str] 116 name: str 117 instrumentation_scope_name: Optional[str] 118 instrumentation_scope_version: Optional[str] 119 attributes: Mapping[str, AttributeValue] 120 resource_attributes: Mapping[str, AttributeValue]
Read-only OpenTelemetry span snapshot passed to mask_otel_spans.
The snapshot contains the span data that Langfuse is about to export after
the SDK has applied should_export_span filtering and export-stage media
processing. The mappings are immutable views and mutating them is not
supported; return an OtelSpanPatch to change exported attributes.
mask_otel_spans can only change span attributes. It cannot change the
span name, IDs, parent relationship, resource attributes, events, links, or
instrumentation scope.
Attributes:
- trace_id: Lowercase 32-character hexadecimal OpenTelemetry trace ID.
- span_id: Lowercase 16-character hexadecimal OpenTelemetry span ID.
- parent_span_id: Lowercase hexadecimal parent span ID, or
Nonefor a root span or when the parent is not available. - name: OpenTelemetry span name.
- instrumentation_scope_name: Name of the instrumentation scope that
emitted the span, for example
openaiorlangfuse. - instrumentation_scope_version: Version of the instrumentation scope, if the instrumentation library provided one.
- attributes: Read-only attributes that will be exported unless patched.
Values use OpenTelemetry
AttributeValuetypes: strings, booleans, numbers, or homogeneous sequences of those scalar values. - resource_attributes: Read-only resource attributes from the span's
OpenTelemetry resource. These are available for decisions only and
cannot be patched through
mask_otel_spans.
65@dataclass(frozen=True) 66class OtelSpanIdentifier: 67 """Stable key for one OpenTelemetry span in a masking batch. 68 69 Use this object as the key when returning a patch for a span. It is a 70 frozen, hashable dataclass, so the safest pattern is to reuse the exact 71 identifier object from `MaskOtelSpansParams.spans` instead of rebuilding it. 72 73 Attributes: 74 trace_id: Lowercase 32-character hexadecimal OpenTelemetry trace ID. 75 span_id: Lowercase 16-character hexadecimal OpenTelemetry span ID. 76 """ 77 78 trace_id: str 79 span_id: str
Stable key for one OpenTelemetry span in a masking batch.
Use this object as the key when returning a patch for a span. It is a
frozen, hashable dataclass, so the safest pattern is to reuse the exact
identifier object from MaskOtelSpansParams.spans instead of rebuilding it.
Attributes:
- trace_id: Lowercase 32-character hexadecimal OpenTelemetry trace ID.
- span_id: Lowercase 16-character hexadecimal OpenTelemetry span ID.
165@dataclass(frozen=True) 166class OtelSpanPatch: 167 """Attribute changes to apply to one OpenTelemetry span before export. 168 169 Patches are sparse: include only the attributes that should change. Langfuse 170 deletes `delete_attributes` first and then applies `set_attributes`, so a key 171 present in both fields is exported with the value from `set_attributes`. 172 173 Attribute values must be valid OpenTelemetry attributes: strings, booleans, 174 integers, floats, or homogeneous sequences of those scalar types. If one 175 value is not valid for OpenTelemetry, Langfuse removes that attribute from 176 the export rather than sending an invalid span. 177 178 Example: 179 ```python 180 OtelSpanPatch( 181 delete_attributes=("gen_ai.prompt.0.content",), 182 set_attributes={ 183 "gen_ai.prompt.redacted": True, 184 "app.masking.rule": "drop_prompt_text", 185 }, 186 ) 187 ``` 188 189 Attributes: 190 set_attributes: Attribute values to add or replace on the exported span. 191 delete_attributes: Attribute keys to remove from the exported span. 192 """ 193 194 set_attributes: Mapping[str, AttributeValue] = field( 195 default_factory=lambda: MappingProxyType({}) 196 ) 197 delete_attributes: Sequence[str] = field(default_factory=tuple)
Attribute changes to apply to one OpenTelemetry span before export.
Patches are sparse: include only the attributes that should change. Langfuse
deletes delete_attributes first and then applies set_attributes, so a key
present in both fields is exported with the value from set_attributes.
Attribute values must be valid OpenTelemetry attributes: strings, booleans, integers, floats, or homogeneous sequences of those scalar types. If one value is not valid for OpenTelemetry, Langfuse removes that attribute from the export rather than sending an invalid span.
Example:
OtelSpanPatch( delete_attributes=("gen_ai.prompt.0.content",), set_attributes={ "gen_ai.prompt.redacted": True, "app.masking.rule": "drop_prompt_text", }, )
Attributes:
- set_attributes: Attribute values to add or replace on the exported span.
- delete_attributes: Attribute keys to remove from the exported span.