langfuse
1""".. include:: ../README.md""" 2 3from langfuse.batch_evaluation import ( 4 BatchEvaluationResult, 5 BatchEvaluationResumeToken, 6 CompositeEvaluatorFunction, 7 EvaluatorInputs, 8 EvaluatorStats, 9 MapperFunction, 10) 11from langfuse.experiment import Evaluation 12 13from ._client import client as _client_module 14from ._client.attributes import LangfuseOtelSpanAttributes 15from ._client.constants import ObservationTypeLiteral 16from ._client.get_client import get_client 17from ._client.observe import observe 18from ._client.propagation import propagate_attributes 19from ._client.span import ( 20 LangfuseAgent, 21 LangfuseChain, 22 LangfuseEmbedding, 23 LangfuseEvaluator, 24 LangfuseEvent, 25 LangfuseGeneration, 26 LangfuseGuardrail, 27 LangfuseRetriever, 28 LangfuseSpan, 29 LangfuseTool, 30) 31from ._version import __version__ 32from .span_filter import ( 33 KNOWN_LLM_INSTRUMENTATION_SCOPE_PREFIXES, 34 is_default_export_span, 35 is_genai_span, 36 is_known_llm_instrumentor, 37 is_langfuse_span, 38) 39 40Langfuse = _client_module.Langfuse 41 42__all__ = [ 43 "Langfuse", 44 "get_client", 45 "observe", 46 "propagate_attributes", 47 "ObservationTypeLiteral", 48 "LangfuseSpan", 49 "LangfuseGeneration", 50 "LangfuseEvent", 51 "LangfuseOtelSpanAttributes", 52 "LangfuseAgent", 53 "LangfuseTool", 54 "LangfuseChain", 55 "LangfuseEmbedding", 56 "LangfuseEvaluator", 57 "LangfuseRetriever", 58 "LangfuseGuardrail", 59 "Evaluation", 60 "EvaluatorInputs", 61 "MapperFunction", 62 "CompositeEvaluatorFunction", 63 "EvaluatorStats", 64 "BatchEvaluationResumeToken", 65 "BatchEvaluationResult", 66 "__version__", 67 "is_default_export_span", 68 "is_langfuse_span", 69 "is_genai_span", 70 "is_known_llm_instrumentor", 71 "KNOWN_LLM_INSTRUMENTATION_SCOPE_PREFIXES", 72 "experiment", 73 "api", 74]
139class Langfuse: 140 """Main client for Langfuse tracing and platform features. 141 142 This class provides an interface for creating and managing traces, spans, 143 and generations in Langfuse as well as interacting with the Langfuse API. 144 145 The client features a thread-safe singleton pattern for each unique public API key, 146 ensuring consistent trace context propagation across your application. It implements 147 efficient batching of spans with configurable flush settings and includes background 148 thread management for media uploads and score ingestion. 149 150 Configuration is flexible through either direct parameters or environment variables, 151 with graceful fallbacks and runtime configuration updates. 152 153 Attributes: 154 api: Synchronous API client for Langfuse backend communication 155 async_api: Asynchronous API client for Langfuse backend communication 156 _otel_tracer: Internal LangfuseTracer instance managing OpenTelemetry components 157 158 Parameters: 159 public_key (Optional[str]): Your Langfuse public API key. Can also be set via LANGFUSE_PUBLIC_KEY environment variable. 160 secret_key (Optional[str]): Your Langfuse secret API key. Can also be set via LANGFUSE_SECRET_KEY environment variable. 161 base_url (Optional[str]): The Langfuse API base URL. Defaults to "https://cloud.langfuse.com". Can also be set via LANGFUSE_BASE_URL environment variable. 162 host (Optional[str]): Deprecated. Use base_url instead. The Langfuse API host URL. Defaults to "https://cloud.langfuse.com". 163 timeout (Optional[int]): Timeout in seconds for API requests. Defaults to 5 seconds. 164 httpx_client (Optional[httpx.Client]): Custom httpx client for making non-tracing HTTP requests. If not provided, a default client will be created. 165 debug (bool): Enable debug logging. Defaults to False. Can also be set via LANGFUSE_DEBUG environment variable. 166 tracing_enabled (Optional[bool]): Enable or disable tracing. Defaults to True. Can also be set via LANGFUSE_TRACING_ENABLED environment variable. 167 flush_at (Optional[int]): Number of spans to batch before sending to the API. Defaults to 512. Can also be set via LANGFUSE_FLUSH_AT environment variable. 168 flush_interval (Optional[float]): Time in seconds between batch flushes. Defaults to 5 seconds. Can also be set via LANGFUSE_FLUSH_INTERVAL environment variable. 169 environment (Optional[str]): Environment name for tracing. Default is 'default'. Can also be set via LANGFUSE_TRACING_ENVIRONMENT environment variable. Can be any lowercase alphanumeric string with hyphens and underscores that does not start with 'langfuse'. 170 release (Optional[str]): Release version/hash of your application. Used for grouping analytics by release. 171 media_upload_thread_count (Optional[int]): Number of background threads for handling media uploads. Defaults to 1. Can also be set via LANGFUSE_MEDIA_UPLOAD_THREAD_COUNT environment variable. 172 sample_rate (Optional[float]): Sampling rate for traces (0.0 to 1.0). Defaults to 1.0 (100% of traces are sampled). Can also be set via LANGFUSE_SAMPLE_RATE environment variable. 173 mask (Optional[MaskFunction]): Function to mask sensitive data in traces before sending to the API. 174 blocked_instrumentation_scopes (Optional[List[str]]): Deprecated. Use `should_export_span` instead. Equivalent behavior: 175 ```python 176 from langfuse.span_filter import is_default_export_span 177 blocked = {"sqlite", "requests"} 178 179 should_export_span = lambda span: ( 180 is_default_export_span(span) 181 and ( 182 span.instrumentation_scope is None 183 or span.instrumentation_scope.name not in blocked 184 ) 185 ) 186 ``` 187 should_export_span (Optional[Callable[[ReadableSpan], bool]]): Callback to decide whether to export a span. If omitted, Langfuse uses the default filter (Langfuse SDK spans, spans with `gen_ai.*` attributes, and known LLM instrumentation scopes). 188 additional_headers (Optional[Dict[str, str]]): Additional headers to include in all API requests and in the default OTLPSpanExporter requests. These headers will be merged with default headers. Note: If httpx_client is provided, additional_headers must be set directly on your custom httpx_client as well. If `span_exporter` is provided, these headers are not wired into that exporter and must be configured on the exporter instance directly. 189 tracer_provider(Optional[TracerProvider]): OpenTelemetry TracerProvider to use for Langfuse. This can be useful to set to have disconnected tracing between Langfuse and other OpenTelemetry-span emitting libraries. Note: To track active spans, the context is still shared between TracerProviders. This may lead to broken trace trees. 190 span_exporter (Optional[SpanExporter]): Custom OpenTelemetry span exporter for the Langfuse span processor. If omitted, Langfuse creates an OTLPSpanExporter pointed at the Langfuse OTLP endpoint. If provided, Langfuse does not wire `base_url`, exporter headers, exporter auth, or exporter timeout into it. Configure endpoint, headers, and timeout on the exporter instance directly. If you are sending spans to Langfuse v4 or using Langfuse Cloud Fast Preview, include `x-langfuse-ingestion-version=4` on the exporter to enable real time processing of exported spans. 191 192 Example: 193 ```python 194 from langfuse.otel import Langfuse 195 196 # Initialize the client (reads from env vars if not provided) 197 langfuse = Langfuse( 198 public_key="your-public-key", 199 secret_key="your-secret-key", 200 host="https://cloud.langfuse.com", # Optional, default shown 201 ) 202 203 # Create a trace span 204 with langfuse.start_as_current_observation(name="process-query") as span: 205 # Your application code here 206 207 # Create a nested generation span for an LLM call 208 with span.start_as_current_generation( 209 name="generate-response", 210 model="gpt-4", 211 input={"query": "Tell me about AI"}, 212 model_parameters={"temperature": 0.7, "max_tokens": 500} 213 ) as generation: 214 # Generate response here 215 response = "AI is a field of computer science..." 216 217 generation.update( 218 output=response, 219 usage_details={"prompt_tokens": 10, "completion_tokens": 50}, 220 cost_details={"total_cost": 0.0023} 221 ) 222 223 # Score the generation (supports NUMERIC, BOOLEAN, CATEGORICAL) 224 generation.score(name="relevance", value=0.95, data_type="NUMERIC") 225 ``` 226 """ 227 228 _resources: Optional[LangfuseResourceManager] = None 229 _mask: Optional[MaskFunction] = None 230 _otel_tracer: otel_trace_api.Tracer 231 232 def __init__( 233 self, 234 *, 235 public_key: Optional[str] = None, 236 secret_key: Optional[str] = None, 237 base_url: Optional[str] = None, 238 host: Optional[str] = None, 239 timeout: Optional[int] = None, 240 httpx_client: Optional[httpx.Client] = None, 241 debug: bool = False, 242 tracing_enabled: Optional[bool] = True, 243 flush_at: Optional[int] = None, 244 flush_interval: Optional[float] = None, 245 environment: Optional[str] = None, 246 release: Optional[str] = None, 247 media_upload_thread_count: Optional[int] = None, 248 sample_rate: Optional[float] = None, 249 mask: Optional[MaskFunction] = None, 250 blocked_instrumentation_scopes: Optional[List[str]] = None, 251 should_export_span: Optional[Callable[[ReadableSpan], bool]] = None, 252 additional_headers: Optional[Dict[str, str]] = None, 253 tracer_provider: Optional[TracerProvider] = None, 254 span_exporter: Optional[SpanExporter] = None, 255 ): 256 self._base_url = ( 257 base_url 258 or os.environ.get(LANGFUSE_BASE_URL) 259 or host 260 or os.environ.get(LANGFUSE_HOST, "https://cloud.langfuse.com") 261 ) 262 self._environment = environment or cast( 263 str, os.environ.get(LANGFUSE_TRACING_ENVIRONMENT) 264 ) 265 self._release = ( 266 release 267 or os.environ.get(LANGFUSE_RELEASE, None) 268 or get_common_release_envs() 269 ) 270 self._project_id: Optional[str] = None 271 sample_rate = sample_rate or float(os.environ.get(LANGFUSE_SAMPLE_RATE, 1.0)) 272 if not 0.0 <= sample_rate <= 1.0: 273 raise ValueError( 274 f"Sample rate must be between 0.0 and 1.0, got {sample_rate}" 275 ) 276 277 timeout = timeout or int(os.environ.get(LANGFUSE_TIMEOUT, 5)) 278 279 self._tracing_enabled = ( 280 tracing_enabled 281 and os.environ.get(LANGFUSE_TRACING_ENABLED, "true").lower() != "false" 282 ) 283 if not self._tracing_enabled: 284 langfuse_logger.info( 285 "Configuration: Langfuse tracing is explicitly disabled. No data will be sent to the Langfuse API." 286 ) 287 288 debug = ( 289 debug if debug else (os.getenv(LANGFUSE_DEBUG, "false").lower() == "true") 290 ) 291 if debug: 292 logging.basicConfig( 293 format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" 294 ) 295 langfuse_logger.setLevel(logging.DEBUG) 296 297 public_key = public_key or os.environ.get(LANGFUSE_PUBLIC_KEY) 298 if public_key is None: 299 langfuse_logger.warning( 300 "Authentication error: Langfuse client initialized without public_key. Client will be disabled. " 301 "Provide a public_key parameter or set LANGFUSE_PUBLIC_KEY environment variable. " 302 ) 303 self._otel_tracer = otel_trace_api.NoOpTracer() 304 return 305 306 secret_key = secret_key or os.environ.get(LANGFUSE_SECRET_KEY) 307 if secret_key is None: 308 langfuse_logger.warning( 309 "Authentication error: Langfuse client initialized without secret_key. Client will be disabled. " 310 "Provide a secret_key parameter or set LANGFUSE_SECRET_KEY environment variable. " 311 ) 312 self._otel_tracer = otel_trace_api.NoOpTracer() 313 return 314 315 if os.environ.get("OTEL_SDK_DISABLED", "false").lower() == "true": 316 langfuse_logger.warning( 317 "OTEL_SDK_DISABLED is set. Langfuse tracing will be disabled and no traces will appear in the UI." 318 ) 319 320 if blocked_instrumentation_scopes is not None: 321 warnings.warn( 322 "`blocked_instrumentation_scopes` is deprecated and will be removed in a future release. " 323 "Use `should_export_span` instead. Example: " 324 "from langfuse.span_filter import is_default_export_span; " 325 'blocked={"scope"}; should_export_span=lambda span: ' 326 "is_default_export_span(span) and (span.instrumentation_scope is None or " 327 "span.instrumentation_scope.name not in blocked).", 328 DeprecationWarning, 329 stacklevel=2, 330 ) 331 332 # Initialize api and tracer if requirements are met 333 self._resources = LangfuseResourceManager( 334 public_key=public_key, 335 secret_key=secret_key, 336 base_url=self._base_url, 337 timeout=timeout, 338 environment=self._environment, 339 release=release, 340 flush_at=flush_at, 341 flush_interval=flush_interval, 342 httpx_client=httpx_client, 343 media_upload_thread_count=media_upload_thread_count, 344 sample_rate=sample_rate, 345 mask=mask, 346 tracing_enabled=self._tracing_enabled, 347 blocked_instrumentation_scopes=blocked_instrumentation_scopes, 348 should_export_span=should_export_span, 349 additional_headers=additional_headers, 350 tracer_provider=tracer_provider, 351 span_exporter=span_exporter, 352 ) 353 self._mask = self._resources.mask 354 355 self._otel_tracer = ( 356 self._resources.tracer 357 if self._tracing_enabled and self._resources.tracer is not None 358 else otel_trace_api.NoOpTracer() 359 ) 360 self.api = self._resources.api 361 self.async_api = self._resources.async_api 362 363 @overload 364 def start_observation( 365 self, 366 *, 367 trace_context: Optional[TraceContext] = None, 368 name: str, 369 as_type: Literal["generation"], 370 input: Optional[Any] = None, 371 output: Optional[Any] = None, 372 metadata: Optional[Any] = None, 373 version: Optional[str] = None, 374 level: Optional[SpanLevel] = None, 375 status_message: Optional[str] = None, 376 completion_start_time: Optional[datetime] = None, 377 model: Optional[str] = None, 378 model_parameters: Optional[Dict[str, MapValue]] = None, 379 usage_details: Optional[Dict[str, int]] = None, 380 cost_details: Optional[Dict[str, float]] = None, 381 prompt: Optional[PromptClient] = None, 382 ) -> LangfuseGeneration: ... 383 384 @overload 385 def start_observation( 386 self, 387 *, 388 trace_context: Optional[TraceContext] = None, 389 name: str, 390 as_type: Literal["span"] = "span", 391 input: Optional[Any] = None, 392 output: Optional[Any] = None, 393 metadata: Optional[Any] = None, 394 version: Optional[str] = None, 395 level: Optional[SpanLevel] = None, 396 status_message: Optional[str] = None, 397 ) -> LangfuseSpan: ... 398 399 @overload 400 def start_observation( 401 self, 402 *, 403 trace_context: Optional[TraceContext] = None, 404 name: str, 405 as_type: Literal["agent"], 406 input: Optional[Any] = None, 407 output: Optional[Any] = None, 408 metadata: Optional[Any] = None, 409 version: Optional[str] = None, 410 level: Optional[SpanLevel] = None, 411 status_message: Optional[str] = None, 412 ) -> LangfuseAgent: ... 413 414 @overload 415 def start_observation( 416 self, 417 *, 418 trace_context: Optional[TraceContext] = None, 419 name: str, 420 as_type: Literal["tool"], 421 input: Optional[Any] = None, 422 output: Optional[Any] = None, 423 metadata: Optional[Any] = None, 424 version: Optional[str] = None, 425 level: Optional[SpanLevel] = None, 426 status_message: Optional[str] = None, 427 ) -> LangfuseTool: ... 428 429 @overload 430 def start_observation( 431 self, 432 *, 433 trace_context: Optional[TraceContext] = None, 434 name: str, 435 as_type: Literal["chain"], 436 input: Optional[Any] = None, 437 output: Optional[Any] = None, 438 metadata: Optional[Any] = None, 439 version: Optional[str] = None, 440 level: Optional[SpanLevel] = None, 441 status_message: Optional[str] = None, 442 ) -> LangfuseChain: ... 443 444 @overload 445 def start_observation( 446 self, 447 *, 448 trace_context: Optional[TraceContext] = None, 449 name: str, 450 as_type: Literal["retriever"], 451 input: Optional[Any] = None, 452 output: Optional[Any] = None, 453 metadata: Optional[Any] = None, 454 version: Optional[str] = None, 455 level: Optional[SpanLevel] = None, 456 status_message: Optional[str] = None, 457 ) -> LangfuseRetriever: ... 458 459 @overload 460 def start_observation( 461 self, 462 *, 463 trace_context: Optional[TraceContext] = None, 464 name: str, 465 as_type: Literal["evaluator"], 466 input: Optional[Any] = None, 467 output: Optional[Any] = None, 468 metadata: Optional[Any] = None, 469 version: Optional[str] = None, 470 level: Optional[SpanLevel] = None, 471 status_message: Optional[str] = None, 472 ) -> LangfuseEvaluator: ... 473 474 @overload 475 def start_observation( 476 self, 477 *, 478 trace_context: Optional[TraceContext] = None, 479 name: str, 480 as_type: Literal["embedding"], 481 input: Optional[Any] = None, 482 output: Optional[Any] = None, 483 metadata: Optional[Any] = None, 484 version: Optional[str] = None, 485 level: Optional[SpanLevel] = None, 486 status_message: Optional[str] = None, 487 completion_start_time: Optional[datetime] = None, 488 model: Optional[str] = None, 489 model_parameters: Optional[Dict[str, MapValue]] = None, 490 usage_details: Optional[Dict[str, int]] = None, 491 cost_details: Optional[Dict[str, float]] = None, 492 prompt: Optional[PromptClient] = None, 493 ) -> LangfuseEmbedding: ... 494 495 @overload 496 def start_observation( 497 self, 498 *, 499 trace_context: Optional[TraceContext] = None, 500 name: str, 501 as_type: Literal["guardrail"], 502 input: Optional[Any] = None, 503 output: Optional[Any] = None, 504 metadata: Optional[Any] = None, 505 version: Optional[str] = None, 506 level: Optional[SpanLevel] = None, 507 status_message: Optional[str] = None, 508 ) -> LangfuseGuardrail: ... 509 510 def start_observation( 511 self, 512 *, 513 trace_context: Optional[TraceContext] = None, 514 name: str, 515 as_type: ObservationTypeLiteralNoEvent = "span", 516 input: Optional[Any] = None, 517 output: Optional[Any] = None, 518 metadata: Optional[Any] = None, 519 version: Optional[str] = None, 520 level: Optional[SpanLevel] = None, 521 status_message: Optional[str] = None, 522 completion_start_time: Optional[datetime] = None, 523 model: Optional[str] = None, 524 model_parameters: Optional[Dict[str, MapValue]] = None, 525 usage_details: Optional[Dict[str, int]] = None, 526 cost_details: Optional[Dict[str, float]] = None, 527 prompt: Optional[PromptClient] = None, 528 ) -> Union[ 529 LangfuseSpan, 530 LangfuseGeneration, 531 LangfuseAgent, 532 LangfuseTool, 533 LangfuseChain, 534 LangfuseRetriever, 535 LangfuseEvaluator, 536 LangfuseEmbedding, 537 LangfuseGuardrail, 538 ]: 539 """Create a new observation of the specified type. 540 541 This method creates a new observation but does not set it as the current span in the 542 context. To create and use an observation within a context, use start_as_current_observation(). 543 544 Args: 545 trace_context: Optional context for connecting to an existing trace 546 name: Name of the observation 547 as_type: Type of observation to create (defaults to "span") 548 input: Input data for the operation 549 output: Output data from the operation 550 metadata: Additional metadata to associate with the observation 551 version: Version identifier for the code or component 552 level: Importance level of the observation 553 status_message: Optional status message for the observation 554 completion_start_time: When the model started generating (for generation types) 555 model: Name/identifier of the AI model used (for generation types) 556 model_parameters: Parameters used for the model (for generation types) 557 usage_details: Token usage information (for generation types) 558 cost_details: Cost information (for generation types) 559 prompt: Associated prompt template (for generation types) 560 561 Returns: 562 An observation object of the appropriate type that must be ended with .end() 563 """ 564 if trace_context: 565 trace_id = trace_context.get("trace_id", None) 566 parent_span_id = trace_context.get("parent_span_id", None) 567 568 if trace_id: 569 remote_parent_span = self._create_remote_parent_span( 570 trace_id=trace_id, parent_span_id=parent_span_id 571 ) 572 573 with otel_trace_api.use_span( 574 cast(otel_trace_api.Span, remote_parent_span) 575 ): 576 otel_span = self._otel_tracer.start_span(name=name) 577 otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True) 578 579 return self._create_observation_from_otel_span( 580 otel_span=otel_span, 581 as_type=as_type, 582 input=input, 583 output=output, 584 metadata=metadata, 585 version=version, 586 level=level, 587 status_message=status_message, 588 completion_start_time=completion_start_time, 589 model=model, 590 model_parameters=model_parameters, 591 usage_details=usage_details, 592 cost_details=cost_details, 593 prompt=prompt, 594 ) 595 596 otel_span = self._otel_tracer.start_span(name=name) 597 598 return self._create_observation_from_otel_span( 599 otel_span=otel_span, 600 as_type=as_type, 601 input=input, 602 output=output, 603 metadata=metadata, 604 version=version, 605 level=level, 606 status_message=status_message, 607 completion_start_time=completion_start_time, 608 model=model, 609 model_parameters=model_parameters, 610 usage_details=usage_details, 611 cost_details=cost_details, 612 prompt=prompt, 613 ) 614 615 def _create_observation_from_otel_span( 616 self, 617 *, 618 otel_span: otel_trace_api.Span, 619 as_type: ObservationTypeLiteralNoEvent, 620 input: Optional[Any] = None, 621 output: Optional[Any] = None, 622 metadata: Optional[Any] = None, 623 version: Optional[str] = None, 624 level: Optional[SpanLevel] = None, 625 status_message: Optional[str] = None, 626 completion_start_time: Optional[datetime] = None, 627 model: Optional[str] = None, 628 model_parameters: Optional[Dict[str, MapValue]] = None, 629 usage_details: Optional[Dict[str, int]] = None, 630 cost_details: Optional[Dict[str, float]] = None, 631 prompt: Optional[PromptClient] = None, 632 ) -> Union[ 633 LangfuseSpan, 634 LangfuseGeneration, 635 LangfuseAgent, 636 LangfuseTool, 637 LangfuseChain, 638 LangfuseRetriever, 639 LangfuseEvaluator, 640 LangfuseEmbedding, 641 LangfuseGuardrail, 642 ]: 643 """Create the appropriate observation type from an OTEL span.""" 644 if as_type in get_observation_types_list(ObservationTypeGenerationLike): 645 observation_class = self._get_span_class(as_type) 646 # Type ignore to prevent overloads of internal _get_span_class function, 647 # issue is that LangfuseEvent could be returned and that classes have diff. args 648 return observation_class( # type: ignore[return-value,call-arg] 649 otel_span=otel_span, 650 langfuse_client=self, 651 environment=self._environment, 652 release=self._release, 653 input=input, 654 output=output, 655 metadata=metadata, 656 version=version, 657 level=level, 658 status_message=status_message, 659 completion_start_time=completion_start_time, 660 model=model, 661 model_parameters=model_parameters, 662 usage_details=usage_details, 663 cost_details=cost_details, 664 prompt=prompt, 665 ) 666 else: 667 # For other types (e.g. span, guardrail), create appropriate class without generation properties 668 observation_class = self._get_span_class(as_type) 669 # Type ignore to prevent overloads of internal _get_span_class function, 670 # issue is that LangfuseEvent could be returned and that classes have diff. args 671 return observation_class( # type: ignore[return-value,call-arg] 672 otel_span=otel_span, 673 langfuse_client=self, 674 environment=self._environment, 675 release=self._release, 676 input=input, 677 output=output, 678 metadata=metadata, 679 version=version, 680 level=level, 681 status_message=status_message, 682 ) 683 # span._observation_type = as_type 684 # span._otel_span.set_attribute("langfuse.observation.type", as_type) 685 # return span 686 687 @overload 688 def start_as_current_observation( 689 self, 690 *, 691 trace_context: Optional[TraceContext] = None, 692 name: str, 693 as_type: Literal["generation"], 694 input: Optional[Any] = None, 695 output: Optional[Any] = None, 696 metadata: Optional[Any] = None, 697 version: Optional[str] = None, 698 level: Optional[SpanLevel] = None, 699 status_message: Optional[str] = None, 700 completion_start_time: Optional[datetime] = None, 701 model: Optional[str] = None, 702 model_parameters: Optional[Dict[str, MapValue]] = None, 703 usage_details: Optional[Dict[str, int]] = None, 704 cost_details: Optional[Dict[str, float]] = None, 705 prompt: Optional[PromptClient] = None, 706 end_on_exit: Optional[bool] = None, 707 ) -> _AgnosticContextManager[LangfuseGeneration]: ... 708 709 @overload 710 def start_as_current_observation( 711 self, 712 *, 713 trace_context: Optional[TraceContext] = None, 714 name: str, 715 as_type: Literal["span"] = "span", 716 input: Optional[Any] = None, 717 output: Optional[Any] = None, 718 metadata: Optional[Any] = None, 719 version: Optional[str] = None, 720 level: Optional[SpanLevel] = None, 721 status_message: Optional[str] = None, 722 end_on_exit: Optional[bool] = None, 723 ) -> _AgnosticContextManager[LangfuseSpan]: ... 724 725 @overload 726 def start_as_current_observation( 727 self, 728 *, 729 trace_context: Optional[TraceContext] = None, 730 name: str, 731 as_type: Literal["agent"], 732 input: Optional[Any] = None, 733 output: Optional[Any] = None, 734 metadata: Optional[Any] = None, 735 version: Optional[str] = None, 736 level: Optional[SpanLevel] = None, 737 status_message: Optional[str] = None, 738 end_on_exit: Optional[bool] = None, 739 ) -> _AgnosticContextManager[LangfuseAgent]: ... 740 741 @overload 742 def start_as_current_observation( 743 self, 744 *, 745 trace_context: Optional[TraceContext] = None, 746 name: str, 747 as_type: Literal["tool"], 748 input: Optional[Any] = None, 749 output: Optional[Any] = None, 750 metadata: Optional[Any] = None, 751 version: Optional[str] = None, 752 level: Optional[SpanLevel] = None, 753 status_message: Optional[str] = None, 754 end_on_exit: Optional[bool] = None, 755 ) -> _AgnosticContextManager[LangfuseTool]: ... 756 757 @overload 758 def start_as_current_observation( 759 self, 760 *, 761 trace_context: Optional[TraceContext] = None, 762 name: str, 763 as_type: Literal["chain"], 764 input: Optional[Any] = None, 765 output: Optional[Any] = None, 766 metadata: Optional[Any] = None, 767 version: Optional[str] = None, 768 level: Optional[SpanLevel] = None, 769 status_message: Optional[str] = None, 770 end_on_exit: Optional[bool] = None, 771 ) -> _AgnosticContextManager[LangfuseChain]: ... 772 773 @overload 774 def start_as_current_observation( 775 self, 776 *, 777 trace_context: Optional[TraceContext] = None, 778 name: str, 779 as_type: Literal["retriever"], 780 input: Optional[Any] = None, 781 output: Optional[Any] = None, 782 metadata: Optional[Any] = None, 783 version: Optional[str] = None, 784 level: Optional[SpanLevel] = None, 785 status_message: Optional[str] = None, 786 end_on_exit: Optional[bool] = None, 787 ) -> _AgnosticContextManager[LangfuseRetriever]: ... 788 789 @overload 790 def start_as_current_observation( 791 self, 792 *, 793 trace_context: Optional[TraceContext] = None, 794 name: str, 795 as_type: Literal["evaluator"], 796 input: Optional[Any] = None, 797 output: Optional[Any] = None, 798 metadata: Optional[Any] = None, 799 version: Optional[str] = None, 800 level: Optional[SpanLevel] = None, 801 status_message: Optional[str] = None, 802 end_on_exit: Optional[bool] = None, 803 ) -> _AgnosticContextManager[LangfuseEvaluator]: ... 804 805 @overload 806 def start_as_current_observation( 807 self, 808 *, 809 trace_context: Optional[TraceContext] = None, 810 name: str, 811 as_type: Literal["embedding"], 812 input: Optional[Any] = None, 813 output: Optional[Any] = None, 814 metadata: Optional[Any] = None, 815 version: Optional[str] = None, 816 level: Optional[SpanLevel] = None, 817 status_message: Optional[str] = None, 818 completion_start_time: Optional[datetime] = None, 819 model: Optional[str] = None, 820 model_parameters: Optional[Dict[str, MapValue]] = None, 821 usage_details: Optional[Dict[str, int]] = None, 822 cost_details: Optional[Dict[str, float]] = None, 823 prompt: Optional[PromptClient] = None, 824 end_on_exit: Optional[bool] = None, 825 ) -> _AgnosticContextManager[LangfuseEmbedding]: ... 826 827 @overload 828 def start_as_current_observation( 829 self, 830 *, 831 trace_context: Optional[TraceContext] = None, 832 name: str, 833 as_type: Literal["guardrail"], 834 input: Optional[Any] = None, 835 output: Optional[Any] = None, 836 metadata: Optional[Any] = None, 837 version: Optional[str] = None, 838 level: Optional[SpanLevel] = None, 839 status_message: Optional[str] = None, 840 end_on_exit: Optional[bool] = None, 841 ) -> _AgnosticContextManager[LangfuseGuardrail]: ... 842 843 def start_as_current_observation( 844 self, 845 *, 846 trace_context: Optional[TraceContext] = None, 847 name: str, 848 as_type: ObservationTypeLiteralNoEvent = "span", 849 input: Optional[Any] = None, 850 output: Optional[Any] = None, 851 metadata: Optional[Any] = None, 852 version: Optional[str] = None, 853 level: Optional[SpanLevel] = None, 854 status_message: Optional[str] = None, 855 completion_start_time: Optional[datetime] = None, 856 model: Optional[str] = None, 857 model_parameters: Optional[Dict[str, MapValue]] = None, 858 usage_details: Optional[Dict[str, int]] = None, 859 cost_details: Optional[Dict[str, float]] = None, 860 prompt: Optional[PromptClient] = None, 861 end_on_exit: Optional[bool] = None, 862 ) -> Union[ 863 _AgnosticContextManager[LangfuseGeneration], 864 _AgnosticContextManager[LangfuseSpan], 865 _AgnosticContextManager[LangfuseAgent], 866 _AgnosticContextManager[LangfuseTool], 867 _AgnosticContextManager[LangfuseChain], 868 _AgnosticContextManager[LangfuseRetriever], 869 _AgnosticContextManager[LangfuseEvaluator], 870 _AgnosticContextManager[LangfuseEmbedding], 871 _AgnosticContextManager[LangfuseGuardrail], 872 ]: 873 """Create a new observation and set it as the current span in a context manager. 874 875 This method creates a new observation of the specified type and sets it as the 876 current span within a context manager. Use this method with a 'with' statement to 877 automatically handle the observation lifecycle within a code block. 878 879 The created observation will be the child of the current span in the context. 880 881 Args: 882 trace_context: Optional context for connecting to an existing trace 883 name: Name of the observation (e.g., function or operation name) 884 as_type: Type of observation to create (defaults to "span") 885 input: Input data for the operation (can be any JSON-serializable object) 886 output: Output data from the operation (can be any JSON-serializable object) 887 metadata: Additional metadata to associate with the observation 888 version: Version identifier for the code or component 889 level: Importance level of the observation (info, warning, error) 890 status_message: Optional status message for the observation 891 end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks. 892 893 The following parameters are available when as_type is: "generation" or "embedding". 894 completion_start_time: When the model started generating the response 895 model: Name/identifier of the AI model used (e.g., "gpt-4") 896 model_parameters: Parameters used for the model (e.g., temperature, max_tokens) 897 usage_details: Token usage information (e.g., prompt_tokens, completion_tokens) 898 cost_details: Cost information for the model call 899 prompt: Associated prompt template from Langfuse prompt management 900 901 Returns: 902 A context manager that yields the appropriate observation type based on as_type 903 904 Example: 905 ```python 906 # Create a span 907 with langfuse.start_as_current_observation(name="process-query", as_type="span") as span: 908 # Do work 909 result = process_data() 910 span.update(output=result) 911 912 # Create a child span automatically 913 with span.start_as_current_observation(name="sub-operation") as child_span: 914 # Do sub-operation work 915 child_span.update(output="sub-result") 916 917 # Create a tool observation 918 with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool: 919 # Do tool work 920 results = search_web(query) 921 tool.update(output=results) 922 923 # Create a generation observation 924 with langfuse.start_as_current_observation( 925 name="answer-generation", 926 as_type="generation", 927 model="gpt-4" 928 ) as generation: 929 # Generate answer 930 response = llm.generate(...) 931 generation.update(output=response) 932 ``` 933 """ 934 if as_type in get_observation_types_list(ObservationTypeGenerationLike): 935 if trace_context: 936 trace_id = trace_context.get("trace_id", None) 937 parent_span_id = trace_context.get("parent_span_id", None) 938 939 if trace_id: 940 remote_parent_span = self._create_remote_parent_span( 941 trace_id=trace_id, parent_span_id=parent_span_id 942 ) 943 944 return cast( 945 Union[ 946 _AgnosticContextManager[LangfuseGeneration], 947 _AgnosticContextManager[LangfuseEmbedding], 948 ], 949 self._create_span_with_parent_context( 950 as_type=as_type, 951 name=name, 952 remote_parent_span=remote_parent_span, 953 parent=None, 954 end_on_exit=end_on_exit, 955 input=input, 956 output=output, 957 metadata=metadata, 958 version=version, 959 level=level, 960 status_message=status_message, 961 completion_start_time=completion_start_time, 962 model=model, 963 model_parameters=model_parameters, 964 usage_details=usage_details, 965 cost_details=cost_details, 966 prompt=prompt, 967 ), 968 ) 969 970 return cast( 971 Union[ 972 _AgnosticContextManager[LangfuseGeneration], 973 _AgnosticContextManager[LangfuseEmbedding], 974 ], 975 self._start_as_current_otel_span_with_processed_media( 976 as_type=as_type, 977 name=name, 978 end_on_exit=end_on_exit, 979 input=input, 980 output=output, 981 metadata=metadata, 982 version=version, 983 level=level, 984 status_message=status_message, 985 completion_start_time=completion_start_time, 986 model=model, 987 model_parameters=model_parameters, 988 usage_details=usage_details, 989 cost_details=cost_details, 990 prompt=prompt, 991 ), 992 ) 993 994 if as_type in get_observation_types_list(ObservationTypeSpanLike): 995 if trace_context: 996 trace_id = trace_context.get("trace_id", None) 997 parent_span_id = trace_context.get("parent_span_id", None) 998 999 if trace_id: 1000 remote_parent_span = self._create_remote_parent_span( 1001 trace_id=trace_id, parent_span_id=parent_span_id 1002 ) 1003 1004 return cast( 1005 Union[ 1006 _AgnosticContextManager[LangfuseSpan], 1007 _AgnosticContextManager[LangfuseAgent], 1008 _AgnosticContextManager[LangfuseTool], 1009 _AgnosticContextManager[LangfuseChain], 1010 _AgnosticContextManager[LangfuseRetriever], 1011 _AgnosticContextManager[LangfuseEvaluator], 1012 _AgnosticContextManager[LangfuseGuardrail], 1013 ], 1014 self._create_span_with_parent_context( 1015 as_type=as_type, 1016 name=name, 1017 remote_parent_span=remote_parent_span, 1018 parent=None, 1019 end_on_exit=end_on_exit, 1020 input=input, 1021 output=output, 1022 metadata=metadata, 1023 version=version, 1024 level=level, 1025 status_message=status_message, 1026 ), 1027 ) 1028 1029 return cast( 1030 Union[ 1031 _AgnosticContextManager[LangfuseSpan], 1032 _AgnosticContextManager[LangfuseAgent], 1033 _AgnosticContextManager[LangfuseTool], 1034 _AgnosticContextManager[LangfuseChain], 1035 _AgnosticContextManager[LangfuseRetriever], 1036 _AgnosticContextManager[LangfuseEvaluator], 1037 _AgnosticContextManager[LangfuseGuardrail], 1038 ], 1039 self._start_as_current_otel_span_with_processed_media( 1040 as_type=as_type, 1041 name=name, 1042 end_on_exit=end_on_exit, 1043 input=input, 1044 output=output, 1045 metadata=metadata, 1046 version=version, 1047 level=level, 1048 status_message=status_message, 1049 ), 1050 ) 1051 1052 # This should never be reached since all valid types are handled above 1053 langfuse_logger.warning( 1054 f"Unknown observation type: {as_type}, falling back to span" 1055 ) 1056 return self._start_as_current_otel_span_with_processed_media( 1057 as_type="span", 1058 name=name, 1059 end_on_exit=end_on_exit, 1060 input=input, 1061 output=output, 1062 metadata=metadata, 1063 version=version, 1064 level=level, 1065 status_message=status_message, 1066 ) 1067 1068 def _get_span_class( 1069 self, 1070 as_type: ObservationTypeLiteral, 1071 ) -> Union[ 1072 Type[LangfuseAgent], 1073 Type[LangfuseTool], 1074 Type[LangfuseChain], 1075 Type[LangfuseRetriever], 1076 Type[LangfuseEvaluator], 1077 Type[LangfuseEmbedding], 1078 Type[LangfuseGuardrail], 1079 Type[LangfuseGeneration], 1080 Type[LangfuseEvent], 1081 Type[LangfuseSpan], 1082 ]: 1083 """Get the appropriate span class based on as_type.""" 1084 normalized_type = as_type.lower() 1085 1086 if normalized_type == "agent": 1087 return LangfuseAgent 1088 elif normalized_type == "tool": 1089 return LangfuseTool 1090 elif normalized_type == "chain": 1091 return LangfuseChain 1092 elif normalized_type == "retriever": 1093 return LangfuseRetriever 1094 elif normalized_type == "evaluator": 1095 return LangfuseEvaluator 1096 elif normalized_type == "embedding": 1097 return LangfuseEmbedding 1098 elif normalized_type == "guardrail": 1099 return LangfuseGuardrail 1100 elif normalized_type == "generation": 1101 return LangfuseGeneration 1102 elif normalized_type == "event": 1103 return LangfuseEvent 1104 elif normalized_type == "span": 1105 return LangfuseSpan 1106 else: 1107 return LangfuseSpan 1108 1109 @_agnosticcontextmanager 1110 def _create_span_with_parent_context( 1111 self, 1112 *, 1113 name: str, 1114 parent: Optional[otel_trace_api.Span] = None, 1115 remote_parent_span: Optional[otel_trace_api.Span] = None, 1116 as_type: ObservationTypeLiteralNoEvent, 1117 end_on_exit: Optional[bool] = None, 1118 input: Optional[Any] = None, 1119 output: Optional[Any] = None, 1120 metadata: Optional[Any] = None, 1121 version: Optional[str] = None, 1122 level: Optional[SpanLevel] = None, 1123 status_message: Optional[str] = None, 1124 completion_start_time: Optional[datetime] = None, 1125 model: Optional[str] = None, 1126 model_parameters: Optional[Dict[str, MapValue]] = None, 1127 usage_details: Optional[Dict[str, int]] = None, 1128 cost_details: Optional[Dict[str, float]] = None, 1129 prompt: Optional[PromptClient] = None, 1130 ) -> Any: 1131 parent_span = parent or cast(otel_trace_api.Span, remote_parent_span) 1132 1133 with otel_trace_api.use_span(parent_span): 1134 with self._start_as_current_otel_span_with_processed_media( 1135 name=name, 1136 as_type=as_type, 1137 end_on_exit=end_on_exit, 1138 input=input, 1139 output=output, 1140 metadata=metadata, 1141 version=version, 1142 level=level, 1143 status_message=status_message, 1144 completion_start_time=completion_start_time, 1145 model=model, 1146 model_parameters=model_parameters, 1147 usage_details=usage_details, 1148 cost_details=cost_details, 1149 prompt=prompt, 1150 ) as langfuse_span: 1151 if remote_parent_span is not None: 1152 langfuse_span._otel_span.set_attribute( 1153 LangfuseOtelSpanAttributes.AS_ROOT, True 1154 ) 1155 1156 yield langfuse_span 1157 1158 @_agnosticcontextmanager 1159 def _start_as_current_otel_span_with_processed_media( 1160 self, 1161 *, 1162 name: str, 1163 as_type: Optional[ObservationTypeLiteralNoEvent] = None, 1164 end_on_exit: Optional[bool] = None, 1165 input: Optional[Any] = None, 1166 output: Optional[Any] = None, 1167 metadata: Optional[Any] = None, 1168 version: Optional[str] = None, 1169 level: Optional[SpanLevel] = None, 1170 status_message: Optional[str] = None, 1171 completion_start_time: Optional[datetime] = None, 1172 model: Optional[str] = None, 1173 model_parameters: Optional[Dict[str, MapValue]] = None, 1174 usage_details: Optional[Dict[str, int]] = None, 1175 cost_details: Optional[Dict[str, float]] = None, 1176 prompt: Optional[PromptClient] = None, 1177 ) -> Any: 1178 with self._otel_tracer.start_as_current_span( 1179 name=name, 1180 end_on_exit=end_on_exit if end_on_exit is not None else True, 1181 ) as otel_span: 1182 span_class = self._get_span_class( 1183 as_type or "generation" 1184 ) # default was "generation" 1185 common_args = { 1186 "otel_span": otel_span, 1187 "langfuse_client": self, 1188 "environment": self._environment, 1189 "release": self._release, 1190 "input": input, 1191 "output": output, 1192 "metadata": metadata, 1193 "version": version, 1194 "level": level, 1195 "status_message": status_message, 1196 } 1197 1198 if span_class in [ 1199 LangfuseGeneration, 1200 LangfuseEmbedding, 1201 ]: 1202 common_args.update( 1203 { 1204 "completion_start_time": completion_start_time, 1205 "model": model, 1206 "model_parameters": model_parameters, 1207 "usage_details": usage_details, 1208 "cost_details": cost_details, 1209 "prompt": prompt, 1210 } 1211 ) 1212 # For span-like types (span, agent, tool, chain, retriever, evaluator, guardrail), no generation properties needed 1213 1214 yield span_class(**common_args) # type: ignore[arg-type] 1215 1216 def _get_current_otel_span(self) -> Optional[otel_trace_api.Span]: 1217 current_span = otel_trace_api.get_current_span() 1218 1219 if current_span is otel_trace_api.INVALID_SPAN: 1220 langfuse_logger.warning( 1221 "Context error: No active span in current context. Operations that depend on an active span will be skipped. " 1222 "Ensure spans are created with start_as_current_observation() or that you're operating within an active span context." 1223 ) 1224 return None 1225 1226 return current_span 1227 1228 def update_current_generation( 1229 self, 1230 *, 1231 name: Optional[str] = None, 1232 input: Optional[Any] = None, 1233 output: Optional[Any] = None, 1234 metadata: Optional[Any] = None, 1235 version: Optional[str] = None, 1236 level: Optional[SpanLevel] = None, 1237 status_message: Optional[str] = None, 1238 completion_start_time: Optional[datetime] = None, 1239 model: Optional[str] = None, 1240 model_parameters: Optional[Dict[str, MapValue]] = None, 1241 usage_details: Optional[Dict[str, int]] = None, 1242 cost_details: Optional[Dict[str, float]] = None, 1243 prompt: Optional[PromptClient] = None, 1244 ) -> None: 1245 """Update the current active generation span with new information. 1246 1247 This method updates the current generation span in the active context with 1248 additional information. It's useful for adding output, usage stats, or other 1249 details that become available during or after model generation. 1250 1251 Args: 1252 name: The generation name 1253 input: Updated input data for the model 1254 output: Output from the model (e.g., completions) 1255 metadata: Additional metadata to associate with the generation 1256 version: Version identifier for the model or component 1257 level: Importance level of the generation (info, warning, error) 1258 status_message: Optional status message for the generation 1259 completion_start_time: When the model started generating the response 1260 model: Name/identifier of the AI model used (e.g., "gpt-4") 1261 model_parameters: Parameters used for the model (e.g., temperature, max_tokens) 1262 usage_details: Token usage information (e.g., prompt_tokens, completion_tokens) 1263 cost_details: Cost information for the model call 1264 prompt: Associated prompt template from Langfuse prompt management 1265 1266 Example: 1267 ```python 1268 with langfuse.start_as_current_generation(name="answer-query") as generation: 1269 # Initial setup and API call 1270 response = llm.generate(...) 1271 1272 # Update with results that weren't available at creation time 1273 langfuse.update_current_generation( 1274 output=response.text, 1275 usage_details={ 1276 "prompt_tokens": response.usage.prompt_tokens, 1277 "completion_tokens": response.usage.completion_tokens 1278 } 1279 ) 1280 ``` 1281 """ 1282 if not self._tracing_enabled: 1283 langfuse_logger.debug( 1284 "Operation skipped: update_current_generation - Tracing is disabled or client is in no-op mode." 1285 ) 1286 return 1287 1288 current_otel_span = self._get_current_otel_span() 1289 1290 if current_otel_span is not None: 1291 generation = LangfuseGeneration( 1292 otel_span=current_otel_span, langfuse_client=self 1293 ) 1294 1295 if name: 1296 current_otel_span.update_name(name) 1297 1298 generation.update( 1299 input=input, 1300 output=output, 1301 metadata=metadata, 1302 version=version, 1303 level=level, 1304 status_message=status_message, 1305 completion_start_time=completion_start_time, 1306 model=model, 1307 model_parameters=model_parameters, 1308 usage_details=usage_details, 1309 cost_details=cost_details, 1310 prompt=prompt, 1311 ) 1312 1313 def update_current_span( 1314 self, 1315 *, 1316 name: Optional[str] = None, 1317 input: Optional[Any] = None, 1318 output: Optional[Any] = None, 1319 metadata: Optional[Any] = None, 1320 version: Optional[str] = None, 1321 level: Optional[SpanLevel] = None, 1322 status_message: Optional[str] = None, 1323 ) -> None: 1324 """Update the current active span with new information. 1325 1326 This method updates the current span in the active context with 1327 additional information. It's useful for adding outputs or metadata 1328 that become available during execution. 1329 1330 Args: 1331 name: The span name 1332 input: Updated input data for the operation 1333 output: Output data from the operation 1334 metadata: Additional metadata to associate with the span 1335 version: Version identifier for the code or component 1336 level: Importance level of the span (info, warning, error) 1337 status_message: Optional status message for the span 1338 1339 Example: 1340 ```python 1341 with langfuse.start_as_current_observation(name="process-data") as span: 1342 # Initial processing 1343 result = process_first_part() 1344 1345 # Update with intermediate results 1346 langfuse.update_current_span(metadata={"intermediate_result": result}) 1347 1348 # Continue processing 1349 final_result = process_second_part(result) 1350 1351 # Final update 1352 langfuse.update_current_span(output=final_result) 1353 ``` 1354 """ 1355 if not self._tracing_enabled: 1356 langfuse_logger.debug( 1357 "Operation skipped: update_current_span - Tracing is disabled or client is in no-op mode." 1358 ) 1359 return 1360 1361 current_otel_span = self._get_current_otel_span() 1362 1363 if current_otel_span is not None: 1364 span = LangfuseSpan( 1365 otel_span=current_otel_span, 1366 langfuse_client=self, 1367 environment=self._environment, 1368 release=self._release, 1369 ) 1370 1371 if name: 1372 current_otel_span.update_name(name) 1373 1374 span.update( 1375 input=input, 1376 output=output, 1377 metadata=metadata, 1378 version=version, 1379 level=level, 1380 status_message=status_message, 1381 ) 1382 1383 @deprecated( 1384 "Trace-level input/output is deprecated. " 1385 "For trace attributes (user_id, session_id, tags, etc.), use propagate_attributes() instead. " 1386 "This method will be removed in a future major version." 1387 ) 1388 def set_current_trace_io( 1389 self, 1390 *, 1391 input: Optional[Any] = None, 1392 output: Optional[Any] = None, 1393 ) -> None: 1394 """Set trace-level input and output for the current span's trace. 1395 1396 .. deprecated:: 1397 This is a legacy method for backward compatibility with Langfuse platform 1398 features that still rely on trace-level input/output (e.g., legacy LLM-as-a-judge 1399 evaluators). It will be removed in a future major version. 1400 1401 For setting other trace attributes (user_id, session_id, metadata, tags, version), 1402 use :meth:`propagate_attributes` instead. 1403 1404 Args: 1405 input: Input data to associate with the trace. 1406 output: Output data to associate with the trace. 1407 """ 1408 if not self._tracing_enabled: 1409 langfuse_logger.debug( 1410 "Operation skipped: set_current_trace_io - Tracing is disabled or client is in no-op mode." 1411 ) 1412 return 1413 1414 current_otel_span = self._get_current_otel_span() 1415 1416 if current_otel_span is not None and current_otel_span.is_recording(): 1417 existing_observation_type = current_otel_span.attributes.get( # type: ignore[attr-defined] 1418 LangfuseOtelSpanAttributes.OBSERVATION_TYPE, "span" 1419 ) 1420 # We need to preserve the class to keep the correct observation type 1421 span_class = self._get_span_class(existing_observation_type) 1422 span = span_class( 1423 otel_span=current_otel_span, 1424 langfuse_client=self, 1425 environment=self._environment, 1426 release=self._release, 1427 ) 1428 1429 span.set_trace_io( 1430 input=input, 1431 output=output, 1432 ) 1433 1434 def set_current_trace_as_public(self) -> None: 1435 """Make the current trace publicly accessible via its URL. 1436 1437 When a trace is published, anyone with the trace link can view the full trace 1438 without needing to be logged in to Langfuse. This action cannot be undone 1439 programmatically - once published, the entire trace becomes public. 1440 1441 This is a convenience method that publishes the trace from the currently 1442 active span context. Use this when you want to make a trace public from 1443 within a traced function without needing direct access to the span object. 1444 """ 1445 if not self._tracing_enabled: 1446 langfuse_logger.debug( 1447 "Operation skipped: set_current_trace_as_public - Tracing is disabled or client is in no-op mode." 1448 ) 1449 return 1450 1451 current_otel_span = self._get_current_otel_span() 1452 1453 if current_otel_span is not None and current_otel_span.is_recording(): 1454 existing_observation_type = current_otel_span.attributes.get( # type: ignore[attr-defined] 1455 LangfuseOtelSpanAttributes.OBSERVATION_TYPE, "span" 1456 ) 1457 # We need to preserve the class to keep the correct observation type 1458 span_class = self._get_span_class(existing_observation_type) 1459 span = span_class( 1460 otel_span=current_otel_span, 1461 langfuse_client=self, 1462 environment=self._environment, 1463 ) 1464 1465 span.set_trace_as_public() 1466 1467 def create_event( 1468 self, 1469 *, 1470 trace_context: Optional[TraceContext] = None, 1471 name: str, 1472 input: Optional[Any] = None, 1473 output: Optional[Any] = None, 1474 metadata: Optional[Any] = None, 1475 version: Optional[str] = None, 1476 level: Optional[SpanLevel] = None, 1477 status_message: Optional[str] = None, 1478 ) -> LangfuseEvent: 1479 """Create a new Langfuse observation of type 'EVENT'. 1480 1481 The created Langfuse Event observation will be the child of the current span in the context. 1482 1483 Args: 1484 trace_context: Optional context for connecting to an existing trace 1485 name: Name of the span (e.g., function or operation name) 1486 input: Input data for the operation (can be any JSON-serializable object) 1487 output: Output data from the operation (can be any JSON-serializable object) 1488 metadata: Additional metadata to associate with the span 1489 version: Version identifier for the code or component 1490 level: Importance level of the span (info, warning, error) 1491 status_message: Optional status message for the span 1492 1493 Returns: 1494 The Langfuse Event object 1495 1496 Example: 1497 ```python 1498 event = langfuse.create_event(name="process-event") 1499 ``` 1500 """ 1501 timestamp = time_ns() 1502 1503 if trace_context: 1504 trace_id = trace_context.get("trace_id", None) 1505 parent_span_id = trace_context.get("parent_span_id", None) 1506 1507 if trace_id: 1508 remote_parent_span = self._create_remote_parent_span( 1509 trace_id=trace_id, parent_span_id=parent_span_id 1510 ) 1511 1512 with otel_trace_api.use_span( 1513 cast(otel_trace_api.Span, remote_parent_span) 1514 ): 1515 otel_span = self._otel_tracer.start_span( 1516 name=name, start_time=timestamp 1517 ) 1518 otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True) 1519 1520 return cast( 1521 LangfuseEvent, 1522 LangfuseEvent( 1523 otel_span=otel_span, 1524 langfuse_client=self, 1525 environment=self._environment, 1526 release=self._release, 1527 input=input, 1528 output=output, 1529 metadata=metadata, 1530 version=version, 1531 level=level, 1532 status_message=status_message, 1533 ).end(end_time=timestamp), 1534 ) 1535 1536 otel_span = self._otel_tracer.start_span(name=name, start_time=timestamp) 1537 1538 return cast( 1539 LangfuseEvent, 1540 LangfuseEvent( 1541 otel_span=otel_span, 1542 langfuse_client=self, 1543 environment=self._environment, 1544 release=self._release, 1545 input=input, 1546 output=output, 1547 metadata=metadata, 1548 version=version, 1549 level=level, 1550 status_message=status_message, 1551 ).end(end_time=timestamp), 1552 ) 1553 1554 def _create_remote_parent_span( 1555 self, *, trace_id: str, parent_span_id: Optional[str] 1556 ) -> Any: 1557 if not self._is_valid_trace_id(trace_id): 1558 langfuse_logger.warning( 1559 f"Passed trace ID '{trace_id}' is not a valid 32 lowercase hex char Langfuse trace id. Ignoring trace ID." 1560 ) 1561 1562 if parent_span_id and not self._is_valid_span_id(parent_span_id): 1563 langfuse_logger.warning( 1564 f"Passed span ID '{parent_span_id}' is not a valid 16 lowercase hex char Langfuse span id. Ignoring parent span ID." 1565 ) 1566 1567 int_trace_id = int(trace_id, 16) 1568 int_parent_span_id = ( 1569 int(parent_span_id, 16) 1570 if parent_span_id 1571 else RandomIdGenerator().generate_span_id() 1572 ) 1573 1574 span_context = otel_trace_api.SpanContext( 1575 trace_id=int_trace_id, 1576 span_id=int_parent_span_id, 1577 trace_flags=otel_trace_api.TraceFlags(0x01), # mark span as sampled 1578 is_remote=False, 1579 ) 1580 1581 return otel_trace_api.NonRecordingSpan(span_context) 1582 1583 def _is_valid_trace_id(self, trace_id: str) -> bool: 1584 pattern = r"^[0-9a-f]{32}$" 1585 1586 return bool(re.match(pattern, trace_id)) 1587 1588 def _is_valid_span_id(self, span_id: str) -> bool: 1589 pattern = r"^[0-9a-f]{16}$" 1590 1591 return bool(re.match(pattern, span_id)) 1592 1593 def _create_observation_id(self, *, seed: Optional[str] = None) -> str: 1594 """Create a unique observation ID for use with Langfuse. 1595 1596 This method generates a unique observation ID (span ID in OpenTelemetry terms) 1597 for use with various Langfuse APIs. It can either generate a random ID or 1598 create a deterministic ID based on a seed string. 1599 1600 Observation IDs must be 16 lowercase hexadecimal characters, representing 8 bytes. 1601 This method ensures the generated ID meets this requirement. If you need to 1602 correlate an external ID with a Langfuse observation ID, use the external ID as 1603 the seed to get a valid, deterministic observation ID. 1604 1605 Args: 1606 seed: Optional string to use as a seed for deterministic ID generation. 1607 If provided, the same seed will always produce the same ID. 1608 If not provided, a random ID will be generated. 1609 1610 Returns: 1611 A 16-character lowercase hexadecimal string representing the observation ID. 1612 1613 Example: 1614 ```python 1615 # Generate a random observation ID 1616 obs_id = langfuse.create_observation_id() 1617 1618 # Generate a deterministic ID based on a seed 1619 user_obs_id = langfuse.create_observation_id(seed="user-123-feedback") 1620 1621 # Correlate an external item ID with a Langfuse observation ID 1622 item_id = "item-789012" 1623 correlated_obs_id = langfuse.create_observation_id(seed=item_id) 1624 1625 # Use the ID with Langfuse APIs 1626 langfuse.create_score( 1627 name="relevance", 1628 value=0.95, 1629 trace_id=trace_id, 1630 observation_id=obs_id 1631 ) 1632 ``` 1633 """ 1634 if not seed: 1635 span_id_int = RandomIdGenerator().generate_span_id() 1636 1637 return self._format_otel_span_id(span_id_int) 1638 1639 return sha256(seed.encode("utf-8")).digest()[:8].hex() 1640 1641 @staticmethod 1642 def create_trace_id(*, seed: Optional[str] = None) -> str: 1643 """Create a unique trace ID for use with Langfuse. 1644 1645 This method generates a unique trace ID for use with various Langfuse APIs. 1646 It can either generate a random ID or create a deterministic ID based on 1647 a seed string. 1648 1649 Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes. 1650 This method ensures the generated ID meets this requirement. If you need to 1651 correlate an external ID with a Langfuse trace ID, use the external ID as the 1652 seed to get a valid, deterministic Langfuse trace ID. 1653 1654 Args: 1655 seed: Optional string to use as a seed for deterministic ID generation. 1656 If provided, the same seed will always produce the same ID. 1657 If not provided, a random ID will be generated. 1658 1659 Returns: 1660 A 32-character lowercase hexadecimal string representing the Langfuse trace ID. 1661 1662 Example: 1663 ```python 1664 # Generate a random trace ID 1665 trace_id = langfuse.create_trace_id() 1666 1667 # Generate a deterministic ID based on a seed 1668 session_trace_id = langfuse.create_trace_id(seed="session-456") 1669 1670 # Correlate an external ID with a Langfuse trace ID 1671 external_id = "external-system-123456" 1672 correlated_trace_id = langfuse.create_trace_id(seed=external_id) 1673 1674 # Use the ID with trace context 1675 with langfuse.start_as_current_observation( 1676 name="process-request", 1677 trace_context={"trace_id": trace_id} 1678 ) as span: 1679 # Operation will be part of the specific trace 1680 pass 1681 ``` 1682 """ 1683 if not seed: 1684 trace_id_int = RandomIdGenerator().generate_trace_id() 1685 1686 return Langfuse._format_otel_trace_id(trace_id_int) 1687 1688 return sha256(seed.encode("utf-8")).digest()[:16].hex() 1689 1690 def _get_otel_trace_id(self, otel_span: otel_trace_api.Span) -> str: 1691 span_context = otel_span.get_span_context() 1692 1693 return self._format_otel_trace_id(span_context.trace_id) 1694 1695 def _get_otel_span_id(self, otel_span: otel_trace_api.Span) -> str: 1696 span_context = otel_span.get_span_context() 1697 1698 return self._format_otel_span_id(span_context.span_id) 1699 1700 @staticmethod 1701 def _format_otel_span_id(span_id_int: int) -> str: 1702 """Format an integer span ID to a 16-character lowercase hex string. 1703 1704 Internal method to convert an OpenTelemetry integer span ID to the standard 1705 W3C Trace Context format (16-character lowercase hex string). 1706 1707 Args: 1708 span_id_int: 64-bit integer representing a span ID 1709 1710 Returns: 1711 A 16-character lowercase hexadecimal string 1712 """ 1713 return format(span_id_int, "016x") 1714 1715 @staticmethod 1716 def _format_otel_trace_id(trace_id_int: int) -> str: 1717 """Format an integer trace ID to a 32-character lowercase hex string. 1718 1719 Internal method to convert an OpenTelemetry integer trace ID to the standard 1720 W3C Trace Context format (32-character lowercase hex string). 1721 1722 Args: 1723 trace_id_int: 128-bit integer representing a trace ID 1724 1725 Returns: 1726 A 32-character lowercase hexadecimal string 1727 """ 1728 return format(trace_id_int, "032x") 1729 1730 @overload 1731 def create_score( 1732 self, 1733 *, 1734 name: str, 1735 value: float, 1736 session_id: Optional[str] = None, 1737 dataset_run_id: Optional[str] = None, 1738 trace_id: Optional[str] = None, 1739 observation_id: Optional[str] = None, 1740 score_id: Optional[str] = None, 1741 data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None, 1742 comment: Optional[str] = None, 1743 config_id: Optional[str] = None, 1744 metadata: Optional[Any] = None, 1745 timestamp: Optional[datetime] = None, 1746 ) -> None: ... 1747 1748 @overload 1749 def create_score( 1750 self, 1751 *, 1752 name: str, 1753 value: str, 1754 session_id: Optional[str] = None, 1755 dataset_run_id: Optional[str] = None, 1756 trace_id: Optional[str] = None, 1757 score_id: Optional[str] = None, 1758 observation_id: Optional[str] = None, 1759 data_type: Optional[Literal["CATEGORICAL", "TEXT"]] = "CATEGORICAL", 1760 comment: Optional[str] = None, 1761 config_id: Optional[str] = None, 1762 metadata: Optional[Any] = None, 1763 timestamp: Optional[datetime] = None, 1764 ) -> None: ... 1765 1766 def create_score( 1767 self, 1768 *, 1769 name: str, 1770 value: Union[float, str], 1771 session_id: Optional[str] = None, 1772 dataset_run_id: Optional[str] = None, 1773 trace_id: Optional[str] = None, 1774 observation_id: Optional[str] = None, 1775 score_id: Optional[str] = None, 1776 data_type: Optional[ScoreDataType] = None, 1777 comment: Optional[str] = None, 1778 config_id: Optional[str] = None, 1779 metadata: Optional[Any] = None, 1780 timestamp: Optional[datetime] = None, 1781 ) -> None: 1782 """Create a score for a specific trace or observation. 1783 1784 This method creates a score for evaluating a Langfuse trace or observation. Scores can be 1785 used to track quality metrics, user feedback, or automated evaluations. 1786 1787 Args: 1788 name: Name of the score (e.g., "relevance", "accuracy") 1789 value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT) 1790 session_id: ID of the Langfuse session to associate the score with 1791 dataset_run_id: ID of the Langfuse dataset run to associate the score with 1792 trace_id: ID of the Langfuse trace to associate the score with 1793 observation_id: Optional ID of the specific observation to score. Trace ID must be provided too. 1794 score_id: Optional custom ID for the score (auto-generated if not provided) 1795 data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT) 1796 comment: Optional comment or explanation for the score 1797 config_id: Optional ID of a score config defined in Langfuse 1798 metadata: Optional metadata to be attached to the score 1799 timestamp: Optional timestamp for the score (defaults to current UTC time) 1800 1801 Example: 1802 ```python 1803 # Create a numeric score for accuracy 1804 langfuse.create_score( 1805 name="accuracy", 1806 value=0.92, 1807 trace_id="abcdef1234567890abcdef1234567890", 1808 data_type="NUMERIC", 1809 comment="High accuracy with minor irrelevant details" 1810 ) 1811 1812 # Create a categorical score for sentiment 1813 langfuse.create_score( 1814 name="sentiment", 1815 value="positive", 1816 trace_id="abcdef1234567890abcdef1234567890", 1817 observation_id="abcdef1234567890", 1818 data_type="CATEGORICAL" 1819 ) 1820 ``` 1821 """ 1822 if not self._tracing_enabled: 1823 return 1824 1825 score_id = score_id or self._create_observation_id() 1826 1827 try: 1828 new_body = ScoreBody( 1829 id=score_id, 1830 sessionId=session_id, 1831 datasetRunId=dataset_run_id, 1832 traceId=trace_id, 1833 observationId=observation_id, 1834 name=name, 1835 value=value, 1836 dataType=data_type, # type: ignore 1837 comment=comment, 1838 configId=config_id, 1839 environment=self._environment, 1840 metadata=metadata, 1841 ) 1842 1843 event = { 1844 "id": self.create_trace_id(), 1845 "type": "score-create", 1846 "timestamp": timestamp or _get_timestamp(), 1847 "body": new_body, 1848 } 1849 1850 if self._resources is not None: 1851 # Force the score to be in sample if it was for a legacy trace ID, i.e. non-32 hexchar 1852 force_sample = ( 1853 not self._is_valid_trace_id(trace_id) if trace_id else True 1854 ) 1855 1856 self._resources.add_score_task( 1857 event, 1858 force_sample=force_sample, 1859 ) 1860 1861 except Exception as e: 1862 langfuse_logger.exception( 1863 f"Error creating score: Failed to process score event for trace_id={trace_id}, name={name}. Error: {e}" 1864 ) 1865 1866 def _create_trace_tags_via_ingestion( 1867 self, 1868 *, 1869 trace_id: str, 1870 tags: List[str], 1871 ) -> None: 1872 """Private helper to enqueue trace tag updates via ingestion API events.""" 1873 if not self._tracing_enabled: 1874 return 1875 1876 if len(tags) == 0: 1877 return 1878 1879 try: 1880 new_body = TraceBody( 1881 id=trace_id, 1882 tags=tags, 1883 ) 1884 1885 event = { 1886 "id": self.create_trace_id(), 1887 "type": "trace-create", 1888 "timestamp": _get_timestamp(), 1889 "body": new_body, 1890 } 1891 1892 if self._resources is not None: 1893 self._resources.add_trace_task(event) 1894 except Exception as e: 1895 langfuse_logger.exception( 1896 f"Error updating trace tags: Failed to process trace update event for trace_id={trace_id}. Error: {e}" 1897 ) 1898 1899 @overload 1900 def score_current_span( 1901 self, 1902 *, 1903 name: str, 1904 value: float, 1905 score_id: Optional[str] = None, 1906 data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None, 1907 comment: Optional[str] = None, 1908 config_id: Optional[str] = None, 1909 metadata: Optional[Any] = None, 1910 ) -> None: ... 1911 1912 @overload 1913 def score_current_span( 1914 self, 1915 *, 1916 name: str, 1917 value: str, 1918 score_id: Optional[str] = None, 1919 data_type: Optional[Literal["CATEGORICAL", "TEXT"]] = "CATEGORICAL", 1920 comment: Optional[str] = None, 1921 config_id: Optional[str] = None, 1922 metadata: Optional[Any] = None, 1923 ) -> None: ... 1924 1925 def score_current_span( 1926 self, 1927 *, 1928 name: str, 1929 value: Union[float, str], 1930 score_id: Optional[str] = None, 1931 data_type: Optional[ScoreDataType] = None, 1932 comment: Optional[str] = None, 1933 config_id: Optional[str] = None, 1934 metadata: Optional[Any] = None, 1935 ) -> None: 1936 """Create a score for the current active span. 1937 1938 This method scores the currently active span in the context. It's a convenient 1939 way to score the current operation without needing to know its trace and span IDs. 1940 1941 Args: 1942 name: Name of the score (e.g., "relevance", "accuracy") 1943 value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT) 1944 score_id: Optional custom ID for the score (auto-generated if not provided) 1945 data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT) 1946 comment: Optional comment or explanation for the score 1947 config_id: Optional ID of a score config defined in Langfuse 1948 metadata: Optional metadata to be attached to the score 1949 1950 Example: 1951 ```python 1952 with langfuse.start_as_current_generation(name="answer-query") as generation: 1953 # Generate answer 1954 response = generate_answer(...) 1955 generation.update(output=response) 1956 1957 # Score the generation 1958 langfuse.score_current_span( 1959 name="relevance", 1960 value=0.85, 1961 data_type="NUMERIC", 1962 comment="Mostly relevant but contains some tangential information", 1963 metadata={"model": "gpt-4", "prompt_version": "v2"} 1964 ) 1965 ``` 1966 """ 1967 current_span = self._get_current_otel_span() 1968 1969 if current_span is not None: 1970 trace_id = self._get_otel_trace_id(current_span) 1971 observation_id = self._get_otel_span_id(current_span) 1972 1973 langfuse_logger.info( 1974 f"Score: Creating score name='{name}' value={value} for current span ({observation_id}) in trace {trace_id}" 1975 ) 1976 1977 self.create_score( 1978 trace_id=trace_id, 1979 observation_id=observation_id, 1980 name=name, 1981 value=cast(str, value), 1982 score_id=score_id, 1983 data_type=cast(Literal["CATEGORICAL", "TEXT"], data_type), 1984 comment=comment, 1985 config_id=config_id, 1986 metadata=metadata, 1987 ) 1988 1989 @overload 1990 def score_current_trace( 1991 self, 1992 *, 1993 name: str, 1994 value: float, 1995 score_id: Optional[str] = None, 1996 data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None, 1997 comment: Optional[str] = None, 1998 config_id: Optional[str] = None, 1999 metadata: Optional[Any] = None, 2000 ) -> None: ... 2001 2002 @overload 2003 def score_current_trace( 2004 self, 2005 *, 2006 name: str, 2007 value: str, 2008 score_id: Optional[str] = None, 2009 data_type: Optional[Literal["CATEGORICAL", "TEXT"]] = "CATEGORICAL", 2010 comment: Optional[str] = None, 2011 config_id: Optional[str] = None, 2012 metadata: Optional[Any] = None, 2013 ) -> None: ... 2014 2015 def score_current_trace( 2016 self, 2017 *, 2018 name: str, 2019 value: Union[float, str], 2020 score_id: Optional[str] = None, 2021 data_type: Optional[ScoreDataType] = None, 2022 comment: Optional[str] = None, 2023 config_id: Optional[str] = None, 2024 metadata: Optional[Any] = None, 2025 ) -> None: 2026 """Create a score for the current trace. 2027 2028 This method scores the trace of the currently active span. Unlike score_current_span, 2029 this method associates the score with the entire trace rather than a specific span. 2030 It's useful for scoring overall performance or quality of the entire operation. 2031 2032 Args: 2033 name: Name of the score (e.g., "user_satisfaction", "overall_quality") 2034 value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT) 2035 score_id: Optional custom ID for the score (auto-generated if not provided) 2036 data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT) 2037 comment: Optional comment or explanation for the score 2038 config_id: Optional ID of a score config defined in Langfuse 2039 metadata: Optional metadata to be attached to the score 2040 2041 Example: 2042 ```python 2043 with langfuse.start_as_current_observation(name="process-user-request") as span: 2044 # Process request 2045 result = process_complete_request() 2046 span.update(output=result) 2047 2048 # Score the overall trace 2049 langfuse.score_current_trace( 2050 name="overall_quality", 2051 value=0.95, 2052 data_type="NUMERIC", 2053 comment="High quality end-to-end response", 2054 metadata={"evaluator": "gpt-4", "criteria": "comprehensive"} 2055 ) 2056 ``` 2057 """ 2058 current_span = self._get_current_otel_span() 2059 2060 if current_span is not None: 2061 trace_id = self._get_otel_trace_id(current_span) 2062 2063 langfuse_logger.info( 2064 f"Score: Creating score name='{name}' value={value} for entire trace {trace_id}" 2065 ) 2066 2067 self.create_score( 2068 trace_id=trace_id, 2069 name=name, 2070 value=cast(str, value), 2071 score_id=score_id, 2072 data_type=cast(Literal["CATEGORICAL", "TEXT"], data_type), 2073 comment=comment, 2074 config_id=config_id, 2075 metadata=metadata, 2076 ) 2077 2078 def flush(self) -> None: 2079 """Force flush all pending spans and events to the Langfuse API. 2080 2081 This method manually flushes any pending spans, scores, and other events to the 2082 Langfuse API. It's useful in scenarios where you want to ensure all data is sent 2083 before proceeding, without waiting for the automatic flush interval. 2084 2085 Example: 2086 ```python 2087 # Record some spans and scores 2088 with langfuse.start_as_current_observation(name="operation") as span: 2089 # Do work... 2090 pass 2091 2092 # Ensure all data is sent to Langfuse before proceeding 2093 langfuse.flush() 2094 2095 # Continue with other work 2096 ``` 2097 """ 2098 if self._resources is not None: 2099 self._resources.flush() 2100 2101 def shutdown(self) -> None: 2102 """Shut down the Langfuse client and flush all pending data. 2103 2104 This method cleanly shuts down the Langfuse client, ensuring all pending data 2105 is flushed to the API and all background threads are properly terminated. 2106 2107 It's important to call this method when your application is shutting down to 2108 prevent data loss and resource leaks. For most applications, using the client 2109 as a context manager or relying on the automatic shutdown via atexit is sufficient. 2110 2111 Example: 2112 ```python 2113 # Initialize Langfuse 2114 langfuse = Langfuse(public_key="...", secret_key="...") 2115 2116 # Use Langfuse throughout your application 2117 # ... 2118 2119 # When application is shutting down 2120 langfuse.shutdown() 2121 ``` 2122 """ 2123 if self._resources is not None: 2124 self._resources.shutdown() 2125 2126 def get_current_trace_id(self) -> Optional[str]: 2127 """Get the trace ID of the current active span. 2128 2129 This method retrieves the trace ID from the currently active span in the context. 2130 It can be used to get the trace ID for referencing in logs, external systems, 2131 or for creating related operations. 2132 2133 Returns: 2134 The current trace ID as a 32-character lowercase hexadecimal string, 2135 or None if there is no active span. 2136 2137 Example: 2138 ```python 2139 with langfuse.start_as_current_observation(name="process-request") as span: 2140 # Get the current trace ID for reference 2141 trace_id = langfuse.get_current_trace_id() 2142 2143 # Use it for external correlation 2144 log.info(f"Processing request with trace_id: {trace_id}") 2145 2146 # Or pass to another system 2147 external_system.process(data, trace_id=trace_id) 2148 ``` 2149 """ 2150 if not self._tracing_enabled: 2151 langfuse_logger.debug( 2152 "Operation skipped: get_current_trace_id - Tracing is disabled or client is in no-op mode." 2153 ) 2154 return None 2155 2156 current_otel_span = self._get_current_otel_span() 2157 2158 return self._get_otel_trace_id(current_otel_span) if current_otel_span else None 2159 2160 def get_current_observation_id(self) -> Optional[str]: 2161 """Get the observation ID (span ID) of the current active span. 2162 2163 This method retrieves the observation ID from the currently active span in the context. 2164 It can be used to get the observation ID for referencing in logs, external systems, 2165 or for creating scores or other related operations. 2166 2167 Returns: 2168 The current observation ID as a 16-character lowercase hexadecimal string, 2169 or None if there is no active span. 2170 2171 Example: 2172 ```python 2173 with langfuse.start_as_current_observation(name="process-user-query") as span: 2174 # Get the current observation ID 2175 observation_id = langfuse.get_current_observation_id() 2176 2177 # Store it for later reference 2178 cache.set(f"query_{query_id}_observation", observation_id) 2179 2180 # Process the query... 2181 ``` 2182 """ 2183 if not self._tracing_enabled: 2184 langfuse_logger.debug( 2185 "Operation skipped: get_current_observation_id - Tracing is disabled or client is in no-op mode." 2186 ) 2187 return None 2188 2189 current_otel_span = self._get_current_otel_span() 2190 2191 return self._get_otel_span_id(current_otel_span) if current_otel_span else None 2192 2193 def _get_project_id(self) -> Optional[str]: 2194 """Fetch and return the current project id. Persisted across requests. Returns None if no project id is found for api keys.""" 2195 if not self._project_id: 2196 proj = self.api.projects.get() 2197 if not proj.data or not proj.data[0].id: 2198 return None 2199 2200 self._project_id = proj.data[0].id 2201 2202 return self._project_id 2203 2204 def get_trace_url(self, *, trace_id: Optional[str] = None) -> Optional[str]: 2205 """Get the URL to view a trace in the Langfuse UI. 2206 2207 This method generates a URL that links directly to a trace in the Langfuse UI. 2208 It's useful for providing links in logs, notifications, or debugging tools. 2209 2210 Args: 2211 trace_id: Optional trace ID to generate a URL for. If not provided, 2212 the trace ID of the current active span will be used. 2213 2214 Returns: 2215 A URL string pointing to the trace in the Langfuse UI, 2216 or None if the project ID couldn't be retrieved or no trace ID is available. 2217 2218 Example: 2219 ```python 2220 # Get URL for the current trace 2221 with langfuse.start_as_current_observation(name="process-request") as span: 2222 trace_url = langfuse.get_trace_url() 2223 log.info(f"Processing trace: {trace_url}") 2224 2225 # Get URL for a specific trace 2226 specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef") 2227 send_notification(f"Review needed for trace: {specific_trace_url}") 2228 ``` 2229 """ 2230 final_trace_id = trace_id or self.get_current_trace_id() 2231 if not final_trace_id: 2232 return None 2233 2234 project_id = self._get_project_id() 2235 2236 return ( 2237 f"{self._base_url}/project/{project_id}/traces/{final_trace_id}" 2238 if project_id and final_trace_id 2239 else None 2240 ) 2241 2242 def get_dataset( 2243 self, 2244 name: str, 2245 *, 2246 fetch_items_page_size: Optional[int] = 50, 2247 version: Optional[datetime] = None, 2248 ) -> "DatasetClient": 2249 """Fetch a dataset by its name. 2250 2251 Args: 2252 name (str): The name of the dataset to fetch. 2253 fetch_items_page_size (Optional[int]): All items of the dataset will be fetched in chunks of this size. Defaults to 50. 2254 version (Optional[datetime]): Retrieve dataset items as they existed at this specific point in time (UTC). 2255 If provided, returns the state of items at the specified UTC timestamp. 2256 If not provided, returns the latest version. Must be a timezone-aware datetime object in UTC. 2257 2258 Returns: 2259 DatasetClient: The dataset with the given name. 2260 """ 2261 try: 2262 langfuse_logger.debug(f"Getting datasets {name}") 2263 dataset = self.api.datasets.get(dataset_name=self._url_encode(name)) 2264 2265 dataset_items = [] 2266 page = 1 2267 2268 while True: 2269 new_items = self.api.dataset_items.list( 2270 dataset_name=self._url_encode(name, is_url_param=True), 2271 page=page, 2272 limit=fetch_items_page_size, 2273 version=version, 2274 ) 2275 dataset_items.extend(new_items.data) 2276 2277 if new_items.meta.total_pages <= page: 2278 break 2279 2280 page += 1 2281 2282 return DatasetClient( 2283 dataset=dataset, 2284 items=dataset_items, 2285 version=version, 2286 langfuse_client=self, 2287 ) 2288 2289 except Error as e: 2290 handle_fern_exception(e) 2291 raise e 2292 2293 def get_dataset_run( 2294 self, *, dataset_name: str, run_name: str 2295 ) -> DatasetRunWithItems: 2296 """Fetch a dataset run by dataset name and run name. 2297 2298 Args: 2299 dataset_name (str): The name of the dataset. 2300 run_name (str): The name of the run. 2301 2302 Returns: 2303 DatasetRunWithItems: The dataset run with its items. 2304 """ 2305 try: 2306 return cast( 2307 DatasetRunWithItems, 2308 self.api.datasets.get_run( 2309 dataset_name=self._url_encode(dataset_name), 2310 run_name=self._url_encode(run_name), 2311 request_options=None, 2312 ), 2313 ) 2314 except Error as e: 2315 handle_fern_exception(e) 2316 raise e 2317 2318 def get_dataset_runs( 2319 self, 2320 *, 2321 dataset_name: str, 2322 page: Optional[int] = None, 2323 limit: Optional[int] = None, 2324 ) -> PaginatedDatasetRuns: 2325 """Fetch all runs for a dataset. 2326 2327 Args: 2328 dataset_name (str): The name of the dataset. 2329 page (Optional[int]): Page number, starts at 1. 2330 limit (Optional[int]): Limit of items per page. 2331 2332 Returns: 2333 PaginatedDatasetRuns: Paginated list of dataset runs. 2334 """ 2335 try: 2336 return cast( 2337 PaginatedDatasetRuns, 2338 self.api.datasets.get_runs( 2339 dataset_name=self._url_encode(dataset_name), 2340 page=page, 2341 limit=limit, 2342 request_options=None, 2343 ), 2344 ) 2345 except Error as e: 2346 handle_fern_exception(e) 2347 raise e 2348 2349 def delete_dataset_run( 2350 self, *, dataset_name: str, run_name: str 2351 ) -> DeleteDatasetRunResponse: 2352 """Delete a dataset run and all its run items. This action is irreversible. 2353 2354 Args: 2355 dataset_name (str): The name of the dataset. 2356 run_name (str): The name of the run. 2357 2358 Returns: 2359 DeleteDatasetRunResponse: Confirmation of deletion. 2360 """ 2361 try: 2362 return cast( 2363 DeleteDatasetRunResponse, 2364 self.api.datasets.delete_run( 2365 dataset_name=self._url_encode(dataset_name), 2366 run_name=self._url_encode(run_name), 2367 request_options=None, 2368 ), 2369 ) 2370 except Error as e: 2371 handle_fern_exception(e) 2372 raise e 2373 2374 def run_experiment( 2375 self, 2376 *, 2377 name: str, 2378 run_name: Optional[str] = None, 2379 description: Optional[str] = None, 2380 data: ExperimentData, 2381 task: TaskFunction, 2382 evaluators: List[EvaluatorFunction] = [], 2383 composite_evaluator: Optional[CompositeEvaluatorFunction] = None, 2384 run_evaluators: List[RunEvaluatorFunction] = [], 2385 max_concurrency: int = 50, 2386 metadata: Optional[Dict[str, str]] = None, 2387 _dataset_version: Optional[datetime] = None, 2388 ) -> ExperimentResult: 2389 """Run an experiment on a dataset with automatic tracing and evaluation. 2390 2391 This method executes a task function on each item in the provided dataset, 2392 automatically traces all executions with Langfuse for observability, runs 2393 item-level and run-level evaluators on the outputs, and returns comprehensive 2394 results with evaluation metrics. 2395 2396 The experiment system provides: 2397 - Automatic tracing of all task executions 2398 - Concurrent processing with configurable limits 2399 - Comprehensive error handling that isolates failures 2400 - Integration with Langfuse datasets for experiment tracking 2401 - Flexible evaluation framework supporting both sync and async evaluators 2402 2403 Args: 2404 name: Human-readable name for the experiment. Used for identification 2405 in the Langfuse UI. 2406 run_name: Optional exact name for the experiment run. If provided, this will be 2407 used as the exact dataset run name if the `data` contains Langfuse dataset items. 2408 If not provided, this will default to the experiment name appended with an ISO timestamp. 2409 description: Optional description explaining the experiment's purpose, 2410 methodology, or expected outcomes. 2411 data: Array of data items to process. Can be either: 2412 - List of dict-like items with 'input', 'expected_output', 'metadata' keys 2413 - List of Langfuse DatasetItem objects from dataset.items 2414 task: Function that processes each data item and returns output. 2415 Must accept 'item' as keyword argument and can return sync or async results. 2416 The task function signature should be: task(*, item, **kwargs) -> Any 2417 evaluators: List of functions to evaluate each item's output individually. 2418 Each evaluator receives input, output, expected_output, and metadata. 2419 Can return single Evaluation dict or list of Evaluation dicts. 2420 composite_evaluator: Optional function that creates composite scores from item-level evaluations. 2421 Receives the same inputs as item-level evaluators (input, output, expected_output, metadata) 2422 plus the list of evaluations from item-level evaluators. Useful for weighted averages, 2423 pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics. 2424 run_evaluators: List of functions to evaluate the entire experiment run. 2425 Each run evaluator receives all item_results and can compute aggregate metrics. 2426 Useful for calculating averages, distributions, or cross-item comparisons. 2427 max_concurrency: Maximum number of concurrent task executions (default: 50). 2428 Controls the number of items processed simultaneously. Adjust based on 2429 API rate limits and system resources. 2430 metadata: Optional metadata dictionary to attach to all experiment traces. 2431 This metadata will be included in every trace created during the experiment. 2432 If `data` are Langfuse dataset items, the metadata will be attached to the dataset run, too. 2433 2434 Returns: 2435 ExperimentResult containing: 2436 - run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset. 2437 - item_results: List of results for each processed item with outputs and evaluations 2438 - run_evaluations: List of aggregate evaluation results for the entire run 2439 - experiment_id: Stable identifier for the experiment run across all items 2440 - dataset_run_id: ID of the dataset run (if using Langfuse datasets) 2441 - dataset_run_url: Direct URL to view results in Langfuse UI (if applicable) 2442 2443 Raises: 2444 ValueError: If required parameters are missing or invalid 2445 Exception: If experiment setup fails (individual item failures are handled gracefully) 2446 2447 Examples: 2448 Basic experiment with local data: 2449 ```python 2450 def summarize_text(*, item, **kwargs): 2451 return f"Summary: {item['input'][:50]}..." 2452 2453 def length_evaluator(*, input, output, expected_output=None, **kwargs): 2454 return { 2455 "name": "output_length", 2456 "value": len(output), 2457 "comment": f"Output contains {len(output)} characters" 2458 } 2459 2460 result = langfuse.run_experiment( 2461 name="Text Summarization Test", 2462 description="Evaluate summarization quality and length", 2463 data=[ 2464 {"input": "Long article text...", "expected_output": "Expected summary"}, 2465 {"input": "Another article...", "expected_output": "Another summary"} 2466 ], 2467 task=summarize_text, 2468 evaluators=[length_evaluator] 2469 ) 2470 2471 print(f"Processed {len(result.item_results)} items") 2472 for item_result in result.item_results: 2473 print(f"Input: {item_result.item['input']}") 2474 print(f"Output: {item_result.output}") 2475 print(f"Evaluations: {item_result.evaluations}") 2476 ``` 2477 2478 Advanced experiment with async task and multiple evaluators: 2479 ```python 2480 async def llm_task(*, item, **kwargs): 2481 # Simulate async LLM call 2482 response = await openai_client.chat.completions.create( 2483 model="gpt-4", 2484 messages=[{"role": "user", "content": item["input"]}] 2485 ) 2486 return response.choices[0].message.content 2487 2488 def accuracy_evaluator(*, input, output, expected_output=None, **kwargs): 2489 if expected_output and expected_output.lower() in output.lower(): 2490 return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"} 2491 return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"} 2492 2493 def toxicity_evaluator(*, input, output, expected_output=None, **kwargs): 2494 # Simulate toxicity check 2495 toxicity_score = check_toxicity(output) # Your toxicity checker 2496 return { 2497 "name": "toxicity", 2498 "value": toxicity_score, 2499 "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}" 2500 } 2501 2502 def average_accuracy(*, item_results, **kwargs): 2503 accuracies = [ 2504 eval.value for result in item_results 2505 for eval in result.evaluations 2506 if eval.name == "accuracy" 2507 ] 2508 return { 2509 "name": "average_accuracy", 2510 "value": sum(accuracies) / len(accuracies) if accuracies else 0, 2511 "comment": f"Average accuracy across {len(accuracies)} items" 2512 } 2513 2514 result = langfuse.run_experiment( 2515 name="LLM Safety and Accuracy Test", 2516 description="Evaluate model accuracy and safety across diverse prompts", 2517 data=test_dataset, # Your dataset items 2518 task=llm_task, 2519 evaluators=[accuracy_evaluator, toxicity_evaluator], 2520 run_evaluators=[average_accuracy], 2521 max_concurrency=5, # Limit concurrent API calls 2522 metadata={"model": "gpt-4", "temperature": 0.7} 2523 ) 2524 ``` 2525 2526 Using with Langfuse datasets: 2527 ```python 2528 # Get dataset from Langfuse 2529 dataset = langfuse.get_dataset("my-eval-dataset") 2530 2531 result = dataset.run_experiment( 2532 name="Production Model Evaluation", 2533 description="Monthly evaluation of production model performance", 2534 task=my_production_task, 2535 evaluators=[accuracy_evaluator, latency_evaluator] 2536 ) 2537 2538 # Results automatically linked to dataset in Langfuse UI 2539 print(f"View results: {result['dataset_run_url']}") 2540 ``` 2541 2542 Note: 2543 - Task and evaluator functions can be either synchronous or asynchronous 2544 - Individual item failures are logged but don't stop the experiment 2545 - All executions are automatically traced and visible in Langfuse UI 2546 - When using Langfuse datasets, results are automatically linked for easy comparison 2547 - This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.) 2548 - Async execution is handled automatically with smart event loop detection 2549 """ 2550 return cast( 2551 ExperimentResult, 2552 run_async_safely( 2553 self._run_experiment_async( 2554 name=name, 2555 run_name=self._create_experiment_run_name( 2556 name=name, run_name=run_name 2557 ), 2558 description=description, 2559 data=data, 2560 task=task, 2561 evaluators=evaluators or [], 2562 composite_evaluator=composite_evaluator, 2563 run_evaluators=run_evaluators or [], 2564 max_concurrency=max_concurrency, 2565 metadata=metadata, 2566 dataset_version=_dataset_version, 2567 ), 2568 ), 2569 ) 2570 2571 async def _run_experiment_async( 2572 self, 2573 *, 2574 name: str, 2575 run_name: str, 2576 description: Optional[str], 2577 data: ExperimentData, 2578 task: TaskFunction, 2579 evaluators: List[EvaluatorFunction], 2580 composite_evaluator: Optional[CompositeEvaluatorFunction], 2581 run_evaluators: List[RunEvaluatorFunction], 2582 max_concurrency: int, 2583 metadata: Optional[Dict[str, Any]] = None, 2584 dataset_version: Optional[datetime] = None, 2585 ) -> ExperimentResult: 2586 langfuse_logger.debug( 2587 f"Starting experiment '{name}' run '{run_name}' with {len(data)} items" 2588 ) 2589 2590 shared_fallback_experiment_id = self._create_observation_id() 2591 2592 # Set up concurrency control 2593 semaphore = asyncio.Semaphore(max_concurrency) 2594 2595 # Process all items 2596 async def process_item(item: ExperimentItem) -> ExperimentItemResult: 2597 async with semaphore: 2598 return await self._process_experiment_item( 2599 item, 2600 task, 2601 evaluators, 2602 composite_evaluator, 2603 shared_fallback_experiment_id, 2604 name, 2605 run_name, 2606 description, 2607 metadata, 2608 dataset_version, 2609 ) 2610 2611 # Run all items concurrently 2612 tasks = [process_item(item) for item in data] 2613 item_results = await asyncio.gather(*tasks, return_exceptions=True) 2614 2615 # Filter out any exceptions and log errors 2616 valid_results: List[ExperimentItemResult] = [] 2617 for i, result in enumerate(item_results): 2618 if isinstance(result, Exception): 2619 langfuse_logger.error(f"Item {i} failed: {result}") 2620 elif isinstance(result, ExperimentItemResult): 2621 valid_results.append(result) # type: ignore 2622 2623 # Run experiment-level evaluators 2624 run_evaluations: List[Evaluation] = [] 2625 for run_evaluator in run_evaluators: 2626 try: 2627 evaluations = await _run_evaluator( 2628 run_evaluator, item_results=valid_results 2629 ) 2630 run_evaluations.extend(evaluations) 2631 except Exception as e: 2632 langfuse_logger.error(f"Run evaluator failed: {e}") 2633 2634 # Generate dataset run URL if applicable 2635 dataset_run_id = next( 2636 ( 2637 result.dataset_run_id 2638 for result in valid_results 2639 if result.dataset_run_id 2640 ), 2641 None, 2642 ) 2643 dataset_run_url = None 2644 if dataset_run_id and data: 2645 try: 2646 # Check if the first item has dataset_id (for DatasetItem objects) 2647 first_item = data[0] 2648 dataset_id = None 2649 2650 if hasattr(first_item, "dataset_id"): 2651 dataset_id = getattr(first_item, "dataset_id", None) 2652 2653 if dataset_id: 2654 project_id = self._get_project_id() 2655 2656 if project_id: 2657 dataset_run_url = f"{self._base_url}/project/{project_id}/datasets/{dataset_id}/runs/{dataset_run_id}" 2658 2659 except Exception: 2660 pass # URL generation is optional 2661 2662 # Store run-level evaluations as scores 2663 for evaluation in run_evaluations: 2664 try: 2665 if dataset_run_id: 2666 self.create_score( 2667 dataset_run_id=dataset_run_id, 2668 name=evaluation.name or "<unknown>", 2669 value=evaluation.value, # type: ignore 2670 comment=evaluation.comment, 2671 metadata=evaluation.metadata, 2672 data_type=evaluation.data_type, # type: ignore 2673 config_id=evaluation.config_id, 2674 ) 2675 2676 except Exception as e: 2677 langfuse_logger.error(f"Failed to store run evaluation: {e}") 2678 2679 # Flush scores and traces 2680 self.flush() 2681 2682 return ExperimentResult( 2683 name=name, 2684 run_name=run_name, 2685 description=description, 2686 item_results=valid_results, 2687 run_evaluations=run_evaluations, 2688 experiment_id=dataset_run_id or shared_fallback_experiment_id, 2689 dataset_run_id=dataset_run_id, 2690 dataset_run_url=dataset_run_url, 2691 ) 2692 2693 async def _process_experiment_item( 2694 self, 2695 item: ExperimentItem, 2696 task: Callable, 2697 evaluators: List[Callable], 2698 composite_evaluator: Optional[CompositeEvaluatorFunction], 2699 fallback_experiment_id: str, 2700 experiment_name: str, 2701 experiment_run_name: str, 2702 experiment_description: Optional[str], 2703 experiment_metadata: Optional[Dict[str, Any]] = None, 2704 dataset_version: Optional[datetime] = None, 2705 ) -> ExperimentItemResult: 2706 span_name = "experiment-item-run" 2707 2708 with self.start_as_current_observation(name=span_name) as span: 2709 try: 2710 input_data = ( 2711 item.get("input") 2712 if isinstance(item, dict) 2713 else getattr(item, "input", None) 2714 ) 2715 2716 if input_data is None: 2717 raise ValueError("Experiment Item is missing input. Skipping item.") 2718 2719 expected_output = ( 2720 item.get("expected_output") 2721 if isinstance(item, dict) 2722 else getattr(item, "expected_output", None) 2723 ) 2724 2725 item_metadata = ( 2726 item.get("metadata") 2727 if isinstance(item, dict) 2728 else getattr(item, "metadata", None) 2729 ) 2730 2731 final_observation_metadata = { 2732 "experiment_name": experiment_name, 2733 "experiment_run_name": experiment_run_name, 2734 **(experiment_metadata or {}), 2735 } 2736 2737 trace_id = span.trace_id 2738 dataset_id = None 2739 dataset_item_id = None 2740 dataset_run_id = None 2741 2742 # Link to dataset run if this is a dataset item 2743 if hasattr(item, "id") and hasattr(item, "dataset_id"): 2744 try: 2745 # Use sync API to avoid event loop issues when run_async_safely 2746 # creates multiple event loops across different threads 2747 dataset_run_item = await asyncio.to_thread( 2748 self.api.dataset_run_items.create, 2749 run_name=experiment_run_name, 2750 run_description=experiment_description, 2751 metadata=experiment_metadata, 2752 dataset_item_id=item.id, # type: ignore 2753 trace_id=trace_id, 2754 observation_id=span.id, 2755 dataset_version=dataset_version, 2756 ) 2757 2758 dataset_run_id = dataset_run_item.dataset_run_id 2759 2760 except Exception as e: 2761 langfuse_logger.error(f"Failed to create dataset run item: {e}") 2762 2763 if ( 2764 not isinstance(item, dict) 2765 and hasattr(item, "dataset_id") 2766 and hasattr(item, "id") 2767 ): 2768 dataset_id = item.dataset_id 2769 dataset_item_id = item.id 2770 2771 final_observation_metadata.update( 2772 {"dataset_id": dataset_id, "dataset_item_id": dataset_item_id} 2773 ) 2774 2775 if isinstance(item_metadata, dict): 2776 final_observation_metadata.update(item_metadata) 2777 2778 experiment_id = dataset_run_id or fallback_experiment_id 2779 experiment_item_id = ( 2780 dataset_item_id or get_sha256_hash_hex(_serialize(input_data))[:16] 2781 ) 2782 span._otel_span.set_attributes( 2783 { 2784 k: v 2785 for k, v in { 2786 LangfuseOtelSpanAttributes.ENVIRONMENT: LANGFUSE_SDK_EXPERIMENT_ENVIRONMENT, 2787 LangfuseOtelSpanAttributes.EXPERIMENT_DESCRIPTION: experiment_description, 2788 LangfuseOtelSpanAttributes.EXPERIMENT_ITEM_EXPECTED_OUTPUT: _serialize( 2789 expected_output 2790 ), 2791 }.items() 2792 if v is not None 2793 } 2794 ) 2795 2796 propagated_experiment_attributes = PropagatedExperimentAttributes( 2797 experiment_id=experiment_id, 2798 experiment_name=experiment_run_name, 2799 experiment_metadata=_flatten_and_serialize_metadata_values( 2800 experiment_metadata 2801 ), 2802 experiment_dataset_id=dataset_id, 2803 experiment_item_id=experiment_item_id, 2804 experiment_item_metadata=_flatten_and_serialize_metadata_values( 2805 item_metadata if isinstance(item_metadata, dict) else None 2806 ), 2807 experiment_item_root_observation_id=span.id, 2808 ) 2809 2810 with _propagate_attributes(experiment=propagated_experiment_attributes): 2811 output = await _run_task(task, item) 2812 2813 span.update( 2814 input=input_data, 2815 output=output, 2816 metadata=final_observation_metadata, 2817 ) 2818 2819 except Exception as e: 2820 span.update( 2821 output=f"Error: {str(e)}", level="ERROR", status_message=str(e) 2822 ) 2823 raise e 2824 2825 # Run evaluators 2826 evaluations = [] 2827 2828 for evaluator in evaluators: 2829 try: 2830 eval_metadata: Optional[Dict[str, Any]] = None 2831 2832 if isinstance(item, dict): 2833 eval_metadata = item.get("metadata") 2834 elif hasattr(item, "metadata"): 2835 eval_metadata = item.metadata 2836 2837 with _propagate_attributes( 2838 experiment=propagated_experiment_attributes 2839 ): 2840 eval_results = await _run_evaluator( 2841 evaluator, 2842 input=input_data, 2843 output=output, 2844 expected_output=expected_output, 2845 metadata=eval_metadata, 2846 ) 2847 evaluations.extend(eval_results) 2848 2849 # Store evaluations as scores 2850 for evaluation in eval_results: 2851 self.create_score( 2852 trace_id=trace_id, 2853 observation_id=span.id, 2854 name=evaluation.name, 2855 value=evaluation.value, # type: ignore 2856 comment=evaluation.comment, 2857 metadata=evaluation.metadata, 2858 config_id=evaluation.config_id, 2859 data_type=evaluation.data_type, # type: ignore 2860 ) 2861 2862 except Exception as e: 2863 langfuse_logger.error(f"Evaluator failed: {e}") 2864 2865 # Run composite evaluator if provided and we have evaluations 2866 if composite_evaluator and evaluations: 2867 try: 2868 composite_eval_metadata: Optional[Dict[str, Any]] = None 2869 if isinstance(item, dict): 2870 composite_eval_metadata = item.get("metadata") 2871 elif hasattr(item, "metadata"): 2872 composite_eval_metadata = item.metadata 2873 2874 with _propagate_attributes( 2875 experiment=propagated_experiment_attributes 2876 ): 2877 result = composite_evaluator( 2878 input=input_data, 2879 output=output, 2880 expected_output=expected_output, 2881 metadata=composite_eval_metadata, 2882 evaluations=evaluations, 2883 ) 2884 2885 # Handle async composite evaluators 2886 if asyncio.iscoroutine(result): 2887 result = await result 2888 2889 # Normalize to list 2890 composite_evals: List[Evaluation] = [] 2891 if isinstance(result, (dict, Evaluation)): 2892 composite_evals = [result] # type: ignore 2893 elif isinstance(result, list): 2894 composite_evals = result # type: ignore 2895 2896 # Store composite evaluations as scores and add to evaluations list 2897 for composite_evaluation in composite_evals: 2898 self.create_score( 2899 trace_id=trace_id, 2900 observation_id=span.id, 2901 name=composite_evaluation.name, 2902 value=composite_evaluation.value, # type: ignore 2903 comment=composite_evaluation.comment, 2904 metadata=composite_evaluation.metadata, 2905 config_id=composite_evaluation.config_id, 2906 data_type=composite_evaluation.data_type, # type: ignore 2907 ) 2908 evaluations.append(composite_evaluation) 2909 2910 except Exception as e: 2911 langfuse_logger.error(f"Composite evaluator failed: {e}") 2912 2913 return ExperimentItemResult( 2914 item=item, 2915 output=output, 2916 evaluations=evaluations, 2917 trace_id=trace_id, 2918 dataset_run_id=dataset_run_id, 2919 ) 2920 2921 def _create_experiment_run_name( 2922 self, *, name: Optional[str] = None, run_name: Optional[str] = None 2923 ) -> str: 2924 if run_name: 2925 return run_name 2926 2927 iso_timestamp = _get_timestamp().isoformat().replace("+00:00", "Z") 2928 2929 return f"{name} - {iso_timestamp}" 2930 2931 def run_batched_evaluation( 2932 self, 2933 *, 2934 scope: Literal["traces", "observations"], 2935 mapper: MapperFunction, 2936 filter: Optional[str] = None, 2937 fetch_batch_size: int = 50, 2938 fetch_trace_fields: Optional[str] = None, 2939 max_items: Optional[int] = None, 2940 max_retries: int = 3, 2941 evaluators: List[EvaluatorFunction], 2942 composite_evaluator: Optional[CompositeEvaluatorFunction] = None, 2943 max_concurrency: int = 5, 2944 metadata: Optional[Dict[str, Any]] = None, 2945 _add_observation_scores_to_trace: bool = False, 2946 _additional_trace_tags: Optional[List[str]] = None, 2947 resume_from: Optional[BatchEvaluationResumeToken] = None, 2948 verbose: bool = False, 2949 ) -> BatchEvaluationResult: 2950 """Fetch traces or observations and run evaluations on each item. 2951 2952 This method provides a powerful way to evaluate existing data in Langfuse at scale. 2953 It fetches items based on filters, transforms them using a mapper function, runs 2954 evaluators on each item, and creates scores that are linked back to the original 2955 entities. This is ideal for: 2956 2957 - Running evaluations on production traces after deployment 2958 - Backtesting new evaluation metrics on historical data 2959 - Batch scoring of observations for quality monitoring 2960 - Periodic evaluation runs on recent data 2961 2962 The method uses a streaming/pipeline approach to process items in batches, making 2963 it memory-efficient for large datasets. It includes comprehensive error handling, 2964 retry logic, and resume capability for long-running evaluations. 2965 2966 Args: 2967 scope: The type of items to evaluate. Must be one of: 2968 - "traces": Evaluate complete traces with all their observations 2969 - "observations": Evaluate individual observations (spans, generations, events) 2970 mapper: Function that transforms API response objects into evaluator inputs. 2971 Receives a trace/observation object and returns an EvaluatorInputs 2972 instance with input, output, expected_output, and metadata fields. 2973 Can be sync or async. 2974 evaluators: List of evaluation functions to run on each item. Each evaluator 2975 receives the mapped inputs and returns Evaluation object(s). Evaluator 2976 failures are logged but don't stop the batch evaluation. 2977 filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples: 2978 - '{"tags": ["production"]}' 2979 - '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}' 2980 Default: None (fetches all items). 2981 fetch_batch_size: Number of items to fetch per API call and hold in memory. 2982 Larger values may be faster but use more memory. Default: 50. 2983 fetch_trace_fields: Comma-separated list of fields to include when fetching traces. Available field groups: 'core' (always included), 'io' (input, output, metadata), 'scores', 'observations', 'metrics'. If not specified, all fields are returned. Example: 'core,scores,metrics'. Note: Excluded 'observations' or 'scores' fields return empty arrays; excluded 'metrics' returns -1 for 'totalCost' and 'latency'. Only relevant if scope is 'traces'. 2984 max_items: Maximum total number of items to process. If None, processes all 2985 items matching the filter. Useful for testing or limiting evaluation runs. 2986 Default: None (process all). 2987 max_concurrency: Maximum number of items to evaluate concurrently. Controls 2988 parallelism and resource usage. Default: 5. 2989 composite_evaluator: Optional function that creates a composite score from 2990 item-level evaluations. Receives the original item and its evaluations, 2991 returns a single Evaluation. Useful for weighted averages or combined metrics. 2992 Default: None. 2993 metadata: Optional metadata dict to add to all created scores. Useful for 2994 tracking evaluation runs, versions, or other context. Default: None. 2995 max_retries: Maximum number of retry attempts for failed batch fetches. 2996 Uses exponential backoff (1s, 2s, 4s). Default: 3. 2997 verbose: If True, logs progress information to console. Useful for monitoring 2998 long-running evaluations. Default: False. 2999 resume_from: Optional resume token from a previous incomplete run. Allows 3000 continuing evaluation after interruption or failure. Default: None. 3001 3002 3003 Returns: 3004 BatchEvaluationResult containing: 3005 - total_items_fetched: Number of items fetched from API 3006 - total_items_processed: Number of items successfully evaluated 3007 - total_items_failed: Number of items that failed evaluation 3008 - total_scores_created: Scores created by item-level evaluators 3009 - total_composite_scores_created: Scores created by composite evaluator 3010 - total_evaluations_failed: Individual evaluator failures 3011 - evaluator_stats: Per-evaluator statistics (success rate, scores created) 3012 - resume_token: Token for resuming if incomplete (None if completed) 3013 - completed: True if all items processed 3014 - duration_seconds: Total execution time 3015 - failed_item_ids: IDs of items that failed 3016 - error_summary: Error types and counts 3017 - has_more_items: True if max_items reached but more exist 3018 3019 Raises: 3020 ValueError: If invalid scope is provided. 3021 3022 Examples: 3023 Basic trace evaluation: 3024 ```python 3025 from langfuse import Langfuse, EvaluatorInputs, Evaluation 3026 3027 client = Langfuse() 3028 3029 # Define mapper to extract fields from traces 3030 def trace_mapper(trace): 3031 return EvaluatorInputs( 3032 input=trace.input, 3033 output=trace.output, 3034 expected_output=None, 3035 metadata={"trace_id": trace.id} 3036 ) 3037 3038 # Define evaluator 3039 def length_evaluator(*, input, output, expected_output, metadata): 3040 return Evaluation( 3041 name="output_length", 3042 value=len(output) if output else 0 3043 ) 3044 3045 # Run batch evaluation 3046 result = client.run_batched_evaluation( 3047 scope="traces", 3048 mapper=trace_mapper, 3049 evaluators=[length_evaluator], 3050 filter='{"tags": ["production"]}', 3051 max_items=1000, 3052 verbose=True 3053 ) 3054 3055 print(f"Processed {result.total_items_processed} traces") 3056 print(f"Created {result.total_scores_created} scores") 3057 ``` 3058 3059 Evaluation with composite scorer: 3060 ```python 3061 def accuracy_evaluator(*, input, output, expected_output, metadata): 3062 # ... evaluation logic 3063 return Evaluation(name="accuracy", value=0.85) 3064 3065 def relevance_evaluator(*, input, output, expected_output, metadata): 3066 # ... evaluation logic 3067 return Evaluation(name="relevance", value=0.92) 3068 3069 def composite_evaluator(*, item, evaluations): 3070 # Weighted average of evaluations 3071 weights = {"accuracy": 0.6, "relevance": 0.4} 3072 total = sum( 3073 e.value * weights.get(e.name, 0) 3074 for e in evaluations 3075 if isinstance(e.value, (int, float)) 3076 ) 3077 return Evaluation( 3078 name="composite_score", 3079 value=total, 3080 comment=f"Weighted average of {len(evaluations)} metrics" 3081 ) 3082 3083 result = client.run_batched_evaluation( 3084 scope="traces", 3085 mapper=trace_mapper, 3086 evaluators=[accuracy_evaluator, relevance_evaluator], 3087 composite_evaluator=composite_evaluator, 3088 filter='{"user_id": "important_user"}', 3089 verbose=True 3090 ) 3091 ``` 3092 3093 Handling incomplete runs with resume: 3094 ```python 3095 # Initial run that may fail or timeout 3096 result = client.run_batched_evaluation( 3097 scope="observations", 3098 mapper=obs_mapper, 3099 evaluators=[my_evaluator], 3100 max_items=10000, 3101 verbose=True 3102 ) 3103 3104 # Check if incomplete 3105 if not result.completed and result.resume_token: 3106 print(f"Processed {result.resume_token.items_processed} items before interruption") 3107 3108 # Resume from where it left off 3109 result = client.run_batched_evaluation( 3110 scope="observations", 3111 mapper=obs_mapper, 3112 evaluators=[my_evaluator], 3113 resume_from=result.resume_token, 3114 verbose=True 3115 ) 3116 3117 print(f"Total items processed: {result.total_items_processed}") 3118 ``` 3119 3120 Monitoring evaluator performance: 3121 ```python 3122 result = client.run_batched_evaluation(...) 3123 3124 for stats in result.evaluator_stats: 3125 success_rate = stats.successful_runs / stats.total_runs 3126 print(f"{stats.name}:") 3127 print(f" Success rate: {success_rate:.1%}") 3128 print(f" Scores created: {stats.total_scores_created}") 3129 3130 if stats.failed_runs > 0: 3131 print(f" â ī¸ Failed {stats.failed_runs} times") 3132 ``` 3133 3134 Note: 3135 - Evaluator failures are logged but don't stop the batch evaluation 3136 - Individual item failures are tracked but don't stop processing 3137 - Fetch failures are retried with exponential backoff 3138 - All scores are automatically flushed to Langfuse at the end 3139 - The resume mechanism uses timestamp-based filtering to avoid duplicates 3140 """ 3141 runner = BatchEvaluationRunner(self) 3142 3143 return cast( 3144 BatchEvaluationResult, 3145 run_async_safely( 3146 runner.run_async( 3147 scope=scope, 3148 mapper=mapper, 3149 evaluators=evaluators, 3150 filter=filter, 3151 fetch_batch_size=fetch_batch_size, 3152 fetch_trace_fields=fetch_trace_fields, 3153 max_items=max_items, 3154 max_concurrency=max_concurrency, 3155 composite_evaluator=composite_evaluator, 3156 metadata=metadata, 3157 _add_observation_scores_to_trace=_add_observation_scores_to_trace, 3158 _additional_trace_tags=_additional_trace_tags, 3159 max_retries=max_retries, 3160 verbose=verbose, 3161 resume_from=resume_from, 3162 ) 3163 ), 3164 ) 3165 3166 def auth_check(self) -> bool: 3167 """Check if the provided credentials (public and secret key) are valid. 3168 3169 Raises: 3170 Exception: If no projects were found for the provided credentials. 3171 3172 Note: 3173 This method is blocking. It is discouraged to use it in production code. 3174 """ 3175 try: 3176 projects = self.api.projects.get() 3177 langfuse_logger.debug( 3178 f"Auth check successful, found {len(projects.data)} projects" 3179 ) 3180 if len(projects.data) == 0: 3181 raise Exception( 3182 "Auth check failed, no project found for the keys provided." 3183 ) 3184 return True 3185 3186 except AttributeError as e: 3187 langfuse_logger.warning( 3188 f"Auth check failed: Client not properly initialized. Error: {e}" 3189 ) 3190 return False 3191 3192 except Error as e: 3193 handle_fern_exception(e) 3194 raise e 3195 3196 def create_dataset( 3197 self, 3198 *, 3199 name: str, 3200 description: Optional[str] = None, 3201 metadata: Optional[Any] = None, 3202 input_schema: Optional[Any] = None, 3203 expected_output_schema: Optional[Any] = None, 3204 ) -> Dataset: 3205 """Create a dataset with the given name on Langfuse. 3206 3207 Args: 3208 name: Name of the dataset to create. 3209 description: Description of the dataset. Defaults to None. 3210 metadata: Additional metadata. Defaults to None. 3211 input_schema: JSON Schema for validating dataset item inputs. When set, all new items will be validated against this schema. 3212 expected_output_schema: JSON Schema for validating dataset item expected outputs. When set, all new items will be validated against this schema. 3213 3214 Returns: 3215 Dataset: The created dataset as returned by the Langfuse API. 3216 """ 3217 try: 3218 langfuse_logger.debug(f"Creating datasets {name}") 3219 3220 result = self.api.datasets.create( 3221 name=name, 3222 description=description, 3223 metadata=metadata, 3224 input_schema=input_schema, 3225 expected_output_schema=expected_output_schema, 3226 ) 3227 3228 return cast(Dataset, result) 3229 3230 except Error as e: 3231 handle_fern_exception(e) 3232 raise e 3233 3234 def create_dataset_item( 3235 self, 3236 *, 3237 dataset_name: str, 3238 input: Optional[Any] = None, 3239 expected_output: Optional[Any] = None, 3240 metadata: Optional[Any] = None, 3241 source_trace_id: Optional[str] = None, 3242 source_observation_id: Optional[str] = None, 3243 status: Optional[DatasetStatus] = None, 3244 id: Optional[str] = None, 3245 ) -> DatasetItem: 3246 """Create a dataset item. 3247 3248 Upserts if an item with id already exists. 3249 3250 Args: 3251 dataset_name: Name of the dataset in which the dataset item should be created. 3252 input: Input data. Defaults to None. Can contain any dict, list or scalar. 3253 expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar. 3254 metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar. 3255 source_trace_id: Id of the source trace. Defaults to None. 3256 source_observation_id: Id of the source observation. Defaults to None. 3257 status: Status of the dataset item. Defaults to ACTIVE for newly created items. 3258 id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets. 3259 3260 Returns: 3261 DatasetItem: The created dataset item as returned by the Langfuse API. 3262 3263 Example: 3264 ```python 3265 from langfuse import Langfuse 3266 3267 langfuse = Langfuse() 3268 3269 # Uploading items to the Langfuse dataset named "capital_cities" 3270 langfuse.create_dataset_item( 3271 dataset_name="capital_cities", 3272 input={"input": {"country": "Italy"}}, 3273 expected_output={"expected_output": "Rome"}, 3274 metadata={"foo": "bar"} 3275 ) 3276 ``` 3277 """ 3278 try: 3279 langfuse_logger.debug(f"Creating dataset item for dataset {dataset_name}") 3280 3281 result = self.api.dataset_items.create( 3282 dataset_name=dataset_name, 3283 input=input, 3284 expected_output=expected_output, 3285 metadata=metadata, 3286 source_trace_id=source_trace_id, 3287 source_observation_id=source_observation_id, 3288 status=status, 3289 id=id, 3290 ) 3291 3292 return cast(DatasetItem, result) 3293 except Error as e: 3294 handle_fern_exception(e) 3295 raise e 3296 3297 def resolve_media_references( 3298 self, 3299 *, 3300 obj: Any, 3301 resolve_with: Literal["base64_data_uri"], 3302 max_depth: int = 10, 3303 content_fetch_timeout_seconds: int = 5, 3304 ) -> Any: 3305 """Replace media reference strings in an object with base64 data URIs. 3306 3307 This method recursively traverses an object (up to max_depth) looking for media reference strings 3308 in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using 3309 the provided Langfuse client and replaces the reference string with a base64 data URI. 3310 3311 If fetching media content fails for a reference string, a warning is logged and the reference 3312 string is left unchanged. 3313 3314 Args: 3315 obj: The object to process. Can be a primitive value, array, or nested object. 3316 If the object has a __dict__ attribute, a dict will be returned instead of the original object type. 3317 resolve_with: The representation of the media content to replace the media reference string with. 3318 Currently only "base64_data_uri" is supported. 3319 max_depth: int: The maximum depth to traverse the object. Default is 10. 3320 content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5. 3321 3322 Returns: 3323 A deep copy of the input object with all media references replaced with base64 data URIs where possible. 3324 If the input object has a __dict__ attribute, a dict will be returned instead of the original object type. 3325 3326 Example: 3327 obj = { 3328 "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@", 3329 "nested": { 3330 "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@" 3331 } 3332 } 3333 3334 result = await LangfuseMedia.resolve_media_references(obj, langfuse_client) 3335 3336 # Result: 3337 # { 3338 # "image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...", 3339 # "nested": { 3340 # "pdf": "data:application/pdf;base64,JVBERi0xLjcK..." 3341 # } 3342 # } 3343 """ 3344 return LangfuseMedia.resolve_media_references( 3345 langfuse_client=self, 3346 obj=obj, 3347 resolve_with=resolve_with, 3348 max_depth=max_depth, 3349 content_fetch_timeout_seconds=content_fetch_timeout_seconds, 3350 ) 3351 3352 @overload 3353 def get_prompt( 3354 self, 3355 name: str, 3356 *, 3357 version: Optional[int] = None, 3358 label: Optional[str] = None, 3359 type: Literal["chat"], 3360 cache_ttl_seconds: Optional[int] = None, 3361 fallback: Optional[List[ChatMessageDict]] = None, 3362 max_retries: Optional[int] = None, 3363 fetch_timeout_seconds: Optional[int] = None, 3364 ) -> ChatPromptClient: ... 3365 3366 @overload 3367 def get_prompt( 3368 self, 3369 name: str, 3370 *, 3371 version: Optional[int] = None, 3372 label: Optional[str] = None, 3373 type: Literal["text"] = "text", 3374 cache_ttl_seconds: Optional[int] = None, 3375 fallback: Optional[str] = None, 3376 max_retries: Optional[int] = None, 3377 fetch_timeout_seconds: Optional[int] = None, 3378 ) -> TextPromptClient: ... 3379 3380 def get_prompt( 3381 self, 3382 name: str, 3383 *, 3384 version: Optional[int] = None, 3385 label: Optional[str] = None, 3386 type: Literal["chat", "text"] = "text", 3387 cache_ttl_seconds: Optional[int] = None, 3388 fallback: Union[Optional[List[ChatMessageDict]], Optional[str]] = None, 3389 max_retries: Optional[int] = None, 3390 fetch_timeout_seconds: Optional[int] = None, 3391 ) -> PromptClient: 3392 """Get a prompt. 3393 3394 This method attempts to fetch the requested prompt from the local cache. If the prompt is not found 3395 in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again 3396 and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will 3397 return the expired prompt as a fallback. 3398 3399 Args: 3400 name (str): The name of the prompt to retrieve. 3401 3402 Keyword Args: 3403 version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both. 3404 label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both. 3405 cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a 3406 keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0. 3407 type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text". 3408 fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None. 3409 max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds. 3410 fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default. 3411 3412 Returns: 3413 The prompt object retrieved from the cache or directly fetched if not cached or expired of type 3414 - TextPromptClient, if type argument is 'text'. 3415 - ChatPromptClient, if type argument is 'chat'. 3416 3417 Raises: 3418 Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an 3419 expired prompt in the cache, in which case it logs a warning and returns the expired prompt. 3420 """ 3421 if self._resources is None: 3422 raise Error( 3423 "SDK is not correctly initialized. Check the init logs for more details." 3424 ) 3425 if version is not None and label is not None: 3426 raise ValueError("Cannot specify both version and label at the same time.") 3427 3428 if not name: 3429 raise ValueError("Prompt name cannot be empty.") 3430 3431 cache_key = PromptCache.generate_cache_key(name, version=version, label=label) 3432 bounded_max_retries = self._get_bounded_max_retries( 3433 max_retries, default_max_retries=2, max_retries_upper_bound=4 3434 ) 3435 3436 langfuse_logger.debug(f"Getting prompt '{cache_key}'") 3437 cached_prompt = self._resources.prompt_cache.get(cache_key) 3438 3439 if cached_prompt is None or cache_ttl_seconds == 0: 3440 langfuse_logger.debug( 3441 f"Prompt '{cache_key}' not found in cache or caching disabled." 3442 ) 3443 try: 3444 return self._fetch_prompt_and_update_cache( 3445 name, 3446 version=version, 3447 label=label, 3448 ttl_seconds=cache_ttl_seconds, 3449 max_retries=bounded_max_retries, 3450 fetch_timeout_seconds=fetch_timeout_seconds, 3451 ) 3452 except Exception as e: 3453 if fallback: 3454 langfuse_logger.warning( 3455 f"Returning fallback prompt for '{cache_key}' due to fetch error: {e}" 3456 ) 3457 3458 fallback_client_args: Dict[str, Any] = { 3459 "name": name, 3460 "prompt": fallback, 3461 "type": type, 3462 "version": version or 0, 3463 "config": {}, 3464 "labels": [label] if label else [], 3465 "tags": [], 3466 } 3467 3468 if type == "text": 3469 return TextPromptClient( 3470 prompt=Prompt_Text(**fallback_client_args), 3471 is_fallback=True, 3472 ) 3473 3474 if type == "chat": 3475 return ChatPromptClient( 3476 prompt=Prompt_Chat(**fallback_client_args), 3477 is_fallback=True, 3478 ) 3479 3480 raise e 3481 3482 if cached_prompt.is_expired(): 3483 langfuse_logger.debug(f"Stale prompt '{cache_key}' found in cache.") 3484 try: 3485 # refresh prompt in background thread, refresh_prompt deduplicates tasks 3486 langfuse_logger.debug(f"Refreshing prompt '{cache_key}' in background.") 3487 3488 def refresh_task() -> None: 3489 self._fetch_prompt_and_update_cache( 3490 name, 3491 version=version, 3492 label=label, 3493 ttl_seconds=cache_ttl_seconds, 3494 max_retries=bounded_max_retries, 3495 fetch_timeout_seconds=fetch_timeout_seconds, 3496 ) 3497 3498 self._resources.prompt_cache.add_refresh_prompt_task_if_current( 3499 cache_key, 3500 cached_prompt, 3501 refresh_task, 3502 ) 3503 langfuse_logger.debug( 3504 f"Returning stale prompt '{cache_key}' from cache." 3505 ) 3506 # return stale prompt 3507 return cached_prompt.value 3508 3509 except Exception as e: 3510 langfuse_logger.warning( 3511 f"Error when refreshing cached prompt '{cache_key}', returning cached version. Error: {e}" 3512 ) 3513 # creation of refresh prompt task failed, return stale prompt 3514 return cached_prompt.value 3515 3516 return cached_prompt.value 3517 3518 def _fetch_prompt_and_update_cache( 3519 self, 3520 name: str, 3521 *, 3522 version: Optional[int] = None, 3523 label: Optional[str] = None, 3524 ttl_seconds: Optional[int] = None, 3525 max_retries: int, 3526 fetch_timeout_seconds: Optional[int], 3527 ) -> PromptClient: 3528 cache_key = PromptCache.generate_cache_key(name, version=version, label=label) 3529 langfuse_logger.debug(f"Fetching prompt '{cache_key}' from server...") 3530 3531 try: 3532 3533 @backoff.on_exception( 3534 backoff.constant, Exception, max_tries=max_retries + 1, logger=None 3535 ) 3536 def fetch_prompts() -> Any: 3537 return self.api.prompts.get( 3538 self._url_encode(name), 3539 version=version, 3540 label=label, 3541 request_options={ 3542 "timeout_in_seconds": fetch_timeout_seconds, 3543 } 3544 if fetch_timeout_seconds is not None 3545 else None, 3546 ) 3547 3548 prompt_response = fetch_prompts() 3549 3550 prompt: PromptClient 3551 if prompt_response.type == "chat": 3552 prompt = ChatPromptClient(prompt_response) 3553 else: 3554 prompt = TextPromptClient(prompt_response) 3555 3556 if self._resources is not None: 3557 self._resources.prompt_cache.set(cache_key, prompt, ttl_seconds) 3558 3559 return prompt 3560 3561 except NotFoundError as not_found_error: 3562 langfuse_logger.warning( 3563 f"Prompt '{cache_key}' not found during refresh, evicting from cache." 3564 ) 3565 if self._resources is not None: 3566 self._resources.prompt_cache.delete(cache_key) 3567 raise not_found_error 3568 3569 except Exception as e: 3570 langfuse_logger.error( 3571 f"Error while fetching prompt '{cache_key}': {str(e)}" 3572 ) 3573 raise e 3574 3575 def _get_bounded_max_retries( 3576 self, 3577 max_retries: Optional[int], 3578 *, 3579 default_max_retries: int = 2, 3580 max_retries_upper_bound: int = 4, 3581 ) -> int: 3582 if max_retries is None: 3583 return default_max_retries 3584 3585 bounded_max_retries = min( 3586 max(max_retries, 0), 3587 max_retries_upper_bound, 3588 ) 3589 3590 return bounded_max_retries 3591 3592 @overload 3593 def create_prompt( 3594 self, 3595 *, 3596 name: str, 3597 prompt: List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]], 3598 labels: List[str] = [], 3599 tags: Optional[List[str]] = None, 3600 type: Optional[Literal["chat"]], 3601 config: Optional[Any] = None, 3602 commit_message: Optional[str] = None, 3603 ) -> ChatPromptClient: ... 3604 3605 @overload 3606 def create_prompt( 3607 self, 3608 *, 3609 name: str, 3610 prompt: str, 3611 labels: List[str] = [], 3612 tags: Optional[List[str]] = None, 3613 type: Optional[Literal["text"]] = "text", 3614 config: Optional[Any] = None, 3615 commit_message: Optional[str] = None, 3616 ) -> TextPromptClient: ... 3617 3618 def create_prompt( 3619 self, 3620 *, 3621 name: str, 3622 prompt: Union[ 3623 str, List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]] 3624 ], 3625 labels: List[str] = [], 3626 tags: Optional[List[str]] = None, 3627 type: Optional[Literal["chat", "text"]] = "text", 3628 config: Optional[Any] = None, 3629 commit_message: Optional[str] = None, 3630 ) -> PromptClient: 3631 """Create a new prompt in Langfuse. 3632 3633 Keyword Args: 3634 name : The name of the prompt to be created. 3635 prompt : The content of the prompt to be created. 3636 is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead. 3637 labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label. 3638 tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt. 3639 config: Additional structured data to be saved with the prompt. Defaults to None. 3640 type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text". 3641 commit_message: Optional string describing the change. 3642 3643 Returns: 3644 TextPromptClient: The prompt if type argument is 'text'. 3645 ChatPromptClient: The prompt if type argument is 'chat'. 3646 """ 3647 try: 3648 langfuse_logger.debug(f"Creating prompt {name=}, {labels=}") 3649 3650 if type == "chat": 3651 if not isinstance(prompt, list): 3652 raise ValueError( 3653 "For 'chat' type, 'prompt' must be a list of chat messages with role and content attributes." 3654 ) 3655 request: Union[CreateChatPromptRequest, CreateTextPromptRequest] = ( 3656 CreateChatPromptRequest( 3657 name=name, 3658 prompt=cast(Any, prompt), 3659 labels=labels, 3660 tags=tags, 3661 config=config or {}, 3662 commit_message=commit_message, 3663 type=CreateChatPromptType.CHAT, 3664 ) 3665 ) 3666 server_prompt = self.api.prompts.create(request=request) 3667 3668 if self._resources is not None: 3669 self._resources.prompt_cache.invalidate(name) 3670 3671 return ChatPromptClient(prompt=cast(Prompt_Chat, server_prompt)) 3672 3673 if not isinstance(prompt, str): 3674 raise ValueError("For 'text' type, 'prompt' must be a string.") 3675 3676 request = CreateTextPromptRequest( 3677 name=name, 3678 prompt=prompt, 3679 labels=labels, 3680 tags=tags, 3681 config=config or {}, 3682 commit_message=commit_message, 3683 ) 3684 3685 server_prompt = self.api.prompts.create(request=request) 3686 3687 if self._resources is not None: 3688 self._resources.prompt_cache.invalidate(name) 3689 3690 return TextPromptClient(prompt=cast(Prompt_Text, server_prompt)) 3691 3692 except Error as e: 3693 handle_fern_exception(e) 3694 raise e 3695 3696 def update_prompt( 3697 self, 3698 *, 3699 name: str, 3700 version: int, 3701 new_labels: List[str] = [], 3702 ) -> Any: 3703 """Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name. 3704 3705 Args: 3706 name (str): The name of the prompt to update. 3707 version (int): The version number of the prompt to update. 3708 new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to []. 3709 3710 Returns: 3711 Prompt: The updated prompt from the Langfuse API. 3712 3713 """ 3714 updated_prompt = self.api.prompt_version.update( 3715 name=self._url_encode(name), 3716 version=version, 3717 new_labels=new_labels, 3718 ) 3719 3720 if self._resources is not None: 3721 self._resources.prompt_cache.invalidate(name) 3722 3723 return updated_prompt 3724 3725 def _url_encode(self, url: str, *, is_url_param: Optional[bool] = False) -> str: 3726 # httpx âĨ 0.28 does its own WHATWG-compliant quoting (eg. encodes bare 3727 # â%â, â?â, â#â, â|â, âĻ in query/path parts). Re-quoting here would 3728 # double-encode, so we skip when the value is about to be sent straight 3729 # to httpx (`is_url_param=True`) and the installed version is âĨ 0.28. 3730 if is_url_param and Version(httpx.__version__) >= Version("0.28.0"): 3731 return url 3732 3733 # urllib.parse.quote does not escape slashes "/" by default; we need to add safe="" to force escaping 3734 # we need add safe="" to force escaping of slashes 3735 # This is necessary for prompts in prompt folders 3736 return urllib.parse.quote(url, safe="") 3737 3738 def clear_prompt_cache(self) -> None: 3739 """Clear the entire prompt cache, removing all cached prompts. 3740 3741 This method is useful when you want to force a complete refresh of all 3742 cached prompts, for example after major updates or when you need to 3743 ensure the latest versions are fetched from the server. 3744 """ 3745 if self._resources is not None: 3746 self._resources.prompt_cache.clear()
Main client for Langfuse tracing and platform features.
This class provides an interface for creating and managing traces, spans, and generations in Langfuse as well as interacting with the Langfuse API.
The client features a thread-safe singleton pattern for each unique public API key, ensuring consistent trace context propagation across your application. It implements efficient batching of spans with configurable flush settings and includes background thread management for media uploads and score ingestion.
Configuration is flexible through either direct parameters or environment variables, with graceful fallbacks and runtime configuration updates.
Attributes:
- api: Synchronous API client for Langfuse backend communication
- async_api: Asynchronous API client for Langfuse backend communication
- _otel_tracer: Internal LangfuseTracer instance managing OpenTelemetry components
Arguments:
- public_key (Optional[str]): Your Langfuse public API key. Can also be set via LANGFUSE_PUBLIC_KEY environment variable.
- secret_key (Optional[str]): Your Langfuse secret API key. Can also be set via LANGFUSE_SECRET_KEY environment variable.
- base_url (Optional[str]): The Langfuse API base URL. Defaults to "https://cloud.langfuse.com". Can also be set via LANGFUSE_BASE_URL environment variable.
- host (Optional[str]): Deprecated. Use base_url instead. The Langfuse API host URL. Defaults to "https://cloud.langfuse.com".
- timeout (Optional[int]): Timeout in seconds for API requests. Defaults to 5 seconds.
- httpx_client (Optional[httpx.Client]): Custom httpx client for making non-tracing HTTP requests. If not provided, a default client will be created.
- debug (bool): Enable debug logging. Defaults to False. Can also be set via LANGFUSE_DEBUG environment variable.
- tracing_enabled (Optional[bool]): Enable or disable tracing. Defaults to True. Can also be set via LANGFUSE_TRACING_ENABLED environment variable.
- flush_at (Optional[int]): Number of spans to batch before sending to the API. Defaults to 512. Can also be set via LANGFUSE_FLUSH_AT environment variable.
- flush_interval (Optional[float]): Time in seconds between batch flushes. Defaults to 5 seconds. Can also be set via LANGFUSE_FLUSH_INTERVAL environment variable.
- environment (Optional[str]): Environment name for tracing. Default is 'default'. Can also be set via LANGFUSE_TRACING_ENVIRONMENT environment variable. Can be any lowercase alphanumeric string with hyphens and underscores that does not start with 'langfuse'.
- release (Optional[str]): Release version/hash of your application. Used for grouping analytics by release.
- media_upload_thread_count (Optional[int]): Number of background threads for handling media uploads. Defaults to 1. Can also be set via LANGFUSE_MEDIA_UPLOAD_THREAD_COUNT environment variable.
- sample_rate (Optional[float]): Sampling rate for traces (0.0 to 1.0). Defaults to 1.0 (100% of traces are sampled). Can also be set via LANGFUSE_SAMPLE_RATE environment variable.
- mask (Optional[MaskFunction]): Function to mask sensitive data in traces before sending to the API.
blocked_instrumentation_scopes (Optional[List[str]]): Deprecated. Use
should_export_spaninstead. Equivalent behavior:from langfuse.span_filter import is_default_export_span blocked = {"sqlite", "requests"} should_export_span = lambda span: ( is_default_export_span(span) and ( span.instrumentation_scope is None or span.instrumentation_scope.name not in blocked ) )should_export_span (Optional[Callable[[ReadableSpan], bool]]): Callback to decide whether to export a span. If omitted, Langfuse uses the default filter (Langfuse SDK spans, spans with
gen_ai.*attributes, and known LLM instrumentation scopes).- additional_headers (Optional[Dict[str, str]]): Additional headers to include in all API requests and in the default OTLPSpanExporter requests. These headers will be merged with default headers. Note: If httpx_client is provided, additional_headers must be set directly on your custom httpx_client as well. If
span_exporteris provided, these headers are not wired into that exporter and must be configured on the exporter instance directly. - tracer_provider(Optional[TracerProvider]): OpenTelemetry TracerProvider to use for Langfuse. This can be useful to set to have disconnected tracing between Langfuse and other OpenTelemetry-span emitting libraries. Note: To track active spans, the context is still shared between TracerProviders. This may lead to broken trace trees.
- span_exporter (Optional[SpanExporter]): Custom OpenTelemetry span exporter for the Langfuse span processor. If omitted, Langfuse creates an OTLPSpanExporter pointed at the Langfuse OTLP endpoint. If provided, Langfuse does not wire
base_url, exporter headers, exporter auth, or exporter timeout into it. Configure endpoint, headers, and timeout on the exporter instance directly. If you are sending spans to Langfuse v4 or using Langfuse Cloud Fast Preview, includex-langfuse-ingestion-version=4on the exporter to enable real time processing of exported spans.
Example:
from langfuse.otel import Langfuse # Initialize the client (reads from env vars if not provided) langfuse = Langfuse( public_key="your-public-key", secret_key="your-secret-key", host="https://cloud.langfuse.com", # Optional, default shown ) # Create a trace span with langfuse.start_as_current_observation(name="process-query") as span: # Your application code here # Create a nested generation span for an LLM call with span.start_as_current_generation( name="generate-response", model="gpt-4", input={"query": "Tell me about AI"}, model_parameters={"temperature": 0.7, "max_tokens": 500} ) as generation: # Generate response here response = "AI is a field of computer science..." generation.update( output=response, usage_details={"prompt_tokens": 10, "completion_tokens": 50}, cost_details={"total_cost": 0.0023} ) # Score the generation (supports NUMERIC, BOOLEAN, CATEGORICAL) generation.score(name="relevance", value=0.95, data_type="NUMERIC")
232 def __init__( 233 self, 234 *, 235 public_key: Optional[str] = None, 236 secret_key: Optional[str] = None, 237 base_url: Optional[str] = None, 238 host: Optional[str] = None, 239 timeout: Optional[int] = None, 240 httpx_client: Optional[httpx.Client] = None, 241 debug: bool = False, 242 tracing_enabled: Optional[bool] = True, 243 flush_at: Optional[int] = None, 244 flush_interval: Optional[float] = None, 245 environment: Optional[str] = None, 246 release: Optional[str] = None, 247 media_upload_thread_count: Optional[int] = None, 248 sample_rate: Optional[float] = None, 249 mask: Optional[MaskFunction] = None, 250 blocked_instrumentation_scopes: Optional[List[str]] = None, 251 should_export_span: Optional[Callable[[ReadableSpan], bool]] = None, 252 additional_headers: Optional[Dict[str, str]] = None, 253 tracer_provider: Optional[TracerProvider] = None, 254 span_exporter: Optional[SpanExporter] = None, 255 ): 256 self._base_url = ( 257 base_url 258 or os.environ.get(LANGFUSE_BASE_URL) 259 or host 260 or os.environ.get(LANGFUSE_HOST, "https://cloud.langfuse.com") 261 ) 262 self._environment = environment or cast( 263 str, os.environ.get(LANGFUSE_TRACING_ENVIRONMENT) 264 ) 265 self._release = ( 266 release 267 or os.environ.get(LANGFUSE_RELEASE, None) 268 or get_common_release_envs() 269 ) 270 self._project_id: Optional[str] = None 271 sample_rate = sample_rate or float(os.environ.get(LANGFUSE_SAMPLE_RATE, 1.0)) 272 if not 0.0 <= sample_rate <= 1.0: 273 raise ValueError( 274 f"Sample rate must be between 0.0 and 1.0, got {sample_rate}" 275 ) 276 277 timeout = timeout or int(os.environ.get(LANGFUSE_TIMEOUT, 5)) 278 279 self._tracing_enabled = ( 280 tracing_enabled 281 and os.environ.get(LANGFUSE_TRACING_ENABLED, "true").lower() != "false" 282 ) 283 if not self._tracing_enabled: 284 langfuse_logger.info( 285 "Configuration: Langfuse tracing is explicitly disabled. No data will be sent to the Langfuse API." 286 ) 287 288 debug = ( 289 debug if debug else (os.getenv(LANGFUSE_DEBUG, "false").lower() == "true") 290 ) 291 if debug: 292 logging.basicConfig( 293 format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" 294 ) 295 langfuse_logger.setLevel(logging.DEBUG) 296 297 public_key = public_key or os.environ.get(LANGFUSE_PUBLIC_KEY) 298 if public_key is None: 299 langfuse_logger.warning( 300 "Authentication error: Langfuse client initialized without public_key. Client will be disabled. " 301 "Provide a public_key parameter or set LANGFUSE_PUBLIC_KEY environment variable. " 302 ) 303 self._otel_tracer = otel_trace_api.NoOpTracer() 304 return 305 306 secret_key = secret_key or os.environ.get(LANGFUSE_SECRET_KEY) 307 if secret_key is None: 308 langfuse_logger.warning( 309 "Authentication error: Langfuse client initialized without secret_key. Client will be disabled. " 310 "Provide a secret_key parameter or set LANGFUSE_SECRET_KEY environment variable. " 311 ) 312 self._otel_tracer = otel_trace_api.NoOpTracer() 313 return 314 315 if os.environ.get("OTEL_SDK_DISABLED", "false").lower() == "true": 316 langfuse_logger.warning( 317 "OTEL_SDK_DISABLED is set. Langfuse tracing will be disabled and no traces will appear in the UI." 318 ) 319 320 if blocked_instrumentation_scopes is not None: 321 warnings.warn( 322 "`blocked_instrumentation_scopes` is deprecated and will be removed in a future release. " 323 "Use `should_export_span` instead. Example: " 324 "from langfuse.span_filter import is_default_export_span; " 325 'blocked={"scope"}; should_export_span=lambda span: ' 326 "is_default_export_span(span) and (span.instrumentation_scope is None or " 327 "span.instrumentation_scope.name not in blocked).", 328 DeprecationWarning, 329 stacklevel=2, 330 ) 331 332 # Initialize api and tracer if requirements are met 333 self._resources = LangfuseResourceManager( 334 public_key=public_key, 335 secret_key=secret_key, 336 base_url=self._base_url, 337 timeout=timeout, 338 environment=self._environment, 339 release=release, 340 flush_at=flush_at, 341 flush_interval=flush_interval, 342 httpx_client=httpx_client, 343 media_upload_thread_count=media_upload_thread_count, 344 sample_rate=sample_rate, 345 mask=mask, 346 tracing_enabled=self._tracing_enabled, 347 blocked_instrumentation_scopes=blocked_instrumentation_scopes, 348 should_export_span=should_export_span, 349 additional_headers=additional_headers, 350 tracer_provider=tracer_provider, 351 span_exporter=span_exporter, 352 ) 353 self._mask = self._resources.mask 354 355 self._otel_tracer = ( 356 self._resources.tracer 357 if self._tracing_enabled and self._resources.tracer is not None 358 else otel_trace_api.NoOpTracer() 359 ) 360 self.api = self._resources.api 361 self.async_api = self._resources.async_api
510 def start_observation( 511 self, 512 *, 513 trace_context: Optional[TraceContext] = None, 514 name: str, 515 as_type: ObservationTypeLiteralNoEvent = "span", 516 input: Optional[Any] = None, 517 output: Optional[Any] = None, 518 metadata: Optional[Any] = None, 519 version: Optional[str] = None, 520 level: Optional[SpanLevel] = None, 521 status_message: Optional[str] = None, 522 completion_start_time: Optional[datetime] = None, 523 model: Optional[str] = None, 524 model_parameters: Optional[Dict[str, MapValue]] = None, 525 usage_details: Optional[Dict[str, int]] = None, 526 cost_details: Optional[Dict[str, float]] = None, 527 prompt: Optional[PromptClient] = None, 528 ) -> Union[ 529 LangfuseSpan, 530 LangfuseGeneration, 531 LangfuseAgent, 532 LangfuseTool, 533 LangfuseChain, 534 LangfuseRetriever, 535 LangfuseEvaluator, 536 LangfuseEmbedding, 537 LangfuseGuardrail, 538 ]: 539 """Create a new observation of the specified type. 540 541 This method creates a new observation but does not set it as the current span in the 542 context. To create and use an observation within a context, use start_as_current_observation(). 543 544 Args: 545 trace_context: Optional context for connecting to an existing trace 546 name: Name of the observation 547 as_type: Type of observation to create (defaults to "span") 548 input: Input data for the operation 549 output: Output data from the operation 550 metadata: Additional metadata to associate with the observation 551 version: Version identifier for the code or component 552 level: Importance level of the observation 553 status_message: Optional status message for the observation 554 completion_start_time: When the model started generating (for generation types) 555 model: Name/identifier of the AI model used (for generation types) 556 model_parameters: Parameters used for the model (for generation types) 557 usage_details: Token usage information (for generation types) 558 cost_details: Cost information (for generation types) 559 prompt: Associated prompt template (for generation types) 560 561 Returns: 562 An observation object of the appropriate type that must be ended with .end() 563 """ 564 if trace_context: 565 trace_id = trace_context.get("trace_id", None) 566 parent_span_id = trace_context.get("parent_span_id", None) 567 568 if trace_id: 569 remote_parent_span = self._create_remote_parent_span( 570 trace_id=trace_id, parent_span_id=parent_span_id 571 ) 572 573 with otel_trace_api.use_span( 574 cast(otel_trace_api.Span, remote_parent_span) 575 ): 576 otel_span = self._otel_tracer.start_span(name=name) 577 otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True) 578 579 return self._create_observation_from_otel_span( 580 otel_span=otel_span, 581 as_type=as_type, 582 input=input, 583 output=output, 584 metadata=metadata, 585 version=version, 586 level=level, 587 status_message=status_message, 588 completion_start_time=completion_start_time, 589 model=model, 590 model_parameters=model_parameters, 591 usage_details=usage_details, 592 cost_details=cost_details, 593 prompt=prompt, 594 ) 595 596 otel_span = self._otel_tracer.start_span(name=name) 597 598 return self._create_observation_from_otel_span( 599 otel_span=otel_span, 600 as_type=as_type, 601 input=input, 602 output=output, 603 metadata=metadata, 604 version=version, 605 level=level, 606 status_message=status_message, 607 completion_start_time=completion_start_time, 608 model=model, 609 model_parameters=model_parameters, 610 usage_details=usage_details, 611 cost_details=cost_details, 612 prompt=prompt, 613 )
Create a new observation of the specified type.
This method creates a new observation but does not set it as the current span in the context. To create and use an observation within a context, use start_as_current_observation().
Arguments:
- trace_context: Optional context for connecting to an existing trace
- name: Name of the observation
- as_type: Type of observation to create (defaults to "span")
- input: Input data for the operation
- output: Output data from the operation
- metadata: Additional metadata to associate with the observation
- version: Version identifier for the code or component
- level: Importance level of the observation
- status_message: Optional status message for the observation
- completion_start_time: When the model started generating (for generation types)
- model: Name/identifier of the AI model used (for generation types)
- model_parameters: Parameters used for the model (for generation types)
- usage_details: Token usage information (for generation types)
- cost_details: Cost information (for generation types)
- prompt: Associated prompt template (for generation types)
Returns:
An observation object of the appropriate type that must be ended with .end()
843 def start_as_current_observation( 844 self, 845 *, 846 trace_context: Optional[TraceContext] = None, 847 name: str, 848 as_type: ObservationTypeLiteralNoEvent = "span", 849 input: Optional[Any] = None, 850 output: Optional[Any] = None, 851 metadata: Optional[Any] = None, 852 version: Optional[str] = None, 853 level: Optional[SpanLevel] = None, 854 status_message: Optional[str] = None, 855 completion_start_time: Optional[datetime] = None, 856 model: Optional[str] = None, 857 model_parameters: Optional[Dict[str, MapValue]] = None, 858 usage_details: Optional[Dict[str, int]] = None, 859 cost_details: Optional[Dict[str, float]] = None, 860 prompt: Optional[PromptClient] = None, 861 end_on_exit: Optional[bool] = None, 862 ) -> Union[ 863 _AgnosticContextManager[LangfuseGeneration], 864 _AgnosticContextManager[LangfuseSpan], 865 _AgnosticContextManager[LangfuseAgent], 866 _AgnosticContextManager[LangfuseTool], 867 _AgnosticContextManager[LangfuseChain], 868 _AgnosticContextManager[LangfuseRetriever], 869 _AgnosticContextManager[LangfuseEvaluator], 870 _AgnosticContextManager[LangfuseEmbedding], 871 _AgnosticContextManager[LangfuseGuardrail], 872 ]: 873 """Create a new observation and set it as the current span in a context manager. 874 875 This method creates a new observation of the specified type and sets it as the 876 current span within a context manager. Use this method with a 'with' statement to 877 automatically handle the observation lifecycle within a code block. 878 879 The created observation will be the child of the current span in the context. 880 881 Args: 882 trace_context: Optional context for connecting to an existing trace 883 name: Name of the observation (e.g., function or operation name) 884 as_type: Type of observation to create (defaults to "span") 885 input: Input data for the operation (can be any JSON-serializable object) 886 output: Output data from the operation (can be any JSON-serializable object) 887 metadata: Additional metadata to associate with the observation 888 version: Version identifier for the code or component 889 level: Importance level of the observation (info, warning, error) 890 status_message: Optional status message for the observation 891 end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks. 892 893 The following parameters are available when as_type is: "generation" or "embedding". 894 completion_start_time: When the model started generating the response 895 model: Name/identifier of the AI model used (e.g., "gpt-4") 896 model_parameters: Parameters used for the model (e.g., temperature, max_tokens) 897 usage_details: Token usage information (e.g., prompt_tokens, completion_tokens) 898 cost_details: Cost information for the model call 899 prompt: Associated prompt template from Langfuse prompt management 900 901 Returns: 902 A context manager that yields the appropriate observation type based on as_type 903 904 Example: 905 ```python 906 # Create a span 907 with langfuse.start_as_current_observation(name="process-query", as_type="span") as span: 908 # Do work 909 result = process_data() 910 span.update(output=result) 911 912 # Create a child span automatically 913 with span.start_as_current_observation(name="sub-operation") as child_span: 914 # Do sub-operation work 915 child_span.update(output="sub-result") 916 917 # Create a tool observation 918 with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool: 919 # Do tool work 920 results = search_web(query) 921 tool.update(output=results) 922 923 # Create a generation observation 924 with langfuse.start_as_current_observation( 925 name="answer-generation", 926 as_type="generation", 927 model="gpt-4" 928 ) as generation: 929 # Generate answer 930 response = llm.generate(...) 931 generation.update(output=response) 932 ``` 933 """ 934 if as_type in get_observation_types_list(ObservationTypeGenerationLike): 935 if trace_context: 936 trace_id = trace_context.get("trace_id", None) 937 parent_span_id = trace_context.get("parent_span_id", None) 938 939 if trace_id: 940 remote_parent_span = self._create_remote_parent_span( 941 trace_id=trace_id, parent_span_id=parent_span_id 942 ) 943 944 return cast( 945 Union[ 946 _AgnosticContextManager[LangfuseGeneration], 947 _AgnosticContextManager[LangfuseEmbedding], 948 ], 949 self._create_span_with_parent_context( 950 as_type=as_type, 951 name=name, 952 remote_parent_span=remote_parent_span, 953 parent=None, 954 end_on_exit=end_on_exit, 955 input=input, 956 output=output, 957 metadata=metadata, 958 version=version, 959 level=level, 960 status_message=status_message, 961 completion_start_time=completion_start_time, 962 model=model, 963 model_parameters=model_parameters, 964 usage_details=usage_details, 965 cost_details=cost_details, 966 prompt=prompt, 967 ), 968 ) 969 970 return cast( 971 Union[ 972 _AgnosticContextManager[LangfuseGeneration], 973 _AgnosticContextManager[LangfuseEmbedding], 974 ], 975 self._start_as_current_otel_span_with_processed_media( 976 as_type=as_type, 977 name=name, 978 end_on_exit=end_on_exit, 979 input=input, 980 output=output, 981 metadata=metadata, 982 version=version, 983 level=level, 984 status_message=status_message, 985 completion_start_time=completion_start_time, 986 model=model, 987 model_parameters=model_parameters, 988 usage_details=usage_details, 989 cost_details=cost_details, 990 prompt=prompt, 991 ), 992 ) 993 994 if as_type in get_observation_types_list(ObservationTypeSpanLike): 995 if trace_context: 996 trace_id = trace_context.get("trace_id", None) 997 parent_span_id = trace_context.get("parent_span_id", None) 998 999 if trace_id: 1000 remote_parent_span = self._create_remote_parent_span( 1001 trace_id=trace_id, parent_span_id=parent_span_id 1002 ) 1003 1004 return cast( 1005 Union[ 1006 _AgnosticContextManager[LangfuseSpan], 1007 _AgnosticContextManager[LangfuseAgent], 1008 _AgnosticContextManager[LangfuseTool], 1009 _AgnosticContextManager[LangfuseChain], 1010 _AgnosticContextManager[LangfuseRetriever], 1011 _AgnosticContextManager[LangfuseEvaluator], 1012 _AgnosticContextManager[LangfuseGuardrail], 1013 ], 1014 self._create_span_with_parent_context( 1015 as_type=as_type, 1016 name=name, 1017 remote_parent_span=remote_parent_span, 1018 parent=None, 1019 end_on_exit=end_on_exit, 1020 input=input, 1021 output=output, 1022 metadata=metadata, 1023 version=version, 1024 level=level, 1025 status_message=status_message, 1026 ), 1027 ) 1028 1029 return cast( 1030 Union[ 1031 _AgnosticContextManager[LangfuseSpan], 1032 _AgnosticContextManager[LangfuseAgent], 1033 _AgnosticContextManager[LangfuseTool], 1034 _AgnosticContextManager[LangfuseChain], 1035 _AgnosticContextManager[LangfuseRetriever], 1036 _AgnosticContextManager[LangfuseEvaluator], 1037 _AgnosticContextManager[LangfuseGuardrail], 1038 ], 1039 self._start_as_current_otel_span_with_processed_media( 1040 as_type=as_type, 1041 name=name, 1042 end_on_exit=end_on_exit, 1043 input=input, 1044 output=output, 1045 metadata=metadata, 1046 version=version, 1047 level=level, 1048 status_message=status_message, 1049 ), 1050 ) 1051 1052 # This should never be reached since all valid types are handled above 1053 langfuse_logger.warning( 1054 f"Unknown observation type: {as_type}, falling back to span" 1055 ) 1056 return self._start_as_current_otel_span_with_processed_media( 1057 as_type="span", 1058 name=name, 1059 end_on_exit=end_on_exit, 1060 input=input, 1061 output=output, 1062 metadata=metadata, 1063 version=version, 1064 level=level, 1065 status_message=status_message, 1066 )
Create a new observation and set it as the current span in a context manager.
This method creates a new observation of the specified type and sets it as the current span within a context manager. Use this method with a 'with' statement to automatically handle the observation lifecycle within a code block.
The created observation will be the child of the current span in the context.
Arguments:
- trace_context: Optional context for connecting to an existing trace
- name: Name of the observation (e.g., function or operation name)
- as_type: Type of observation to create (defaults to "span")
- input: Input data for the operation (can be any JSON-serializable object)
- output: Output data from the operation (can be any JSON-serializable object)
- metadata: Additional metadata to associate with the observation
- version: Version identifier for the code or component
- level: Importance level of the observation (info, warning, error)
- status_message: Optional status message for the observation
- end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
- The following parameters are available when as_type is: "generation" or "embedding".
- completion_start_time: When the model started generating the response
- model: Name/identifier of the AI model used (e.g., "gpt-4")
- model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
- usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
- cost_details: Cost information for the model call
- prompt: Associated prompt template from Langfuse prompt management
Returns:
A context manager that yields the appropriate observation type based on as_type
Example:
# Create a span with langfuse.start_as_current_observation(name="process-query", as_type="span") as span: # Do work result = process_data() span.update(output=result) # Create a child span automatically with span.start_as_current_observation(name="sub-operation") as child_span: # Do sub-operation work child_span.update(output="sub-result") # Create a tool observation with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool: # Do tool work results = search_web(query) tool.update(output=results) # Create a generation observation with langfuse.start_as_current_observation( name="answer-generation", as_type="generation", model="gpt-4" ) as generation: # Generate answer response = llm.generate(...) generation.update(output=response)
1228 def update_current_generation( 1229 self, 1230 *, 1231 name: Optional[str] = None, 1232 input: Optional[Any] = None, 1233 output: Optional[Any] = None, 1234 metadata: Optional[Any] = None, 1235 version: Optional[str] = None, 1236 level: Optional[SpanLevel] = None, 1237 status_message: Optional[str] = None, 1238 completion_start_time: Optional[datetime] = None, 1239 model: Optional[str] = None, 1240 model_parameters: Optional[Dict[str, MapValue]] = None, 1241 usage_details: Optional[Dict[str, int]] = None, 1242 cost_details: Optional[Dict[str, float]] = None, 1243 prompt: Optional[PromptClient] = None, 1244 ) -> None: 1245 """Update the current active generation span with new information. 1246 1247 This method updates the current generation span in the active context with 1248 additional information. It's useful for adding output, usage stats, or other 1249 details that become available during or after model generation. 1250 1251 Args: 1252 name: The generation name 1253 input: Updated input data for the model 1254 output: Output from the model (e.g., completions) 1255 metadata: Additional metadata to associate with the generation 1256 version: Version identifier for the model or component 1257 level: Importance level of the generation (info, warning, error) 1258 status_message: Optional status message for the generation 1259 completion_start_time: When the model started generating the response 1260 model: Name/identifier of the AI model used (e.g., "gpt-4") 1261 model_parameters: Parameters used for the model (e.g., temperature, max_tokens) 1262 usage_details: Token usage information (e.g., prompt_tokens, completion_tokens) 1263 cost_details: Cost information for the model call 1264 prompt: Associated prompt template from Langfuse prompt management 1265 1266 Example: 1267 ```python 1268 with langfuse.start_as_current_generation(name="answer-query") as generation: 1269 # Initial setup and API call 1270 response = llm.generate(...) 1271 1272 # Update with results that weren't available at creation time 1273 langfuse.update_current_generation( 1274 output=response.text, 1275 usage_details={ 1276 "prompt_tokens": response.usage.prompt_tokens, 1277 "completion_tokens": response.usage.completion_tokens 1278 } 1279 ) 1280 ``` 1281 """ 1282 if not self._tracing_enabled: 1283 langfuse_logger.debug( 1284 "Operation skipped: update_current_generation - Tracing is disabled or client is in no-op mode." 1285 ) 1286 return 1287 1288 current_otel_span = self._get_current_otel_span() 1289 1290 if current_otel_span is not None: 1291 generation = LangfuseGeneration( 1292 otel_span=current_otel_span, langfuse_client=self 1293 ) 1294 1295 if name: 1296 current_otel_span.update_name(name) 1297 1298 generation.update( 1299 input=input, 1300 output=output, 1301 metadata=metadata, 1302 version=version, 1303 level=level, 1304 status_message=status_message, 1305 completion_start_time=completion_start_time, 1306 model=model, 1307 model_parameters=model_parameters, 1308 usage_details=usage_details, 1309 cost_details=cost_details, 1310 prompt=prompt, 1311 )
Update the current active generation span with new information.
This method updates the current generation span in the active context with additional information. It's useful for adding output, usage stats, or other details that become available during or after model generation.
Arguments:
- name: The generation name
- input: Updated input data for the model
- output: Output from the model (e.g., completions)
- metadata: Additional metadata to associate with the generation
- version: Version identifier for the model or component
- level: Importance level of the generation (info, warning, error)
- status_message: Optional status message for the generation
- completion_start_time: When the model started generating the response
- model: Name/identifier of the AI model used (e.g., "gpt-4")
- model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
- usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
- cost_details: Cost information for the model call
- prompt: Associated prompt template from Langfuse prompt management
Example:
with langfuse.start_as_current_generation(name="answer-query") as generation: # Initial setup and API call response = llm.generate(...) # Update with results that weren't available at creation time langfuse.update_current_generation( output=response.text, usage_details={ "prompt_tokens": response.usage.prompt_tokens, "completion_tokens": response.usage.completion_tokens } )
1313 def update_current_span( 1314 self, 1315 *, 1316 name: Optional[str] = None, 1317 input: Optional[Any] = None, 1318 output: Optional[Any] = None, 1319 metadata: Optional[Any] = None, 1320 version: Optional[str] = None, 1321 level: Optional[SpanLevel] = None, 1322 status_message: Optional[str] = None, 1323 ) -> None: 1324 """Update the current active span with new information. 1325 1326 This method updates the current span in the active context with 1327 additional information. It's useful for adding outputs or metadata 1328 that become available during execution. 1329 1330 Args: 1331 name: The span name 1332 input: Updated input data for the operation 1333 output: Output data from the operation 1334 metadata: Additional metadata to associate with the span 1335 version: Version identifier for the code or component 1336 level: Importance level of the span (info, warning, error) 1337 status_message: Optional status message for the span 1338 1339 Example: 1340 ```python 1341 with langfuse.start_as_current_observation(name="process-data") as span: 1342 # Initial processing 1343 result = process_first_part() 1344 1345 # Update with intermediate results 1346 langfuse.update_current_span(metadata={"intermediate_result": result}) 1347 1348 # Continue processing 1349 final_result = process_second_part(result) 1350 1351 # Final update 1352 langfuse.update_current_span(output=final_result) 1353 ``` 1354 """ 1355 if not self._tracing_enabled: 1356 langfuse_logger.debug( 1357 "Operation skipped: update_current_span - Tracing is disabled or client is in no-op mode." 1358 ) 1359 return 1360 1361 current_otel_span = self._get_current_otel_span() 1362 1363 if current_otel_span is not None: 1364 span = LangfuseSpan( 1365 otel_span=current_otel_span, 1366 langfuse_client=self, 1367 environment=self._environment, 1368 release=self._release, 1369 ) 1370 1371 if name: 1372 current_otel_span.update_name(name) 1373 1374 span.update( 1375 input=input, 1376 output=output, 1377 metadata=metadata, 1378 version=version, 1379 level=level, 1380 status_message=status_message, 1381 )
Update the current active span with new information.
This method updates the current span in the active context with additional information. It's useful for adding outputs or metadata that become available during execution.
Arguments:
- name: The span name
- input: Updated input data for the operation
- output: Output data from the operation
- metadata: Additional metadata to associate with the span
- version: Version identifier for the code or component
- level: Importance level of the span (info, warning, error)
- status_message: Optional status message for the span
Example:
with langfuse.start_as_current_observation(name="process-data") as span: # Initial processing result = process_first_part() # Update with intermediate results langfuse.update_current_span(metadata={"intermediate_result": result}) # Continue processing final_result = process_second_part(result) # Final update langfuse.update_current_span(output=final_result)
1383 @deprecated( 1384 "Trace-level input/output is deprecated. " 1385 "For trace attributes (user_id, session_id, tags, etc.), use propagate_attributes() instead. " 1386 "This method will be removed in a future major version." 1387 ) 1388 def set_current_trace_io( 1389 self, 1390 *, 1391 input: Optional[Any] = None, 1392 output: Optional[Any] = None, 1393 ) -> None: 1394 """Set trace-level input and output for the current span's trace. 1395 1396 .. deprecated:: 1397 This is a legacy method for backward compatibility with Langfuse platform 1398 features that still rely on trace-level input/output (e.g., legacy LLM-as-a-judge 1399 evaluators). It will be removed in a future major version. 1400 1401 For setting other trace attributes (user_id, session_id, metadata, tags, version), 1402 use :meth:`propagate_attributes` instead. 1403 1404 Args: 1405 input: Input data to associate with the trace. 1406 output: Output data to associate with the trace. 1407 """ 1408 if not self._tracing_enabled: 1409 langfuse_logger.debug( 1410 "Operation skipped: set_current_trace_io - Tracing is disabled or client is in no-op mode." 1411 ) 1412 return 1413 1414 current_otel_span = self._get_current_otel_span() 1415 1416 if current_otel_span is not None and current_otel_span.is_recording(): 1417 existing_observation_type = current_otel_span.attributes.get( # type: ignore[attr-defined] 1418 LangfuseOtelSpanAttributes.OBSERVATION_TYPE, "span" 1419 ) 1420 # We need to preserve the class to keep the correct observation type 1421 span_class = self._get_span_class(existing_observation_type) 1422 span = span_class( 1423 otel_span=current_otel_span, 1424 langfuse_client=self, 1425 environment=self._environment, 1426 release=self._release, 1427 ) 1428 1429 span.set_trace_io( 1430 input=input, 1431 output=output, 1432 )
Set trace-level input and output for the current span's trace.
Deprecated since version : This is a legacy method for backward compatibility with Langfuse platform features that still rely on trace-level input/output (e.g., legacy LLM-as-a-judge evaluators). It will be removed in a future major version.
For setting other trace attributes (user_id, session_id, metadata, tags, version),
use propagate_attributes() instead.
Arguments:
- input: Input data to associate with the trace.
- output: Output data to associate with the trace.
1434 def set_current_trace_as_public(self) -> None: 1435 """Make the current trace publicly accessible via its URL. 1436 1437 When a trace is published, anyone with the trace link can view the full trace 1438 without needing to be logged in to Langfuse. This action cannot be undone 1439 programmatically - once published, the entire trace becomes public. 1440 1441 This is a convenience method that publishes the trace from the currently 1442 active span context. Use this when you want to make a trace public from 1443 within a traced function without needing direct access to the span object. 1444 """ 1445 if not self._tracing_enabled: 1446 langfuse_logger.debug( 1447 "Operation skipped: set_current_trace_as_public - Tracing is disabled or client is in no-op mode." 1448 ) 1449 return 1450 1451 current_otel_span = self._get_current_otel_span() 1452 1453 if current_otel_span is not None and current_otel_span.is_recording(): 1454 existing_observation_type = current_otel_span.attributes.get( # type: ignore[attr-defined] 1455 LangfuseOtelSpanAttributes.OBSERVATION_TYPE, "span" 1456 ) 1457 # We need to preserve the class to keep the correct observation type 1458 span_class = self._get_span_class(existing_observation_type) 1459 span = span_class( 1460 otel_span=current_otel_span, 1461 langfuse_client=self, 1462 environment=self._environment, 1463 ) 1464 1465 span.set_trace_as_public()
Make the current trace publicly accessible via its URL.
When a trace is published, anyone with the trace link can view the full trace without needing to be logged in to Langfuse. This action cannot be undone programmatically - once published, the entire trace becomes public.
This is a convenience method that publishes the trace from the currently active span context. Use this when you want to make a trace public from within a traced function without needing direct access to the span object.
1467 def create_event( 1468 self, 1469 *, 1470 trace_context: Optional[TraceContext] = None, 1471 name: str, 1472 input: Optional[Any] = None, 1473 output: Optional[Any] = None, 1474 metadata: Optional[Any] = None, 1475 version: Optional[str] = None, 1476 level: Optional[SpanLevel] = None, 1477 status_message: Optional[str] = None, 1478 ) -> LangfuseEvent: 1479 """Create a new Langfuse observation of type 'EVENT'. 1480 1481 The created Langfuse Event observation will be the child of the current span in the context. 1482 1483 Args: 1484 trace_context: Optional context for connecting to an existing trace 1485 name: Name of the span (e.g., function or operation name) 1486 input: Input data for the operation (can be any JSON-serializable object) 1487 output: Output data from the operation (can be any JSON-serializable object) 1488 metadata: Additional metadata to associate with the span 1489 version: Version identifier for the code or component 1490 level: Importance level of the span (info, warning, error) 1491 status_message: Optional status message for the span 1492 1493 Returns: 1494 The Langfuse Event object 1495 1496 Example: 1497 ```python 1498 event = langfuse.create_event(name="process-event") 1499 ``` 1500 """ 1501 timestamp = time_ns() 1502 1503 if trace_context: 1504 trace_id = trace_context.get("trace_id", None) 1505 parent_span_id = trace_context.get("parent_span_id", None) 1506 1507 if trace_id: 1508 remote_parent_span = self._create_remote_parent_span( 1509 trace_id=trace_id, parent_span_id=parent_span_id 1510 ) 1511 1512 with otel_trace_api.use_span( 1513 cast(otel_trace_api.Span, remote_parent_span) 1514 ): 1515 otel_span = self._otel_tracer.start_span( 1516 name=name, start_time=timestamp 1517 ) 1518 otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True) 1519 1520 return cast( 1521 LangfuseEvent, 1522 LangfuseEvent( 1523 otel_span=otel_span, 1524 langfuse_client=self, 1525 environment=self._environment, 1526 release=self._release, 1527 input=input, 1528 output=output, 1529 metadata=metadata, 1530 version=version, 1531 level=level, 1532 status_message=status_message, 1533 ).end(end_time=timestamp), 1534 ) 1535 1536 otel_span = self._otel_tracer.start_span(name=name, start_time=timestamp) 1537 1538 return cast( 1539 LangfuseEvent, 1540 LangfuseEvent( 1541 otel_span=otel_span, 1542 langfuse_client=self, 1543 environment=self._environment, 1544 release=self._release, 1545 input=input, 1546 output=output, 1547 metadata=metadata, 1548 version=version, 1549 level=level, 1550 status_message=status_message, 1551 ).end(end_time=timestamp), 1552 )
Create a new Langfuse observation of type 'EVENT'.
The created Langfuse Event observation will be the child of the current span in the context.
Arguments:
- trace_context: Optional context for connecting to an existing trace
- name: Name of the span (e.g., function or operation name)
- input: Input data for the operation (can be any JSON-serializable object)
- output: Output data from the operation (can be any JSON-serializable object)
- metadata: Additional metadata to associate with the span
- version: Version identifier for the code or component
- level: Importance level of the span (info, warning, error)
- status_message: Optional status message for the span
Returns:
The Langfuse Event object
Example:
event = langfuse.create_event(name="process-event")
1641 @staticmethod 1642 def create_trace_id(*, seed: Optional[str] = None) -> str: 1643 """Create a unique trace ID for use with Langfuse. 1644 1645 This method generates a unique trace ID for use with various Langfuse APIs. 1646 It can either generate a random ID or create a deterministic ID based on 1647 a seed string. 1648 1649 Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes. 1650 This method ensures the generated ID meets this requirement. If you need to 1651 correlate an external ID with a Langfuse trace ID, use the external ID as the 1652 seed to get a valid, deterministic Langfuse trace ID. 1653 1654 Args: 1655 seed: Optional string to use as a seed for deterministic ID generation. 1656 If provided, the same seed will always produce the same ID. 1657 If not provided, a random ID will be generated. 1658 1659 Returns: 1660 A 32-character lowercase hexadecimal string representing the Langfuse trace ID. 1661 1662 Example: 1663 ```python 1664 # Generate a random trace ID 1665 trace_id = langfuse.create_trace_id() 1666 1667 # Generate a deterministic ID based on a seed 1668 session_trace_id = langfuse.create_trace_id(seed="session-456") 1669 1670 # Correlate an external ID with a Langfuse trace ID 1671 external_id = "external-system-123456" 1672 correlated_trace_id = langfuse.create_trace_id(seed=external_id) 1673 1674 # Use the ID with trace context 1675 with langfuse.start_as_current_observation( 1676 name="process-request", 1677 trace_context={"trace_id": trace_id} 1678 ) as span: 1679 # Operation will be part of the specific trace 1680 pass 1681 ``` 1682 """ 1683 if not seed: 1684 trace_id_int = RandomIdGenerator().generate_trace_id() 1685 1686 return Langfuse._format_otel_trace_id(trace_id_int) 1687 1688 return sha256(seed.encode("utf-8")).digest()[:16].hex()
Create a unique trace ID for use with Langfuse.
This method generates a unique trace ID for use with various Langfuse APIs. It can either generate a random ID or create a deterministic ID based on a seed string.
Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes. This method ensures the generated ID meets this requirement. If you need to correlate an external ID with a Langfuse trace ID, use the external ID as the seed to get a valid, deterministic Langfuse trace ID.
Arguments:
- seed: Optional string to use as a seed for deterministic ID generation. If provided, the same seed will always produce the same ID. If not provided, a random ID will be generated.
Returns:
A 32-character lowercase hexadecimal string representing the Langfuse trace ID.
Example:
# Generate a random trace ID trace_id = langfuse.create_trace_id() # Generate a deterministic ID based on a seed session_trace_id = langfuse.create_trace_id(seed="session-456") # Correlate an external ID with a Langfuse trace ID external_id = "external-system-123456" correlated_trace_id = langfuse.create_trace_id(seed=external_id) # Use the ID with trace context with langfuse.start_as_current_observation( name="process-request", trace_context={"trace_id": trace_id} ) as span: # Operation will be part of the specific trace pass
1766 def create_score( 1767 self, 1768 *, 1769 name: str, 1770 value: Union[float, str], 1771 session_id: Optional[str] = None, 1772 dataset_run_id: Optional[str] = None, 1773 trace_id: Optional[str] = None, 1774 observation_id: Optional[str] = None, 1775 score_id: Optional[str] = None, 1776 data_type: Optional[ScoreDataType] = None, 1777 comment: Optional[str] = None, 1778 config_id: Optional[str] = None, 1779 metadata: Optional[Any] = None, 1780 timestamp: Optional[datetime] = None, 1781 ) -> None: 1782 """Create a score for a specific trace or observation. 1783 1784 This method creates a score for evaluating a Langfuse trace or observation. Scores can be 1785 used to track quality metrics, user feedback, or automated evaluations. 1786 1787 Args: 1788 name: Name of the score (e.g., "relevance", "accuracy") 1789 value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT) 1790 session_id: ID of the Langfuse session to associate the score with 1791 dataset_run_id: ID of the Langfuse dataset run to associate the score with 1792 trace_id: ID of the Langfuse trace to associate the score with 1793 observation_id: Optional ID of the specific observation to score. Trace ID must be provided too. 1794 score_id: Optional custom ID for the score (auto-generated if not provided) 1795 data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT) 1796 comment: Optional comment or explanation for the score 1797 config_id: Optional ID of a score config defined in Langfuse 1798 metadata: Optional metadata to be attached to the score 1799 timestamp: Optional timestamp for the score (defaults to current UTC time) 1800 1801 Example: 1802 ```python 1803 # Create a numeric score for accuracy 1804 langfuse.create_score( 1805 name="accuracy", 1806 value=0.92, 1807 trace_id="abcdef1234567890abcdef1234567890", 1808 data_type="NUMERIC", 1809 comment="High accuracy with minor irrelevant details" 1810 ) 1811 1812 # Create a categorical score for sentiment 1813 langfuse.create_score( 1814 name="sentiment", 1815 value="positive", 1816 trace_id="abcdef1234567890abcdef1234567890", 1817 observation_id="abcdef1234567890", 1818 data_type="CATEGORICAL" 1819 ) 1820 ``` 1821 """ 1822 if not self._tracing_enabled: 1823 return 1824 1825 score_id = score_id or self._create_observation_id() 1826 1827 try: 1828 new_body = ScoreBody( 1829 id=score_id, 1830 sessionId=session_id, 1831 datasetRunId=dataset_run_id, 1832 traceId=trace_id, 1833 observationId=observation_id, 1834 name=name, 1835 value=value, 1836 dataType=data_type, # type: ignore 1837 comment=comment, 1838 configId=config_id, 1839 environment=self._environment, 1840 metadata=metadata, 1841 ) 1842 1843 event = { 1844 "id": self.create_trace_id(), 1845 "type": "score-create", 1846 "timestamp": timestamp or _get_timestamp(), 1847 "body": new_body, 1848 } 1849 1850 if self._resources is not None: 1851 # Force the score to be in sample if it was for a legacy trace ID, i.e. non-32 hexchar 1852 force_sample = ( 1853 not self._is_valid_trace_id(trace_id) if trace_id else True 1854 ) 1855 1856 self._resources.add_score_task( 1857 event, 1858 force_sample=force_sample, 1859 ) 1860 1861 except Exception as e: 1862 langfuse_logger.exception( 1863 f"Error creating score: Failed to process score event for trace_id={trace_id}, name={name}. Error: {e}" 1864 )
Create a score for a specific trace or observation.
This method creates a score for evaluating a Langfuse trace or observation. Scores can be used to track quality metrics, user feedback, or automated evaluations.
Arguments:
- name: Name of the score (e.g., "relevance", "accuracy")
- value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT)
- session_id: ID of the Langfuse session to associate the score with
- dataset_run_id: ID of the Langfuse dataset run to associate the score with
- trace_id: ID of the Langfuse trace to associate the score with
- observation_id: Optional ID of the specific observation to score. Trace ID must be provided too.
- score_id: Optional custom ID for the score (auto-generated if not provided)
- data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT)
- comment: Optional comment or explanation for the score
- config_id: Optional ID of a score config defined in Langfuse
- metadata: Optional metadata to be attached to the score
- timestamp: Optional timestamp for the score (defaults to current UTC time)
Example:
# Create a numeric score for accuracy langfuse.create_score( name="accuracy", value=0.92, trace_id="abcdef1234567890abcdef1234567890", data_type="NUMERIC", comment="High accuracy with minor irrelevant details" ) # Create a categorical score for sentiment langfuse.create_score( name="sentiment", value="positive", trace_id="abcdef1234567890abcdef1234567890", observation_id="abcdef1234567890", data_type="CATEGORICAL" )
1925 def score_current_span( 1926 self, 1927 *, 1928 name: str, 1929 value: Union[float, str], 1930 score_id: Optional[str] = None, 1931 data_type: Optional[ScoreDataType] = None, 1932 comment: Optional[str] = None, 1933 config_id: Optional[str] = None, 1934 metadata: Optional[Any] = None, 1935 ) -> None: 1936 """Create a score for the current active span. 1937 1938 This method scores the currently active span in the context. It's a convenient 1939 way to score the current operation without needing to know its trace and span IDs. 1940 1941 Args: 1942 name: Name of the score (e.g., "relevance", "accuracy") 1943 value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT) 1944 score_id: Optional custom ID for the score (auto-generated if not provided) 1945 data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT) 1946 comment: Optional comment or explanation for the score 1947 config_id: Optional ID of a score config defined in Langfuse 1948 metadata: Optional metadata to be attached to the score 1949 1950 Example: 1951 ```python 1952 with langfuse.start_as_current_generation(name="answer-query") as generation: 1953 # Generate answer 1954 response = generate_answer(...) 1955 generation.update(output=response) 1956 1957 # Score the generation 1958 langfuse.score_current_span( 1959 name="relevance", 1960 value=0.85, 1961 data_type="NUMERIC", 1962 comment="Mostly relevant but contains some tangential information", 1963 metadata={"model": "gpt-4", "prompt_version": "v2"} 1964 ) 1965 ``` 1966 """ 1967 current_span = self._get_current_otel_span() 1968 1969 if current_span is not None: 1970 trace_id = self._get_otel_trace_id(current_span) 1971 observation_id = self._get_otel_span_id(current_span) 1972 1973 langfuse_logger.info( 1974 f"Score: Creating score name='{name}' value={value} for current span ({observation_id}) in trace {trace_id}" 1975 ) 1976 1977 self.create_score( 1978 trace_id=trace_id, 1979 observation_id=observation_id, 1980 name=name, 1981 value=cast(str, value), 1982 score_id=score_id, 1983 data_type=cast(Literal["CATEGORICAL", "TEXT"], data_type), 1984 comment=comment, 1985 config_id=config_id, 1986 metadata=metadata, 1987 )
Create a score for the current active span.
This method scores the currently active span in the context. It's a convenient way to score the current operation without needing to know its trace and span IDs.
Arguments:
- name: Name of the score (e.g., "relevance", "accuracy")
- value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT)
- score_id: Optional custom ID for the score (auto-generated if not provided)
- data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT)
- comment: Optional comment or explanation for the score
- config_id: Optional ID of a score config defined in Langfuse
- metadata: Optional metadata to be attached to the score
Example:
with langfuse.start_as_current_generation(name="answer-query") as generation: # Generate answer response = generate_answer(...) generation.update(output=response) # Score the generation langfuse.score_current_span( name="relevance", value=0.85, data_type="NUMERIC", comment="Mostly relevant but contains some tangential information", metadata={"model": "gpt-4", "prompt_version": "v2"} )
2015 def score_current_trace( 2016 self, 2017 *, 2018 name: str, 2019 value: Union[float, str], 2020 score_id: Optional[str] = None, 2021 data_type: Optional[ScoreDataType] = None, 2022 comment: Optional[str] = None, 2023 config_id: Optional[str] = None, 2024 metadata: Optional[Any] = None, 2025 ) -> None: 2026 """Create a score for the current trace. 2027 2028 This method scores the trace of the currently active span. Unlike score_current_span, 2029 this method associates the score with the entire trace rather than a specific span. 2030 It's useful for scoring overall performance or quality of the entire operation. 2031 2032 Args: 2033 name: Name of the score (e.g., "user_satisfaction", "overall_quality") 2034 value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT) 2035 score_id: Optional custom ID for the score (auto-generated if not provided) 2036 data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT) 2037 comment: Optional comment or explanation for the score 2038 config_id: Optional ID of a score config defined in Langfuse 2039 metadata: Optional metadata to be attached to the score 2040 2041 Example: 2042 ```python 2043 with langfuse.start_as_current_observation(name="process-user-request") as span: 2044 # Process request 2045 result = process_complete_request() 2046 span.update(output=result) 2047 2048 # Score the overall trace 2049 langfuse.score_current_trace( 2050 name="overall_quality", 2051 value=0.95, 2052 data_type="NUMERIC", 2053 comment="High quality end-to-end response", 2054 metadata={"evaluator": "gpt-4", "criteria": "comprehensive"} 2055 ) 2056 ``` 2057 """ 2058 current_span = self._get_current_otel_span() 2059 2060 if current_span is not None: 2061 trace_id = self._get_otel_trace_id(current_span) 2062 2063 langfuse_logger.info( 2064 f"Score: Creating score name='{name}' value={value} for entire trace {trace_id}" 2065 ) 2066 2067 self.create_score( 2068 trace_id=trace_id, 2069 name=name, 2070 value=cast(str, value), 2071 score_id=score_id, 2072 data_type=cast(Literal["CATEGORICAL", "TEXT"], data_type), 2073 comment=comment, 2074 config_id=config_id, 2075 metadata=metadata, 2076 )
Create a score for the current trace.
This method scores the trace of the currently active span. Unlike score_current_span, this method associates the score with the entire trace rather than a specific span. It's useful for scoring overall performance or quality of the entire operation.
Arguments:
- name: Name of the score (e.g., "user_satisfaction", "overall_quality")
- value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT)
- score_id: Optional custom ID for the score (auto-generated if not provided)
- data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT)
- comment: Optional comment or explanation for the score
- config_id: Optional ID of a score config defined in Langfuse
- metadata: Optional metadata to be attached to the score
Example:
with langfuse.start_as_current_observation(name="process-user-request") as span: # Process request result = process_complete_request() span.update(output=result) # Score the overall trace langfuse.score_current_trace( name="overall_quality", value=0.95, data_type="NUMERIC", comment="High quality end-to-end response", metadata={"evaluator": "gpt-4", "criteria": "comprehensive"} )
2078 def flush(self) -> None: 2079 """Force flush all pending spans and events to the Langfuse API. 2080 2081 This method manually flushes any pending spans, scores, and other events to the 2082 Langfuse API. It's useful in scenarios where you want to ensure all data is sent 2083 before proceeding, without waiting for the automatic flush interval. 2084 2085 Example: 2086 ```python 2087 # Record some spans and scores 2088 with langfuse.start_as_current_observation(name="operation") as span: 2089 # Do work... 2090 pass 2091 2092 # Ensure all data is sent to Langfuse before proceeding 2093 langfuse.flush() 2094 2095 # Continue with other work 2096 ``` 2097 """ 2098 if self._resources is not None: 2099 self._resources.flush()
Force flush all pending spans and events to the Langfuse API.
This method manually flushes any pending spans, scores, and other events to the Langfuse API. It's useful in scenarios where you want to ensure all data is sent before proceeding, without waiting for the automatic flush interval.
Example:
# Record some spans and scores with langfuse.start_as_current_observation(name="operation") as span: # Do work... pass # Ensure all data is sent to Langfuse before proceeding langfuse.flush() # Continue with other work
2101 def shutdown(self) -> None: 2102 """Shut down the Langfuse client and flush all pending data. 2103 2104 This method cleanly shuts down the Langfuse client, ensuring all pending data 2105 is flushed to the API and all background threads are properly terminated. 2106 2107 It's important to call this method when your application is shutting down to 2108 prevent data loss and resource leaks. For most applications, using the client 2109 as a context manager or relying on the automatic shutdown via atexit is sufficient. 2110 2111 Example: 2112 ```python 2113 # Initialize Langfuse 2114 langfuse = Langfuse(public_key="...", secret_key="...") 2115 2116 # Use Langfuse throughout your application 2117 # ... 2118 2119 # When application is shutting down 2120 langfuse.shutdown() 2121 ``` 2122 """ 2123 if self._resources is not None: 2124 self._resources.shutdown()
Shut down the Langfuse client and flush all pending data.
This method cleanly shuts down the Langfuse client, ensuring all pending data is flushed to the API and all background threads are properly terminated.
It's important to call this method when your application is shutting down to prevent data loss and resource leaks. For most applications, using the client as a context manager or relying on the automatic shutdown via atexit is sufficient.
Example:
# Initialize Langfuse langfuse = Langfuse(public_key="...", secret_key="...") # Use Langfuse throughout your application # ... # When application is shutting down langfuse.shutdown()
2126 def get_current_trace_id(self) -> Optional[str]: 2127 """Get the trace ID of the current active span. 2128 2129 This method retrieves the trace ID from the currently active span in the context. 2130 It can be used to get the trace ID for referencing in logs, external systems, 2131 or for creating related operations. 2132 2133 Returns: 2134 The current trace ID as a 32-character lowercase hexadecimal string, 2135 or None if there is no active span. 2136 2137 Example: 2138 ```python 2139 with langfuse.start_as_current_observation(name="process-request") as span: 2140 # Get the current trace ID for reference 2141 trace_id = langfuse.get_current_trace_id() 2142 2143 # Use it for external correlation 2144 log.info(f"Processing request with trace_id: {trace_id}") 2145 2146 # Or pass to another system 2147 external_system.process(data, trace_id=trace_id) 2148 ``` 2149 """ 2150 if not self._tracing_enabled: 2151 langfuse_logger.debug( 2152 "Operation skipped: get_current_trace_id - Tracing is disabled or client is in no-op mode." 2153 ) 2154 return None 2155 2156 current_otel_span = self._get_current_otel_span() 2157 2158 return self._get_otel_trace_id(current_otel_span) if current_otel_span else None
Get the trace ID of the current active span.
This method retrieves the trace ID from the currently active span in the context. It can be used to get the trace ID for referencing in logs, external systems, or for creating related operations.
Returns:
The current trace ID as a 32-character lowercase hexadecimal string, or None if there is no active span.
Example:
with langfuse.start_as_current_observation(name="process-request") as span: # Get the current trace ID for reference trace_id = langfuse.get_current_trace_id() # Use it for external correlation log.info(f"Processing request with trace_id: {trace_id}") # Or pass to another system external_system.process(data, trace_id=trace_id)
2160 def get_current_observation_id(self) -> Optional[str]: 2161 """Get the observation ID (span ID) of the current active span. 2162 2163 This method retrieves the observation ID from the currently active span in the context. 2164 It can be used to get the observation ID for referencing in logs, external systems, 2165 or for creating scores or other related operations. 2166 2167 Returns: 2168 The current observation ID as a 16-character lowercase hexadecimal string, 2169 or None if there is no active span. 2170 2171 Example: 2172 ```python 2173 with langfuse.start_as_current_observation(name="process-user-query") as span: 2174 # Get the current observation ID 2175 observation_id = langfuse.get_current_observation_id() 2176 2177 # Store it for later reference 2178 cache.set(f"query_{query_id}_observation", observation_id) 2179 2180 # Process the query... 2181 ``` 2182 """ 2183 if not self._tracing_enabled: 2184 langfuse_logger.debug( 2185 "Operation skipped: get_current_observation_id - Tracing is disabled or client is in no-op mode." 2186 ) 2187 return None 2188 2189 current_otel_span = self._get_current_otel_span() 2190 2191 return self._get_otel_span_id(current_otel_span) if current_otel_span else None
Get the observation ID (span ID) of the current active span.
This method retrieves the observation ID from the currently active span in the context. It can be used to get the observation ID for referencing in logs, external systems, or for creating scores or other related operations.
Returns:
The current observation ID as a 16-character lowercase hexadecimal string, or None if there is no active span.
Example:
with langfuse.start_as_current_observation(name="process-user-query") as span: # Get the current observation ID observation_id = langfuse.get_current_observation_id() # Store it for later reference cache.set(f"query_{query_id}_observation", observation_id) # Process the query...
2204 def get_trace_url(self, *, trace_id: Optional[str] = None) -> Optional[str]: 2205 """Get the URL to view a trace in the Langfuse UI. 2206 2207 This method generates a URL that links directly to a trace in the Langfuse UI. 2208 It's useful for providing links in logs, notifications, or debugging tools. 2209 2210 Args: 2211 trace_id: Optional trace ID to generate a URL for. If not provided, 2212 the trace ID of the current active span will be used. 2213 2214 Returns: 2215 A URL string pointing to the trace in the Langfuse UI, 2216 or None if the project ID couldn't be retrieved or no trace ID is available. 2217 2218 Example: 2219 ```python 2220 # Get URL for the current trace 2221 with langfuse.start_as_current_observation(name="process-request") as span: 2222 trace_url = langfuse.get_trace_url() 2223 log.info(f"Processing trace: {trace_url}") 2224 2225 # Get URL for a specific trace 2226 specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef") 2227 send_notification(f"Review needed for trace: {specific_trace_url}") 2228 ``` 2229 """ 2230 final_trace_id = trace_id or self.get_current_trace_id() 2231 if not final_trace_id: 2232 return None 2233 2234 project_id = self._get_project_id() 2235 2236 return ( 2237 f"{self._base_url}/project/{project_id}/traces/{final_trace_id}" 2238 if project_id and final_trace_id 2239 else None 2240 )
Get the URL to view a trace in the Langfuse UI.
This method generates a URL that links directly to a trace in the Langfuse UI. It's useful for providing links in logs, notifications, or debugging tools.
Arguments:
- trace_id: Optional trace ID to generate a URL for. If not provided, the trace ID of the current active span will be used.
Returns:
A URL string pointing to the trace in the Langfuse UI, or None if the project ID couldn't be retrieved or no trace ID is available.
Example:
# Get URL for the current trace with langfuse.start_as_current_observation(name="process-request") as span: trace_url = langfuse.get_trace_url() log.info(f"Processing trace: {trace_url}") # Get URL for a specific trace specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef") send_notification(f"Review needed for trace: {specific_trace_url}")
2242 def get_dataset( 2243 self, 2244 name: str, 2245 *, 2246 fetch_items_page_size: Optional[int] = 50, 2247 version: Optional[datetime] = None, 2248 ) -> "DatasetClient": 2249 """Fetch a dataset by its name. 2250 2251 Args: 2252 name (str): The name of the dataset to fetch. 2253 fetch_items_page_size (Optional[int]): All items of the dataset will be fetched in chunks of this size. Defaults to 50. 2254 version (Optional[datetime]): Retrieve dataset items as they existed at this specific point in time (UTC). 2255 If provided, returns the state of items at the specified UTC timestamp. 2256 If not provided, returns the latest version. Must be a timezone-aware datetime object in UTC. 2257 2258 Returns: 2259 DatasetClient: The dataset with the given name. 2260 """ 2261 try: 2262 langfuse_logger.debug(f"Getting datasets {name}") 2263 dataset = self.api.datasets.get(dataset_name=self._url_encode(name)) 2264 2265 dataset_items = [] 2266 page = 1 2267 2268 while True: 2269 new_items = self.api.dataset_items.list( 2270 dataset_name=self._url_encode(name, is_url_param=True), 2271 page=page, 2272 limit=fetch_items_page_size, 2273 version=version, 2274 ) 2275 dataset_items.extend(new_items.data) 2276 2277 if new_items.meta.total_pages <= page: 2278 break 2279 2280 page += 1 2281 2282 return DatasetClient( 2283 dataset=dataset, 2284 items=dataset_items, 2285 version=version, 2286 langfuse_client=self, 2287 ) 2288 2289 except Error as e: 2290 handle_fern_exception(e) 2291 raise e
Fetch a dataset by its name.
Arguments:
- name (str): The name of the dataset to fetch.
- fetch_items_page_size (Optional[int]): All items of the dataset will be fetched in chunks of this size. Defaults to 50.
- version (Optional[datetime]): Retrieve dataset items as they existed at this specific point in time (UTC). If provided, returns the state of items at the specified UTC timestamp. If not provided, returns the latest version. Must be a timezone-aware datetime object in UTC.
Returns:
DatasetClient: The dataset with the given name.
2293 def get_dataset_run( 2294 self, *, dataset_name: str, run_name: str 2295 ) -> DatasetRunWithItems: 2296 """Fetch a dataset run by dataset name and run name. 2297 2298 Args: 2299 dataset_name (str): The name of the dataset. 2300 run_name (str): The name of the run. 2301 2302 Returns: 2303 DatasetRunWithItems: The dataset run with its items. 2304 """ 2305 try: 2306 return cast( 2307 DatasetRunWithItems, 2308 self.api.datasets.get_run( 2309 dataset_name=self._url_encode(dataset_name), 2310 run_name=self._url_encode(run_name), 2311 request_options=None, 2312 ), 2313 ) 2314 except Error as e: 2315 handle_fern_exception(e) 2316 raise e
Fetch a dataset run by dataset name and run name.
Arguments:
- dataset_name (str): The name of the dataset.
- run_name (str): The name of the run.
Returns:
DatasetRunWithItems: The dataset run with its items.
2318 def get_dataset_runs( 2319 self, 2320 *, 2321 dataset_name: str, 2322 page: Optional[int] = None, 2323 limit: Optional[int] = None, 2324 ) -> PaginatedDatasetRuns: 2325 """Fetch all runs for a dataset. 2326 2327 Args: 2328 dataset_name (str): The name of the dataset. 2329 page (Optional[int]): Page number, starts at 1. 2330 limit (Optional[int]): Limit of items per page. 2331 2332 Returns: 2333 PaginatedDatasetRuns: Paginated list of dataset runs. 2334 """ 2335 try: 2336 return cast( 2337 PaginatedDatasetRuns, 2338 self.api.datasets.get_runs( 2339 dataset_name=self._url_encode(dataset_name), 2340 page=page, 2341 limit=limit, 2342 request_options=None, 2343 ), 2344 ) 2345 except Error as e: 2346 handle_fern_exception(e) 2347 raise e
Fetch all runs for a dataset.
Arguments:
- dataset_name (str): The name of the dataset.
- page (Optional[int]): Page number, starts at 1.
- limit (Optional[int]): Limit of items per page.
Returns:
PaginatedDatasetRuns: Paginated list of dataset runs.
2349 def delete_dataset_run( 2350 self, *, dataset_name: str, run_name: str 2351 ) -> DeleteDatasetRunResponse: 2352 """Delete a dataset run and all its run items. This action is irreversible. 2353 2354 Args: 2355 dataset_name (str): The name of the dataset. 2356 run_name (str): The name of the run. 2357 2358 Returns: 2359 DeleteDatasetRunResponse: Confirmation of deletion. 2360 """ 2361 try: 2362 return cast( 2363 DeleteDatasetRunResponse, 2364 self.api.datasets.delete_run( 2365 dataset_name=self._url_encode(dataset_name), 2366 run_name=self._url_encode(run_name), 2367 request_options=None, 2368 ), 2369 ) 2370 except Error as e: 2371 handle_fern_exception(e) 2372 raise e
Delete a dataset run and all its run items. This action is irreversible.
Arguments:
- dataset_name (str): The name of the dataset.
- run_name (str): The name of the run.
Returns:
DeleteDatasetRunResponse: Confirmation of deletion.
2374 def run_experiment( 2375 self, 2376 *, 2377 name: str, 2378 run_name: Optional[str] = None, 2379 description: Optional[str] = None, 2380 data: ExperimentData, 2381 task: TaskFunction, 2382 evaluators: List[EvaluatorFunction] = [], 2383 composite_evaluator: Optional[CompositeEvaluatorFunction] = None, 2384 run_evaluators: List[RunEvaluatorFunction] = [], 2385 max_concurrency: int = 50, 2386 metadata: Optional[Dict[str, str]] = None, 2387 _dataset_version: Optional[datetime] = None, 2388 ) -> ExperimentResult: 2389 """Run an experiment on a dataset with automatic tracing and evaluation. 2390 2391 This method executes a task function on each item in the provided dataset, 2392 automatically traces all executions with Langfuse for observability, runs 2393 item-level and run-level evaluators on the outputs, and returns comprehensive 2394 results with evaluation metrics. 2395 2396 The experiment system provides: 2397 - Automatic tracing of all task executions 2398 - Concurrent processing with configurable limits 2399 - Comprehensive error handling that isolates failures 2400 - Integration with Langfuse datasets for experiment tracking 2401 - Flexible evaluation framework supporting both sync and async evaluators 2402 2403 Args: 2404 name: Human-readable name for the experiment. Used for identification 2405 in the Langfuse UI. 2406 run_name: Optional exact name for the experiment run. If provided, this will be 2407 used as the exact dataset run name if the `data` contains Langfuse dataset items. 2408 If not provided, this will default to the experiment name appended with an ISO timestamp. 2409 description: Optional description explaining the experiment's purpose, 2410 methodology, or expected outcomes. 2411 data: Array of data items to process. Can be either: 2412 - List of dict-like items with 'input', 'expected_output', 'metadata' keys 2413 - List of Langfuse DatasetItem objects from dataset.items 2414 task: Function that processes each data item and returns output. 2415 Must accept 'item' as keyword argument and can return sync or async results. 2416 The task function signature should be: task(*, item, **kwargs) -> Any 2417 evaluators: List of functions to evaluate each item's output individually. 2418 Each evaluator receives input, output, expected_output, and metadata. 2419 Can return single Evaluation dict or list of Evaluation dicts. 2420 composite_evaluator: Optional function that creates composite scores from item-level evaluations. 2421 Receives the same inputs as item-level evaluators (input, output, expected_output, metadata) 2422 plus the list of evaluations from item-level evaluators. Useful for weighted averages, 2423 pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics. 2424 run_evaluators: List of functions to evaluate the entire experiment run. 2425 Each run evaluator receives all item_results and can compute aggregate metrics. 2426 Useful for calculating averages, distributions, or cross-item comparisons. 2427 max_concurrency: Maximum number of concurrent task executions (default: 50). 2428 Controls the number of items processed simultaneously. Adjust based on 2429 API rate limits and system resources. 2430 metadata: Optional metadata dictionary to attach to all experiment traces. 2431 This metadata will be included in every trace created during the experiment. 2432 If `data` are Langfuse dataset items, the metadata will be attached to the dataset run, too. 2433 2434 Returns: 2435 ExperimentResult containing: 2436 - run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset. 2437 - item_results: List of results for each processed item with outputs and evaluations 2438 - run_evaluations: List of aggregate evaluation results for the entire run 2439 - experiment_id: Stable identifier for the experiment run across all items 2440 - dataset_run_id: ID of the dataset run (if using Langfuse datasets) 2441 - dataset_run_url: Direct URL to view results in Langfuse UI (if applicable) 2442 2443 Raises: 2444 ValueError: If required parameters are missing or invalid 2445 Exception: If experiment setup fails (individual item failures are handled gracefully) 2446 2447 Examples: 2448 Basic experiment with local data: 2449 ```python 2450 def summarize_text(*, item, **kwargs): 2451 return f"Summary: {item['input'][:50]}..." 2452 2453 def length_evaluator(*, input, output, expected_output=None, **kwargs): 2454 return { 2455 "name": "output_length", 2456 "value": len(output), 2457 "comment": f"Output contains {len(output)} characters" 2458 } 2459 2460 result = langfuse.run_experiment( 2461 name="Text Summarization Test", 2462 description="Evaluate summarization quality and length", 2463 data=[ 2464 {"input": "Long article text...", "expected_output": "Expected summary"}, 2465 {"input": "Another article...", "expected_output": "Another summary"} 2466 ], 2467 task=summarize_text, 2468 evaluators=[length_evaluator] 2469 ) 2470 2471 print(f"Processed {len(result.item_results)} items") 2472 for item_result in result.item_results: 2473 print(f"Input: {item_result.item['input']}") 2474 print(f"Output: {item_result.output}") 2475 print(f"Evaluations: {item_result.evaluations}") 2476 ``` 2477 2478 Advanced experiment with async task and multiple evaluators: 2479 ```python 2480 async def llm_task(*, item, **kwargs): 2481 # Simulate async LLM call 2482 response = await openai_client.chat.completions.create( 2483 model="gpt-4", 2484 messages=[{"role": "user", "content": item["input"]}] 2485 ) 2486 return response.choices[0].message.content 2487 2488 def accuracy_evaluator(*, input, output, expected_output=None, **kwargs): 2489 if expected_output and expected_output.lower() in output.lower(): 2490 return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"} 2491 return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"} 2492 2493 def toxicity_evaluator(*, input, output, expected_output=None, **kwargs): 2494 # Simulate toxicity check 2495 toxicity_score = check_toxicity(output) # Your toxicity checker 2496 return { 2497 "name": "toxicity", 2498 "value": toxicity_score, 2499 "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}" 2500 } 2501 2502 def average_accuracy(*, item_results, **kwargs): 2503 accuracies = [ 2504 eval.value for result in item_results 2505 for eval in result.evaluations 2506 if eval.name == "accuracy" 2507 ] 2508 return { 2509 "name": "average_accuracy", 2510 "value": sum(accuracies) / len(accuracies) if accuracies else 0, 2511 "comment": f"Average accuracy across {len(accuracies)} items" 2512 } 2513 2514 result = langfuse.run_experiment( 2515 name="LLM Safety and Accuracy Test", 2516 description="Evaluate model accuracy and safety across diverse prompts", 2517 data=test_dataset, # Your dataset items 2518 task=llm_task, 2519 evaluators=[accuracy_evaluator, toxicity_evaluator], 2520 run_evaluators=[average_accuracy], 2521 max_concurrency=5, # Limit concurrent API calls 2522 metadata={"model": "gpt-4", "temperature": 0.7} 2523 ) 2524 ``` 2525 2526 Using with Langfuse datasets: 2527 ```python 2528 # Get dataset from Langfuse 2529 dataset = langfuse.get_dataset("my-eval-dataset") 2530 2531 result = dataset.run_experiment( 2532 name="Production Model Evaluation", 2533 description="Monthly evaluation of production model performance", 2534 task=my_production_task, 2535 evaluators=[accuracy_evaluator, latency_evaluator] 2536 ) 2537 2538 # Results automatically linked to dataset in Langfuse UI 2539 print(f"View results: {result['dataset_run_url']}") 2540 ``` 2541 2542 Note: 2543 - Task and evaluator functions can be either synchronous or asynchronous 2544 - Individual item failures are logged but don't stop the experiment 2545 - All executions are automatically traced and visible in Langfuse UI 2546 - When using Langfuse datasets, results are automatically linked for easy comparison 2547 - This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.) 2548 - Async execution is handled automatically with smart event loop detection 2549 """ 2550 return cast( 2551 ExperimentResult, 2552 run_async_safely( 2553 self._run_experiment_async( 2554 name=name, 2555 run_name=self._create_experiment_run_name( 2556 name=name, run_name=run_name 2557 ), 2558 description=description, 2559 data=data, 2560 task=task, 2561 evaluators=evaluators or [], 2562 composite_evaluator=composite_evaluator, 2563 run_evaluators=run_evaluators or [], 2564 max_concurrency=max_concurrency, 2565 metadata=metadata, 2566 dataset_version=_dataset_version, 2567 ), 2568 ), 2569 )
Run an experiment on a dataset with automatic tracing and evaluation.
This method executes a task function on each item in the provided dataset, automatically traces all executions with Langfuse for observability, runs item-level and run-level evaluators on the outputs, and returns comprehensive results with evaluation metrics.
The experiment system provides:
- Automatic tracing of all task executions
- Concurrent processing with configurable limits
- Comprehensive error handling that isolates failures
- Integration with Langfuse datasets for experiment tracking
- Flexible evaluation framework supporting both sync and async evaluators
Arguments:
- name: Human-readable name for the experiment. Used for identification in the Langfuse UI.
- run_name: Optional exact name for the experiment run. If provided, this will be
used as the exact dataset run name if the
datacontains Langfuse dataset items. If not provided, this will default to the experiment name appended with an ISO timestamp. - description: Optional description explaining the experiment's purpose, methodology, or expected outcomes.
- data: Array of data items to process. Can be either:
- List of dict-like items with 'input', 'expected_output', 'metadata' keys
- List of Langfuse DatasetItem objects from dataset.items
- task: Function that processes each data item and returns output. Must accept 'item' as keyword argument and can return sync or async results. The task function signature should be: task(*, item, **kwargs) -> Any
- evaluators: List of functions to evaluate each item's output individually. Each evaluator receives input, output, expected_output, and metadata. Can return single Evaluation dict or list of Evaluation dicts.
- composite_evaluator: Optional function that creates composite scores from item-level evaluations. Receives the same inputs as item-level evaluators (input, output, expected_output, metadata) plus the list of evaluations from item-level evaluators. Useful for weighted averages, pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics.
- run_evaluators: List of functions to evaluate the entire experiment run. Each run evaluator receives all item_results and can compute aggregate metrics. Useful for calculating averages, distributions, or cross-item comparisons.
- max_concurrency: Maximum number of concurrent task executions (default: 50). Controls the number of items processed simultaneously. Adjust based on API rate limits and system resources.
- metadata: Optional metadata dictionary to attach to all experiment traces.
This metadata will be included in every trace created during the experiment.
If
dataare Langfuse dataset items, the metadata will be attached to the dataset run, too.
Returns:
ExperimentResult containing:
- run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset.
- item_results: List of results for each processed item with outputs and evaluations
- run_evaluations: List of aggregate evaluation results for the entire run
- experiment_id: Stable identifier for the experiment run across all items
- dataset_run_id: ID of the dataset run (if using Langfuse datasets)
- dataset_run_url: Direct URL to view results in Langfuse UI (if applicable)
Raises:
- ValueError: If required parameters are missing or invalid
- Exception: If experiment setup fails (individual item failures are handled gracefully)
Examples:
Basic experiment with local data:
def summarize_text(*, item, **kwargs): return f"Summary: {item['input'][:50]}..." def length_evaluator(*, input, output, expected_output=None, **kwargs): return { "name": "output_length", "value": len(output), "comment": f"Output contains {len(output)} characters" } result = langfuse.run_experiment( name="Text Summarization Test", description="Evaluate summarization quality and length", data=[ {"input": "Long article text...", "expected_output": "Expected summary"}, {"input": "Another article...", "expected_output": "Another summary"} ], task=summarize_text, evaluators=[length_evaluator] ) print(f"Processed {len(result.item_results)} items") for item_result in result.item_results: print(f"Input: {item_result.item['input']}") print(f"Output: {item_result.output}") print(f"Evaluations: {item_result.evaluations}")Advanced experiment with async task and multiple evaluators:
async def llm_task(*, item, **kwargs): # Simulate async LLM call response = await openai_client.chat.completions.create( model="gpt-4", messages=[{"role": "user", "content": item["input"]}] ) return response.choices[0].message.content def accuracy_evaluator(*, input, output, expected_output=None, **kwargs): if expected_output and expected_output.lower() in output.lower(): return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"} return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"} def toxicity_evaluator(*, input, output, expected_output=None, **kwargs): # Simulate toxicity check toxicity_score = check_toxicity(output) # Your toxicity checker return { "name": "toxicity", "value": toxicity_score, "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}" } def average_accuracy(*, item_results, **kwargs): accuracies = [ eval.value for result in item_results for eval in result.evaluations if eval.name == "accuracy" ] return { "name": "average_accuracy", "value": sum(accuracies) / len(accuracies) if accuracies else 0, "comment": f"Average accuracy across {len(accuracies)} items" } result = langfuse.run_experiment( name="LLM Safety and Accuracy Test", description="Evaluate model accuracy and safety across diverse prompts", data=test_dataset, # Your dataset items task=llm_task, evaluators=[accuracy_evaluator, toxicity_evaluator], run_evaluators=[average_accuracy], max_concurrency=5, # Limit concurrent API calls metadata={"model": "gpt-4", "temperature": 0.7} )Using with Langfuse datasets:
# Get dataset from Langfuse dataset = langfuse.get_dataset("my-eval-dataset") result = dataset.run_experiment( name="Production Model Evaluation", description="Monthly evaluation of production model performance", task=my_production_task, evaluators=[accuracy_evaluator, latency_evaluator] ) # Results automatically linked to dataset in Langfuse UI print(f"View results: {result['dataset_run_url']}")
Note:
- Task and evaluator functions can be either synchronous or asynchronous
- Individual item failures are logged but don't stop the experiment
- All executions are automatically traced and visible in Langfuse UI
- When using Langfuse datasets, results are automatically linked for easy comparison
- This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.)
- Async execution is handled automatically with smart event loop detection
2931 def run_batched_evaluation( 2932 self, 2933 *, 2934 scope: Literal["traces", "observations"], 2935 mapper: MapperFunction, 2936 filter: Optional[str] = None, 2937 fetch_batch_size: int = 50, 2938 fetch_trace_fields: Optional[str] = None, 2939 max_items: Optional[int] = None, 2940 max_retries: int = 3, 2941 evaluators: List[EvaluatorFunction], 2942 composite_evaluator: Optional[CompositeEvaluatorFunction] = None, 2943 max_concurrency: int = 5, 2944 metadata: Optional[Dict[str, Any]] = None, 2945 _add_observation_scores_to_trace: bool = False, 2946 _additional_trace_tags: Optional[List[str]] = None, 2947 resume_from: Optional[BatchEvaluationResumeToken] = None, 2948 verbose: bool = False, 2949 ) -> BatchEvaluationResult: 2950 """Fetch traces or observations and run evaluations on each item. 2951 2952 This method provides a powerful way to evaluate existing data in Langfuse at scale. 2953 It fetches items based on filters, transforms them using a mapper function, runs 2954 evaluators on each item, and creates scores that are linked back to the original 2955 entities. This is ideal for: 2956 2957 - Running evaluations on production traces after deployment 2958 - Backtesting new evaluation metrics on historical data 2959 - Batch scoring of observations for quality monitoring 2960 - Periodic evaluation runs on recent data 2961 2962 The method uses a streaming/pipeline approach to process items in batches, making 2963 it memory-efficient for large datasets. It includes comprehensive error handling, 2964 retry logic, and resume capability for long-running evaluations. 2965 2966 Args: 2967 scope: The type of items to evaluate. Must be one of: 2968 - "traces": Evaluate complete traces with all their observations 2969 - "observations": Evaluate individual observations (spans, generations, events) 2970 mapper: Function that transforms API response objects into evaluator inputs. 2971 Receives a trace/observation object and returns an EvaluatorInputs 2972 instance with input, output, expected_output, and metadata fields. 2973 Can be sync or async. 2974 evaluators: List of evaluation functions to run on each item. Each evaluator 2975 receives the mapped inputs and returns Evaluation object(s). Evaluator 2976 failures are logged but don't stop the batch evaluation. 2977 filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples: 2978 - '{"tags": ["production"]}' 2979 - '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}' 2980 Default: None (fetches all items). 2981 fetch_batch_size: Number of items to fetch per API call and hold in memory. 2982 Larger values may be faster but use more memory. Default: 50. 2983 fetch_trace_fields: Comma-separated list of fields to include when fetching traces. Available field groups: 'core' (always included), 'io' (input, output, metadata), 'scores', 'observations', 'metrics'. If not specified, all fields are returned. Example: 'core,scores,metrics'. Note: Excluded 'observations' or 'scores' fields return empty arrays; excluded 'metrics' returns -1 for 'totalCost' and 'latency'. Only relevant if scope is 'traces'. 2984 max_items: Maximum total number of items to process. If None, processes all 2985 items matching the filter. Useful for testing or limiting evaluation runs. 2986 Default: None (process all). 2987 max_concurrency: Maximum number of items to evaluate concurrently. Controls 2988 parallelism and resource usage. Default: 5. 2989 composite_evaluator: Optional function that creates a composite score from 2990 item-level evaluations. Receives the original item and its evaluations, 2991 returns a single Evaluation. Useful for weighted averages or combined metrics. 2992 Default: None. 2993 metadata: Optional metadata dict to add to all created scores. Useful for 2994 tracking evaluation runs, versions, or other context. Default: None. 2995 max_retries: Maximum number of retry attempts for failed batch fetches. 2996 Uses exponential backoff (1s, 2s, 4s). Default: 3. 2997 verbose: If True, logs progress information to console. Useful for monitoring 2998 long-running evaluations. Default: False. 2999 resume_from: Optional resume token from a previous incomplete run. Allows 3000 continuing evaluation after interruption or failure. Default: None. 3001 3002 3003 Returns: 3004 BatchEvaluationResult containing: 3005 - total_items_fetched: Number of items fetched from API 3006 - total_items_processed: Number of items successfully evaluated 3007 - total_items_failed: Number of items that failed evaluation 3008 - total_scores_created: Scores created by item-level evaluators 3009 - total_composite_scores_created: Scores created by composite evaluator 3010 - total_evaluations_failed: Individual evaluator failures 3011 - evaluator_stats: Per-evaluator statistics (success rate, scores created) 3012 - resume_token: Token for resuming if incomplete (None if completed) 3013 - completed: True if all items processed 3014 - duration_seconds: Total execution time 3015 - failed_item_ids: IDs of items that failed 3016 - error_summary: Error types and counts 3017 - has_more_items: True if max_items reached but more exist 3018 3019 Raises: 3020 ValueError: If invalid scope is provided. 3021 3022 Examples: 3023 Basic trace evaluation: 3024 ```python 3025 from langfuse import Langfuse, EvaluatorInputs, Evaluation 3026 3027 client = Langfuse() 3028 3029 # Define mapper to extract fields from traces 3030 def trace_mapper(trace): 3031 return EvaluatorInputs( 3032 input=trace.input, 3033 output=trace.output, 3034 expected_output=None, 3035 metadata={"trace_id": trace.id} 3036 ) 3037 3038 # Define evaluator 3039 def length_evaluator(*, input, output, expected_output, metadata): 3040 return Evaluation( 3041 name="output_length", 3042 value=len(output) if output else 0 3043 ) 3044 3045 # Run batch evaluation 3046 result = client.run_batched_evaluation( 3047 scope="traces", 3048 mapper=trace_mapper, 3049 evaluators=[length_evaluator], 3050 filter='{"tags": ["production"]}', 3051 max_items=1000, 3052 verbose=True 3053 ) 3054 3055 print(f"Processed {result.total_items_processed} traces") 3056 print(f"Created {result.total_scores_created} scores") 3057 ``` 3058 3059 Evaluation with composite scorer: 3060 ```python 3061 def accuracy_evaluator(*, input, output, expected_output, metadata): 3062 # ... evaluation logic 3063 return Evaluation(name="accuracy", value=0.85) 3064 3065 def relevance_evaluator(*, input, output, expected_output, metadata): 3066 # ... evaluation logic 3067 return Evaluation(name="relevance", value=0.92) 3068 3069 def composite_evaluator(*, item, evaluations): 3070 # Weighted average of evaluations 3071 weights = {"accuracy": 0.6, "relevance": 0.4} 3072 total = sum( 3073 e.value * weights.get(e.name, 0) 3074 for e in evaluations 3075 if isinstance(e.value, (int, float)) 3076 ) 3077 return Evaluation( 3078 name="composite_score", 3079 value=total, 3080 comment=f"Weighted average of {len(evaluations)} metrics" 3081 ) 3082 3083 result = client.run_batched_evaluation( 3084 scope="traces", 3085 mapper=trace_mapper, 3086 evaluators=[accuracy_evaluator, relevance_evaluator], 3087 composite_evaluator=composite_evaluator, 3088 filter='{"user_id": "important_user"}', 3089 verbose=True 3090 ) 3091 ``` 3092 3093 Handling incomplete runs with resume: 3094 ```python 3095 # Initial run that may fail or timeout 3096 result = client.run_batched_evaluation( 3097 scope="observations", 3098 mapper=obs_mapper, 3099 evaluators=[my_evaluator], 3100 max_items=10000, 3101 verbose=True 3102 ) 3103 3104 # Check if incomplete 3105 if not result.completed and result.resume_token: 3106 print(f"Processed {result.resume_token.items_processed} items before interruption") 3107 3108 # Resume from where it left off 3109 result = client.run_batched_evaluation( 3110 scope="observations", 3111 mapper=obs_mapper, 3112 evaluators=[my_evaluator], 3113 resume_from=result.resume_token, 3114 verbose=True 3115 ) 3116 3117 print(f"Total items processed: {result.total_items_processed}") 3118 ``` 3119 3120 Monitoring evaluator performance: 3121 ```python 3122 result = client.run_batched_evaluation(...) 3123 3124 for stats in result.evaluator_stats: 3125 success_rate = stats.successful_runs / stats.total_runs 3126 print(f"{stats.name}:") 3127 print(f" Success rate: {success_rate:.1%}") 3128 print(f" Scores created: {stats.total_scores_created}") 3129 3130 if stats.failed_runs > 0: 3131 print(f" â ī¸ Failed {stats.failed_runs} times") 3132 ``` 3133 3134 Note: 3135 - Evaluator failures are logged but don't stop the batch evaluation 3136 - Individual item failures are tracked but don't stop processing 3137 - Fetch failures are retried with exponential backoff 3138 - All scores are automatically flushed to Langfuse at the end 3139 - The resume mechanism uses timestamp-based filtering to avoid duplicates 3140 """ 3141 runner = BatchEvaluationRunner(self) 3142 3143 return cast( 3144 BatchEvaluationResult, 3145 run_async_safely( 3146 runner.run_async( 3147 scope=scope, 3148 mapper=mapper, 3149 evaluators=evaluators, 3150 filter=filter, 3151 fetch_batch_size=fetch_batch_size, 3152 fetch_trace_fields=fetch_trace_fields, 3153 max_items=max_items, 3154 max_concurrency=max_concurrency, 3155 composite_evaluator=composite_evaluator, 3156 metadata=metadata, 3157 _add_observation_scores_to_trace=_add_observation_scores_to_trace, 3158 _additional_trace_tags=_additional_trace_tags, 3159 max_retries=max_retries, 3160 verbose=verbose, 3161 resume_from=resume_from, 3162 ) 3163 ), 3164 )
Fetch traces or observations and run evaluations on each item.
This method provides a powerful way to evaluate existing data in Langfuse at scale. It fetches items based on filters, transforms them using a mapper function, runs evaluators on each item, and creates scores that are linked back to the original entities. This is ideal for:
- Running evaluations on production traces after deployment
- Backtesting new evaluation metrics on historical data
- Batch scoring of observations for quality monitoring
- Periodic evaluation runs on recent data
The method uses a streaming/pipeline approach to process items in batches, making it memory-efficient for large datasets. It includes comprehensive error handling, retry logic, and resume capability for long-running evaluations.
Arguments:
- scope: The type of items to evaluate. Must be one of:
- "traces": Evaluate complete traces with all their observations
- "observations": Evaluate individual observations (spans, generations, events)
- mapper: Function that transforms API response objects into evaluator inputs. Receives a trace/observation object and returns an EvaluatorInputs instance with input, output, expected_output, and metadata fields. Can be sync or async.
- evaluators: List of evaluation functions to run on each item. Each evaluator receives the mapped inputs and returns Evaluation object(s). Evaluator failures are logged but don't stop the batch evaluation.
- filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples:
- '{"tags": ["production"]}'
- '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}' Default: None (fetches all items).
- fetch_batch_size: Number of items to fetch per API call and hold in memory. Larger values may be faster but use more memory. Default: 50.
- fetch_trace_fields: Comma-separated list of fields to include when fetching traces. Available field groups: 'core' (always included), 'io' (input, output, metadata), 'scores', 'observations', 'metrics'. If not specified, all fields are returned. Example: 'core,scores,metrics'. Note: Excluded 'observations' or 'scores' fields return empty arrays; excluded 'metrics' returns -1 for 'totalCost' and 'latency'. Only relevant if scope is 'traces'.
- max_items: Maximum total number of items to process. If None, processes all items matching the filter. Useful for testing or limiting evaluation runs. Default: None (process all).
- max_concurrency: Maximum number of items to evaluate concurrently. Controls parallelism and resource usage. Default: 5.
- composite_evaluator: Optional function that creates a composite score from item-level evaluations. Receives the original item and its evaluations, returns a single Evaluation. Useful for weighted averages or combined metrics. Default: None.
- metadata: Optional metadata dict to add to all created scores. Useful for tracking evaluation runs, versions, or other context. Default: None.
- max_retries: Maximum number of retry attempts for failed batch fetches. Uses exponential backoff (1s, 2s, 4s). Default: 3.
- verbose: If True, logs progress information to console. Useful for monitoring long-running evaluations. Default: False.
- resume_from: Optional resume token from a previous incomplete run. Allows continuing evaluation after interruption or failure. Default: None.
Returns:
BatchEvaluationResult containing: - total_items_fetched: Number of items fetched from API - total_items_processed: Number of items successfully evaluated - total_items_failed: Number of items that failed evaluation - total_scores_created: Scores created by item-level evaluators - total_composite_scores_created: Scores created by composite evaluator - total_evaluations_failed: Individual evaluator failures - evaluator_stats: Per-evaluator statistics (success rate, scores created) - resume_token: Token for resuming if incomplete (None if completed) - completed: True if all items processed - duration_seconds: Total execution time - failed_item_ids: IDs of items that failed - error_summary: Error types and counts - has_more_items: True if max_items reached but more exist
Raises:
- ValueError: If invalid scope is provided.
Examples:
Basic trace evaluation:
from langfuse import Langfuse, EvaluatorInputs, Evaluation client = Langfuse() # Define mapper to extract fields from traces def trace_mapper(trace): return EvaluatorInputs( input=trace.input, output=trace.output, expected_output=None, metadata={"trace_id": trace.id} ) # Define evaluator def length_evaluator(*, input, output, expected_output, metadata): return Evaluation( name="output_length", value=len(output) if output else 0 ) # Run batch evaluation result = client.run_batched_evaluation( scope="traces", mapper=trace_mapper, evaluators=[length_evaluator], filter='{"tags": ["production"]}', max_items=1000, verbose=True ) print(f"Processed {result.total_items_processed} traces") print(f"Created {result.total_scores_created} scores")Evaluation with composite scorer:
def accuracy_evaluator(*, input, output, expected_output, metadata): # ... evaluation logic return Evaluation(name="accuracy", value=0.85) def relevance_evaluator(*, input, output, expected_output, metadata): # ... evaluation logic return Evaluation(name="relevance", value=0.92) def composite_evaluator(*, item, evaluations): # Weighted average of evaluations weights = {"accuracy": 0.6, "relevance": 0.4} total = sum( e.value * weights.get(e.name, 0) for e in evaluations if isinstance(e.value, (int, float)) ) return Evaluation( name="composite_score", value=total, comment=f"Weighted average of {len(evaluations)} metrics" ) result = client.run_batched_evaluation( scope="traces", mapper=trace_mapper, evaluators=[accuracy_evaluator, relevance_evaluator], composite_evaluator=composite_evaluator, filter='{"user_id": "important_user"}', verbose=True )Handling incomplete runs with resume:
# Initial run that may fail or timeout result = client.run_batched_evaluation( scope="observations", mapper=obs_mapper, evaluators=[my_evaluator], max_items=10000, verbose=True ) # Check if incomplete if not result.completed and result.resume_token: print(f"Processed {result.resume_token.items_processed} items before interruption") # Resume from where it left off result = client.run_batched_evaluation( scope="observations", mapper=obs_mapper, evaluators=[my_evaluator], resume_from=result.resume_token, verbose=True ) print(f"Total items processed: {result.total_items_processed}")Monitoring evaluator performance:
result = client.run_batched_evaluation(...) for stats in result.evaluator_stats: success_rate = stats.successful_runs / stats.total_runs print(f"{stats.name}:") print(f" Success rate: {success_rate:.1%}") print(f" Scores created: {stats.total_scores_created}") if stats.failed_runs > 0: print(f" â ī¸ Failed {stats.failed_runs} times")
Note:
- Evaluator failures are logged but don't stop the batch evaluation
- Individual item failures are tracked but don't stop processing
- Fetch failures are retried with exponential backoff
- All scores are automatically flushed to Langfuse at the end
- The resume mechanism uses timestamp-based filtering to avoid duplicates
3166 def auth_check(self) -> bool: 3167 """Check if the provided credentials (public and secret key) are valid. 3168 3169 Raises: 3170 Exception: If no projects were found for the provided credentials. 3171 3172 Note: 3173 This method is blocking. It is discouraged to use it in production code. 3174 """ 3175 try: 3176 projects = self.api.projects.get() 3177 langfuse_logger.debug( 3178 f"Auth check successful, found {len(projects.data)} projects" 3179 ) 3180 if len(projects.data) == 0: 3181 raise Exception( 3182 "Auth check failed, no project found for the keys provided." 3183 ) 3184 return True 3185 3186 except AttributeError as e: 3187 langfuse_logger.warning( 3188 f"Auth check failed: Client not properly initialized. Error: {e}" 3189 ) 3190 return False 3191 3192 except Error as e: 3193 handle_fern_exception(e) 3194 raise e
Check if the provided credentials (public and secret key) are valid.
Raises:
- Exception: If no projects were found for the provided credentials.
Note:
This method is blocking. It is discouraged to use it in production code.
3196 def create_dataset( 3197 self, 3198 *, 3199 name: str, 3200 description: Optional[str] = None, 3201 metadata: Optional[Any] = None, 3202 input_schema: Optional[Any] = None, 3203 expected_output_schema: Optional[Any] = None, 3204 ) -> Dataset: 3205 """Create a dataset with the given name on Langfuse. 3206 3207 Args: 3208 name: Name of the dataset to create. 3209 description: Description of the dataset. Defaults to None. 3210 metadata: Additional metadata. Defaults to None. 3211 input_schema: JSON Schema for validating dataset item inputs. When set, all new items will be validated against this schema. 3212 expected_output_schema: JSON Schema for validating dataset item expected outputs. When set, all new items will be validated against this schema. 3213 3214 Returns: 3215 Dataset: The created dataset as returned by the Langfuse API. 3216 """ 3217 try: 3218 langfuse_logger.debug(f"Creating datasets {name}") 3219 3220 result = self.api.datasets.create( 3221 name=name, 3222 description=description, 3223 metadata=metadata, 3224 input_schema=input_schema, 3225 expected_output_schema=expected_output_schema, 3226 ) 3227 3228 return cast(Dataset, result) 3229 3230 except Error as e: 3231 handle_fern_exception(e) 3232 raise e
Create a dataset with the given name on Langfuse.
Arguments:
- name: Name of the dataset to create.
- description: Description of the dataset. Defaults to None.
- metadata: Additional metadata. Defaults to None.
- input_schema: JSON Schema for validating dataset item inputs. When set, all new items will be validated against this schema.
- expected_output_schema: JSON Schema for validating dataset item expected outputs. When set, all new items will be validated against this schema.
Returns:
Dataset: The created dataset as returned by the Langfuse API.
3234 def create_dataset_item( 3235 self, 3236 *, 3237 dataset_name: str, 3238 input: Optional[Any] = None, 3239 expected_output: Optional[Any] = None, 3240 metadata: Optional[Any] = None, 3241 source_trace_id: Optional[str] = None, 3242 source_observation_id: Optional[str] = None, 3243 status: Optional[DatasetStatus] = None, 3244 id: Optional[str] = None, 3245 ) -> DatasetItem: 3246 """Create a dataset item. 3247 3248 Upserts if an item with id already exists. 3249 3250 Args: 3251 dataset_name: Name of the dataset in which the dataset item should be created. 3252 input: Input data. Defaults to None. Can contain any dict, list or scalar. 3253 expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar. 3254 metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar. 3255 source_trace_id: Id of the source trace. Defaults to None. 3256 source_observation_id: Id of the source observation. Defaults to None. 3257 status: Status of the dataset item. Defaults to ACTIVE for newly created items. 3258 id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets. 3259 3260 Returns: 3261 DatasetItem: The created dataset item as returned by the Langfuse API. 3262 3263 Example: 3264 ```python 3265 from langfuse import Langfuse 3266 3267 langfuse = Langfuse() 3268 3269 # Uploading items to the Langfuse dataset named "capital_cities" 3270 langfuse.create_dataset_item( 3271 dataset_name="capital_cities", 3272 input={"input": {"country": "Italy"}}, 3273 expected_output={"expected_output": "Rome"}, 3274 metadata={"foo": "bar"} 3275 ) 3276 ``` 3277 """ 3278 try: 3279 langfuse_logger.debug(f"Creating dataset item for dataset {dataset_name}") 3280 3281 result = self.api.dataset_items.create( 3282 dataset_name=dataset_name, 3283 input=input, 3284 expected_output=expected_output, 3285 metadata=metadata, 3286 source_trace_id=source_trace_id, 3287 source_observation_id=source_observation_id, 3288 status=status, 3289 id=id, 3290 ) 3291 3292 return cast(DatasetItem, result) 3293 except Error as e: 3294 handle_fern_exception(e) 3295 raise e
Create a dataset item.
Upserts if an item with id already exists.
Arguments:
- dataset_name: Name of the dataset in which the dataset item should be created.
- input: Input data. Defaults to None. Can contain any dict, list or scalar.
- expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar.
- metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar.
- source_trace_id: Id of the source trace. Defaults to None.
- source_observation_id: Id of the source observation. Defaults to None.
- status: Status of the dataset item. Defaults to ACTIVE for newly created items.
- id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets.
Returns:
DatasetItem: The created dataset item as returned by the Langfuse API.
Example:
from langfuse import Langfuse langfuse = Langfuse() # Uploading items to the Langfuse dataset named "capital_cities" langfuse.create_dataset_item( dataset_name="capital_cities", input={"input": {"country": "Italy"}}, expected_output={"expected_output": "Rome"}, metadata={"foo": "bar"} )
3297 def resolve_media_references( 3298 self, 3299 *, 3300 obj: Any, 3301 resolve_with: Literal["base64_data_uri"], 3302 max_depth: int = 10, 3303 content_fetch_timeout_seconds: int = 5, 3304 ) -> Any: 3305 """Replace media reference strings in an object with base64 data URIs. 3306 3307 This method recursively traverses an object (up to max_depth) looking for media reference strings 3308 in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using 3309 the provided Langfuse client and replaces the reference string with a base64 data URI. 3310 3311 If fetching media content fails for a reference string, a warning is logged and the reference 3312 string is left unchanged. 3313 3314 Args: 3315 obj: The object to process. Can be a primitive value, array, or nested object. 3316 If the object has a __dict__ attribute, a dict will be returned instead of the original object type. 3317 resolve_with: The representation of the media content to replace the media reference string with. 3318 Currently only "base64_data_uri" is supported. 3319 max_depth: int: The maximum depth to traverse the object. Default is 10. 3320 content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5. 3321 3322 Returns: 3323 A deep copy of the input object with all media references replaced with base64 data URIs where possible. 3324 If the input object has a __dict__ attribute, a dict will be returned instead of the original object type. 3325 3326 Example: 3327 obj = { 3328 "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@", 3329 "nested": { 3330 "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@" 3331 } 3332 } 3333 3334 result = await LangfuseMedia.resolve_media_references(obj, langfuse_client) 3335 3336 # Result: 3337 # { 3338 # "image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...", 3339 # "nested": { 3340 # "pdf": "data:application/pdf;base64,JVBERi0xLjcK..." 3341 # } 3342 # } 3343 """ 3344 return LangfuseMedia.resolve_media_references( 3345 langfuse_client=self, 3346 obj=obj, 3347 resolve_with=resolve_with, 3348 max_depth=max_depth, 3349 content_fetch_timeout_seconds=content_fetch_timeout_seconds, 3350 )
Replace media reference strings in an object with base64 data URIs.
This method recursively traverses an object (up to max_depth) looking for media reference strings in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using the provided Langfuse client and replaces the reference string with a base64 data URI.
If fetching media content fails for a reference string, a warning is logged and the reference string is left unchanged.
Arguments:
- obj: The object to process. Can be a primitive value, array, or nested object. If the object has a __dict__ attribute, a dict will be returned instead of the original object type.
- resolve_with: The representation of the media content to replace the media reference string with. Currently only "base64_data_uri" is supported.
- max_depth: int: The maximum depth to traverse the object. Default is 10.
- content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5.
Returns:
A deep copy of the input object with all media references replaced with base64 data URIs where possible. If the input object has a __dict__ attribute, a dict will be returned instead of the original object type.
Example:
obj = { "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@", "nested": { "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@" } }
result = await LangfuseMedia.resolve_media_references(obj, langfuse_client)
Result:
{
"image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...",
"nested": {
"pdf": "data:application/pdf;base64,JVBERi0xLjcK..."
}
}
3380 def get_prompt( 3381 self, 3382 name: str, 3383 *, 3384 version: Optional[int] = None, 3385 label: Optional[str] = None, 3386 type: Literal["chat", "text"] = "text", 3387 cache_ttl_seconds: Optional[int] = None, 3388 fallback: Union[Optional[List[ChatMessageDict]], Optional[str]] = None, 3389 max_retries: Optional[int] = None, 3390 fetch_timeout_seconds: Optional[int] = None, 3391 ) -> PromptClient: 3392 """Get a prompt. 3393 3394 This method attempts to fetch the requested prompt from the local cache. If the prompt is not found 3395 in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again 3396 and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will 3397 return the expired prompt as a fallback. 3398 3399 Args: 3400 name (str): The name of the prompt to retrieve. 3401 3402 Keyword Args: 3403 version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both. 3404 label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both. 3405 cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a 3406 keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0. 3407 type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text". 3408 fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None. 3409 max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds. 3410 fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default. 3411 3412 Returns: 3413 The prompt object retrieved from the cache or directly fetched if not cached or expired of type 3414 - TextPromptClient, if type argument is 'text'. 3415 - ChatPromptClient, if type argument is 'chat'. 3416 3417 Raises: 3418 Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an 3419 expired prompt in the cache, in which case it logs a warning and returns the expired prompt. 3420 """ 3421 if self._resources is None: 3422 raise Error( 3423 "SDK is not correctly initialized. Check the init logs for more details." 3424 ) 3425 if version is not None and label is not None: 3426 raise ValueError("Cannot specify both version and label at the same time.") 3427 3428 if not name: 3429 raise ValueError("Prompt name cannot be empty.") 3430 3431 cache_key = PromptCache.generate_cache_key(name, version=version, label=label) 3432 bounded_max_retries = self._get_bounded_max_retries( 3433 max_retries, default_max_retries=2, max_retries_upper_bound=4 3434 ) 3435 3436 langfuse_logger.debug(f"Getting prompt '{cache_key}'") 3437 cached_prompt = self._resources.prompt_cache.get(cache_key) 3438 3439 if cached_prompt is None or cache_ttl_seconds == 0: 3440 langfuse_logger.debug( 3441 f"Prompt '{cache_key}' not found in cache or caching disabled." 3442 ) 3443 try: 3444 return self._fetch_prompt_and_update_cache( 3445 name, 3446 version=version, 3447 label=label, 3448 ttl_seconds=cache_ttl_seconds, 3449 max_retries=bounded_max_retries, 3450 fetch_timeout_seconds=fetch_timeout_seconds, 3451 ) 3452 except Exception as e: 3453 if fallback: 3454 langfuse_logger.warning( 3455 f"Returning fallback prompt for '{cache_key}' due to fetch error: {e}" 3456 ) 3457 3458 fallback_client_args: Dict[str, Any] = { 3459 "name": name, 3460 "prompt": fallback, 3461 "type": type, 3462 "version": version or 0, 3463 "config": {}, 3464 "labels": [label] if label else [], 3465 "tags": [], 3466 } 3467 3468 if type == "text": 3469 return TextPromptClient( 3470 prompt=Prompt_Text(**fallback_client_args), 3471 is_fallback=True, 3472 ) 3473 3474 if type == "chat": 3475 return ChatPromptClient( 3476 prompt=Prompt_Chat(**fallback_client_args), 3477 is_fallback=True, 3478 ) 3479 3480 raise e 3481 3482 if cached_prompt.is_expired(): 3483 langfuse_logger.debug(f"Stale prompt '{cache_key}' found in cache.") 3484 try: 3485 # refresh prompt in background thread, refresh_prompt deduplicates tasks 3486 langfuse_logger.debug(f"Refreshing prompt '{cache_key}' in background.") 3487 3488 def refresh_task() -> None: 3489 self._fetch_prompt_and_update_cache( 3490 name, 3491 version=version, 3492 label=label, 3493 ttl_seconds=cache_ttl_seconds, 3494 max_retries=bounded_max_retries, 3495 fetch_timeout_seconds=fetch_timeout_seconds, 3496 ) 3497 3498 self._resources.prompt_cache.add_refresh_prompt_task_if_current( 3499 cache_key, 3500 cached_prompt, 3501 refresh_task, 3502 ) 3503 langfuse_logger.debug( 3504 f"Returning stale prompt '{cache_key}' from cache." 3505 ) 3506 # return stale prompt 3507 return cached_prompt.value 3508 3509 except Exception as e: 3510 langfuse_logger.warning( 3511 f"Error when refreshing cached prompt '{cache_key}', returning cached version. Error: {e}" 3512 ) 3513 # creation of refresh prompt task failed, return stale prompt 3514 return cached_prompt.value 3515 3516 return cached_prompt.value
Get a prompt.
This method attempts to fetch the requested prompt from the local cache. If the prompt is not found in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will return the expired prompt as a fallback.
Arguments:
- name (str): The name of the prompt to retrieve.
Keyword Args:
- version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the
productionlabel is returned. Specify either version or label, not both. - label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the
productionlabel is returned. Specify either version or label, not both. - cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a
- keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0.
- type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text".
- fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None.
- max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds.
- fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default.
Returns:
The prompt object retrieved from the cache or directly fetched if not cached or expired of type
- TextPromptClient, if type argument is 'text'.
- ChatPromptClient, if type argument is 'chat'.
Raises:
- Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an
- expired prompt in the cache, in which case it logs a warning and returns the expired prompt.
3618 def create_prompt( 3619 self, 3620 *, 3621 name: str, 3622 prompt: Union[ 3623 str, List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]] 3624 ], 3625 labels: List[str] = [], 3626 tags: Optional[List[str]] = None, 3627 type: Optional[Literal["chat", "text"]] = "text", 3628 config: Optional[Any] = None, 3629 commit_message: Optional[str] = None, 3630 ) -> PromptClient: 3631 """Create a new prompt in Langfuse. 3632 3633 Keyword Args: 3634 name : The name of the prompt to be created. 3635 prompt : The content of the prompt to be created. 3636 is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead. 3637 labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label. 3638 tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt. 3639 config: Additional structured data to be saved with the prompt. Defaults to None. 3640 type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text". 3641 commit_message: Optional string describing the change. 3642 3643 Returns: 3644 TextPromptClient: The prompt if type argument is 'text'. 3645 ChatPromptClient: The prompt if type argument is 'chat'. 3646 """ 3647 try: 3648 langfuse_logger.debug(f"Creating prompt {name=}, {labels=}") 3649 3650 if type == "chat": 3651 if not isinstance(prompt, list): 3652 raise ValueError( 3653 "For 'chat' type, 'prompt' must be a list of chat messages with role and content attributes." 3654 ) 3655 request: Union[CreateChatPromptRequest, CreateTextPromptRequest] = ( 3656 CreateChatPromptRequest( 3657 name=name, 3658 prompt=cast(Any, prompt), 3659 labels=labels, 3660 tags=tags, 3661 config=config or {}, 3662 commit_message=commit_message, 3663 type=CreateChatPromptType.CHAT, 3664 ) 3665 ) 3666 server_prompt = self.api.prompts.create(request=request) 3667 3668 if self._resources is not None: 3669 self._resources.prompt_cache.invalidate(name) 3670 3671 return ChatPromptClient(prompt=cast(Prompt_Chat, server_prompt)) 3672 3673 if not isinstance(prompt, str): 3674 raise ValueError("For 'text' type, 'prompt' must be a string.") 3675 3676 request = CreateTextPromptRequest( 3677 name=name, 3678 prompt=prompt, 3679 labels=labels, 3680 tags=tags, 3681 config=config or {}, 3682 commit_message=commit_message, 3683 ) 3684 3685 server_prompt = self.api.prompts.create(request=request) 3686 3687 if self._resources is not None: 3688 self._resources.prompt_cache.invalidate(name) 3689 3690 return TextPromptClient(prompt=cast(Prompt_Text, server_prompt)) 3691 3692 except Error as e: 3693 handle_fern_exception(e) 3694 raise e
Create a new prompt in Langfuse.
Keyword Args:
- name : The name of the prompt to be created.
- prompt : The content of the prompt to be created.
- is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead.
- labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label.
- tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt.
- config: Additional structured data to be saved with the prompt. Defaults to None.
- type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text".
- commit_message: Optional string describing the change.
Returns:
TextPromptClient: The prompt if type argument is 'text'. ChatPromptClient: The prompt if type argument is 'chat'.
3696 def update_prompt( 3697 self, 3698 *, 3699 name: str, 3700 version: int, 3701 new_labels: List[str] = [], 3702 ) -> Any: 3703 """Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name. 3704 3705 Args: 3706 name (str): The name of the prompt to update. 3707 version (int): The version number of the prompt to update. 3708 new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to []. 3709 3710 Returns: 3711 Prompt: The updated prompt from the Langfuse API. 3712 3713 """ 3714 updated_prompt = self.api.prompt_version.update( 3715 name=self._url_encode(name), 3716 version=version, 3717 new_labels=new_labels, 3718 ) 3719 3720 if self._resources is not None: 3721 self._resources.prompt_cache.invalidate(name) 3722 3723 return updated_prompt
Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name.
Arguments:
- name (str): The name of the prompt to update.
- version (int): The version number of the prompt to update.
- new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to [].
Returns:
Prompt: The updated prompt from the Langfuse API.
3738 def clear_prompt_cache(self) -> None: 3739 """Clear the entire prompt cache, removing all cached prompts. 3740 3741 This method is useful when you want to force a complete refresh of all 3742 cached prompts, for example after major updates or when you need to 3743 ensure the latest versions are fetched from the server. 3744 """ 3745 if self._resources is not None: 3746 self._resources.prompt_cache.clear()
Clear the entire prompt cache, removing all cached prompts.
This method is useful when you want to force a complete refresh of all cached prompts, for example after major updates or when you need to ensure the latest versions are fetched from the server.
63def get_client(*, public_key: Optional[str] = None) -> Langfuse: 64 """Get or create a Langfuse client instance. 65 66 Returns an existing Langfuse client or creates a new one if none exists. In multi-project setups, 67 providing a public_key is required. Multi-project support is experimental - see Langfuse docs. 68 69 Behavior: 70 - Single project: Returns existing client or creates new one 71 - Multi-project: Requires public_key to return specific client 72 - No public_key in multi-project: Returns disabled client to prevent data leakage 73 74 The function uses a singleton pattern per public_key to conserve resources and maintain state. 75 76 Args: 77 public_key (Optional[str]): Project identifier 78 - With key: Returns client for that project 79 - Without key: Returns single client or disabled client if multiple exist 80 81 Returns: 82 Langfuse: Client instance in one of three states: 83 1. Client for specified public_key 84 2. Default client for single-project setup 85 3. Disabled client when multiple projects exist without key 86 87 Security: 88 Disables tracing when multiple projects exist without explicit key to prevent 89 cross-project data leakage. Multi-project setups are experimental. 90 91 Example: 92 ```python 93 # Single project 94 client = get_client() # Default client 95 96 # In multi-project usage: 97 client_a = get_client(public_key="project_a_key") # Returns project A's client 98 client_b = get_client(public_key="project_b_key") # Returns project B's client 99 100 # Without specific key in multi-project setup: 101 client = get_client() # Returns disabled client for safety 102 ``` 103 """ 104 with LangfuseResourceManager._lock: 105 active_instances = LangfuseResourceManager._instances 106 107 # If no explicit public_key provided, check execution context 108 if not public_key: 109 public_key = _current_public_key.get(None) 110 111 if not public_key: 112 if len(active_instances) == 0: 113 # No clients initialized yet, create default instance 114 return Langfuse() 115 116 if len(active_instances) == 1: 117 # Only one client exists, safe to use without specifying key 118 instance = list(active_instances.values())[0] 119 120 # Initialize with the credentials bound to the instance 121 # This is important if the original instance was instantiated 122 # via constructor arguments 123 return _create_client_from_instance(instance) 124 125 else: 126 # Multiple clients exist but no key specified - disable tracing 127 # to prevent cross-project data leakage 128 langfuse_logger.warning( 129 "No 'langfuse_public_key' passed to decorated function, but multiple langfuse clients are instantiated in current process. Skipping tracing for this function to avoid cross-project leakage." 130 ) 131 return Langfuse( 132 tracing_enabled=False, public_key="fake", secret_key="fake" 133 ) 134 135 else: 136 # Specific key provided, look up existing instance 137 target_instance: Optional[LangfuseResourceManager] = active_instances.get( 138 public_key, None 139 ) 140 141 if target_instance is None: 142 # No instance found with this key - client not initialized properly 143 langfuse_logger.warning( 144 f"No Langfuse client with public key {public_key} has been initialized. Skipping tracing for decorated function." 145 ) 146 return Langfuse( 147 tracing_enabled=False, public_key="fake", secret_key="fake" 148 ) 149 150 # target_instance is guaranteed to be not None at this point 151 return _create_client_from_instance(target_instance, public_key)
Get or create a Langfuse client instance.
Returns an existing Langfuse client or creates a new one if none exists. In multi-project setups, providing a public_key is required. Multi-project support is experimental - see Langfuse docs.
Behavior:
- Single project: Returns existing client or creates new one
- Multi-project: Requires public_key to return specific client
- No public_key in multi-project: Returns disabled client to prevent data leakage
The function uses a singleton pattern per public_key to conserve resources and maintain state.
Arguments:
- public_key (Optional[str]): Project identifier
- With key: Returns client for that project
- Without key: Returns single client or disabled client if multiple exist
Returns:
Langfuse: Client instance in one of three states: 1. Client for specified public_key 2. Default client for single-project setup 3. Disabled client when multiple projects exist without key
Security:
Disables tracing when multiple projects exist without explicit key to prevent cross-project data leakage. Multi-project setups are experimental.
Example:
# Single project client = get_client() # Default client # In multi-project usage: client_a = get_client(public_key="project_a_key") # Returns project A's client client_b = get_client(public_key="project_b_key") # Returns project B's client # Without specific key in multi-project setup: client = get_client() # Returns disabled client for safety
88 def observe( 89 self, 90 func: Optional[F] = None, 91 *, 92 name: Optional[str] = None, 93 as_type: Optional[ObservationTypeLiteralNoEvent] = None, 94 capture_input: Optional[bool] = None, 95 capture_output: Optional[bool] = None, 96 transform_to_string: Optional[Callable[[Iterable], str]] = None, 97 ) -> Union[F, Callable[[F], F]]: 98 """Wrap a function to create and manage Langfuse tracing around its execution, supporting both synchronous and asynchronous functions. 99 100 This decorator provides seamless integration of Langfuse observability into your codebase. It automatically creates 101 spans or generations around function execution, capturing timing, inputs/outputs, and error states. The decorator 102 intelligently handles both synchronous and asynchronous functions, preserving function signatures and type hints. 103 104 Using OpenTelemetry's distributed tracing system, it maintains proper trace context propagation throughout your application, 105 enabling you to see hierarchical traces of function calls with detailed performance metrics and function-specific details. 106 107 Args: 108 func (Optional[Callable]): The function to decorate. When used with parentheses @observe(), this will be None. 109 name (Optional[str]): Custom name for the created trace or span. If not provided, the function name is used. 110 as_type (Optional[Literal]): Set the observation type. Supported values: 111 "generation", "span", "agent", "tool", "chain", "retriever", "embedding", "evaluator", "guardrail". 112 Observation types are highlighted in the Langfuse UI for filtering and visualization. 113 The types "generation" and "embedding" create a span on which additional attributes such as model metrics 114 can be set. 115 116 Returns: 117 Callable: A wrapped version of the original function that automatically creates and manages Langfuse spans. 118 119 Example: 120 For general function tracing with automatic naming: 121 ```python 122 @observe() 123 def process_user_request(user_id, query): 124 # Function is automatically traced with name "process_user_request" 125 return get_response(query) 126 ``` 127 128 For language model generation tracking: 129 ```python 130 @observe(name="answer-generation", as_type="generation") 131 async def generate_answer(query): 132 # Creates a generation-type span with extended LLM metrics 133 response = await openai.chat.completions.create( 134 model="gpt-4", 135 messages=[{"role": "user", "content": query}] 136 ) 137 return response.choices[0].message.content 138 ``` 139 140 For trace context propagation between functions: 141 ```python 142 @observe() 143 def main_process(): 144 # Parent span is created 145 return sub_process() # Child span automatically connected to parent 146 147 @observe() 148 def sub_process(): 149 # Automatically becomes a child span of main_process 150 return "result" 151 ``` 152 153 Raises: 154 Exception: Propagates any exceptions from the wrapped function after logging them in the trace. 155 156 Notes: 157 - The decorator preserves the original function's signature, docstring, and return type. 158 - Proper parent-child relationships between spans are automatically maintained. 159 - Special keyword arguments can be passed to control tracing: 160 - langfuse_trace_id: Explicitly set the trace ID for this function call 161 - langfuse_parent_observation_id: Explicitly set the parent span ID 162 - langfuse_public_key: Use a specific Langfuse project (when multiple clients exist) 163 - For async functions, the decorator returns an async function wrapper. 164 - For sync functions, the decorator returns a synchronous wrapper. 165 """ 166 valid_types = set(get_observation_types_list(ObservationTypeLiteralNoEvent)) 167 if as_type is not None and as_type not in valid_types: 168 logger.warning( 169 f"Invalid as_type '{as_type}'. Valid types are: {', '.join(sorted(valid_types))}. Defaulting to 'span'." 170 ) 171 as_type = "span" 172 173 function_io_capture_enabled = os.environ.get( 174 LANGFUSE_OBSERVE_DECORATOR_IO_CAPTURE_ENABLED, "True" 175 ).lower() not in ("false", "0") 176 177 should_capture_input = ( 178 capture_input if capture_input is not None else function_io_capture_enabled 179 ) 180 181 should_capture_output = ( 182 capture_output 183 if capture_output is not None 184 else function_io_capture_enabled 185 ) 186 187 def decorator(func: F) -> F: 188 return ( 189 self._async_observe( 190 func, 191 name=name, 192 as_type=as_type, 193 capture_input=should_capture_input, 194 capture_output=should_capture_output, 195 transform_to_string=transform_to_string, 196 ) 197 if asyncio.iscoroutinefunction(func) 198 else self._sync_observe( 199 func, 200 name=name, 201 as_type=as_type, 202 capture_input=should_capture_input, 203 capture_output=should_capture_output, 204 transform_to_string=transform_to_string, 205 ) 206 ) 207 208 """Handle decorator with or without parentheses. 209 210 This logic enables the decorator to work both with and without parentheses: 211 - @observe - Python passes the function directly to the decorator 212 - @observe() - Python calls the decorator first, which must return a function decorator 213 214 When called without arguments (@observe), the func parameter contains the function to decorate, 215 so we directly apply the decorator to it. When called with parentheses (@observe()), 216 func is None, so we return the decorator function itself for Python to apply in the next step. 217 """ 218 if func is None: 219 return decorator 220 else: 221 return decorator(func)
Wrap a function to create and manage Langfuse tracing around its execution, supporting both synchronous and asynchronous functions.
This decorator provides seamless integration of Langfuse observability into your codebase. It automatically creates spans or generations around function execution, capturing timing, inputs/outputs, and error states. The decorator intelligently handles both synchronous and asynchronous functions, preserving function signatures and type hints.
Using OpenTelemetry's distributed tracing system, it maintains proper trace context propagation throughout your application, enabling you to see hierarchical traces of function calls with detailed performance metrics and function-specific details.
Arguments:
- func (Optional[Callable]): The function to decorate. When used with parentheses @observe(), this will be None.
- name (Optional[str]): Custom name for the created trace or span. If not provided, the function name is used.
- as_type (Optional[Literal]): Set the observation type. Supported values: "generation", "span", "agent", "tool", "chain", "retriever", "embedding", "evaluator", "guardrail". Observation types are highlighted in the Langfuse UI for filtering and visualization. The types "generation" and "embedding" create a span on which additional attributes such as model metrics can be set.
Returns:
Callable: A wrapped version of the original function that automatically creates and manages Langfuse spans.
Example:
For general function tracing with automatic naming:
@observe() def process_user_request(user_id, query): # Function is automatically traced with name "process_user_request" return get_response(query)For language model generation tracking:
@observe(name="answer-generation", as_type="generation") async def generate_answer(query): # Creates a generation-type span with extended LLM metrics response = await openai.chat.completions.create( model="gpt-4", messages=[{"role": "user", "content": query}] ) return response.choices[0].message.contentFor trace context propagation between functions:
@observe() def main_process(): # Parent span is created return sub_process() # Child span automatically connected to parent @observe() def sub_process(): # Automatically becomes a child span of main_process return "result"
Raises:
- Exception: Propagates any exceptions from the wrapped function after logging them in the trace.
Notes:
- The decorator preserves the original function's signature, docstring, and return type.
- Proper parent-child relationships between spans are automatically maintained.
- Special keyword arguments can be passed to control tracing:
- langfuse_trace_id: Explicitly set the trace ID for this function call
- langfuse_parent_observation_id: Explicitly set the parent span ID
- langfuse_public_key: Use a specific Langfuse project (when multiple clients exist)
- For async functions, the decorator returns an async function wrapper.
- For sync functions, the decorator returns a synchronous wrapper.
95def propagate_attributes( 96 *, 97 user_id: Optional[str] = None, 98 session_id: Optional[str] = None, 99 metadata: Optional[Dict[str, str]] = None, 100 version: Optional[str] = None, 101 tags: Optional[List[str]] = None, 102 trace_name: Optional[str] = None, 103 as_baggage: bool = False, 104) -> _AgnosticContextManager[Any]: 105 """Propagate trace-level attributes to all spans created within this context. 106 107 This context manager sets attributes on the currently active span AND automatically 108 propagates them to all new child spans created within the context. This is the 109 recommended way to set trace-level attributes like user_id, session_id, and metadata 110 dimensions that should be consistently applied across all observations in a trace. 111 112 **IMPORTANT**: Call this as early as possible within your trace/workflow. Only the 113 currently active span and spans created after entering this context will have these 114 attributes. Pre-existing spans will NOT be retroactively updated. 115 116 **Why this matters**: Langfuse aggregation queries (e.g., total cost by user_id, 117 filtering by session_id) only include observations that have the attribute set. 118 If you call `propagate_attributes` late in your workflow, earlier spans won't be 119 included in aggregations for that attribute. 120 121 Args: 122 user_id: User identifier to associate with all spans in this context. 123 Must be US-ASCII string, â¤200 characters. Use this to track which user 124 generated each trace and enable e.g. per-user cost/performance analysis. 125 session_id: Session identifier to associate with all spans in this context. 126 Must be US-ASCII string, â¤200 characters. Use this to group related traces 127 within a user session (e.g., a conversation thread, multi-turn interaction). 128 metadata: Additional key-value metadata to propagate to all spans. 129 - Keys and values must be US-ASCII strings 130 - All values must be â¤200 characters 131 - Use for dimensions like internal correlating identifiers 132 - AVOID: large payloads, sensitive data, non-string values (will be dropped with warning) 133 version: Version identfier for parts of your application that are independently versioned, e.g. agents 134 tags: List of tags to categorize the group of observations 135 trace_name: Name to assign to the trace. Must be US-ASCII string, â¤200 characters. 136 Use this to set a consistent trace name for all spans created within this context. 137 as_baggage: If True, propagates attributes using OpenTelemetry baggage for 138 cross-process/service propagation. **Security warning**: When enabled, 139 attribute values are added to HTTP headers on ALL outbound requests. 140 Only enable if values are safe to transmit via HTTP headers and you need 141 cross-service tracing. Default: False. 142 143 Returns: 144 Context manager that propagates attributes to all child spans. 145 146 Example: 147 Basic usage with user and session tracking: 148 149 ```python 150 from langfuse import Langfuse 151 152 langfuse = Langfuse() 153 154 # Set attributes early in the trace 155 with langfuse.start_as_current_observation(name="user_workflow") as span: 156 with langfuse.propagate_attributes( 157 user_id="user_123", 158 session_id="session_abc", 159 metadata={"experiment": "variant_a", "environment": "production"} 160 ): 161 # All spans created here will have user_id, session_id, and metadata 162 with langfuse.start_observation(name="llm_call") as llm_span: 163 # This span inherits: user_id, session_id, experiment, environment 164 ... 165 166 with langfuse.start_generation(name="completion") as gen: 167 # This span also inherits all attributes 168 ... 169 ``` 170 171 Late propagation (anti-pattern): 172 173 ```python 174 with langfuse.start_as_current_observation(name="workflow") as span: 175 # These spans WON'T have user_id 176 early_span = langfuse.start_observation(name="early_work") 177 early_span.end() 178 179 # Set attributes in the middle 180 with langfuse.propagate_attributes(user_id="user_123"): 181 # Only spans created AFTER this point will have user_id 182 late_span = langfuse.start_observation(name="late_work") 183 late_span.end() 184 185 # Result: Aggregations by user_id will miss "early_work" span 186 ``` 187 188 Cross-service propagation with baggage (advanced): 189 190 ```python 191 # Service A - originating service 192 with langfuse.start_as_current_observation(name="api_request"): 193 with langfuse.propagate_attributes( 194 user_id="user_123", 195 session_id="session_abc", 196 as_baggage=True # Propagate via HTTP headers 197 ): 198 # Make HTTP request to Service B 199 response = requests.get("https://service-b.example.com/api") 200 # user_id and session_id are now in HTTP headers 201 202 # Service B - downstream service 203 # OpenTelemetry will automatically extract baggage from HTTP headers 204 # and propagate to spans in Service B 205 ``` 206 207 Note: 208 - **Validation**: All attribute values (user_id, session_id, metadata values) 209 must be strings â¤200 characters. Invalid values will be dropped with a 210 warning logged. Ensure values meet constraints before calling. 211 - **OpenTelemetry**: This uses OpenTelemetry context propagation under the hood, 212 making it compatible with other OTel-instrumented libraries. 213 214 Raises: 215 No exceptions are raised. Invalid values are logged as warnings and dropped. 216 """ 217 return _propagate_attributes( 218 user_id=user_id, 219 session_id=session_id, 220 metadata=metadata, 221 version=version, 222 tags=tags, 223 trace_name=trace_name, 224 as_baggage=as_baggage, 225 )
Propagate trace-level attributes to all spans created within this context.
This context manager sets attributes on the currently active span AND automatically propagates them to all new child spans created within the context. This is the recommended way to set trace-level attributes like user_id, session_id, and metadata dimensions that should be consistently applied across all observations in a trace.
IMPORTANT: Call this as early as possible within your trace/workflow. Only the currently active span and spans created after entering this context will have these attributes. Pre-existing spans will NOT be retroactively updated.
Why this matters: Langfuse aggregation queries (e.g., total cost by user_id,
filtering by session_id) only include observations that have the attribute set.
If you call propagate_attributes late in your workflow, earlier spans won't be
included in aggregations for that attribute.
Arguments:
- user_id: User identifier to associate with all spans in this context. Must be US-ASCII string, â¤200 characters. Use this to track which user generated each trace and enable e.g. per-user cost/performance analysis.
- session_id: Session identifier to associate with all spans in this context. Must be US-ASCII string, â¤200 characters. Use this to group related traces within a user session (e.g., a conversation thread, multi-turn interaction).
- metadata: Additional key-value metadata to propagate to all spans.
- Keys and values must be US-ASCII strings
- All values must be â¤200 characters
- Use for dimensions like internal correlating identifiers
- AVOID: large payloads, sensitive data, non-string values (will be dropped with warning)
- version: Version identfier for parts of your application that are independently versioned, e.g. agents
- tags: List of tags to categorize the group of observations
- trace_name: Name to assign to the trace. Must be US-ASCII string, â¤200 characters. Use this to set a consistent trace name for all spans created within this context.
- as_baggage: If True, propagates attributes using OpenTelemetry baggage for cross-process/service propagation. Security warning: When enabled, attribute values are added to HTTP headers on ALL outbound requests. Only enable if values are safe to transmit via HTTP headers and you need cross-service tracing. Default: False.
Returns:
Context manager that propagates attributes to all child spans.
Example:
Basic usage with user and session tracking:
from langfuse import Langfuse langfuse = Langfuse() # Set attributes early in the trace with langfuse.start_as_current_observation(name="user_workflow") as span: with langfuse.propagate_attributes( user_id="user_123", session_id="session_abc", metadata={"experiment": "variant_a", "environment": "production"} ): # All spans created here will have user_id, session_id, and metadata with langfuse.start_observation(name="llm_call") as llm_span: # This span inherits: user_id, session_id, experiment, environment ... with langfuse.start_generation(name="completion") as gen: # This span also inherits all attributes ...Late propagation (anti-pattern):
with langfuse.start_as_current_observation(name="workflow") as span: # These spans WON'T have user_id early_span = langfuse.start_observation(name="early_work") early_span.end() # Set attributes in the middle with langfuse.propagate_attributes(user_id="user_123"): # Only spans created AFTER this point will have user_id late_span = langfuse.start_observation(name="late_work") late_span.end() # Result: Aggregations by user_id will miss "early_work" spanCross-service propagation with baggage (advanced):
# Service A - originating service with langfuse.start_as_current_observation(name="api_request"): with langfuse.propagate_attributes( user_id="user_123", session_id="session_abc", as_baggage=True # Propagate via HTTP headers ): # Make HTTP request to Service B response = requests.get("https://service-b.example.com/api") # user_id and session_id are now in HTTP headers # Service B - downstream service # OpenTelemetry will automatically extract baggage from HTTP headers # and propagate to spans in Service B
Note:
- Validation: All attribute values (user_id, session_id, metadata values) must be strings â¤200 characters. Invalid values will be dropped with a warning logged. Ensure values meet constraints before calling.
- OpenTelemetry: This uses OpenTelemetry context propagation under the hood, making it compatible with other OTel-instrumented libraries.
Raises:
- No exceptions are raised. Invalid values are logged as warnings and dropped.
1247class LangfuseSpan(LangfuseObservationWrapper): 1248 """Standard span implementation for general operations in Langfuse. 1249 1250 This class represents a general-purpose span that can be used to trace 1251 any operation in your application. It extends the base LangfuseObservationWrapper 1252 with specific methods for creating child spans, generations, and updating 1253 span-specific attributes. If possible, use a more specific type for 1254 better observability and insights. 1255 """ 1256 1257 def __init__( 1258 self, 1259 *, 1260 otel_span: otel_trace_api.Span, 1261 langfuse_client: "Langfuse", 1262 input: Optional[Any] = None, 1263 output: Optional[Any] = None, 1264 metadata: Optional[Any] = None, 1265 environment: Optional[str] = None, 1266 release: Optional[str] = None, 1267 version: Optional[str] = None, 1268 level: Optional[SpanLevel] = None, 1269 status_message: Optional[str] = None, 1270 ): 1271 """Initialize a new LangfuseSpan. 1272 1273 Args: 1274 otel_span: The OpenTelemetry span to wrap 1275 langfuse_client: Reference to the parent Langfuse client 1276 input: Input data for the span (any JSON-serializable object) 1277 output: Output data from the span (any JSON-serializable object) 1278 metadata: Additional metadata to associate with the span 1279 environment: The tracing environment 1280 release: Release identifier for the application 1281 version: Version identifier for the code or component 1282 level: Importance level of the span (info, warning, error) 1283 status_message: Optional status message for the span 1284 """ 1285 super().__init__( 1286 otel_span=otel_span, 1287 as_type="span", 1288 langfuse_client=langfuse_client, 1289 input=input, 1290 output=output, 1291 metadata=metadata, 1292 environment=environment, 1293 release=release, 1294 version=version, 1295 level=level, 1296 status_message=status_message, 1297 )
Standard span implementation for general operations in Langfuse.
This class represents a general-purpose span that can be used to trace any operation in your application. It extends the base LangfuseObservationWrapper with specific methods for creating child spans, generations, and updating span-specific attributes. If possible, use a more specific type for better observability and insights.
1257 def __init__( 1258 self, 1259 *, 1260 otel_span: otel_trace_api.Span, 1261 langfuse_client: "Langfuse", 1262 input: Optional[Any] = None, 1263 output: Optional[Any] = None, 1264 metadata: Optional[Any] = None, 1265 environment: Optional[str] = None, 1266 release: Optional[str] = None, 1267 version: Optional[str] = None, 1268 level: Optional[SpanLevel] = None, 1269 status_message: Optional[str] = None, 1270 ): 1271 """Initialize a new LangfuseSpan. 1272 1273 Args: 1274 otel_span: The OpenTelemetry span to wrap 1275 langfuse_client: Reference to the parent Langfuse client 1276 input: Input data for the span (any JSON-serializable object) 1277 output: Output data from the span (any JSON-serializable object) 1278 metadata: Additional metadata to associate with the span 1279 environment: The tracing environment 1280 release: Release identifier for the application 1281 version: Version identifier for the code or component 1282 level: Importance level of the span (info, warning, error) 1283 status_message: Optional status message for the span 1284 """ 1285 super().__init__( 1286 otel_span=otel_span, 1287 as_type="span", 1288 langfuse_client=langfuse_client, 1289 input=input, 1290 output=output, 1291 metadata=metadata, 1292 environment=environment, 1293 release=release, 1294 version=version, 1295 level=level, 1296 status_message=status_message, 1297 )
Initialize a new LangfuseSpan.
Arguments:
- otel_span: The OpenTelemetry span to wrap
- langfuse_client: Reference to the parent Langfuse client
- input: Input data for the span (any JSON-serializable object)
- output: Output data from the span (any JSON-serializable object)
- metadata: Additional metadata to associate with the span
- environment: The tracing environment
- release: Release identifier for the application
- version: Version identifier for the code or component
- level: Importance level of the span (info, warning, error)
- status_message: Optional status message for the span
1300class LangfuseGeneration(LangfuseObservationWrapper): 1301 """Specialized span implementation for AI model generations in Langfuse. 1302 1303 This class represents a generation span specifically designed for tracking 1304 AI/LLM operations. It extends the base LangfuseObservationWrapper with specialized 1305 attributes for model details, token usage, and costs. 1306 """ 1307 1308 def __init__( 1309 self, 1310 *, 1311 otel_span: otel_trace_api.Span, 1312 langfuse_client: "Langfuse", 1313 input: Optional[Any] = None, 1314 output: Optional[Any] = None, 1315 metadata: Optional[Any] = None, 1316 environment: Optional[str] = None, 1317 release: Optional[str] = None, 1318 version: Optional[str] = None, 1319 level: Optional[SpanLevel] = None, 1320 status_message: Optional[str] = None, 1321 completion_start_time: Optional[datetime] = None, 1322 model: Optional[str] = None, 1323 model_parameters: Optional[Dict[str, MapValue]] = None, 1324 usage_details: Optional[Dict[str, int]] = None, 1325 cost_details: Optional[Dict[str, float]] = None, 1326 prompt: Optional[PromptClient] = None, 1327 ): 1328 """Initialize a new LangfuseGeneration span. 1329 1330 Args: 1331 otel_span: The OpenTelemetry span to wrap 1332 langfuse_client: Reference to the parent Langfuse client 1333 input: Input data for the generation (e.g., prompts) 1334 output: Output from the generation (e.g., completions) 1335 metadata: Additional metadata to associate with the generation 1336 environment: The tracing environment 1337 release: Release identifier for the application 1338 version: Version identifier for the model or component 1339 level: Importance level of the generation (info, warning, error) 1340 status_message: Optional status message for the generation 1341 completion_start_time: When the model started generating the response 1342 model: Name/identifier of the AI model used (e.g., "gpt-4") 1343 model_parameters: Parameters used for the model (e.g., temperature, max_tokens) 1344 usage_details: Token usage information (e.g., prompt_tokens, completion_tokens) 1345 cost_details: Cost information for the model call 1346 prompt: Associated prompt template from Langfuse prompt management 1347 """ 1348 super().__init__( 1349 as_type="generation", 1350 otel_span=otel_span, 1351 langfuse_client=langfuse_client, 1352 input=input, 1353 output=output, 1354 metadata=metadata, 1355 environment=environment, 1356 release=release, 1357 version=version, 1358 level=level, 1359 status_message=status_message, 1360 completion_start_time=completion_start_time, 1361 model=model, 1362 model_parameters=model_parameters, 1363 usage_details=usage_details, 1364 cost_details=cost_details, 1365 prompt=prompt, 1366 )
Specialized span implementation for AI model generations in Langfuse.
This class represents a generation span specifically designed for tracking AI/LLM operations. It extends the base LangfuseObservationWrapper with specialized attributes for model details, token usage, and costs.
1308 def __init__( 1309 self, 1310 *, 1311 otel_span: otel_trace_api.Span, 1312 langfuse_client: "Langfuse", 1313 input: Optional[Any] = None, 1314 output: Optional[Any] = None, 1315 metadata: Optional[Any] = None, 1316 environment: Optional[str] = None, 1317 release: Optional[str] = None, 1318 version: Optional[str] = None, 1319 level: Optional[SpanLevel] = None, 1320 status_message: Optional[str] = None, 1321 completion_start_time: Optional[datetime] = None, 1322 model: Optional[str] = None, 1323 model_parameters: Optional[Dict[str, MapValue]] = None, 1324 usage_details: Optional[Dict[str, int]] = None, 1325 cost_details: Optional[Dict[str, float]] = None, 1326 prompt: Optional[PromptClient] = None, 1327 ): 1328 """Initialize a new LangfuseGeneration span. 1329 1330 Args: 1331 otel_span: The OpenTelemetry span to wrap 1332 langfuse_client: Reference to the parent Langfuse client 1333 input: Input data for the generation (e.g., prompts) 1334 output: Output from the generation (e.g., completions) 1335 metadata: Additional metadata to associate with the generation 1336 environment: The tracing environment 1337 release: Release identifier for the application 1338 version: Version identifier for the model or component 1339 level: Importance level of the generation (info, warning, error) 1340 status_message: Optional status message for the generation 1341 completion_start_time: When the model started generating the response 1342 model: Name/identifier of the AI model used (e.g., "gpt-4") 1343 model_parameters: Parameters used for the model (e.g., temperature, max_tokens) 1344 usage_details: Token usage information (e.g., prompt_tokens, completion_tokens) 1345 cost_details: Cost information for the model call 1346 prompt: Associated prompt template from Langfuse prompt management 1347 """ 1348 super().__init__( 1349 as_type="generation", 1350 otel_span=otel_span, 1351 langfuse_client=langfuse_client, 1352 input=input, 1353 output=output, 1354 metadata=metadata, 1355 environment=environment, 1356 release=release, 1357 version=version, 1358 level=level, 1359 status_message=status_message, 1360 completion_start_time=completion_start_time, 1361 model=model, 1362 model_parameters=model_parameters, 1363 usage_details=usage_details, 1364 cost_details=cost_details, 1365 prompt=prompt, 1366 )
Initialize a new LangfuseGeneration span.
Arguments:
- otel_span: The OpenTelemetry span to wrap
- langfuse_client: Reference to the parent Langfuse client
- input: Input data for the generation (e.g., prompts)
- output: Output from the generation (e.g., completions)
- metadata: Additional metadata to associate with the generation
- environment: The tracing environment
- release: Release identifier for the application
- version: Version identifier for the model or component
- level: Importance level of the generation (info, warning, error)
- status_message: Optional status message for the generation
- completion_start_time: When the model started generating the response
- model: Name/identifier of the AI model used (e.g., "gpt-4")
- model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
- usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
- cost_details: Cost information for the model call
- prompt: Associated prompt template from Langfuse prompt management
1369class LangfuseEvent(LangfuseObservationWrapper): 1370 """Specialized span implementation for Langfuse Events.""" 1371 1372 def __init__( 1373 self, 1374 *, 1375 otel_span: otel_trace_api.Span, 1376 langfuse_client: "Langfuse", 1377 input: Optional[Any] = None, 1378 output: Optional[Any] = None, 1379 metadata: Optional[Any] = None, 1380 environment: Optional[str] = None, 1381 release: Optional[str] = None, 1382 version: Optional[str] = None, 1383 level: Optional[SpanLevel] = None, 1384 status_message: Optional[str] = None, 1385 ): 1386 """Initialize a new LangfuseEvent span. 1387 1388 Args: 1389 otel_span: The OpenTelemetry span to wrap 1390 langfuse_client: Reference to the parent Langfuse client 1391 input: Input data for the event 1392 output: Output from the event 1393 metadata: Additional metadata to associate with the generation 1394 environment: The tracing environment 1395 release: Release identifier for the application 1396 version: Version identifier for the model or component 1397 level: Importance level of the generation (info, warning, error) 1398 status_message: Optional status message for the generation 1399 """ 1400 super().__init__( 1401 otel_span=otel_span, 1402 as_type="event", 1403 langfuse_client=langfuse_client, 1404 input=input, 1405 output=output, 1406 metadata=metadata, 1407 environment=environment, 1408 release=release, 1409 version=version, 1410 level=level, 1411 status_message=status_message, 1412 ) 1413 1414 def update( 1415 self, 1416 *, 1417 name: Optional[str] = None, 1418 input: Optional[Any] = None, 1419 output: Optional[Any] = None, 1420 metadata: Optional[Any] = None, 1421 version: Optional[str] = None, 1422 level: Optional[SpanLevel] = None, 1423 status_message: Optional[str] = None, 1424 completion_start_time: Optional[datetime] = None, 1425 model: Optional[str] = None, 1426 model_parameters: Optional[Dict[str, MapValue]] = None, 1427 usage_details: Optional[Dict[str, int]] = None, 1428 cost_details: Optional[Dict[str, float]] = None, 1429 prompt: Optional[PromptClient] = None, 1430 **kwargs: Any, 1431 ) -> "LangfuseEvent": 1432 """Update is not allowed for LangfuseEvent because events cannot be updated. 1433 1434 This method logs a warning and returns self without making changes. 1435 1436 Returns: 1437 self: Returns the unchanged LangfuseEvent instance 1438 """ 1439 langfuse_logger.warning( 1440 "Attempted to update LangfuseEvent observation. Events cannot be updated after creation." 1441 ) 1442 return self
Specialized span implementation for Langfuse Events.
1372 def __init__( 1373 self, 1374 *, 1375 otel_span: otel_trace_api.Span, 1376 langfuse_client: "Langfuse", 1377 input: Optional[Any] = None, 1378 output: Optional[Any] = None, 1379 metadata: Optional[Any] = None, 1380 environment: Optional[str] = None, 1381 release: Optional[str] = None, 1382 version: Optional[str] = None, 1383 level: Optional[SpanLevel] = None, 1384 status_message: Optional[str] = None, 1385 ): 1386 """Initialize a new LangfuseEvent span. 1387 1388 Args: 1389 otel_span: The OpenTelemetry span to wrap 1390 langfuse_client: Reference to the parent Langfuse client 1391 input: Input data for the event 1392 output: Output from the event 1393 metadata: Additional metadata to associate with the generation 1394 environment: The tracing environment 1395 release: Release identifier for the application 1396 version: Version identifier for the model or component 1397 level: Importance level of the generation (info, warning, error) 1398 status_message: Optional status message for the generation 1399 """ 1400 super().__init__( 1401 otel_span=otel_span, 1402 as_type="event", 1403 langfuse_client=langfuse_client, 1404 input=input, 1405 output=output, 1406 metadata=metadata, 1407 environment=environment, 1408 release=release, 1409 version=version, 1410 level=level, 1411 status_message=status_message, 1412 )
Initialize a new LangfuseEvent span.
Arguments:
- otel_span: The OpenTelemetry span to wrap
- langfuse_client: Reference to the parent Langfuse client
- input: Input data for the event
- output: Output from the event
- metadata: Additional metadata to associate with the generation
- environment: The tracing environment
- release: Release identifier for the application
- version: Version identifier for the model or component
- level: Importance level of the generation (info, warning, error)
- status_message: Optional status message for the generation
1414 def update( 1415 self, 1416 *, 1417 name: Optional[str] = None, 1418 input: Optional[Any] = None, 1419 output: Optional[Any] = None, 1420 metadata: Optional[Any] = None, 1421 version: Optional[str] = None, 1422 level: Optional[SpanLevel] = None, 1423 status_message: Optional[str] = None, 1424 completion_start_time: Optional[datetime] = None, 1425 model: Optional[str] = None, 1426 model_parameters: Optional[Dict[str, MapValue]] = None, 1427 usage_details: Optional[Dict[str, int]] = None, 1428 cost_details: Optional[Dict[str, float]] = None, 1429 prompt: Optional[PromptClient] = None, 1430 **kwargs: Any, 1431 ) -> "LangfuseEvent": 1432 """Update is not allowed for LangfuseEvent because events cannot be updated. 1433 1434 This method logs a warning and returns self without making changes. 1435 1436 Returns: 1437 self: Returns the unchanged LangfuseEvent instance 1438 """ 1439 langfuse_logger.warning( 1440 "Attempted to update LangfuseEvent observation. Events cannot be updated after creation." 1441 ) 1442 return self
Update is not allowed for LangfuseEvent because events cannot be updated.
This method logs a warning and returns self without making changes.
Returns:
self: Returns the unchanged LangfuseEvent instance
28class LangfuseOtelSpanAttributes: 29 # Langfuse-Trace attributes 30 TRACE_NAME = "langfuse.trace.name" 31 TRACE_USER_ID = "user.id" 32 TRACE_SESSION_ID = "session.id" 33 TRACE_TAGS = "langfuse.trace.tags" 34 TRACE_PUBLIC = "langfuse.trace.public" 35 TRACE_METADATA = "langfuse.trace.metadata" 36 TRACE_INPUT = "langfuse.trace.input" 37 TRACE_OUTPUT = "langfuse.trace.output" 38 39 # Langfuse-observation attributes 40 OBSERVATION_TYPE = "langfuse.observation.type" 41 OBSERVATION_METADATA = "langfuse.observation.metadata" 42 OBSERVATION_LEVEL = "langfuse.observation.level" 43 OBSERVATION_STATUS_MESSAGE = "langfuse.observation.status_message" 44 OBSERVATION_INPUT = "langfuse.observation.input" 45 OBSERVATION_OUTPUT = "langfuse.observation.output" 46 47 # Langfuse-observation of type Generation attributes 48 OBSERVATION_COMPLETION_START_TIME = "langfuse.observation.completion_start_time" 49 OBSERVATION_MODEL = "langfuse.observation.model.name" 50 OBSERVATION_MODEL_PARAMETERS = "langfuse.observation.model.parameters" 51 OBSERVATION_USAGE_DETAILS = "langfuse.observation.usage_details" 52 OBSERVATION_COST_DETAILS = "langfuse.observation.cost_details" 53 OBSERVATION_PROMPT_NAME = "langfuse.observation.prompt.name" 54 OBSERVATION_PROMPT_VERSION = "langfuse.observation.prompt.version" 55 56 # General 57 ENVIRONMENT = "langfuse.environment" 58 RELEASE = "langfuse.release" 59 VERSION = "langfuse.version" 60 61 # Internal 62 AS_ROOT = "langfuse.internal.as_root" 63 64 # Experiments 65 EXPERIMENT_ID = "langfuse.experiment.id" 66 EXPERIMENT_NAME = "langfuse.experiment.name" 67 EXPERIMENT_DESCRIPTION = "langfuse.experiment.description" 68 EXPERIMENT_METADATA = "langfuse.experiment.metadata" 69 EXPERIMENT_DATASET_ID = "langfuse.experiment.dataset.id" 70 EXPERIMENT_ITEM_ID = "langfuse.experiment.item.id" 71 EXPERIMENT_ITEM_EXPECTED_OUTPUT = "langfuse.experiment.item.expected_output" 72 EXPERIMENT_ITEM_METADATA = "langfuse.experiment.item.metadata" 73 EXPERIMENT_ITEM_ROOT_OBSERVATION_ID = "langfuse.experiment.item.root_observation_id"
1445class LangfuseAgent(LangfuseObservationWrapper): 1446 """Agent observation for reasoning blocks that act on tools using LLM guidance.""" 1447 1448 def __init__(self, **kwargs: Any) -> None: 1449 """Initialize a new LangfuseAgent span.""" 1450 kwargs["as_type"] = "agent" 1451 super().__init__(**kwargs)
Agent observation for reasoning blocks that act on tools using LLM guidance.
1454class LangfuseTool(LangfuseObservationWrapper): 1455 """Tool observation representing external tool calls, e.g., calling a weather API.""" 1456 1457 def __init__(self, **kwargs: Any) -> None: 1458 """Initialize a new LangfuseTool span.""" 1459 kwargs["as_type"] = "tool" 1460 super().__init__(**kwargs)
Tool observation representing external tool calls, e.g., calling a weather API.
1463class LangfuseChain(LangfuseObservationWrapper): 1464 """Chain observation for connecting LLM application steps, e.g. passing context from retriever to LLM.""" 1465 1466 def __init__(self, **kwargs: Any) -> None: 1467 """Initialize a new LangfuseChain span.""" 1468 kwargs["as_type"] = "chain" 1469 super().__init__(**kwargs)
Chain observation for connecting LLM application steps, e.g. passing context from retriever to LLM.
1481class LangfuseEmbedding(LangfuseObservationWrapper): 1482 """Embedding observation for LLM embedding calls, typically used before retrieval.""" 1483 1484 def __init__(self, **kwargs: Any) -> None: 1485 """Initialize a new LangfuseEmbedding span.""" 1486 kwargs["as_type"] = "embedding" 1487 super().__init__(**kwargs)
Embedding observation for LLM embedding calls, typically used before retrieval.
1490class LangfuseEvaluator(LangfuseObservationWrapper): 1491 """Evaluator observation for assessing relevance, correctness, or helpfulness of LLM outputs.""" 1492 1493 def __init__(self, **kwargs: Any) -> None: 1494 """Initialize a new LangfuseEvaluator span.""" 1495 kwargs["as_type"] = "evaluator" 1496 super().__init__(**kwargs)
Evaluator observation for assessing relevance, correctness, or helpfulness of LLM outputs.
1472class LangfuseRetriever(LangfuseObservationWrapper): 1473 """Retriever observation for data retrieval steps, e.g. vector store or database queries.""" 1474 1475 def __init__(self, **kwargs: Any) -> None: 1476 """Initialize a new LangfuseRetriever span.""" 1477 kwargs["as_type"] = "retriever" 1478 super().__init__(**kwargs)
Retriever observation for data retrieval steps, e.g. vector store or database queries.
1499class LangfuseGuardrail(LangfuseObservationWrapper): 1500 """Guardrail observation for protection e.g. against jailbreaks or offensive content.""" 1501 1502 def __init__(self, **kwargs: Any) -> None: 1503 """Initialize a new LangfuseGuardrail span.""" 1504 kwargs["as_type"] = "guardrail" 1505 super().__init__(**kwargs)
Guardrail observation for protection e.g. against jailbreaks or offensive content.
94class Evaluation: 95 """Represents an evaluation result for an experiment item or an entire experiment run. 96 97 This class provides a strongly-typed way to create evaluation results in evaluator functions. 98 Users must use keyword arguments when instantiating this class. 99 100 Attributes: 101 name: Unique identifier for the evaluation metric. Should be descriptive 102 and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity"). 103 Used for aggregation and comparison across experiment runs. 104 value: The evaluation score or result. Can be: 105 - Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42) 106 - String: For categorical results like "positive", "negative", "neutral" 107 - Boolean: For binary assessments like "passes_safety_check" 108 comment: Optional human-readable explanation of the evaluation result. 109 Useful for providing context, explaining scoring rationale, or noting 110 special conditions. Displayed in Langfuse UI for interpretability. 111 metadata: Optional structured metadata about the evaluation process. 112 Can include confidence scores, intermediate calculations, model versions, 113 or any other relevant technical details. 114 data_type: Optional score data type. Required if value is not NUMERIC. 115 One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC. 116 config_id: Optional Langfuse score config ID. 117 118 Examples: 119 Basic accuracy evaluation: 120 ```python 121 from langfuse import Evaluation 122 123 def accuracy_evaluator(*, input, output, expected_output=None, **kwargs): 124 if not expected_output: 125 return Evaluation(name="accuracy", value=0, comment="No expected output") 126 127 is_correct = output.strip().lower() == expected_output.strip().lower() 128 return Evaluation( 129 name="accuracy", 130 value=1.0 if is_correct else 0.0, 131 comment="Correct answer" if is_correct else "Incorrect answer" 132 ) 133 ``` 134 135 Multi-metric evaluator: 136 ```python 137 def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs): 138 return [ 139 Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"), 140 Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"), 141 Evaluation( 142 name="quality", 143 value=0.85, 144 comment="High quality response", 145 metadata={"confidence": 0.92, "model": "gpt-4"} 146 ) 147 ] 148 ``` 149 150 Categorical evaluation: 151 ```python 152 def sentiment_evaluator(*, input, output, **kwargs): 153 sentiment = analyze_sentiment(output) # Returns "positive", "negative", or "neutral" 154 return Evaluation( 155 name="sentiment", 156 value=sentiment, 157 comment=f"Response expresses {sentiment} sentiment", 158 data_type="CATEGORICAL" 159 ) 160 ``` 161 162 Failed evaluation with error handling: 163 ```python 164 def external_api_evaluator(*, input, output, **kwargs): 165 try: 166 score = external_api.evaluate(output) 167 return Evaluation(name="external_score", value=score) 168 except Exception as e: 169 return Evaluation( 170 name="external_score", 171 value=0, 172 comment=f"API unavailable: {e}", 173 metadata={"error": str(e), "retry_count": 3} 174 ) 175 ``` 176 177 Note: 178 All arguments must be passed as keywords. Positional arguments are not allowed 179 to ensure code clarity and prevent errors from argument reordering. 180 """ 181 182 def __init__( 183 self, 184 *, 185 name: str, 186 value: Union[int, float, str, bool], 187 comment: Optional[str] = None, 188 metadata: Optional[Dict[str, Any]] = None, 189 data_type: Optional[ExperimentScoreType] = None, 190 config_id: Optional[str] = None, 191 ): 192 """Initialize an Evaluation with the provided data. 193 194 Args: 195 name: Unique identifier for the evaluation metric. 196 value: The evaluation score or result. 197 comment: Optional human-readable explanation of the result. 198 metadata: Optional structured metadata about the evaluation process. 199 data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN). 200 config_id: Optional Langfuse score config ID. 201 202 Note: 203 All arguments must be provided as keywords. Positional arguments will raise a TypeError. 204 """ 205 self.name = name 206 self.value = value 207 self.comment = comment 208 self.metadata = metadata 209 self.data_type = data_type 210 self.config_id = config_id
Represents an evaluation result for an experiment item or an entire experiment run.
This class provides a strongly-typed way to create evaluation results in evaluator functions. Users must use keyword arguments when instantiating this class.
Attributes:
- name: Unique identifier for the evaluation metric. Should be descriptive and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity"). Used for aggregation and comparison across experiment runs.
- value: The evaluation score or result. Can be:
- Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42)
- String: For categorical results like "positive", "negative", "neutral"
- Boolean: For binary assessments like "passes_safety_check"
- comment: Optional human-readable explanation of the evaluation result. Useful for providing context, explaining scoring rationale, or noting special conditions. Displayed in Langfuse UI for interpretability.
- metadata: Optional structured metadata about the evaluation process. Can include confidence scores, intermediate calculations, model versions, or any other relevant technical details.
- data_type: Optional score data type. Required if value is not NUMERIC. One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC.
- config_id: Optional Langfuse score config ID.
Examples:
Basic accuracy evaluation:
from langfuse import Evaluation def accuracy_evaluator(*, input, output, expected_output=None, **kwargs): if not expected_output: return Evaluation(name="accuracy", value=0, comment="No expected output") is_correct = output.strip().lower() == expected_output.strip().lower() return Evaluation( name="accuracy", value=1.0 if is_correct else 0.0, comment="Correct answer" if is_correct else "Incorrect answer" )Multi-metric evaluator:
def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs): return [ Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"), Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"), Evaluation( name="quality", value=0.85, comment="High quality response", metadata={"confidence": 0.92, "model": "gpt-4"} ) ]Categorical evaluation:
def sentiment_evaluator(*, input, output, **kwargs): sentiment = analyze_sentiment(output) # Returns "positive", "negative", or "neutral" return Evaluation( name="sentiment", value=sentiment, comment=f"Response expresses {sentiment} sentiment", data_type="CATEGORICAL" )Failed evaluation with error handling:
def external_api_evaluator(*, input, output, **kwargs): try: score = external_api.evaluate(output) return Evaluation(name="external_score", value=score) except Exception as e: return Evaluation( name="external_score", value=0, comment=f"API unavailable: {e}", metadata={"error": str(e), "retry_count": 3} )
Note:
All arguments must be passed as keywords. Positional arguments are not allowed to ensure code clarity and prevent errors from argument reordering.
182 def __init__( 183 self, 184 *, 185 name: str, 186 value: Union[int, float, str, bool], 187 comment: Optional[str] = None, 188 metadata: Optional[Dict[str, Any]] = None, 189 data_type: Optional[ExperimentScoreType] = None, 190 config_id: Optional[str] = None, 191 ): 192 """Initialize an Evaluation with the provided data. 193 194 Args: 195 name: Unique identifier for the evaluation metric. 196 value: The evaluation score or result. 197 comment: Optional human-readable explanation of the result. 198 metadata: Optional structured metadata about the evaluation process. 199 data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN). 200 config_id: Optional Langfuse score config ID. 201 202 Note: 203 All arguments must be provided as keywords. Positional arguments will raise a TypeError. 204 """ 205 self.name = name 206 self.value = value 207 self.comment = comment 208 self.metadata = metadata 209 self.data_type = data_type 210 self.config_id = config_id
Initialize an Evaluation with the provided data.
Arguments:
- name: Unique identifier for the evaluation metric.
- value: The evaluation score or result.
- comment: Optional human-readable explanation of the result.
- metadata: Optional structured metadata about the evaluation process.
- data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
- config_id: Optional Langfuse score config ID.
Note:
All arguments must be provided as keywords. Positional arguments will raise a TypeError.
38class EvaluatorInputs: 39 """Input data structure for evaluators, returned by mapper functions. 40 41 This class provides a strongly-typed container for transforming API response 42 objects (traces, observations) into the standardized format expected 43 by evaluator functions. It ensures consistent access to input, output, expected 44 output, and metadata regardless of the source entity type. 45 46 Attributes: 47 input: The input data that was provided to generate the output being evaluated. 48 For traces, this might be the initial prompt or request. For observations, 49 this could be the span's input. The exact meaning depends on your use case. 50 output: The actual output that was produced and needs to be evaluated. 51 For traces, this is typically the final response. For observations, 52 this might be the generation output or span result. 53 expected_output: Optional ground truth or expected result for comparison. 54 Used by evaluators to assess correctness. May be None if no ground truth 55 is available for the entity being evaluated. 56 metadata: Optional structured metadata providing additional context for evaluation. 57 Can include information about the entity, execution context, user attributes, 58 or any other relevant data that evaluators might use. 59 60 Examples: 61 Simple mapper for traces: 62 ```python 63 from langfuse import EvaluatorInputs 64 65 def trace_mapper(trace): 66 return EvaluatorInputs( 67 input=trace.input, 68 output=trace.output, 69 expected_output=None, # No ground truth available 70 metadata={"user_id": trace.user_id, "tags": trace.tags} 71 ) 72 ``` 73 74 Mapper for observations extracting specific fields: 75 ```python 76 def observation_mapper(observation): 77 # Extract input/output from observation's data 78 input_data = observation.input if hasattr(observation, 'input') else None 79 output_data = observation.output if hasattr(observation, 'output') else None 80 81 return EvaluatorInputs( 82 input=input_data, 83 output=output_data, 84 expected_output=None, 85 metadata={ 86 "observation_type": observation.type, 87 "model": observation.model, 88 "latency_ms": observation.end_time - observation.start_time 89 } 90 ) 91 ``` 92 ``` 93 94 Note: 95 All arguments must be passed as keywords when instantiating this class. 96 """ 97 98 def __init__( 99 self, 100 *, 101 input: Any, 102 output: Any, 103 expected_output: Any = None, 104 metadata: Optional[Dict[str, Any]] = None, 105 ): 106 """Initialize EvaluatorInputs with the provided data. 107 108 Args: 109 input: The input data for evaluation. 110 output: The output data to be evaluated. 111 expected_output: Optional ground truth for comparison. 112 metadata: Optional additional context for evaluation. 113 114 Note: 115 All arguments must be provided as keywords. 116 """ 117 self.input = input 118 self.output = output 119 self.expected_output = expected_output 120 self.metadata = metadata
Input data structure for evaluators, returned by mapper functions.
This class provides a strongly-typed container for transforming API response objects (traces, observations) into the standardized format expected by evaluator functions. It ensures consistent access to input, output, expected output, and metadata regardless of the source entity type.
Attributes:
- input: The input data that was provided to generate the output being evaluated. For traces, this might be the initial prompt or request. For observations, this could be the span's input. The exact meaning depends on your use case.
- output: The actual output that was produced and needs to be evaluated. For traces, this is typically the final response. For observations, this might be the generation output or span result.
- expected_output: Optional ground truth or expected result for comparison. Used by evaluators to assess correctness. May be None if no ground truth is available for the entity being evaluated.
- metadata: Optional structured metadata providing additional context for evaluation. Can include information about the entity, execution context, user attributes, or any other relevant data that evaluators might use.
Examples:
Simple mapper for traces:
from langfuse import EvaluatorInputs def trace_mapper(trace): return EvaluatorInputs( input=trace.input, output=trace.output, expected_output=None, # No ground truth available metadata={"user_id": trace.user_id, "tags": trace.tags} )Mapper for observations extracting specific fields:
def observation_mapper(observation): # Extract input/output from observation's data input_data = observation.input if hasattr(observation, 'input') else None output_data = observation.output if hasattr(observation, 'output') else None return EvaluatorInputs( input=input_data, output=output_data, expected_output=None, metadata={ "observation_type": observation.type, "model": observation.model, "latency_ms": observation.end_time - observation.start_time } )```
Note:
All arguments must be passed as keywords when instantiating this class.
98 def __init__( 99 self, 100 *, 101 input: Any, 102 output: Any, 103 expected_output: Any = None, 104 metadata: Optional[Dict[str, Any]] = None, 105 ): 106 """Initialize EvaluatorInputs with the provided data. 107 108 Args: 109 input: The input data for evaluation. 110 output: The output data to be evaluated. 111 expected_output: Optional ground truth for comparison. 112 metadata: Optional additional context for evaluation. 113 114 Note: 115 All arguments must be provided as keywords. 116 """ 117 self.input = input 118 self.output = output 119 self.expected_output = expected_output 120 self.metadata = metadata
Initialize EvaluatorInputs with the provided data.
Arguments:
- input: The input data for evaluation.
- output: The output data to be evaluated.
- expected_output: Optional ground truth for comparison.
- metadata: Optional additional context for evaluation.
Note:
All arguments must be provided as keywords.
123class MapperFunction(Protocol): 124 """Protocol defining the interface for mapper functions in batch evaluation. 125 126 Mapper functions transform API response objects (traces or observations) 127 into the standardized EvaluatorInputs format that evaluators expect. This abstraction 128 allows you to define how to extract and structure evaluation data from different 129 entity types. 130 131 Mapper functions must: 132 - Accept a single item parameter (trace, observation) 133 - Return an EvaluatorInputs instance with input, output, expected_output, metadata 134 - Can be either synchronous or asynchronous 135 - Should handle missing or malformed data gracefully 136 """ 137 138 def __call__( 139 self, 140 *, 141 item: Union["TraceWithFullDetails", "ObservationsView"], 142 **kwargs: Dict[str, Any], 143 ) -> Union[EvaluatorInputs, Awaitable[EvaluatorInputs]]: 144 """Transform an API response object into evaluator inputs. 145 146 This method defines how to extract evaluation-relevant data from the raw 147 API response object. The implementation should map entity-specific fields 148 to the standardized input/output/expected_output/metadata structure. 149 150 Args: 151 item: The API response object to transform. The type depends on the scope: 152 - TraceWithFullDetails: When evaluating traces 153 - ObservationsView: When evaluating observations 154 155 Returns: 156 EvaluatorInputs: A structured container with: 157 - input: The input data that generated the output 158 - output: The output to be evaluated 159 - expected_output: Optional ground truth for comparison 160 - metadata: Optional additional context 161 162 Can return either a direct EvaluatorInputs instance or an awaitable 163 (for async mappers that need to fetch additional data). 164 165 Examples: 166 Basic trace mapper: 167 ```python 168 def map_trace(trace): 169 return EvaluatorInputs( 170 input=trace.input, 171 output=trace.output, 172 expected_output=None, 173 metadata={"trace_id": trace.id, "user": trace.user_id} 174 ) 175 ``` 176 177 Observation mapper with conditional logic: 178 ```python 179 def map_observation(observation): 180 # Extract fields based on observation type 181 if observation.type == "GENERATION": 182 input_data = observation.input 183 output_data = observation.output 184 else: 185 # For other types, use different fields 186 input_data = observation.metadata.get("input") 187 output_data = observation.metadata.get("output") 188 189 return EvaluatorInputs( 190 input=input_data, 191 output=output_data, 192 expected_output=None, 193 metadata={"obs_id": observation.id, "type": observation.type} 194 ) 195 ``` 196 197 Async mapper (if additional processing needed): 198 ```python 199 async def map_trace_async(trace): 200 # Could do async processing here if needed 201 processed_output = await some_async_transformation(trace.output) 202 203 return EvaluatorInputs( 204 input=trace.input, 205 output=processed_output, 206 expected_output=None, 207 metadata={"trace_id": trace.id} 208 ) 209 ``` 210 """ 211 ...
Protocol defining the interface for mapper functions in batch evaluation.
Mapper functions transform API response objects (traces or observations) into the standardized EvaluatorInputs format that evaluators expect. This abstraction allows you to define how to extract and structure evaluation data from different entity types.
Mapper functions must:
- Accept a single item parameter (trace, observation)
- Return an EvaluatorInputs instance with input, output, expected_output, metadata
- Can be either synchronous or asynchronous
- Should handle missing or malformed data gracefully
1927def _no_init_or_replace_init(self, *args, **kwargs): 1928 cls = type(self) 1929 1930 if cls._is_protocol: 1931 raise TypeError('Protocols cannot be instantiated') 1932 1933 # Already using a custom `__init__`. No need to calculate correct 1934 # `__init__` to call. This can lead to RecursionError. See bpo-45121. 1935 if cls.__init__ is not _no_init_or_replace_init: 1936 return 1937 1938 # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`. 1939 # The first instantiation of the subclass will call `_no_init_or_replace_init` which 1940 # searches for a proper new `__init__` in the MRO. The new `__init__` 1941 # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent 1942 # instantiation of the protocol subclass will thus use the new 1943 # `__init__` and no longer call `_no_init_or_replace_init`. 1944 for base in cls.__mro__: 1945 init = base.__dict__.get('__init__', _no_init_or_replace_init) 1946 if init is not _no_init_or_replace_init: 1947 cls.__init__ = init 1948 break 1949 else: 1950 # should not happen 1951 cls.__init__ = object.__init__ 1952 1953 cls.__init__(self, *args, **kwargs)
214class CompositeEvaluatorFunction(Protocol): 215 """Protocol defining the interface for composite evaluator functions. 216 217 Composite evaluators create aggregate scores from multiple item-level evaluations. 218 This is commonly used to compute weighted averages, combined metrics, or other 219 composite assessments based on individual evaluation results. 220 221 Composite evaluators: 222 - Accept the same inputs as item-level evaluators (input, output, expected_output, metadata) 223 plus the list of evaluations 224 - Return either a single Evaluation, a list of Evaluations, or a dict 225 - Can be either synchronous or asynchronous 226 - Have access to both raw item data and evaluation results 227 """ 228 229 def __call__( 230 self, 231 *, 232 input: Optional[Any] = None, 233 output: Optional[Any] = None, 234 expected_output: Optional[Any] = None, 235 metadata: Optional[Dict[str, Any]] = None, 236 evaluations: List[Evaluation], 237 **kwargs: Dict[str, Any], 238 ) -> Union[ 239 Evaluation, 240 List[Evaluation], 241 Dict[str, Any], 242 Awaitable[Evaluation], 243 Awaitable[List[Evaluation]], 244 Awaitable[Dict[str, Any]], 245 ]: 246 r"""Create a composite evaluation from item-level evaluation results. 247 248 This method combines multiple evaluation scores into a single composite metric. 249 Common use cases include weighted averages, pass/fail decisions based on multiple 250 criteria, or custom scoring logic that considers multiple dimensions. 251 252 Args: 253 input: The input data that was provided to the system being evaluated. 254 output: The output generated by the system being evaluated. 255 expected_output: The expected/reference output for comparison (if available). 256 metadata: Additional metadata about the evaluation context. 257 evaluations: List of evaluation results from item-level evaluators. 258 Each evaluation contains name, value, comment, and metadata. 259 260 Returns: 261 Can return any of: 262 - Evaluation: A single composite evaluation result 263 - List[Evaluation]: Multiple composite evaluations 264 - Dict: A dict that will be converted to an Evaluation 265 - name: Identifier for the composite metric (e.g., "composite_score") 266 - value: The computed composite value 267 - comment: Optional explanation of how the score was computed 268 - metadata: Optional details about the composition logic 269 270 Can return either a direct Evaluation instance or an awaitable 271 (for async composite evaluators). 272 273 Examples: 274 Simple weighted average: 275 ```python 276 def weighted_composite(*, input, output, expected_output, metadata, evaluations): 277 weights = { 278 "accuracy": 0.5, 279 "relevance": 0.3, 280 "safety": 0.2 281 } 282 283 total_score = 0.0 284 total_weight = 0.0 285 286 for eval in evaluations: 287 if eval.name in weights and isinstance(eval.value, (int, float)): 288 total_score += eval.value * weights[eval.name] 289 total_weight += weights[eval.name] 290 291 final_score = total_score / total_weight if total_weight > 0 else 0.0 292 293 return Evaluation( 294 name="composite_score", 295 value=final_score, 296 comment=f"Weighted average of {len(evaluations)} metrics" 297 ) 298 ``` 299 300 Pass/fail composite based on thresholds: 301 ```python 302 def pass_fail_composite(*, input, output, expected_output, metadata, evaluations): 303 # Must pass all criteria 304 thresholds = { 305 "accuracy": 0.7, 306 "safety": 0.9, 307 "relevance": 0.6 308 } 309 310 passes = True 311 failing_metrics = [] 312 313 for metric, threshold in thresholds.items(): 314 eval_result = next((e for e in evaluations if e.name == metric), None) 315 if eval_result and isinstance(eval_result.value, (int, float)): 316 if eval_result.value < threshold: 317 passes = False 318 failing_metrics.append(metric) 319 320 return Evaluation( 321 name="passes_all_checks", 322 value=passes, 323 comment=f"Failed: {', '.join(failing_metrics)}" if failing_metrics else "All checks passed", 324 data_type="BOOLEAN" 325 ) 326 ``` 327 328 Async composite with external scoring: 329 ```python 330 async def llm_composite(*, input, output, expected_output, metadata, evaluations): 331 # Use LLM to synthesize multiple evaluation results 332 eval_summary = "\n".join( 333 f"- {e.name}: {e.value}" for e in evaluations 334 ) 335 336 prompt = f"Given these evaluation scores:\n{eval_summary}\n" 337 prompt += f"For the output: {output}\n" 338 prompt += "Provide an overall quality score from 0-1." 339 340 response = await openai.chat.completions.create( 341 model="gpt-4", 342 messages=[{"role": "user", "content": prompt}] 343 ) 344 345 score = float(response.choices[0].message.content.strip()) 346 347 return Evaluation( 348 name="llm_composite_score", 349 value=score, 350 comment="LLM-synthesized composite score" 351 ) 352 ``` 353 354 Context-aware composite: 355 ```python 356 def context_composite(*, input, output, expected_output, metadata, evaluations): 357 # Adjust weighting based on metadata 358 base_weights = {"accuracy": 0.5, "speed": 0.3, "cost": 0.2} 359 360 # If metadata indicates high importance, prioritize accuracy 361 if metadata and metadata.get('importance') == 'high': 362 weights = {"accuracy": 0.7, "speed": 0.2, "cost": 0.1} 363 else: 364 weights = base_weights 365 366 total = sum( 367 e.value * weights.get(e.name, 0) 368 for e in evaluations 369 if isinstance(e.value, (int, float)) 370 ) 371 372 return Evaluation( 373 name="weighted_composite", 374 value=total, 375 comment="Context-aware weighted composite" 376 ) 377 ``` 378 """ 379 ...
Protocol defining the interface for composite evaluator functions.
Composite evaluators create aggregate scores from multiple item-level evaluations. This is commonly used to compute weighted averages, combined metrics, or other composite assessments based on individual evaluation results.
Composite evaluators:
- Accept the same inputs as item-level evaluators (input, output, expected_output, metadata) plus the list of evaluations
- Return either a single Evaluation, a list of Evaluations, or a dict
- Can be either synchronous or asynchronous
- Have access to both raw item data and evaluation results
1927def _no_init_or_replace_init(self, *args, **kwargs): 1928 cls = type(self) 1929 1930 if cls._is_protocol: 1931 raise TypeError('Protocols cannot be instantiated') 1932 1933 # Already using a custom `__init__`. No need to calculate correct 1934 # `__init__` to call. This can lead to RecursionError. See bpo-45121. 1935 if cls.__init__ is not _no_init_or_replace_init: 1936 return 1937 1938 # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`. 1939 # The first instantiation of the subclass will call `_no_init_or_replace_init` which 1940 # searches for a proper new `__init__` in the MRO. The new `__init__` 1941 # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent 1942 # instantiation of the protocol subclass will thus use the new 1943 # `__init__` and no longer call `_no_init_or_replace_init`. 1944 for base in cls.__mro__: 1945 init = base.__dict__.get('__init__', _no_init_or_replace_init) 1946 if init is not _no_init_or_replace_init: 1947 cls.__init__ = init 1948 break 1949 else: 1950 # should not happen 1951 cls.__init__ = object.__init__ 1952 1953 cls.__init__(self, *args, **kwargs)
382class EvaluatorStats: 383 """Statistics for a single evaluator's performance during batch evaluation. 384 385 This class tracks detailed metrics about how a specific evaluator performed 386 across all items in a batch evaluation run. It helps identify evaluator issues, 387 understand reliability, and optimize evaluation pipelines. 388 389 Attributes: 390 name: The name of the evaluator function (extracted from __name__). 391 total_runs: Total number of times the evaluator was invoked. 392 successful_runs: Number of times the evaluator completed successfully. 393 failed_runs: Number of times the evaluator raised an exception or failed. 394 total_scores_created: Total number of evaluation scores created by this evaluator. 395 Can be higher than successful_runs if the evaluator returns multiple scores. 396 397 Examples: 398 Accessing evaluator stats from batch evaluation result: 399 ```python 400 result = client.run_batched_evaluation(...) 401 402 for stats in result.evaluator_stats: 403 print(f"Evaluator: {stats.name}") 404 print(f" Success rate: {stats.successful_runs / stats.total_runs:.1%}") 405 print(f" Scores created: {stats.total_scores_created}") 406 407 if stats.failed_runs > 0: 408 print(f" â ī¸ Failed {stats.failed_runs} times") 409 ``` 410 411 Identifying problematic evaluators: 412 ```python 413 result = client.run_batched_evaluation(...) 414 415 # Find evaluators with high failure rates 416 for stats in result.evaluator_stats: 417 failure_rate = stats.failed_runs / stats.total_runs 418 if failure_rate > 0.1: # More than 10% failures 419 print(f"â ī¸ {stats.name} has {failure_rate:.1%} failure rate") 420 print(f" Consider debugging or removing this evaluator") 421 ``` 422 423 Note: 424 All arguments must be passed as keywords when instantiating this class. 425 """ 426 427 def __init__( 428 self, 429 *, 430 name: str, 431 total_runs: int = 0, 432 successful_runs: int = 0, 433 failed_runs: int = 0, 434 total_scores_created: int = 0, 435 ): 436 """Initialize EvaluatorStats with the provided metrics. 437 438 Args: 439 name: The evaluator function name. 440 total_runs: Total number of evaluator invocations. 441 successful_runs: Number of successful completions. 442 failed_runs: Number of failures. 443 total_scores_created: Total scores created by this evaluator. 444 445 Note: 446 All arguments must be provided as keywords. 447 """ 448 self.name = name 449 self.total_runs = total_runs 450 self.successful_runs = successful_runs 451 self.failed_runs = failed_runs 452 self.total_scores_created = total_scores_created
Statistics for a single evaluator's performance during batch evaluation.
This class tracks detailed metrics about how a specific evaluator performed across all items in a batch evaluation run. It helps identify evaluator issues, understand reliability, and optimize evaluation pipelines.
Attributes:
- name: The name of the evaluator function (extracted from __name__).
- total_runs: Total number of times the evaluator was invoked.
- successful_runs: Number of times the evaluator completed successfully.
- failed_runs: Number of times the evaluator raised an exception or failed.
- total_scores_created: Total number of evaluation scores created by this evaluator. Can be higher than successful_runs if the evaluator returns multiple scores.
Examples:
Accessing evaluator stats from batch evaluation result:
result = client.run_batched_evaluation(...) for stats in result.evaluator_stats: print(f"Evaluator: {stats.name}") print(f" Success rate: {stats.successful_runs / stats.total_runs:.1%}") print(f" Scores created: {stats.total_scores_created}") if stats.failed_runs > 0: print(f" â ī¸ Failed {stats.failed_runs} times")Identifying problematic evaluators:
result = client.run_batched_evaluation(...) # Find evaluators with high failure rates for stats in result.evaluator_stats: failure_rate = stats.failed_runs / stats.total_runs if failure_rate > 0.1: # More than 10% failures print(f"â ī¸ {stats.name} has {failure_rate:.1%} failure rate") print(f" Consider debugging or removing this evaluator")
Note:
All arguments must be passed as keywords when instantiating this class.
427 def __init__( 428 self, 429 *, 430 name: str, 431 total_runs: int = 0, 432 successful_runs: int = 0, 433 failed_runs: int = 0, 434 total_scores_created: int = 0, 435 ): 436 """Initialize EvaluatorStats with the provided metrics. 437 438 Args: 439 name: The evaluator function name. 440 total_runs: Total number of evaluator invocations. 441 successful_runs: Number of successful completions. 442 failed_runs: Number of failures. 443 total_scores_created: Total scores created by this evaluator. 444 445 Note: 446 All arguments must be provided as keywords. 447 """ 448 self.name = name 449 self.total_runs = total_runs 450 self.successful_runs = successful_runs 451 self.failed_runs = failed_runs 452 self.total_scores_created = total_scores_created
Initialize EvaluatorStats with the provided metrics.
Arguments:
- name: The evaluator function name.
- total_runs: Total number of evaluator invocations.
- successful_runs: Number of successful completions.
- failed_runs: Number of failures.
- total_scores_created: Total scores created by this evaluator.
Note:
All arguments must be provided as keywords.
455class BatchEvaluationResumeToken: 456 """Token for resuming a failed batch evaluation run. 457 458 This class encapsulates all the information needed to resume a batch evaluation 459 that was interrupted or failed partway through. It uses timestamp-based filtering 460 to avoid re-processing items that were already evaluated, even if the underlying 461 dataset changed between runs. 462 463 Attributes: 464 scope: The type of items being evaluated ("traces", "observations"). 465 filter: The original JSON filter string used to query items. 466 last_processed_timestamp: ISO 8601 timestamp of the last successfully processed item. 467 Used to construct a filter that only fetches items after this timestamp. 468 last_processed_id: The ID of the last successfully processed item, for reference. 469 items_processed: Count of items successfully processed before interruption. 470 471 Examples: 472 Resuming a failed batch evaluation: 473 ```python 474 # Initial run that fails partway through 475 try: 476 result = client.run_batched_evaluation( 477 scope="traces", 478 mapper=my_mapper, 479 evaluators=[evaluator1, evaluator2], 480 filter='{"tags": ["production"]}', 481 max_items=10000 482 ) 483 except Exception as e: 484 print(f"Evaluation failed: {e}") 485 486 # Save the resume token 487 if result.resume_token: 488 # Store resume token for later (e.g., in a file or database) 489 import json 490 with open("resume_token.json", "w") as f: 491 json.dump({ 492 "scope": result.resume_token.scope, 493 "filter": result.resume_token.filter, 494 "last_timestamp": result.resume_token.last_processed_timestamp, 495 "last_id": result.resume_token.last_processed_id, 496 "items_done": result.resume_token.items_processed 497 }, f) 498 499 # Later, resume from where it left off 500 with open("resume_token.json") as f: 501 token_data = json.load(f) 502 503 resume_token = BatchEvaluationResumeToken( 504 scope=token_data["scope"], 505 filter=token_data["filter"], 506 last_processed_timestamp=token_data["last_timestamp"], 507 last_processed_id=token_data["last_id"], 508 items_processed=token_data["items_done"] 509 ) 510 511 # Resume the evaluation 512 result = client.run_batched_evaluation( 513 scope="traces", 514 mapper=my_mapper, 515 evaluators=[evaluator1, evaluator2], 516 resume_from=resume_token 517 ) 518 519 print(f"Processed {result.total_items_processed} additional items") 520 ``` 521 522 Handling partial completion: 523 ```python 524 result = client.run_batched_evaluation(...) 525 526 if not result.completed: 527 print(f"Evaluation incomplete. Processed {result.resume_token.items_processed} items") 528 print(f"Last item: {result.resume_token.last_processed_id}") 529 print(f"Resume from: {result.resume_token.last_processed_timestamp}") 530 531 # Optionally retry automatically 532 if result.resume_token: 533 print("Retrying...") 534 result = client.run_batched_evaluation( 535 scope=result.resume_token.scope, 536 mapper=my_mapper, 537 evaluators=my_evaluators, 538 resume_from=result.resume_token 539 ) 540 ``` 541 542 Note: 543 All arguments must be passed as keywords when instantiating this class. 544 The timestamp-based approach means that items created after the initial run 545 but before the timestamp will be skipped. This is intentional to avoid 546 duplicates and ensure consistent evaluation. 547 """ 548 549 def __init__( 550 self, 551 *, 552 scope: str, 553 filter: Optional[str], 554 last_processed_timestamp: str, 555 last_processed_id: str, 556 items_processed: int, 557 ): 558 """Initialize BatchEvaluationResumeToken with the provided state. 559 560 Args: 561 scope: The scope type ("traces", "observations"). 562 filter: The original JSON filter string. 563 last_processed_timestamp: ISO 8601 timestamp of last processed item. 564 last_processed_id: ID of last processed item. 565 items_processed: Count of items processed before interruption. 566 567 Note: 568 All arguments must be provided as keywords. 569 """ 570 self.scope = scope 571 self.filter = filter 572 self.last_processed_timestamp = last_processed_timestamp 573 self.last_processed_id = last_processed_id 574 self.items_processed = items_processed
Token for resuming a failed batch evaluation run.
This class encapsulates all the information needed to resume a batch evaluation that was interrupted or failed partway through. It uses timestamp-based filtering to avoid re-processing items that were already evaluated, even if the underlying dataset changed between runs.
Attributes:
- scope: The type of items being evaluated ("traces", "observations").
- filter: The original JSON filter string used to query items.
- last_processed_timestamp: ISO 8601 timestamp of the last successfully processed item. Used to construct a filter that only fetches items after this timestamp.
- last_processed_id: The ID of the last successfully processed item, for reference.
- items_processed: Count of items successfully processed before interruption.
Examples:
Resuming a failed batch evaluation:
# Initial run that fails partway through try: result = client.run_batched_evaluation( scope="traces", mapper=my_mapper, evaluators=[evaluator1, evaluator2], filter='{"tags": ["production"]}', max_items=10000 ) except Exception as e: print(f"Evaluation failed: {e}") # Save the resume token if result.resume_token: # Store resume token for later (e.g., in a file or database) import json with open("resume_token.json", "w") as f: json.dump({ "scope": result.resume_token.scope, "filter": result.resume_token.filter, "last_timestamp": result.resume_token.last_processed_timestamp, "last_id": result.resume_token.last_processed_id, "items_done": result.resume_token.items_processed }, f) # Later, resume from where it left off with open("resume_token.json") as f: token_data = json.load(f) resume_token = BatchEvaluationResumeToken( scope=token_data["scope"], filter=token_data["filter"], last_processed_timestamp=token_data["last_timestamp"], last_processed_id=token_data["last_id"], items_processed=token_data["items_done"] ) # Resume the evaluation result = client.run_batched_evaluation( scope="traces", mapper=my_mapper, evaluators=[evaluator1, evaluator2], resume_from=resume_token ) print(f"Processed {result.total_items_processed} additional items")Handling partial completion:
result = client.run_batched_evaluation(...) if not result.completed: print(f"Evaluation incomplete. Processed {result.resume_token.items_processed} items") print(f"Last item: {result.resume_token.last_processed_id}") print(f"Resume from: {result.resume_token.last_processed_timestamp}") # Optionally retry automatically if result.resume_token: print("Retrying...") result = client.run_batched_evaluation( scope=result.resume_token.scope, mapper=my_mapper, evaluators=my_evaluators, resume_from=result.resume_token )
Note:
All arguments must be passed as keywords when instantiating this class. The timestamp-based approach means that items created after the initial run but before the timestamp will be skipped. This is intentional to avoid duplicates and ensure consistent evaluation.
549 def __init__( 550 self, 551 *, 552 scope: str, 553 filter: Optional[str], 554 last_processed_timestamp: str, 555 last_processed_id: str, 556 items_processed: int, 557 ): 558 """Initialize BatchEvaluationResumeToken with the provided state. 559 560 Args: 561 scope: The scope type ("traces", "observations"). 562 filter: The original JSON filter string. 563 last_processed_timestamp: ISO 8601 timestamp of last processed item. 564 last_processed_id: ID of last processed item. 565 items_processed: Count of items processed before interruption. 566 567 Note: 568 All arguments must be provided as keywords. 569 """ 570 self.scope = scope 571 self.filter = filter 572 self.last_processed_timestamp = last_processed_timestamp 573 self.last_processed_id = last_processed_id 574 self.items_processed = items_processed
Initialize BatchEvaluationResumeToken with the provided state.
Arguments:
- scope: The scope type ("traces", "observations").
- filter: The original JSON filter string.
- last_processed_timestamp: ISO 8601 timestamp of last processed item.
- last_processed_id: ID of last processed item.
- items_processed: Count of items processed before interruption.
Note:
All arguments must be provided as keywords.
577class BatchEvaluationResult: 578 r"""Complete result structure for batch evaluation execution. 579 580 This class encapsulates comprehensive statistics and metadata about a batch 581 evaluation run, including counts, evaluator-specific metrics, timing information, 582 error details, and resume capability. 583 584 Attributes: 585 total_items_fetched: Total number of items fetched from the API. 586 total_items_processed: Number of items successfully evaluated. 587 total_items_failed: Number of items that failed during evaluation. 588 total_scores_created: Total scores created by all item-level evaluators. 589 total_composite_scores_created: Scores created by the composite evaluator. 590 total_evaluations_failed: Number of individual evaluator failures across all items. 591 evaluator_stats: List of per-evaluator statistics (success/failure rates, scores created). 592 resume_token: Token for resuming if evaluation was interrupted (None if completed). 593 completed: True if all items were processed, False if stopped early or failed. 594 duration_seconds: Total time taken to execute the batch evaluation. 595 failed_item_ids: List of IDs for items that failed evaluation. 596 error_summary: Dictionary mapping error types to occurrence counts. 597 has_more_items: True if max_items limit was reached but more items exist. 598 item_evaluations: Dictionary mapping item IDs to their evaluation results (both regular and composite). 599 600 Examples: 601 Basic result inspection: 602 ```python 603 result = client.run_batched_evaluation(...) 604 605 print(f"Processed: {result.total_items_processed}/{result.total_items_fetched}") 606 print(f"Scores created: {result.total_scores_created}") 607 print(f"Duration: {result.duration_seconds:.2f}s") 608 print(f"Success rate: {result.total_items_processed / result.total_items_fetched:.1%}") 609 ``` 610 611 Detailed analysis with evaluator stats: 612 ```python 613 result = client.run_batched_evaluation(...) 614 615 print(f"\nđ Batch Evaluation Results") 616 print(f"{'='*50}") 617 print(f"Items processed: {result.total_items_processed}") 618 print(f"Items failed: {result.total_items_failed}") 619 print(f"Scores created: {result.total_scores_created}") 620 621 if result.total_composite_scores_created > 0: 622 print(f"Composite scores: {result.total_composite_scores_created}") 623 624 print(f"\nđ Evaluator Performance:") 625 for stats in result.evaluator_stats: 626 success_rate = stats.successful_runs / stats.total_runs if stats.total_runs > 0 else 0 627 print(f"\n {stats.name}:") 628 print(f" Success rate: {success_rate:.1%}") 629 print(f" Scores created: {stats.total_scores_created}") 630 if stats.failed_runs > 0: 631 print(f" â ī¸ Failures: {stats.failed_runs}") 632 633 if result.error_summary: 634 print(f"\nâ ī¸ Errors encountered:") 635 for error_type, count in result.error_summary.items(): 636 print(f" {error_type}: {count}") 637 ``` 638 639 Handling incomplete runs: 640 ```python 641 result = client.run_batched_evaluation(...) 642 643 if not result.completed: 644 print("â ī¸ Evaluation incomplete!") 645 646 if result.resume_token: 647 print(f"Processed {result.resume_token.items_processed} items before failure") 648 print(f"Use resume_from parameter to continue from:") 649 print(f" Timestamp: {result.resume_token.last_processed_timestamp}") 650 print(f" Last ID: {result.resume_token.last_processed_id}") 651 652 if result.has_more_items: 653 print(f"âšī¸ More items available beyond max_items limit") 654 ``` 655 656 Performance monitoring: 657 ```python 658 result = client.run_batched_evaluation(...) 659 660 items_per_second = result.total_items_processed / result.duration_seconds 661 avg_scores_per_item = result.total_scores_created / result.total_items_processed 662 663 print(f"Performance metrics:") 664 print(f" Throughput: {items_per_second:.2f} items/second") 665 print(f" Avg scores/item: {avg_scores_per_item:.2f}") 666 print(f" Total duration: {result.duration_seconds:.2f}s") 667 668 if result.total_evaluations_failed > 0: 669 failure_rate = result.total_evaluations_failed / ( 670 result.total_items_processed * len(result.evaluator_stats) 671 ) 672 print(f" Evaluation failure rate: {failure_rate:.1%}") 673 ``` 674 675 Note: 676 All arguments must be passed as keywords when instantiating this class. 677 """ 678 679 def __init__( 680 self, 681 *, 682 total_items_fetched: int, 683 total_items_processed: int, 684 total_items_failed: int, 685 total_scores_created: int, 686 total_composite_scores_created: int, 687 total_evaluations_failed: int, 688 evaluator_stats: List[EvaluatorStats], 689 resume_token: Optional[BatchEvaluationResumeToken], 690 completed: bool, 691 duration_seconds: float, 692 failed_item_ids: List[str], 693 error_summary: Dict[str, int], 694 has_more_items: bool, 695 item_evaluations: Dict[str, List["Evaluation"]], 696 ): 697 """Initialize BatchEvaluationResult with comprehensive statistics. 698 699 Args: 700 total_items_fetched: Total items fetched from API. 701 total_items_processed: Items successfully evaluated. 702 total_items_failed: Items that failed evaluation. 703 total_scores_created: Scores from item-level evaluators. 704 total_composite_scores_created: Scores from composite evaluator. 705 total_evaluations_failed: Individual evaluator failures. 706 evaluator_stats: Per-evaluator statistics. 707 resume_token: Token for resuming (None if completed). 708 completed: Whether all items were processed. 709 duration_seconds: Total execution time. 710 failed_item_ids: IDs of failed items. 711 error_summary: Error types and counts. 712 has_more_items: Whether more items exist beyond max_items. 713 item_evaluations: Dictionary mapping item IDs to their evaluation results. 714 715 Note: 716 All arguments must be provided as keywords. 717 """ 718 self.total_items_fetched = total_items_fetched 719 self.total_items_processed = total_items_processed 720 self.total_items_failed = total_items_failed 721 self.total_scores_created = total_scores_created 722 self.total_composite_scores_created = total_composite_scores_created 723 self.total_evaluations_failed = total_evaluations_failed 724 self.evaluator_stats = evaluator_stats 725 self.resume_token = resume_token 726 self.completed = completed 727 self.duration_seconds = duration_seconds 728 self.failed_item_ids = failed_item_ids 729 self.error_summary = error_summary 730 self.has_more_items = has_more_items 731 self.item_evaluations = item_evaluations 732 733 def __str__(self) -> str: 734 """Return a formatted string representation of the batch evaluation results. 735 736 Returns: 737 A multi-line string with a summary of the evaluation results. 738 """ 739 lines = [] 740 lines.append("=" * 60) 741 lines.append("Batch Evaluation Results") 742 lines.append("=" * 60) 743 744 # Summary statistics 745 lines.append(f"\nStatus: {'Completed' if self.completed else 'Incomplete'}") 746 lines.append(f"Duration: {self.duration_seconds:.2f}s") 747 lines.append(f"\nItems fetched: {self.total_items_fetched}") 748 lines.append(f"Items processed: {self.total_items_processed}") 749 750 if self.total_items_failed > 0: 751 lines.append(f"Items failed: {self.total_items_failed}") 752 753 # Success rate 754 if self.total_items_fetched > 0: 755 success_rate = self.total_items_processed / self.total_items_fetched * 100 756 lines.append(f"Success rate: {success_rate:.1f}%") 757 758 # Scores created 759 lines.append(f"\nScores created: {self.total_scores_created}") 760 if self.total_composite_scores_created > 0: 761 lines.append(f"Composite scores: {self.total_composite_scores_created}") 762 763 total_scores = self.total_scores_created + self.total_composite_scores_created 764 lines.append(f"Total scores: {total_scores}") 765 766 # Evaluator statistics 767 if self.evaluator_stats: 768 lines.append("\nEvaluator Performance:") 769 for stats in self.evaluator_stats: 770 lines.append(f" {stats.name}:") 771 if stats.total_runs > 0: 772 success_rate = ( 773 stats.successful_runs / stats.total_runs * 100 774 if stats.total_runs > 0 775 else 0 776 ) 777 lines.append( 778 f" Runs: {stats.successful_runs}/{stats.total_runs} " 779 f"({success_rate:.1f}% success)" 780 ) 781 lines.append(f" Scores created: {stats.total_scores_created}") 782 if stats.failed_runs > 0: 783 lines.append(f" Failed runs: {stats.failed_runs}") 784 785 # Performance metrics 786 if self.total_items_processed > 0 and self.duration_seconds > 0: 787 items_per_sec = self.total_items_processed / self.duration_seconds 788 lines.append("\nPerformance:") 789 lines.append(f" Throughput: {items_per_sec:.2f} items/second") 790 if self.total_scores_created > 0: 791 avg_scores = self.total_scores_created / self.total_items_processed 792 lines.append(f" Avg scores per item: {avg_scores:.2f}") 793 794 # Errors and warnings 795 if self.error_summary: 796 lines.append("\nErrors encountered:") 797 for error_type, count in self.error_summary.items(): 798 lines.append(f" {error_type}: {count}") 799 800 # Incomplete run information 801 if not self.completed: 802 lines.append("\nWarning: Evaluation incomplete") 803 if self.resume_token: 804 lines.append( 805 f" Last processed: {self.resume_token.last_processed_timestamp}" 806 ) 807 lines.append(f" Items processed: {self.resume_token.items_processed}") 808 lines.append(" Use resume_from parameter to continue") 809 810 if self.has_more_items: 811 lines.append("\nNote: More items available beyond max_items limit") 812 813 lines.append("=" * 60) 814 return "\n".join(lines)
Complete result structure for batch evaluation execution.
This class encapsulates comprehensive statistics and metadata about a batch evaluation run, including counts, evaluator-specific metrics, timing information, error details, and resume capability.
Attributes:
- total_items_fetched: Total number of items fetched from the API.
- total_items_processed: Number of items successfully evaluated.
- total_items_failed: Number of items that failed during evaluation.
- total_scores_created: Total scores created by all item-level evaluators.
- total_composite_scores_created: Scores created by the composite evaluator.
- total_evaluations_failed: Number of individual evaluator failures across all items.
- evaluator_stats: List of per-evaluator statistics (success/failure rates, scores created).
- resume_token: Token for resuming if evaluation was interrupted (None if completed).
- completed: True if all items were processed, False if stopped early or failed.
- duration_seconds: Total time taken to execute the batch evaluation.
- failed_item_ids: List of IDs for items that failed evaluation.
- error_summary: Dictionary mapping error types to occurrence counts.
- has_more_items: True if max_items limit was reached but more items exist.
- item_evaluations: Dictionary mapping item IDs to their evaluation results (both regular and composite).
Examples:
Basic result inspection:
result = client.run_batched_evaluation(...) print(f"Processed: {result.total_items_processed}/{result.total_items_fetched}") print(f"Scores created: {result.total_scores_created}") print(f"Duration: {result.duration_seconds:.2f}s") print(f"Success rate: {result.total_items_processed / result.total_items_fetched:.1%}")Detailed analysis with evaluator stats:
result = client.run_batched_evaluation(...) print(f"\nđ Batch Evaluation Results") print(f"{'='*50}") print(f"Items processed: {result.total_items_processed}") print(f"Items failed: {result.total_items_failed}") print(f"Scores created: {result.total_scores_created}") if result.total_composite_scores_created > 0: print(f"Composite scores: {result.total_composite_scores_created}") print(f"\nđ Evaluator Performance:") for stats in result.evaluator_stats: success_rate = stats.successful_runs / stats.total_runs if stats.total_runs > 0 else 0 print(f"\n {stats.name}:") print(f" Success rate: {success_rate:.1%}") print(f" Scores created: {stats.total_scores_created}") if stats.failed_runs > 0: print(f" â ī¸ Failures: {stats.failed_runs}") if result.error_summary: print(f"\nâ ī¸ Errors encountered:") for error_type, count in result.error_summary.items(): print(f" {error_type}: {count}")Handling incomplete runs:
result = client.run_batched_evaluation(...) if not result.completed: print("â ī¸ Evaluation incomplete!") if result.resume_token: print(f"Processed {result.resume_token.items_processed} items before failure") print(f"Use resume_from parameter to continue from:") print(f" Timestamp: {result.resume_token.last_processed_timestamp}") print(f" Last ID: {result.resume_token.last_processed_id}") if result.has_more_items: print(f"âšī¸ More items available beyond max_items limit")Performance monitoring:
result = client.run_batched_evaluation(...) items_per_second = result.total_items_processed / result.duration_seconds avg_scores_per_item = result.total_scores_created / result.total_items_processed print(f"Performance metrics:") print(f" Throughput: {items_per_second:.2f} items/second") print(f" Avg scores/item: {avg_scores_per_item:.2f}") print(f" Total duration: {result.duration_seconds:.2f}s") if result.total_evaluations_failed > 0: failure_rate = result.total_evaluations_failed / ( result.total_items_processed * len(result.evaluator_stats) ) print(f" Evaluation failure rate: {failure_rate:.1%}")
Note:
All arguments must be passed as keywords when instantiating this class.
679 def __init__( 680 self, 681 *, 682 total_items_fetched: int, 683 total_items_processed: int, 684 total_items_failed: int, 685 total_scores_created: int, 686 total_composite_scores_created: int, 687 total_evaluations_failed: int, 688 evaluator_stats: List[EvaluatorStats], 689 resume_token: Optional[BatchEvaluationResumeToken], 690 completed: bool, 691 duration_seconds: float, 692 failed_item_ids: List[str], 693 error_summary: Dict[str, int], 694 has_more_items: bool, 695 item_evaluations: Dict[str, List["Evaluation"]], 696 ): 697 """Initialize BatchEvaluationResult with comprehensive statistics. 698 699 Args: 700 total_items_fetched: Total items fetched from API. 701 total_items_processed: Items successfully evaluated. 702 total_items_failed: Items that failed evaluation. 703 total_scores_created: Scores from item-level evaluators. 704 total_composite_scores_created: Scores from composite evaluator. 705 total_evaluations_failed: Individual evaluator failures. 706 evaluator_stats: Per-evaluator statistics. 707 resume_token: Token for resuming (None if completed). 708 completed: Whether all items were processed. 709 duration_seconds: Total execution time. 710 failed_item_ids: IDs of failed items. 711 error_summary: Error types and counts. 712 has_more_items: Whether more items exist beyond max_items. 713 item_evaluations: Dictionary mapping item IDs to their evaluation results. 714 715 Note: 716 All arguments must be provided as keywords. 717 """ 718 self.total_items_fetched = total_items_fetched 719 self.total_items_processed = total_items_processed 720 self.total_items_failed = total_items_failed 721 self.total_scores_created = total_scores_created 722 self.total_composite_scores_created = total_composite_scores_created 723 self.total_evaluations_failed = total_evaluations_failed 724 self.evaluator_stats = evaluator_stats 725 self.resume_token = resume_token 726 self.completed = completed 727 self.duration_seconds = duration_seconds 728 self.failed_item_ids = failed_item_ids 729 self.error_summary = error_summary 730 self.has_more_items = has_more_items 731 self.item_evaluations = item_evaluations
Initialize BatchEvaluationResult with comprehensive statistics.
Arguments:
- total_items_fetched: Total items fetched from API.
- total_items_processed: Items successfully evaluated.
- total_items_failed: Items that failed evaluation.
- total_scores_created: Scores from item-level evaluators.
- total_composite_scores_created: Scores from composite evaluator.
- total_evaluations_failed: Individual evaluator failures.
- evaluator_stats: Per-evaluator statistics.
- resume_token: Token for resuming (None if completed).
- completed: Whether all items were processed.
- duration_seconds: Total execution time.
- failed_item_ids: IDs of failed items.
- error_summary: Error types and counts.
- has_more_items: Whether more items exist beyond max_items.
- item_evaluations: Dictionary mapping item IDs to their evaluation results.
Note:
All arguments must be provided as keywords.
73def is_default_export_span(span: ReadableSpan) -> bool: 74 """Return whether a span should be exported by default.""" 75 return ( 76 is_langfuse_span(span) or is_genai_span(span) or is_known_llm_instrumentor(span) 77 )
Return whether a span should be exported by default.
36def is_langfuse_span(span: ReadableSpan) -> bool: 37 """Return whether the span was created by the Langfuse SDK tracer.""" 38 return ( 39 span.instrumentation_scope is not None 40 and span.instrumentation_scope.name == LANGFUSE_TRACER_NAME 41 )
Return whether the span was created by the Langfuse SDK tracer.
44def is_genai_span(span: ReadableSpan) -> bool: 45 """Return whether the span has any ``gen_ai.*`` semantic convention attribute.""" 46 if span.attributes is None: 47 return False 48 49 return any( 50 isinstance(key, str) and key.startswith("gen_ai") 51 for key in span.attributes.keys() 52 )
Return whether the span has any gen_ai.* semantic convention attribute.
60def is_known_llm_instrumentor(span: ReadableSpan) -> bool: 61 """Return whether the span comes from a known LLM instrumentation scope.""" 62 if span.instrumentation_scope is None: 63 return False 64 65 scope_name = span.instrumentation_scope.name 66 67 return any( 68 _matches_scope_prefix(scope_name, prefix) 69 for prefix in KNOWN_LLM_INSTRUMENTATION_SCOPE_PREFIXES 70 )
Return whether the span comes from a known LLM instrumentation scope.