langfuse

Langfuse Python SDK
Installation
The SDK was rewritten in v3 and released in June 2025. Refer to the v3 migration guide for instructions on updating your code.
pip install langfuse
Docs
Please see our docs for detailed information on this SDK.
1""".. include:: ../README.md""" 2 3from langfuse.batch_evaluation import ( 4 BatchEvaluationResult, 5 BatchEvaluationResumeToken, 6 CompositeEvaluatorFunction, 7 EvaluatorInputs, 8 EvaluatorStats, 9 MapperFunction, 10) 11from langfuse.experiment import Evaluation 12 13from ._client import client as _client_module 14from ._client.attributes import LangfuseOtelSpanAttributes 15from ._client.constants import ObservationTypeLiteral 16from ._client.get_client import get_client 17from ._client.observe import observe 18from ._client.propagation import propagate_attributes 19from ._client.span import ( 20 LangfuseAgent, 21 LangfuseChain, 22 LangfuseEmbedding, 23 LangfuseEvaluator, 24 LangfuseEvent, 25 LangfuseGeneration, 26 LangfuseGuardrail, 27 LangfuseRetriever, 28 LangfuseSpan, 29 LangfuseTool, 30) 31from .span_filter import ( 32 KNOWN_LLM_INSTRUMENTATION_SCOPE_PREFIXES, 33 is_default_export_span, 34 is_genai_span, 35 is_known_llm_instrumentor, 36 is_langfuse_span, 37) 38 39Langfuse = _client_module.Langfuse 40 41__all__ = [ 42 "Langfuse", 43 "get_client", 44 "observe", 45 "propagate_attributes", 46 "ObservationTypeLiteral", 47 "LangfuseSpan", 48 "LangfuseGeneration", 49 "LangfuseEvent", 50 "LangfuseOtelSpanAttributes", 51 "LangfuseAgent", 52 "LangfuseTool", 53 "LangfuseChain", 54 "LangfuseEmbedding", 55 "LangfuseEvaluator", 56 "LangfuseRetriever", 57 "LangfuseGuardrail", 58 "Evaluation", 59 "EvaluatorInputs", 60 "MapperFunction", 61 "CompositeEvaluatorFunction", 62 "EvaluatorStats", 63 "BatchEvaluationResumeToken", 64 "BatchEvaluationResult", 65 "is_default_export_span", 66 "is_langfuse_span", 67 "is_genai_span", 68 "is_known_llm_instrumentor", 69 "KNOWN_LLM_INSTRUMENTATION_SCOPE_PREFIXES", 70 "experiment", 71 "api", 72]
134class Langfuse: 135 """Main client for Langfuse tracing and platform features. 136 137 This class provides an interface for creating and managing traces, spans, 138 and generations in Langfuse as well as interacting with the Langfuse API. 139 140 The client features a thread-safe singleton pattern for each unique public API key, 141 ensuring consistent trace context propagation across your application. It implements 142 efficient batching of spans with configurable flush settings and includes background 143 thread management for media uploads and score ingestion. 144 145 Configuration is flexible through either direct parameters or environment variables, 146 with graceful fallbacks and runtime configuration updates. 147 148 Attributes: 149 api: Synchronous API client for Langfuse backend communication 150 async_api: Asynchronous API client for Langfuse backend communication 151 _otel_tracer: Internal LangfuseTracer instance managing OpenTelemetry components 152 153 Parameters: 154 public_key (Optional[str]): Your Langfuse public API key. Can also be set via LANGFUSE_PUBLIC_KEY environment variable. 155 secret_key (Optional[str]): Your Langfuse secret API key. Can also be set via LANGFUSE_SECRET_KEY environment variable. 156 base_url (Optional[str]): The Langfuse API base URL. Defaults to "https://cloud.langfuse.com". Can also be set via LANGFUSE_BASE_URL environment variable. 157 host (Optional[str]): Deprecated. Use base_url instead. The Langfuse API host URL. Defaults to "https://cloud.langfuse.com". 158 timeout (Optional[int]): Timeout in seconds for API requests. Defaults to 5 seconds. 159 httpx_client (Optional[httpx.Client]): Custom httpx client for making non-tracing HTTP requests. If not provided, a default client will be created. 160 debug (bool): Enable debug logging. Defaults to False. Can also be set via LANGFUSE_DEBUG environment variable. 161 tracing_enabled (Optional[bool]): Enable or disable tracing. Defaults to True. Can also be set via LANGFUSE_TRACING_ENABLED environment variable. 162 flush_at (Optional[int]): Number of spans to batch before sending to the API. Defaults to 512. Can also be set via LANGFUSE_FLUSH_AT environment variable. 163 flush_interval (Optional[float]): Time in seconds between batch flushes. Defaults to 5 seconds. Can also be set via LANGFUSE_FLUSH_INTERVAL environment variable. 164 environment (Optional[str]): Environment name for tracing. Default is 'default'. Can also be set via LANGFUSE_TRACING_ENVIRONMENT environment variable. Can be any lowercase alphanumeric string with hyphens and underscores that does not start with 'langfuse'. 165 release (Optional[str]): Release version/hash of your application. Used for grouping analytics by release. 166 media_upload_thread_count (Optional[int]): Number of background threads for handling media uploads. Defaults to 1. Can also be set via LANGFUSE_MEDIA_UPLOAD_THREAD_COUNT environment variable. 167 sample_rate (Optional[float]): Sampling rate for traces (0.0 to 1.0). Defaults to 1.0 (100% of traces are sampled). Can also be set via LANGFUSE_SAMPLE_RATE environment variable. 168 mask (Optional[MaskFunction]): Function to mask sensitive data in traces before sending to the API. 169 blocked_instrumentation_scopes (Optional[List[str]]): Deprecated. Use `should_export_span` instead. Equivalent behavior: 170 ```python 171 from langfuse.span_filter import is_default_export_span 172 blocked = {"sqlite", "requests"} 173 174 should_export_span = lambda span: ( 175 is_default_export_span(span) 176 and ( 177 span.instrumentation_scope is None 178 or span.instrumentation_scope.name not in blocked 179 ) 180 ) 181 ``` 182 should_export_span (Optional[Callable[[ReadableSpan], bool]]): Callback to decide whether to export a span. If omitted, Langfuse uses the default filter (Langfuse SDK spans, spans with `gen_ai.*` attributes, and known LLM instrumentation scopes). 183 additional_headers (Optional[Dict[str, str]]): Additional headers to include in all API requests and OTLPSpanExporter requests. These headers will be merged with default headers. Note: If httpx_client is provided, additional_headers must be set directly on your custom httpx_client as well. 184 tracer_provider(Optional[TracerProvider]): OpenTelemetry TracerProvider to use for Langfuse. This can be useful to set to have disconnected tracing between Langfuse and other OpenTelemetry-span emitting libraries. Note: To track active spans, the context is still shared between TracerProviders. This may lead to broken trace trees. 185 186 Example: 187 ```python 188 from langfuse.otel import Langfuse 189 190 # Initialize the client (reads from env vars if not provided) 191 langfuse = Langfuse( 192 public_key="your-public-key", 193 secret_key="your-secret-key", 194 host="https://cloud.langfuse.com", # Optional, default shown 195 ) 196 197 # Create a trace span 198 with langfuse.start_as_current_observation(name="process-query") as span: 199 # Your application code here 200 201 # Create a nested generation span for an LLM call 202 with span.start_as_current_generation( 203 name="generate-response", 204 model="gpt-4", 205 input={"query": "Tell me about AI"}, 206 model_parameters={"temperature": 0.7, "max_tokens": 500} 207 ) as generation: 208 # Generate response here 209 response = "AI is a field of computer science..." 210 211 generation.update( 212 output=response, 213 usage_details={"prompt_tokens": 10, "completion_tokens": 50}, 214 cost_details={"total_cost": 0.0023} 215 ) 216 217 # Score the generation (supports NUMERIC, BOOLEAN, CATEGORICAL) 218 generation.score(name="relevance", value=0.95, data_type="NUMERIC") 219 ``` 220 """ 221 222 _resources: Optional[LangfuseResourceManager] = None 223 _mask: Optional[MaskFunction] = None 224 _otel_tracer: otel_trace_api.Tracer 225 226 def __init__( 227 self, 228 *, 229 public_key: Optional[str] = None, 230 secret_key: Optional[str] = None, 231 base_url: Optional[str] = None, 232 host: Optional[str] = None, 233 timeout: Optional[int] = None, 234 httpx_client: Optional[httpx.Client] = None, 235 debug: bool = False, 236 tracing_enabled: Optional[bool] = True, 237 flush_at: Optional[int] = None, 238 flush_interval: Optional[float] = None, 239 environment: Optional[str] = None, 240 release: Optional[str] = None, 241 media_upload_thread_count: Optional[int] = None, 242 sample_rate: Optional[float] = None, 243 mask: Optional[MaskFunction] = None, 244 blocked_instrumentation_scopes: Optional[List[str]] = None, 245 should_export_span: Optional[Callable[[ReadableSpan], bool]] = None, 246 additional_headers: Optional[Dict[str, str]] = None, 247 tracer_provider: Optional[TracerProvider] = None, 248 ): 249 self._base_url = ( 250 base_url 251 or os.environ.get(LANGFUSE_BASE_URL) 252 or host 253 or os.environ.get(LANGFUSE_HOST, "https://cloud.langfuse.com") 254 ) 255 self._environment = environment or cast( 256 str, os.environ.get(LANGFUSE_TRACING_ENVIRONMENT) 257 ) 258 self._release = ( 259 release 260 or os.environ.get(LANGFUSE_RELEASE, None) 261 or get_common_release_envs() 262 ) 263 self._project_id: Optional[str] = None 264 sample_rate = sample_rate or float(os.environ.get(LANGFUSE_SAMPLE_RATE, 1.0)) 265 if not 0.0 <= sample_rate <= 1.0: 266 raise ValueError( 267 f"Sample rate must be between 0.0 and 1.0, got {sample_rate}" 268 ) 269 270 timeout = timeout or int(os.environ.get(LANGFUSE_TIMEOUT, 5)) 271 272 self._tracing_enabled = ( 273 tracing_enabled 274 and os.environ.get(LANGFUSE_TRACING_ENABLED, "true").lower() != "false" 275 ) 276 if not self._tracing_enabled: 277 langfuse_logger.info( 278 "Configuration: Langfuse tracing is explicitly disabled. No data will be sent to the Langfuse API." 279 ) 280 281 debug = ( 282 debug if debug else (os.getenv(LANGFUSE_DEBUG, "false").lower() == "true") 283 ) 284 if debug: 285 logging.basicConfig( 286 format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" 287 ) 288 langfuse_logger.setLevel(logging.DEBUG) 289 290 public_key = public_key or os.environ.get(LANGFUSE_PUBLIC_KEY) 291 if public_key is None: 292 langfuse_logger.warning( 293 "Authentication error: Langfuse client initialized without public_key. Client will be disabled. " 294 "Provide a public_key parameter or set LANGFUSE_PUBLIC_KEY environment variable. " 295 ) 296 self._otel_tracer = otel_trace_api.NoOpTracer() 297 return 298 299 secret_key = secret_key or os.environ.get(LANGFUSE_SECRET_KEY) 300 if secret_key is None: 301 langfuse_logger.warning( 302 "Authentication error: Langfuse client initialized without secret_key. Client will be disabled. " 303 "Provide a secret_key parameter or set LANGFUSE_SECRET_KEY environment variable. " 304 ) 305 self._otel_tracer = otel_trace_api.NoOpTracer() 306 return 307 308 if os.environ.get("OTEL_SDK_DISABLED", "false").lower() == "true": 309 langfuse_logger.warning( 310 "OTEL_SDK_DISABLED is set. Langfuse tracing will be disabled and no traces will appear in the UI." 311 ) 312 313 if blocked_instrumentation_scopes is not None: 314 warnings.warn( 315 "`blocked_instrumentation_scopes` is deprecated and will be removed in a future release. " 316 "Use `should_export_span` instead. Example: " 317 "from langfuse.span_filter import is_default_export_span; " 318 'blocked={"scope"}; should_export_span=lambda span: ' 319 "is_default_export_span(span) and (span.instrumentation_scope is None or " 320 "span.instrumentation_scope.name not in blocked).", 321 DeprecationWarning, 322 stacklevel=2, 323 ) 324 325 # Initialize api and tracer if requirements are met 326 self._resources = LangfuseResourceManager( 327 public_key=public_key, 328 secret_key=secret_key, 329 base_url=self._base_url, 330 timeout=timeout, 331 environment=self._environment, 332 release=release, 333 flush_at=flush_at, 334 flush_interval=flush_interval, 335 httpx_client=httpx_client, 336 media_upload_thread_count=media_upload_thread_count, 337 sample_rate=sample_rate, 338 mask=mask, 339 tracing_enabled=self._tracing_enabled, 340 blocked_instrumentation_scopes=blocked_instrumentation_scopes, 341 should_export_span=should_export_span, 342 additional_headers=additional_headers, 343 tracer_provider=tracer_provider, 344 ) 345 self._mask = self._resources.mask 346 347 self._otel_tracer = ( 348 self._resources.tracer 349 if self._tracing_enabled and self._resources.tracer is not None 350 else otel_trace_api.NoOpTracer() 351 ) 352 self.api = self._resources.api 353 self.async_api = self._resources.async_api 354 355 @overload 356 def start_observation( 357 self, 358 *, 359 trace_context: Optional[TraceContext] = None, 360 name: str, 361 as_type: Literal["generation"], 362 input: Optional[Any] = None, 363 output: Optional[Any] = None, 364 metadata: Optional[Any] = None, 365 version: Optional[str] = None, 366 level: Optional[SpanLevel] = None, 367 status_message: Optional[str] = None, 368 completion_start_time: Optional[datetime] = None, 369 model: Optional[str] = None, 370 model_parameters: Optional[Dict[str, MapValue]] = None, 371 usage_details: Optional[Dict[str, int]] = None, 372 cost_details: Optional[Dict[str, float]] = None, 373 prompt: Optional[PromptClient] = None, 374 ) -> LangfuseGeneration: ... 375 376 @overload 377 def start_observation( 378 self, 379 *, 380 trace_context: Optional[TraceContext] = None, 381 name: str, 382 as_type: Literal["span"] = "span", 383 input: Optional[Any] = None, 384 output: Optional[Any] = None, 385 metadata: Optional[Any] = None, 386 version: Optional[str] = None, 387 level: Optional[SpanLevel] = None, 388 status_message: Optional[str] = None, 389 ) -> LangfuseSpan: ... 390 391 @overload 392 def start_observation( 393 self, 394 *, 395 trace_context: Optional[TraceContext] = None, 396 name: str, 397 as_type: Literal["agent"], 398 input: Optional[Any] = None, 399 output: Optional[Any] = None, 400 metadata: Optional[Any] = None, 401 version: Optional[str] = None, 402 level: Optional[SpanLevel] = None, 403 status_message: Optional[str] = None, 404 ) -> LangfuseAgent: ... 405 406 @overload 407 def start_observation( 408 self, 409 *, 410 trace_context: Optional[TraceContext] = None, 411 name: str, 412 as_type: Literal["tool"], 413 input: Optional[Any] = None, 414 output: Optional[Any] = None, 415 metadata: Optional[Any] = None, 416 version: Optional[str] = None, 417 level: Optional[SpanLevel] = None, 418 status_message: Optional[str] = None, 419 ) -> LangfuseTool: ... 420 421 @overload 422 def start_observation( 423 self, 424 *, 425 trace_context: Optional[TraceContext] = None, 426 name: str, 427 as_type: Literal["chain"], 428 input: Optional[Any] = None, 429 output: Optional[Any] = None, 430 metadata: Optional[Any] = None, 431 version: Optional[str] = None, 432 level: Optional[SpanLevel] = None, 433 status_message: Optional[str] = None, 434 ) -> LangfuseChain: ... 435 436 @overload 437 def start_observation( 438 self, 439 *, 440 trace_context: Optional[TraceContext] = None, 441 name: str, 442 as_type: Literal["retriever"], 443 input: Optional[Any] = None, 444 output: Optional[Any] = None, 445 metadata: Optional[Any] = None, 446 version: Optional[str] = None, 447 level: Optional[SpanLevel] = None, 448 status_message: Optional[str] = None, 449 ) -> LangfuseRetriever: ... 450 451 @overload 452 def start_observation( 453 self, 454 *, 455 trace_context: Optional[TraceContext] = None, 456 name: str, 457 as_type: Literal["evaluator"], 458 input: Optional[Any] = None, 459 output: Optional[Any] = None, 460 metadata: Optional[Any] = None, 461 version: Optional[str] = None, 462 level: Optional[SpanLevel] = None, 463 status_message: Optional[str] = None, 464 ) -> LangfuseEvaluator: ... 465 466 @overload 467 def start_observation( 468 self, 469 *, 470 trace_context: Optional[TraceContext] = None, 471 name: str, 472 as_type: Literal["embedding"], 473 input: Optional[Any] = None, 474 output: Optional[Any] = None, 475 metadata: Optional[Any] = None, 476 version: Optional[str] = None, 477 level: Optional[SpanLevel] = None, 478 status_message: Optional[str] = None, 479 completion_start_time: Optional[datetime] = None, 480 model: Optional[str] = None, 481 model_parameters: Optional[Dict[str, MapValue]] = None, 482 usage_details: Optional[Dict[str, int]] = None, 483 cost_details: Optional[Dict[str, float]] = None, 484 prompt: Optional[PromptClient] = None, 485 ) -> LangfuseEmbedding: ... 486 487 @overload 488 def start_observation( 489 self, 490 *, 491 trace_context: Optional[TraceContext] = None, 492 name: str, 493 as_type: Literal["guardrail"], 494 input: Optional[Any] = None, 495 output: Optional[Any] = None, 496 metadata: Optional[Any] = None, 497 version: Optional[str] = None, 498 level: Optional[SpanLevel] = None, 499 status_message: Optional[str] = None, 500 ) -> LangfuseGuardrail: ... 501 502 def start_observation( 503 self, 504 *, 505 trace_context: Optional[TraceContext] = None, 506 name: str, 507 as_type: ObservationTypeLiteralNoEvent = "span", 508 input: Optional[Any] = None, 509 output: Optional[Any] = None, 510 metadata: Optional[Any] = None, 511 version: Optional[str] = None, 512 level: Optional[SpanLevel] = None, 513 status_message: Optional[str] = None, 514 completion_start_time: Optional[datetime] = None, 515 model: Optional[str] = None, 516 model_parameters: Optional[Dict[str, MapValue]] = None, 517 usage_details: Optional[Dict[str, int]] = None, 518 cost_details: Optional[Dict[str, float]] = None, 519 prompt: Optional[PromptClient] = None, 520 ) -> Union[ 521 LangfuseSpan, 522 LangfuseGeneration, 523 LangfuseAgent, 524 LangfuseTool, 525 LangfuseChain, 526 LangfuseRetriever, 527 LangfuseEvaluator, 528 LangfuseEmbedding, 529 LangfuseGuardrail, 530 ]: 531 """Create a new observation of the specified type. 532 533 This method creates a new observation but does not set it as the current span in the 534 context. To create and use an observation within a context, use start_as_current_observation(). 535 536 Args: 537 trace_context: Optional context for connecting to an existing trace 538 name: Name of the observation 539 as_type: Type of observation to create (defaults to "span") 540 input: Input data for the operation 541 output: Output data from the operation 542 metadata: Additional metadata to associate with the observation 543 version: Version identifier for the code or component 544 level: Importance level of the observation 545 status_message: Optional status message for the observation 546 completion_start_time: When the model started generating (for generation types) 547 model: Name/identifier of the AI model used (for generation types) 548 model_parameters: Parameters used for the model (for generation types) 549 usage_details: Token usage information (for generation types) 550 cost_details: Cost information (for generation types) 551 prompt: Associated prompt template (for generation types) 552 553 Returns: 554 An observation object of the appropriate type that must be ended with .end() 555 """ 556 if trace_context: 557 trace_id = trace_context.get("trace_id", None) 558 parent_span_id = trace_context.get("parent_span_id", None) 559 560 if trace_id: 561 remote_parent_span = self._create_remote_parent_span( 562 trace_id=trace_id, parent_span_id=parent_span_id 563 ) 564 565 with otel_trace_api.use_span( 566 cast(otel_trace_api.Span, remote_parent_span) 567 ): 568 otel_span = self._otel_tracer.start_span(name=name) 569 otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True) 570 571 return self._create_observation_from_otel_span( 572 otel_span=otel_span, 573 as_type=as_type, 574 input=input, 575 output=output, 576 metadata=metadata, 577 version=version, 578 level=level, 579 status_message=status_message, 580 completion_start_time=completion_start_time, 581 model=model, 582 model_parameters=model_parameters, 583 usage_details=usage_details, 584 cost_details=cost_details, 585 prompt=prompt, 586 ) 587 588 otel_span = self._otel_tracer.start_span(name=name) 589 590 return self._create_observation_from_otel_span( 591 otel_span=otel_span, 592 as_type=as_type, 593 input=input, 594 output=output, 595 metadata=metadata, 596 version=version, 597 level=level, 598 status_message=status_message, 599 completion_start_time=completion_start_time, 600 model=model, 601 model_parameters=model_parameters, 602 usage_details=usage_details, 603 cost_details=cost_details, 604 prompt=prompt, 605 ) 606 607 def _create_observation_from_otel_span( 608 self, 609 *, 610 otel_span: otel_trace_api.Span, 611 as_type: ObservationTypeLiteralNoEvent, 612 input: Optional[Any] = None, 613 output: Optional[Any] = None, 614 metadata: Optional[Any] = None, 615 version: Optional[str] = None, 616 level: Optional[SpanLevel] = None, 617 status_message: Optional[str] = None, 618 completion_start_time: Optional[datetime] = None, 619 model: Optional[str] = None, 620 model_parameters: Optional[Dict[str, MapValue]] = None, 621 usage_details: Optional[Dict[str, int]] = None, 622 cost_details: Optional[Dict[str, float]] = None, 623 prompt: Optional[PromptClient] = None, 624 ) -> Union[ 625 LangfuseSpan, 626 LangfuseGeneration, 627 LangfuseAgent, 628 LangfuseTool, 629 LangfuseChain, 630 LangfuseRetriever, 631 LangfuseEvaluator, 632 LangfuseEmbedding, 633 LangfuseGuardrail, 634 ]: 635 """Create the appropriate observation type from an OTEL span.""" 636 if as_type in get_observation_types_list(ObservationTypeGenerationLike): 637 observation_class = self._get_span_class(as_type) 638 # Type ignore to prevent overloads of internal _get_span_class function, 639 # issue is that LangfuseEvent could be returned and that classes have diff. args 640 return observation_class( # type: ignore[return-value,call-arg] 641 otel_span=otel_span, 642 langfuse_client=self, 643 environment=self._environment, 644 release=self._release, 645 input=input, 646 output=output, 647 metadata=metadata, 648 version=version, 649 level=level, 650 status_message=status_message, 651 completion_start_time=completion_start_time, 652 model=model, 653 model_parameters=model_parameters, 654 usage_details=usage_details, 655 cost_details=cost_details, 656 prompt=prompt, 657 ) 658 else: 659 # For other types (e.g. span, guardrail), create appropriate class without generation properties 660 observation_class = self._get_span_class(as_type) 661 # Type ignore to prevent overloads of internal _get_span_class function, 662 # issue is that LangfuseEvent could be returned and that classes have diff. args 663 return observation_class( # type: ignore[return-value,call-arg] 664 otel_span=otel_span, 665 langfuse_client=self, 666 environment=self._environment, 667 release=self._release, 668 input=input, 669 output=output, 670 metadata=metadata, 671 version=version, 672 level=level, 673 status_message=status_message, 674 ) 675 # span._observation_type = as_type 676 # span._otel_span.set_attribute("langfuse.observation.type", as_type) 677 # return span 678 679 @overload 680 def start_as_current_observation( 681 self, 682 *, 683 trace_context: Optional[TraceContext] = None, 684 name: str, 685 as_type: Literal["generation"], 686 input: Optional[Any] = None, 687 output: Optional[Any] = None, 688 metadata: Optional[Any] = None, 689 version: Optional[str] = None, 690 level: Optional[SpanLevel] = None, 691 status_message: Optional[str] = None, 692 completion_start_time: Optional[datetime] = None, 693 model: Optional[str] = None, 694 model_parameters: Optional[Dict[str, MapValue]] = None, 695 usage_details: Optional[Dict[str, int]] = None, 696 cost_details: Optional[Dict[str, float]] = None, 697 prompt: Optional[PromptClient] = None, 698 end_on_exit: Optional[bool] = None, 699 ) -> _AgnosticContextManager[LangfuseGeneration]: ... 700 701 @overload 702 def start_as_current_observation( 703 self, 704 *, 705 trace_context: Optional[TraceContext] = None, 706 name: str, 707 as_type: Literal["span"] = "span", 708 input: Optional[Any] = None, 709 output: Optional[Any] = None, 710 metadata: Optional[Any] = None, 711 version: Optional[str] = None, 712 level: Optional[SpanLevel] = None, 713 status_message: Optional[str] = None, 714 end_on_exit: Optional[bool] = None, 715 ) -> _AgnosticContextManager[LangfuseSpan]: ... 716 717 @overload 718 def start_as_current_observation( 719 self, 720 *, 721 trace_context: Optional[TraceContext] = None, 722 name: str, 723 as_type: Literal["agent"], 724 input: Optional[Any] = None, 725 output: Optional[Any] = None, 726 metadata: Optional[Any] = None, 727 version: Optional[str] = None, 728 level: Optional[SpanLevel] = None, 729 status_message: Optional[str] = None, 730 end_on_exit: Optional[bool] = None, 731 ) -> _AgnosticContextManager[LangfuseAgent]: ... 732 733 @overload 734 def start_as_current_observation( 735 self, 736 *, 737 trace_context: Optional[TraceContext] = None, 738 name: str, 739 as_type: Literal["tool"], 740 input: Optional[Any] = None, 741 output: Optional[Any] = None, 742 metadata: Optional[Any] = None, 743 version: Optional[str] = None, 744 level: Optional[SpanLevel] = None, 745 status_message: Optional[str] = None, 746 end_on_exit: Optional[bool] = None, 747 ) -> _AgnosticContextManager[LangfuseTool]: ... 748 749 @overload 750 def start_as_current_observation( 751 self, 752 *, 753 trace_context: Optional[TraceContext] = None, 754 name: str, 755 as_type: Literal["chain"], 756 input: Optional[Any] = None, 757 output: Optional[Any] = None, 758 metadata: Optional[Any] = None, 759 version: Optional[str] = None, 760 level: Optional[SpanLevel] = None, 761 status_message: Optional[str] = None, 762 end_on_exit: Optional[bool] = None, 763 ) -> _AgnosticContextManager[LangfuseChain]: ... 764 765 @overload 766 def start_as_current_observation( 767 self, 768 *, 769 trace_context: Optional[TraceContext] = None, 770 name: str, 771 as_type: Literal["retriever"], 772 input: Optional[Any] = None, 773 output: Optional[Any] = None, 774 metadata: Optional[Any] = None, 775 version: Optional[str] = None, 776 level: Optional[SpanLevel] = None, 777 status_message: Optional[str] = None, 778 end_on_exit: Optional[bool] = None, 779 ) -> _AgnosticContextManager[LangfuseRetriever]: ... 780 781 @overload 782 def start_as_current_observation( 783 self, 784 *, 785 trace_context: Optional[TraceContext] = None, 786 name: str, 787 as_type: Literal["evaluator"], 788 input: Optional[Any] = None, 789 output: Optional[Any] = None, 790 metadata: Optional[Any] = None, 791 version: Optional[str] = None, 792 level: Optional[SpanLevel] = None, 793 status_message: Optional[str] = None, 794 end_on_exit: Optional[bool] = None, 795 ) -> _AgnosticContextManager[LangfuseEvaluator]: ... 796 797 @overload 798 def start_as_current_observation( 799 self, 800 *, 801 trace_context: Optional[TraceContext] = None, 802 name: str, 803 as_type: Literal["embedding"], 804 input: Optional[Any] = None, 805 output: Optional[Any] = None, 806 metadata: Optional[Any] = None, 807 version: Optional[str] = None, 808 level: Optional[SpanLevel] = None, 809 status_message: Optional[str] = None, 810 completion_start_time: Optional[datetime] = None, 811 model: Optional[str] = None, 812 model_parameters: Optional[Dict[str, MapValue]] = None, 813 usage_details: Optional[Dict[str, int]] = None, 814 cost_details: Optional[Dict[str, float]] = None, 815 prompt: Optional[PromptClient] = None, 816 end_on_exit: Optional[bool] = None, 817 ) -> _AgnosticContextManager[LangfuseEmbedding]: ... 818 819 @overload 820 def start_as_current_observation( 821 self, 822 *, 823 trace_context: Optional[TraceContext] = None, 824 name: str, 825 as_type: Literal["guardrail"], 826 input: Optional[Any] = None, 827 output: Optional[Any] = None, 828 metadata: Optional[Any] = None, 829 version: Optional[str] = None, 830 level: Optional[SpanLevel] = None, 831 status_message: Optional[str] = None, 832 end_on_exit: Optional[bool] = None, 833 ) -> _AgnosticContextManager[LangfuseGuardrail]: ... 834 835 def start_as_current_observation( 836 self, 837 *, 838 trace_context: Optional[TraceContext] = None, 839 name: str, 840 as_type: ObservationTypeLiteralNoEvent = "span", 841 input: Optional[Any] = None, 842 output: Optional[Any] = None, 843 metadata: Optional[Any] = None, 844 version: Optional[str] = None, 845 level: Optional[SpanLevel] = None, 846 status_message: Optional[str] = None, 847 completion_start_time: Optional[datetime] = None, 848 model: Optional[str] = None, 849 model_parameters: Optional[Dict[str, MapValue]] = None, 850 usage_details: Optional[Dict[str, int]] = None, 851 cost_details: Optional[Dict[str, float]] = None, 852 prompt: Optional[PromptClient] = None, 853 end_on_exit: Optional[bool] = None, 854 ) -> Union[ 855 _AgnosticContextManager[LangfuseGeneration], 856 _AgnosticContextManager[LangfuseSpan], 857 _AgnosticContextManager[LangfuseAgent], 858 _AgnosticContextManager[LangfuseTool], 859 _AgnosticContextManager[LangfuseChain], 860 _AgnosticContextManager[LangfuseRetriever], 861 _AgnosticContextManager[LangfuseEvaluator], 862 _AgnosticContextManager[LangfuseEmbedding], 863 _AgnosticContextManager[LangfuseGuardrail], 864 ]: 865 """Create a new observation and set it as the current span in a context manager. 866 867 This method creates a new observation of the specified type and sets it as the 868 current span within a context manager. Use this method with a 'with' statement to 869 automatically handle the observation lifecycle within a code block. 870 871 The created observation will be the child of the current span in the context. 872 873 Args: 874 trace_context: Optional context for connecting to an existing trace 875 name: Name of the observation (e.g., function or operation name) 876 as_type: Type of observation to create (defaults to "span") 877 input: Input data for the operation (can be any JSON-serializable object) 878 output: Output data from the operation (can be any JSON-serializable object) 879 metadata: Additional metadata to associate with the observation 880 version: Version identifier for the code or component 881 level: Importance level of the observation (info, warning, error) 882 status_message: Optional status message for the observation 883 end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks. 884 885 The following parameters are available when as_type is: "generation" or "embedding". 886 completion_start_time: When the model started generating the response 887 model: Name/identifier of the AI model used (e.g., "gpt-4") 888 model_parameters: Parameters used for the model (e.g., temperature, max_tokens) 889 usage_details: Token usage information (e.g., prompt_tokens, completion_tokens) 890 cost_details: Cost information for the model call 891 prompt: Associated prompt template from Langfuse prompt management 892 893 Returns: 894 A context manager that yields the appropriate observation type based on as_type 895 896 Example: 897 ```python 898 # Create a span 899 with langfuse.start_as_current_observation(name="process-query", as_type="span") as span: 900 # Do work 901 result = process_data() 902 span.update(output=result) 903 904 # Create a child span automatically 905 with span.start_as_current_observation(name="sub-operation") as child_span: 906 # Do sub-operation work 907 child_span.update(output="sub-result") 908 909 # Create a tool observation 910 with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool: 911 # Do tool work 912 results = search_web(query) 913 tool.update(output=results) 914 915 # Create a generation observation 916 with langfuse.start_as_current_observation( 917 name="answer-generation", 918 as_type="generation", 919 model="gpt-4" 920 ) as generation: 921 # Generate answer 922 response = llm.generate(...) 923 generation.update(output=response) 924 ``` 925 """ 926 if as_type in get_observation_types_list(ObservationTypeGenerationLike): 927 if trace_context: 928 trace_id = trace_context.get("trace_id", None) 929 parent_span_id = trace_context.get("parent_span_id", None) 930 931 if trace_id: 932 remote_parent_span = self._create_remote_parent_span( 933 trace_id=trace_id, parent_span_id=parent_span_id 934 ) 935 936 return cast( 937 Union[ 938 _AgnosticContextManager[LangfuseGeneration], 939 _AgnosticContextManager[LangfuseEmbedding], 940 ], 941 self._create_span_with_parent_context( 942 as_type=as_type, 943 name=name, 944 remote_parent_span=remote_parent_span, 945 parent=None, 946 end_on_exit=end_on_exit, 947 input=input, 948 output=output, 949 metadata=metadata, 950 version=version, 951 level=level, 952 status_message=status_message, 953 completion_start_time=completion_start_time, 954 model=model, 955 model_parameters=model_parameters, 956 usage_details=usage_details, 957 cost_details=cost_details, 958 prompt=prompt, 959 ), 960 ) 961 962 return cast( 963 Union[ 964 _AgnosticContextManager[LangfuseGeneration], 965 _AgnosticContextManager[LangfuseEmbedding], 966 ], 967 self._start_as_current_otel_span_with_processed_media( 968 as_type=as_type, 969 name=name, 970 end_on_exit=end_on_exit, 971 input=input, 972 output=output, 973 metadata=metadata, 974 version=version, 975 level=level, 976 status_message=status_message, 977 completion_start_time=completion_start_time, 978 model=model, 979 model_parameters=model_parameters, 980 usage_details=usage_details, 981 cost_details=cost_details, 982 prompt=prompt, 983 ), 984 ) 985 986 if as_type in get_observation_types_list(ObservationTypeSpanLike): 987 if trace_context: 988 trace_id = trace_context.get("trace_id", None) 989 parent_span_id = trace_context.get("parent_span_id", None) 990 991 if trace_id: 992 remote_parent_span = self._create_remote_parent_span( 993 trace_id=trace_id, parent_span_id=parent_span_id 994 ) 995 996 return cast( 997 Union[ 998 _AgnosticContextManager[LangfuseSpan], 999 _AgnosticContextManager[LangfuseAgent], 1000 _AgnosticContextManager[LangfuseTool], 1001 _AgnosticContextManager[LangfuseChain], 1002 _AgnosticContextManager[LangfuseRetriever], 1003 _AgnosticContextManager[LangfuseEvaluator], 1004 _AgnosticContextManager[LangfuseGuardrail], 1005 ], 1006 self._create_span_with_parent_context( 1007 as_type=as_type, 1008 name=name, 1009 remote_parent_span=remote_parent_span, 1010 parent=None, 1011 end_on_exit=end_on_exit, 1012 input=input, 1013 output=output, 1014 metadata=metadata, 1015 version=version, 1016 level=level, 1017 status_message=status_message, 1018 ), 1019 ) 1020 1021 return cast( 1022 Union[ 1023 _AgnosticContextManager[LangfuseSpan], 1024 _AgnosticContextManager[LangfuseAgent], 1025 _AgnosticContextManager[LangfuseTool], 1026 _AgnosticContextManager[LangfuseChain], 1027 _AgnosticContextManager[LangfuseRetriever], 1028 _AgnosticContextManager[LangfuseEvaluator], 1029 _AgnosticContextManager[LangfuseGuardrail], 1030 ], 1031 self._start_as_current_otel_span_with_processed_media( 1032 as_type=as_type, 1033 name=name, 1034 end_on_exit=end_on_exit, 1035 input=input, 1036 output=output, 1037 metadata=metadata, 1038 version=version, 1039 level=level, 1040 status_message=status_message, 1041 ), 1042 ) 1043 1044 # This should never be reached since all valid types are handled above 1045 langfuse_logger.warning( 1046 f"Unknown observation type: {as_type}, falling back to span" 1047 ) 1048 return self._start_as_current_otel_span_with_processed_media( 1049 as_type="span", 1050 name=name, 1051 end_on_exit=end_on_exit, 1052 input=input, 1053 output=output, 1054 metadata=metadata, 1055 version=version, 1056 level=level, 1057 status_message=status_message, 1058 ) 1059 1060 def _get_span_class( 1061 self, 1062 as_type: ObservationTypeLiteral, 1063 ) -> Union[ 1064 Type[LangfuseAgent], 1065 Type[LangfuseTool], 1066 Type[LangfuseChain], 1067 Type[LangfuseRetriever], 1068 Type[LangfuseEvaluator], 1069 Type[LangfuseEmbedding], 1070 Type[LangfuseGuardrail], 1071 Type[LangfuseGeneration], 1072 Type[LangfuseEvent], 1073 Type[LangfuseSpan], 1074 ]: 1075 """Get the appropriate span class based on as_type.""" 1076 normalized_type = as_type.lower() 1077 1078 if normalized_type == "agent": 1079 return LangfuseAgent 1080 elif normalized_type == "tool": 1081 return LangfuseTool 1082 elif normalized_type == "chain": 1083 return LangfuseChain 1084 elif normalized_type == "retriever": 1085 return LangfuseRetriever 1086 elif normalized_type == "evaluator": 1087 return LangfuseEvaluator 1088 elif normalized_type == "embedding": 1089 return LangfuseEmbedding 1090 elif normalized_type == "guardrail": 1091 return LangfuseGuardrail 1092 elif normalized_type == "generation": 1093 return LangfuseGeneration 1094 elif normalized_type == "event": 1095 return LangfuseEvent 1096 elif normalized_type == "span": 1097 return LangfuseSpan 1098 else: 1099 return LangfuseSpan 1100 1101 @_agnosticcontextmanager 1102 def _create_span_with_parent_context( 1103 self, 1104 *, 1105 name: str, 1106 parent: Optional[otel_trace_api.Span] = None, 1107 remote_parent_span: Optional[otel_trace_api.Span] = None, 1108 as_type: ObservationTypeLiteralNoEvent, 1109 end_on_exit: Optional[bool] = None, 1110 input: Optional[Any] = None, 1111 output: Optional[Any] = None, 1112 metadata: Optional[Any] = None, 1113 version: Optional[str] = None, 1114 level: Optional[SpanLevel] = None, 1115 status_message: Optional[str] = None, 1116 completion_start_time: Optional[datetime] = None, 1117 model: Optional[str] = None, 1118 model_parameters: Optional[Dict[str, MapValue]] = None, 1119 usage_details: Optional[Dict[str, int]] = None, 1120 cost_details: Optional[Dict[str, float]] = None, 1121 prompt: Optional[PromptClient] = None, 1122 ) -> Any: 1123 parent_span = parent or cast(otel_trace_api.Span, remote_parent_span) 1124 1125 with otel_trace_api.use_span(parent_span): 1126 with self._start_as_current_otel_span_with_processed_media( 1127 name=name, 1128 as_type=as_type, 1129 end_on_exit=end_on_exit, 1130 input=input, 1131 output=output, 1132 metadata=metadata, 1133 version=version, 1134 level=level, 1135 status_message=status_message, 1136 completion_start_time=completion_start_time, 1137 model=model, 1138 model_parameters=model_parameters, 1139 usage_details=usage_details, 1140 cost_details=cost_details, 1141 prompt=prompt, 1142 ) as langfuse_span: 1143 if remote_parent_span is not None: 1144 langfuse_span._otel_span.set_attribute( 1145 LangfuseOtelSpanAttributes.AS_ROOT, True 1146 ) 1147 1148 yield langfuse_span 1149 1150 @_agnosticcontextmanager 1151 def _start_as_current_otel_span_with_processed_media( 1152 self, 1153 *, 1154 name: str, 1155 as_type: Optional[ObservationTypeLiteralNoEvent] = None, 1156 end_on_exit: Optional[bool] = None, 1157 input: Optional[Any] = None, 1158 output: Optional[Any] = None, 1159 metadata: Optional[Any] = None, 1160 version: Optional[str] = None, 1161 level: Optional[SpanLevel] = None, 1162 status_message: Optional[str] = None, 1163 completion_start_time: Optional[datetime] = None, 1164 model: Optional[str] = None, 1165 model_parameters: Optional[Dict[str, MapValue]] = None, 1166 usage_details: Optional[Dict[str, int]] = None, 1167 cost_details: Optional[Dict[str, float]] = None, 1168 prompt: Optional[PromptClient] = None, 1169 ) -> Any: 1170 with self._otel_tracer.start_as_current_span( 1171 name=name, 1172 end_on_exit=end_on_exit if end_on_exit is not None else True, 1173 ) as otel_span: 1174 span_class = self._get_span_class( 1175 as_type or "generation" 1176 ) # default was "generation" 1177 common_args = { 1178 "otel_span": otel_span, 1179 "langfuse_client": self, 1180 "environment": self._environment, 1181 "release": self._release, 1182 "input": input, 1183 "output": output, 1184 "metadata": metadata, 1185 "version": version, 1186 "level": level, 1187 "status_message": status_message, 1188 } 1189 1190 if span_class in [ 1191 LangfuseGeneration, 1192 LangfuseEmbedding, 1193 ]: 1194 common_args.update( 1195 { 1196 "completion_start_time": completion_start_time, 1197 "model": model, 1198 "model_parameters": model_parameters, 1199 "usage_details": usage_details, 1200 "cost_details": cost_details, 1201 "prompt": prompt, 1202 } 1203 ) 1204 # For span-like types (span, agent, tool, chain, retriever, evaluator, guardrail), no generation properties needed 1205 1206 yield span_class(**common_args) # type: ignore[arg-type] 1207 1208 def _get_current_otel_span(self) -> Optional[otel_trace_api.Span]: 1209 current_span = otel_trace_api.get_current_span() 1210 1211 if current_span is otel_trace_api.INVALID_SPAN: 1212 langfuse_logger.warning( 1213 "Context error: No active span in current context. Operations that depend on an active span will be skipped. " 1214 "Ensure spans are created with start_as_current_observation() or that you're operating within an active span context." 1215 ) 1216 return None 1217 1218 return current_span 1219 1220 def update_current_generation( 1221 self, 1222 *, 1223 name: Optional[str] = None, 1224 input: Optional[Any] = None, 1225 output: Optional[Any] = None, 1226 metadata: Optional[Any] = None, 1227 version: Optional[str] = None, 1228 level: Optional[SpanLevel] = None, 1229 status_message: Optional[str] = None, 1230 completion_start_time: Optional[datetime] = None, 1231 model: Optional[str] = None, 1232 model_parameters: Optional[Dict[str, MapValue]] = None, 1233 usage_details: Optional[Dict[str, int]] = None, 1234 cost_details: Optional[Dict[str, float]] = None, 1235 prompt: Optional[PromptClient] = None, 1236 ) -> None: 1237 """Update the current active generation span with new information. 1238 1239 This method updates the current generation span in the active context with 1240 additional information. It's useful for adding output, usage stats, or other 1241 details that become available during or after model generation. 1242 1243 Args: 1244 name: The generation name 1245 input: Updated input data for the model 1246 output: Output from the model (e.g., completions) 1247 metadata: Additional metadata to associate with the generation 1248 version: Version identifier for the model or component 1249 level: Importance level of the generation (info, warning, error) 1250 status_message: Optional status message for the generation 1251 completion_start_time: When the model started generating the response 1252 model: Name/identifier of the AI model used (e.g., "gpt-4") 1253 model_parameters: Parameters used for the model (e.g., temperature, max_tokens) 1254 usage_details: Token usage information (e.g., prompt_tokens, completion_tokens) 1255 cost_details: Cost information for the model call 1256 prompt: Associated prompt template from Langfuse prompt management 1257 1258 Example: 1259 ```python 1260 with langfuse.start_as_current_generation(name="answer-query") as generation: 1261 # Initial setup and API call 1262 response = llm.generate(...) 1263 1264 # Update with results that weren't available at creation time 1265 langfuse.update_current_generation( 1266 output=response.text, 1267 usage_details={ 1268 "prompt_tokens": response.usage.prompt_tokens, 1269 "completion_tokens": response.usage.completion_tokens 1270 } 1271 ) 1272 ``` 1273 """ 1274 if not self._tracing_enabled: 1275 langfuse_logger.debug( 1276 "Operation skipped: update_current_generation - Tracing is disabled or client is in no-op mode." 1277 ) 1278 return 1279 1280 current_otel_span = self._get_current_otel_span() 1281 1282 if current_otel_span is not None: 1283 generation = LangfuseGeneration( 1284 otel_span=current_otel_span, langfuse_client=self 1285 ) 1286 1287 if name: 1288 current_otel_span.update_name(name) 1289 1290 generation.update( 1291 input=input, 1292 output=output, 1293 metadata=metadata, 1294 version=version, 1295 level=level, 1296 status_message=status_message, 1297 completion_start_time=completion_start_time, 1298 model=model, 1299 model_parameters=model_parameters, 1300 usage_details=usage_details, 1301 cost_details=cost_details, 1302 prompt=prompt, 1303 ) 1304 1305 def update_current_span( 1306 self, 1307 *, 1308 name: Optional[str] = None, 1309 input: Optional[Any] = None, 1310 output: Optional[Any] = None, 1311 metadata: Optional[Any] = None, 1312 version: Optional[str] = None, 1313 level: Optional[SpanLevel] = None, 1314 status_message: Optional[str] = None, 1315 ) -> None: 1316 """Update the current active span with new information. 1317 1318 This method updates the current span in the active context with 1319 additional information. It's useful for adding outputs or metadata 1320 that become available during execution. 1321 1322 Args: 1323 name: The span name 1324 input: Updated input data for the operation 1325 output: Output data from the operation 1326 metadata: Additional metadata to associate with the span 1327 version: Version identifier for the code or component 1328 level: Importance level of the span (info, warning, error) 1329 status_message: Optional status message for the span 1330 1331 Example: 1332 ```python 1333 with langfuse.start_as_current_observation(name="process-data") as span: 1334 # Initial processing 1335 result = process_first_part() 1336 1337 # Update with intermediate results 1338 langfuse.update_current_span(metadata={"intermediate_result": result}) 1339 1340 # Continue processing 1341 final_result = process_second_part(result) 1342 1343 # Final update 1344 langfuse.update_current_span(output=final_result) 1345 ``` 1346 """ 1347 if not self._tracing_enabled: 1348 langfuse_logger.debug( 1349 "Operation skipped: update_current_span - Tracing is disabled or client is in no-op mode." 1350 ) 1351 return 1352 1353 current_otel_span = self._get_current_otel_span() 1354 1355 if current_otel_span is not None: 1356 span = LangfuseSpan( 1357 otel_span=current_otel_span, 1358 langfuse_client=self, 1359 environment=self._environment, 1360 release=self._release, 1361 ) 1362 1363 if name: 1364 current_otel_span.update_name(name) 1365 1366 span.update( 1367 input=input, 1368 output=output, 1369 metadata=metadata, 1370 version=version, 1371 level=level, 1372 status_message=status_message, 1373 ) 1374 1375 @deprecated( 1376 "Trace-level input/output is deprecated. " 1377 "For trace attributes (user_id, session_id, tags, etc.), use propagate_attributes() instead. " 1378 "This method will be removed in a future major version." 1379 ) 1380 def set_current_trace_io( 1381 self, 1382 *, 1383 input: Optional[Any] = None, 1384 output: Optional[Any] = None, 1385 ) -> None: 1386 """Set trace-level input and output for the current span's trace. 1387 1388 .. deprecated:: 1389 This is a legacy method for backward compatibility with Langfuse platform 1390 features that still rely on trace-level input/output (e.g., legacy LLM-as-a-judge 1391 evaluators). It will be removed in a future major version. 1392 1393 For setting other trace attributes (user_id, session_id, metadata, tags, version), 1394 use :meth:`propagate_attributes` instead. 1395 1396 Args: 1397 input: Input data to associate with the trace. 1398 output: Output data to associate with the trace. 1399 """ 1400 if not self._tracing_enabled: 1401 langfuse_logger.debug( 1402 "Operation skipped: set_current_trace_io - Tracing is disabled or client is in no-op mode." 1403 ) 1404 return 1405 1406 current_otel_span = self._get_current_otel_span() 1407 1408 if current_otel_span is not None and current_otel_span.is_recording(): 1409 existing_observation_type = current_otel_span.attributes.get( # type: ignore[attr-defined] 1410 LangfuseOtelSpanAttributes.OBSERVATION_TYPE, "span" 1411 ) 1412 # We need to preserve the class to keep the correct observation type 1413 span_class = self._get_span_class(existing_observation_type) 1414 span = span_class( 1415 otel_span=current_otel_span, 1416 langfuse_client=self, 1417 environment=self._environment, 1418 release=self._release, 1419 ) 1420 1421 span.set_trace_io( 1422 input=input, 1423 output=output, 1424 ) 1425 1426 def set_current_trace_as_public(self) -> None: 1427 """Make the current trace publicly accessible via its URL. 1428 1429 When a trace is published, anyone with the trace link can view the full trace 1430 without needing to be logged in to Langfuse. This action cannot be undone 1431 programmatically - once published, the entire trace becomes public. 1432 1433 This is a convenience method that publishes the trace from the currently 1434 active span context. Use this when you want to make a trace public from 1435 within a traced function without needing direct access to the span object. 1436 """ 1437 if not self._tracing_enabled: 1438 langfuse_logger.debug( 1439 "Operation skipped: set_current_trace_as_public - Tracing is disabled or client is in no-op mode." 1440 ) 1441 return 1442 1443 current_otel_span = self._get_current_otel_span() 1444 1445 if current_otel_span is not None and current_otel_span.is_recording(): 1446 existing_observation_type = current_otel_span.attributes.get( # type: ignore[attr-defined] 1447 LangfuseOtelSpanAttributes.OBSERVATION_TYPE, "span" 1448 ) 1449 # We need to preserve the class to keep the correct observation type 1450 span_class = self._get_span_class(existing_observation_type) 1451 span = span_class( 1452 otel_span=current_otel_span, 1453 langfuse_client=self, 1454 environment=self._environment, 1455 ) 1456 1457 span.set_trace_as_public() 1458 1459 def create_event( 1460 self, 1461 *, 1462 trace_context: Optional[TraceContext] = None, 1463 name: str, 1464 input: Optional[Any] = None, 1465 output: Optional[Any] = None, 1466 metadata: Optional[Any] = None, 1467 version: Optional[str] = None, 1468 level: Optional[SpanLevel] = None, 1469 status_message: Optional[str] = None, 1470 ) -> LangfuseEvent: 1471 """Create a new Langfuse observation of type 'EVENT'. 1472 1473 The created Langfuse Event observation will be the child of the current span in the context. 1474 1475 Args: 1476 trace_context: Optional context for connecting to an existing trace 1477 name: Name of the span (e.g., function or operation name) 1478 input: Input data for the operation (can be any JSON-serializable object) 1479 output: Output data from the operation (can be any JSON-serializable object) 1480 metadata: Additional metadata to associate with the span 1481 version: Version identifier for the code or component 1482 level: Importance level of the span (info, warning, error) 1483 status_message: Optional status message for the span 1484 1485 Returns: 1486 The Langfuse Event object 1487 1488 Example: 1489 ```python 1490 event = langfuse.create_event(name="process-event") 1491 ``` 1492 """ 1493 timestamp = time_ns() 1494 1495 if trace_context: 1496 trace_id = trace_context.get("trace_id", None) 1497 parent_span_id = trace_context.get("parent_span_id", None) 1498 1499 if trace_id: 1500 remote_parent_span = self._create_remote_parent_span( 1501 trace_id=trace_id, parent_span_id=parent_span_id 1502 ) 1503 1504 with otel_trace_api.use_span( 1505 cast(otel_trace_api.Span, remote_parent_span) 1506 ): 1507 otel_span = self._otel_tracer.start_span( 1508 name=name, start_time=timestamp 1509 ) 1510 otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True) 1511 1512 return cast( 1513 LangfuseEvent, 1514 LangfuseEvent( 1515 otel_span=otel_span, 1516 langfuse_client=self, 1517 environment=self._environment, 1518 release=self._release, 1519 input=input, 1520 output=output, 1521 metadata=metadata, 1522 version=version, 1523 level=level, 1524 status_message=status_message, 1525 ).end(end_time=timestamp), 1526 ) 1527 1528 otel_span = self._otel_tracer.start_span(name=name, start_time=timestamp) 1529 1530 return cast( 1531 LangfuseEvent, 1532 LangfuseEvent( 1533 otel_span=otel_span, 1534 langfuse_client=self, 1535 environment=self._environment, 1536 release=self._release, 1537 input=input, 1538 output=output, 1539 metadata=metadata, 1540 version=version, 1541 level=level, 1542 status_message=status_message, 1543 ).end(end_time=timestamp), 1544 ) 1545 1546 def _create_remote_parent_span( 1547 self, *, trace_id: str, parent_span_id: Optional[str] 1548 ) -> Any: 1549 if not self._is_valid_trace_id(trace_id): 1550 langfuse_logger.warning( 1551 f"Passed trace ID '{trace_id}' is not a valid 32 lowercase hex char Langfuse trace id. Ignoring trace ID." 1552 ) 1553 1554 if parent_span_id and not self._is_valid_span_id(parent_span_id): 1555 langfuse_logger.warning( 1556 f"Passed span ID '{parent_span_id}' is not a valid 16 lowercase hex char Langfuse span id. Ignoring parent span ID." 1557 ) 1558 1559 int_trace_id = int(trace_id, 16) 1560 int_parent_span_id = ( 1561 int(parent_span_id, 16) 1562 if parent_span_id 1563 else RandomIdGenerator().generate_span_id() 1564 ) 1565 1566 span_context = otel_trace_api.SpanContext( 1567 trace_id=int_trace_id, 1568 span_id=int_parent_span_id, 1569 trace_flags=otel_trace_api.TraceFlags(0x01), # mark span as sampled 1570 is_remote=False, 1571 ) 1572 1573 return otel_trace_api.NonRecordingSpan(span_context) 1574 1575 def _is_valid_trace_id(self, trace_id: str) -> bool: 1576 pattern = r"^[0-9a-f]{32}$" 1577 1578 return bool(re.match(pattern, trace_id)) 1579 1580 def _is_valid_span_id(self, span_id: str) -> bool: 1581 pattern = r"^[0-9a-f]{16}$" 1582 1583 return bool(re.match(pattern, span_id)) 1584 1585 def _create_observation_id(self, *, seed: Optional[str] = None) -> str: 1586 """Create a unique observation ID for use with Langfuse. 1587 1588 This method generates a unique observation ID (span ID in OpenTelemetry terms) 1589 for use with various Langfuse APIs. It can either generate a random ID or 1590 create a deterministic ID based on a seed string. 1591 1592 Observation IDs must be 16 lowercase hexadecimal characters, representing 8 bytes. 1593 This method ensures the generated ID meets this requirement. If you need to 1594 correlate an external ID with a Langfuse observation ID, use the external ID as 1595 the seed to get a valid, deterministic observation ID. 1596 1597 Args: 1598 seed: Optional string to use as a seed for deterministic ID generation. 1599 If provided, the same seed will always produce the same ID. 1600 If not provided, a random ID will be generated. 1601 1602 Returns: 1603 A 16-character lowercase hexadecimal string representing the observation ID. 1604 1605 Example: 1606 ```python 1607 # Generate a random observation ID 1608 obs_id = langfuse.create_observation_id() 1609 1610 # Generate a deterministic ID based on a seed 1611 user_obs_id = langfuse.create_observation_id(seed="user-123-feedback") 1612 1613 # Correlate an external item ID with a Langfuse observation ID 1614 item_id = "item-789012" 1615 correlated_obs_id = langfuse.create_observation_id(seed=item_id) 1616 1617 # Use the ID with Langfuse APIs 1618 langfuse.create_score( 1619 name="relevance", 1620 value=0.95, 1621 trace_id=trace_id, 1622 observation_id=obs_id 1623 ) 1624 ``` 1625 """ 1626 if not seed: 1627 span_id_int = RandomIdGenerator().generate_span_id() 1628 1629 return self._format_otel_span_id(span_id_int) 1630 1631 return sha256(seed.encode("utf-8")).digest()[:8].hex() 1632 1633 @staticmethod 1634 def create_trace_id(*, seed: Optional[str] = None) -> str: 1635 """Create a unique trace ID for use with Langfuse. 1636 1637 This method generates a unique trace ID for use with various Langfuse APIs. 1638 It can either generate a random ID or create a deterministic ID based on 1639 a seed string. 1640 1641 Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes. 1642 This method ensures the generated ID meets this requirement. If you need to 1643 correlate an external ID with a Langfuse trace ID, use the external ID as the 1644 seed to get a valid, deterministic Langfuse trace ID. 1645 1646 Args: 1647 seed: Optional string to use as a seed for deterministic ID generation. 1648 If provided, the same seed will always produce the same ID. 1649 If not provided, a random ID will be generated. 1650 1651 Returns: 1652 A 32-character lowercase hexadecimal string representing the Langfuse trace ID. 1653 1654 Example: 1655 ```python 1656 # Generate a random trace ID 1657 trace_id = langfuse.create_trace_id() 1658 1659 # Generate a deterministic ID based on a seed 1660 session_trace_id = langfuse.create_trace_id(seed="session-456") 1661 1662 # Correlate an external ID with a Langfuse trace ID 1663 external_id = "external-system-123456" 1664 correlated_trace_id = langfuse.create_trace_id(seed=external_id) 1665 1666 # Use the ID with trace context 1667 with langfuse.start_as_current_observation( 1668 name="process-request", 1669 trace_context={"trace_id": trace_id} 1670 ) as span: 1671 # Operation will be part of the specific trace 1672 pass 1673 ``` 1674 """ 1675 if not seed: 1676 trace_id_int = RandomIdGenerator().generate_trace_id() 1677 1678 return Langfuse._format_otel_trace_id(trace_id_int) 1679 1680 return sha256(seed.encode("utf-8")).digest()[:16].hex() 1681 1682 def _get_otel_trace_id(self, otel_span: otel_trace_api.Span) -> str: 1683 span_context = otel_span.get_span_context() 1684 1685 return self._format_otel_trace_id(span_context.trace_id) 1686 1687 def _get_otel_span_id(self, otel_span: otel_trace_api.Span) -> str: 1688 span_context = otel_span.get_span_context() 1689 1690 return self._format_otel_span_id(span_context.span_id) 1691 1692 @staticmethod 1693 def _format_otel_span_id(span_id_int: int) -> str: 1694 """Format an integer span ID to a 16-character lowercase hex string. 1695 1696 Internal method to convert an OpenTelemetry integer span ID to the standard 1697 W3C Trace Context format (16-character lowercase hex string). 1698 1699 Args: 1700 span_id_int: 64-bit integer representing a span ID 1701 1702 Returns: 1703 A 16-character lowercase hexadecimal string 1704 """ 1705 return format(span_id_int, "016x") 1706 1707 @staticmethod 1708 def _format_otel_trace_id(trace_id_int: int) -> str: 1709 """Format an integer trace ID to a 32-character lowercase hex string. 1710 1711 Internal method to convert an OpenTelemetry integer trace ID to the standard 1712 W3C Trace Context format (32-character lowercase hex string). 1713 1714 Args: 1715 trace_id_int: 128-bit integer representing a trace ID 1716 1717 Returns: 1718 A 32-character lowercase hexadecimal string 1719 """ 1720 return format(trace_id_int, "032x") 1721 1722 @overload 1723 def create_score( 1724 self, 1725 *, 1726 name: str, 1727 value: float, 1728 session_id: Optional[str] = None, 1729 dataset_run_id: Optional[str] = None, 1730 trace_id: Optional[str] = None, 1731 observation_id: Optional[str] = None, 1732 score_id: Optional[str] = None, 1733 data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None, 1734 comment: Optional[str] = None, 1735 config_id: Optional[str] = None, 1736 metadata: Optional[Any] = None, 1737 timestamp: Optional[datetime] = None, 1738 ) -> None: ... 1739 1740 @overload 1741 def create_score( 1742 self, 1743 *, 1744 name: str, 1745 value: str, 1746 session_id: Optional[str] = None, 1747 dataset_run_id: Optional[str] = None, 1748 trace_id: Optional[str] = None, 1749 score_id: Optional[str] = None, 1750 observation_id: Optional[str] = None, 1751 data_type: Optional[Literal["CATEGORICAL"]] = "CATEGORICAL", 1752 comment: Optional[str] = None, 1753 config_id: Optional[str] = None, 1754 metadata: Optional[Any] = None, 1755 timestamp: Optional[datetime] = None, 1756 ) -> None: ... 1757 1758 def create_score( 1759 self, 1760 *, 1761 name: str, 1762 value: Union[float, str], 1763 session_id: Optional[str] = None, 1764 dataset_run_id: Optional[str] = None, 1765 trace_id: Optional[str] = None, 1766 observation_id: Optional[str] = None, 1767 score_id: Optional[str] = None, 1768 data_type: Optional[ScoreDataType] = None, 1769 comment: Optional[str] = None, 1770 config_id: Optional[str] = None, 1771 metadata: Optional[Any] = None, 1772 timestamp: Optional[datetime] = None, 1773 ) -> None: 1774 """Create a score for a specific trace or observation. 1775 1776 This method creates a score for evaluating a Langfuse trace or observation. Scores can be 1777 used to track quality metrics, user feedback, or automated evaluations. 1778 1779 Args: 1780 name: Name of the score (e.g., "relevance", "accuracy") 1781 value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL) 1782 session_id: ID of the Langfuse session to associate the score with 1783 dataset_run_id: ID of the Langfuse dataset run to associate the score with 1784 trace_id: ID of the Langfuse trace to associate the score with 1785 observation_id: Optional ID of the specific observation to score. Trace ID must be provided too. 1786 score_id: Optional custom ID for the score (auto-generated if not provided) 1787 data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL) 1788 comment: Optional comment or explanation for the score 1789 config_id: Optional ID of a score config defined in Langfuse 1790 metadata: Optional metadata to be attached to the score 1791 timestamp: Optional timestamp for the score (defaults to current UTC time) 1792 1793 Example: 1794 ```python 1795 # Create a numeric score for accuracy 1796 langfuse.create_score( 1797 name="accuracy", 1798 value=0.92, 1799 trace_id="abcdef1234567890abcdef1234567890", 1800 data_type="NUMERIC", 1801 comment="High accuracy with minor irrelevant details" 1802 ) 1803 1804 # Create a categorical score for sentiment 1805 langfuse.create_score( 1806 name="sentiment", 1807 value="positive", 1808 trace_id="abcdef1234567890abcdef1234567890", 1809 observation_id="abcdef1234567890", 1810 data_type="CATEGORICAL" 1811 ) 1812 ``` 1813 """ 1814 if not self._tracing_enabled: 1815 return 1816 1817 score_id = score_id or self._create_observation_id() 1818 1819 try: 1820 new_body = ScoreBody( 1821 id=score_id, 1822 session_id=session_id, 1823 datasetRunId=dataset_run_id, 1824 traceId=trace_id, 1825 observationId=observation_id, 1826 name=name, 1827 value=value, 1828 dataType=data_type, # type: ignore 1829 comment=comment, 1830 configId=config_id, 1831 environment=self._environment, 1832 metadata=metadata, 1833 ) 1834 1835 event = { 1836 "id": self.create_trace_id(), 1837 "type": "score-create", 1838 "timestamp": timestamp or _get_timestamp(), 1839 "body": new_body, 1840 } 1841 1842 if self._resources is not None: 1843 # Force the score to be in sample if it was for a legacy trace ID, i.e. non-32 hexchar 1844 force_sample = ( 1845 not self._is_valid_trace_id(trace_id) if trace_id else True 1846 ) 1847 1848 self._resources.add_score_task( 1849 event, 1850 force_sample=force_sample, 1851 ) 1852 1853 except Exception as e: 1854 langfuse_logger.exception( 1855 f"Error creating score: Failed to process score event for trace_id={trace_id}, name={name}. Error: {e}" 1856 ) 1857 1858 def _create_trace_tags_via_ingestion( 1859 self, 1860 *, 1861 trace_id: str, 1862 tags: List[str], 1863 ) -> None: 1864 """Private helper to enqueue trace tag updates via ingestion API events.""" 1865 if not self._tracing_enabled: 1866 return 1867 1868 if len(tags) == 0: 1869 return 1870 1871 try: 1872 new_body = TraceBody( 1873 id=trace_id, 1874 tags=tags, 1875 ) 1876 1877 event = { 1878 "id": self.create_trace_id(), 1879 "type": "trace-create", 1880 "timestamp": _get_timestamp(), 1881 "body": new_body, 1882 } 1883 1884 if self._resources is not None: 1885 self._resources.add_trace_task(event) 1886 except Exception as e: 1887 langfuse_logger.exception( 1888 f"Error updating trace tags: Failed to process trace update event for trace_id={trace_id}. Error: {e}" 1889 ) 1890 1891 @overload 1892 def score_current_span( 1893 self, 1894 *, 1895 name: str, 1896 value: float, 1897 score_id: Optional[str] = None, 1898 data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None, 1899 comment: Optional[str] = None, 1900 config_id: Optional[str] = None, 1901 metadata: Optional[Any] = None, 1902 ) -> None: ... 1903 1904 @overload 1905 def score_current_span( 1906 self, 1907 *, 1908 name: str, 1909 value: str, 1910 score_id: Optional[str] = None, 1911 data_type: Optional[Literal["CATEGORICAL"]] = "CATEGORICAL", 1912 comment: Optional[str] = None, 1913 config_id: Optional[str] = None, 1914 metadata: Optional[Any] = None, 1915 ) -> None: ... 1916 1917 def score_current_span( 1918 self, 1919 *, 1920 name: str, 1921 value: Union[float, str], 1922 score_id: Optional[str] = None, 1923 data_type: Optional[ScoreDataType] = None, 1924 comment: Optional[str] = None, 1925 config_id: Optional[str] = None, 1926 metadata: Optional[Any] = None, 1927 ) -> None: 1928 """Create a score for the current active span. 1929 1930 This method scores the currently active span in the context. It's a convenient 1931 way to score the current operation without needing to know its trace and span IDs. 1932 1933 Args: 1934 name: Name of the score (e.g., "relevance", "accuracy") 1935 value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL) 1936 score_id: Optional custom ID for the score (auto-generated if not provided) 1937 data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL) 1938 comment: Optional comment or explanation for the score 1939 config_id: Optional ID of a score config defined in Langfuse 1940 metadata: Optional metadata to be attached to the score 1941 1942 Example: 1943 ```python 1944 with langfuse.start_as_current_generation(name="answer-query") as generation: 1945 # Generate answer 1946 response = generate_answer(...) 1947 generation.update(output=response) 1948 1949 # Score the generation 1950 langfuse.score_current_span( 1951 name="relevance", 1952 value=0.85, 1953 data_type="NUMERIC", 1954 comment="Mostly relevant but contains some tangential information", 1955 metadata={"model": "gpt-4", "prompt_version": "v2"} 1956 ) 1957 ``` 1958 """ 1959 current_span = self._get_current_otel_span() 1960 1961 if current_span is not None: 1962 trace_id = self._get_otel_trace_id(current_span) 1963 observation_id = self._get_otel_span_id(current_span) 1964 1965 langfuse_logger.info( 1966 f"Score: Creating score name='{name}' value={value} for current span ({observation_id}) in trace {trace_id}" 1967 ) 1968 1969 self.create_score( 1970 trace_id=trace_id, 1971 observation_id=observation_id, 1972 name=name, 1973 value=cast(str, value), 1974 score_id=score_id, 1975 data_type=cast(Literal["CATEGORICAL"], data_type), 1976 comment=comment, 1977 config_id=config_id, 1978 metadata=metadata, 1979 ) 1980 1981 @overload 1982 def score_current_trace( 1983 self, 1984 *, 1985 name: str, 1986 value: float, 1987 score_id: Optional[str] = None, 1988 data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None, 1989 comment: Optional[str] = None, 1990 config_id: Optional[str] = None, 1991 metadata: Optional[Any] = None, 1992 ) -> None: ... 1993 1994 @overload 1995 def score_current_trace( 1996 self, 1997 *, 1998 name: str, 1999 value: str, 2000 score_id: Optional[str] = None, 2001 data_type: Optional[Literal["CATEGORICAL"]] = "CATEGORICAL", 2002 comment: Optional[str] = None, 2003 config_id: Optional[str] = None, 2004 metadata: Optional[Any] = None, 2005 ) -> None: ... 2006 2007 def score_current_trace( 2008 self, 2009 *, 2010 name: str, 2011 value: Union[float, str], 2012 score_id: Optional[str] = None, 2013 data_type: Optional[ScoreDataType] = None, 2014 comment: Optional[str] = None, 2015 config_id: Optional[str] = None, 2016 metadata: Optional[Any] = None, 2017 ) -> None: 2018 """Create a score for the current trace. 2019 2020 This method scores the trace of the currently active span. Unlike score_current_span, 2021 this method associates the score with the entire trace rather than a specific span. 2022 It's useful for scoring overall performance or quality of the entire operation. 2023 2024 Args: 2025 name: Name of the score (e.g., "user_satisfaction", "overall_quality") 2026 value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL) 2027 score_id: Optional custom ID for the score (auto-generated if not provided) 2028 data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL) 2029 comment: Optional comment or explanation for the score 2030 config_id: Optional ID of a score config defined in Langfuse 2031 metadata: Optional metadata to be attached to the score 2032 2033 Example: 2034 ```python 2035 with langfuse.start_as_current_observation(name="process-user-request") as span: 2036 # Process request 2037 result = process_complete_request() 2038 span.update(output=result) 2039 2040 # Score the overall trace 2041 langfuse.score_current_trace( 2042 name="overall_quality", 2043 value=0.95, 2044 data_type="NUMERIC", 2045 comment="High quality end-to-end response", 2046 metadata={"evaluator": "gpt-4", "criteria": "comprehensive"} 2047 ) 2048 ``` 2049 """ 2050 current_span = self._get_current_otel_span() 2051 2052 if current_span is not None: 2053 trace_id = self._get_otel_trace_id(current_span) 2054 2055 langfuse_logger.info( 2056 f"Score: Creating score name='{name}' value={value} for entire trace {trace_id}" 2057 ) 2058 2059 self.create_score( 2060 trace_id=trace_id, 2061 name=name, 2062 value=cast(str, value), 2063 score_id=score_id, 2064 data_type=cast(Literal["CATEGORICAL"], data_type), 2065 comment=comment, 2066 config_id=config_id, 2067 metadata=metadata, 2068 ) 2069 2070 def flush(self) -> None: 2071 """Force flush all pending spans and events to the Langfuse API. 2072 2073 This method manually flushes any pending spans, scores, and other events to the 2074 Langfuse API. It's useful in scenarios where you want to ensure all data is sent 2075 before proceeding, without waiting for the automatic flush interval. 2076 2077 Example: 2078 ```python 2079 # Record some spans and scores 2080 with langfuse.start_as_current_observation(name="operation") as span: 2081 # Do work... 2082 pass 2083 2084 # Ensure all data is sent to Langfuse before proceeding 2085 langfuse.flush() 2086 2087 # Continue with other work 2088 ``` 2089 """ 2090 if self._resources is not None: 2091 self._resources.flush() 2092 2093 def shutdown(self) -> None: 2094 """Shut down the Langfuse client and flush all pending data. 2095 2096 This method cleanly shuts down the Langfuse client, ensuring all pending data 2097 is flushed to the API and all background threads are properly terminated. 2098 2099 It's important to call this method when your application is shutting down to 2100 prevent data loss and resource leaks. For most applications, using the client 2101 as a context manager or relying on the automatic shutdown via atexit is sufficient. 2102 2103 Example: 2104 ```python 2105 # Initialize Langfuse 2106 langfuse = Langfuse(public_key="...", secret_key="...") 2107 2108 # Use Langfuse throughout your application 2109 # ... 2110 2111 # When application is shutting down 2112 langfuse.shutdown() 2113 ``` 2114 """ 2115 if self._resources is not None: 2116 self._resources.shutdown() 2117 2118 def get_current_trace_id(self) -> Optional[str]: 2119 """Get the trace ID of the current active span. 2120 2121 This method retrieves the trace ID from the currently active span in the context. 2122 It can be used to get the trace ID for referencing in logs, external systems, 2123 or for creating related operations. 2124 2125 Returns: 2126 The current trace ID as a 32-character lowercase hexadecimal string, 2127 or None if there is no active span. 2128 2129 Example: 2130 ```python 2131 with langfuse.start_as_current_observation(name="process-request") as span: 2132 # Get the current trace ID for reference 2133 trace_id = langfuse.get_current_trace_id() 2134 2135 # Use it for external correlation 2136 log.info(f"Processing request with trace_id: {trace_id}") 2137 2138 # Or pass to another system 2139 external_system.process(data, trace_id=trace_id) 2140 ``` 2141 """ 2142 if not self._tracing_enabled: 2143 langfuse_logger.debug( 2144 "Operation skipped: get_current_trace_id - Tracing is disabled or client is in no-op mode." 2145 ) 2146 return None 2147 2148 current_otel_span = self._get_current_otel_span() 2149 2150 return self._get_otel_trace_id(current_otel_span) if current_otel_span else None 2151 2152 def get_current_observation_id(self) -> Optional[str]: 2153 """Get the observation ID (span ID) of the current active span. 2154 2155 This method retrieves the observation ID from the currently active span in the context. 2156 It can be used to get the observation ID for referencing in logs, external systems, 2157 or for creating scores or other related operations. 2158 2159 Returns: 2160 The current observation ID as a 16-character lowercase hexadecimal string, 2161 or None if there is no active span. 2162 2163 Example: 2164 ```python 2165 with langfuse.start_as_current_observation(name="process-user-query") as span: 2166 # Get the current observation ID 2167 observation_id = langfuse.get_current_observation_id() 2168 2169 # Store it for later reference 2170 cache.set(f"query_{query_id}_observation", observation_id) 2171 2172 # Process the query... 2173 ``` 2174 """ 2175 if not self._tracing_enabled: 2176 langfuse_logger.debug( 2177 "Operation skipped: get_current_observation_id - Tracing is disabled or client is in no-op mode." 2178 ) 2179 return None 2180 2181 current_otel_span = self._get_current_otel_span() 2182 2183 return self._get_otel_span_id(current_otel_span) if current_otel_span else None 2184 2185 def _get_project_id(self) -> Optional[str]: 2186 """Fetch and return the current project id. Persisted across requests. Returns None if no project id is found for api keys.""" 2187 if not self._project_id: 2188 proj = self.api.projects.get() 2189 if not proj.data or not proj.data[0].id: 2190 return None 2191 2192 self._project_id = proj.data[0].id 2193 2194 return self._project_id 2195 2196 def get_trace_url(self, *, trace_id: Optional[str] = None) -> Optional[str]: 2197 """Get the URL to view a trace in the Langfuse UI. 2198 2199 This method generates a URL that links directly to a trace in the Langfuse UI. 2200 It's useful for providing links in logs, notifications, or debugging tools. 2201 2202 Args: 2203 trace_id: Optional trace ID to generate a URL for. If not provided, 2204 the trace ID of the current active span will be used. 2205 2206 Returns: 2207 A URL string pointing to the trace in the Langfuse UI, 2208 or None if the project ID couldn't be retrieved or no trace ID is available. 2209 2210 Example: 2211 ```python 2212 # Get URL for the current trace 2213 with langfuse.start_as_current_observation(name="process-request") as span: 2214 trace_url = langfuse.get_trace_url() 2215 log.info(f"Processing trace: {trace_url}") 2216 2217 # Get URL for a specific trace 2218 specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef") 2219 send_notification(f"Review needed for trace: {specific_trace_url}") 2220 ``` 2221 """ 2222 final_trace_id = trace_id or self.get_current_trace_id() 2223 if not final_trace_id: 2224 return None 2225 2226 project_id = self._get_project_id() 2227 2228 return ( 2229 f"{self._base_url}/project/{project_id}/traces/{final_trace_id}" 2230 if project_id and final_trace_id 2231 else None 2232 ) 2233 2234 def get_dataset( 2235 self, 2236 name: str, 2237 *, 2238 fetch_items_page_size: Optional[int] = 50, 2239 version: Optional[datetime] = None, 2240 ) -> "DatasetClient": 2241 """Fetch a dataset by its name. 2242 2243 Args: 2244 name (str): The name of the dataset to fetch. 2245 fetch_items_page_size (Optional[int]): All items of the dataset will be fetched in chunks of this size. Defaults to 50. 2246 version (Optional[datetime]): Retrieve dataset items as they existed at this specific point in time (UTC). 2247 If provided, returns the state of items at the specified UTC timestamp. 2248 If not provided, returns the latest version. Must be a timezone-aware datetime object in UTC. 2249 2250 Returns: 2251 DatasetClient: The dataset with the given name. 2252 """ 2253 try: 2254 langfuse_logger.debug(f"Getting datasets {name}") 2255 dataset = self.api.datasets.get(dataset_name=self._url_encode(name)) 2256 2257 dataset_items = [] 2258 page = 1 2259 2260 while True: 2261 new_items = self.api.dataset_items.list( 2262 dataset_name=self._url_encode(name, is_url_param=True), 2263 page=page, 2264 limit=fetch_items_page_size, 2265 version=version, 2266 ) 2267 dataset_items.extend(new_items.data) 2268 2269 if new_items.meta.total_pages <= page: 2270 break 2271 2272 page += 1 2273 2274 return DatasetClient( 2275 dataset=dataset, 2276 items=dataset_items, 2277 version=version, 2278 langfuse_client=self, 2279 ) 2280 2281 except Error as e: 2282 handle_fern_exception(e) 2283 raise e 2284 2285 def get_dataset_run( 2286 self, *, dataset_name: str, run_name: str 2287 ) -> DatasetRunWithItems: 2288 """Fetch a dataset run by dataset name and run name. 2289 2290 Args: 2291 dataset_name (str): The name of the dataset. 2292 run_name (str): The name of the run. 2293 2294 Returns: 2295 DatasetRunWithItems: The dataset run with its items. 2296 """ 2297 try: 2298 return cast( 2299 DatasetRunWithItems, 2300 self.api.datasets.get_run( 2301 dataset_name=self._url_encode(dataset_name), 2302 run_name=self._url_encode(run_name), 2303 request_options=None, 2304 ), 2305 ) 2306 except Error as e: 2307 handle_fern_exception(e) 2308 raise e 2309 2310 def get_dataset_runs( 2311 self, 2312 *, 2313 dataset_name: str, 2314 page: Optional[int] = None, 2315 limit: Optional[int] = None, 2316 ) -> PaginatedDatasetRuns: 2317 """Fetch all runs for a dataset. 2318 2319 Args: 2320 dataset_name (str): The name of the dataset. 2321 page (Optional[int]): Page number, starts at 1. 2322 limit (Optional[int]): Limit of items per page. 2323 2324 Returns: 2325 PaginatedDatasetRuns: Paginated list of dataset runs. 2326 """ 2327 try: 2328 return cast( 2329 PaginatedDatasetRuns, 2330 self.api.datasets.get_runs( 2331 dataset_name=self._url_encode(dataset_name), 2332 page=page, 2333 limit=limit, 2334 request_options=None, 2335 ), 2336 ) 2337 except Error as e: 2338 handle_fern_exception(e) 2339 raise e 2340 2341 def delete_dataset_run( 2342 self, *, dataset_name: str, run_name: str 2343 ) -> DeleteDatasetRunResponse: 2344 """Delete a dataset run and all its run items. This action is irreversible. 2345 2346 Args: 2347 dataset_name (str): The name of the dataset. 2348 run_name (str): The name of the run. 2349 2350 Returns: 2351 DeleteDatasetRunResponse: Confirmation of deletion. 2352 """ 2353 try: 2354 return cast( 2355 DeleteDatasetRunResponse, 2356 self.api.datasets.delete_run( 2357 dataset_name=self._url_encode(dataset_name), 2358 run_name=self._url_encode(run_name), 2359 request_options=None, 2360 ), 2361 ) 2362 except Error as e: 2363 handle_fern_exception(e) 2364 raise e 2365 2366 def run_experiment( 2367 self, 2368 *, 2369 name: str, 2370 run_name: Optional[str] = None, 2371 description: Optional[str] = None, 2372 data: ExperimentData, 2373 task: TaskFunction, 2374 evaluators: List[EvaluatorFunction] = [], 2375 composite_evaluator: Optional[CompositeEvaluatorFunction] = None, 2376 run_evaluators: List[RunEvaluatorFunction] = [], 2377 max_concurrency: int = 50, 2378 metadata: Optional[Dict[str, str]] = None, 2379 _dataset_version: Optional[datetime] = None, 2380 ) -> ExperimentResult: 2381 """Run an experiment on a dataset with automatic tracing and evaluation. 2382 2383 This method executes a task function on each item in the provided dataset, 2384 automatically traces all executions with Langfuse for observability, runs 2385 item-level and run-level evaluators on the outputs, and returns comprehensive 2386 results with evaluation metrics. 2387 2388 The experiment system provides: 2389 - Automatic tracing of all task executions 2390 - Concurrent processing with configurable limits 2391 - Comprehensive error handling that isolates failures 2392 - Integration with Langfuse datasets for experiment tracking 2393 - Flexible evaluation framework supporting both sync and async evaluators 2394 2395 Args: 2396 name: Human-readable name for the experiment. Used for identification 2397 in the Langfuse UI. 2398 run_name: Optional exact name for the experiment run. If provided, this will be 2399 used as the exact dataset run name if the `data` contains Langfuse dataset items. 2400 If not provided, this will default to the experiment name appended with an ISO timestamp. 2401 description: Optional description explaining the experiment's purpose, 2402 methodology, or expected outcomes. 2403 data: Array of data items to process. Can be either: 2404 - List of dict-like items with 'input', 'expected_output', 'metadata' keys 2405 - List of Langfuse DatasetItem objects from dataset.items 2406 task: Function that processes each data item and returns output. 2407 Must accept 'item' as keyword argument and can return sync or async results. 2408 The task function signature should be: task(*, item, **kwargs) -> Any 2409 evaluators: List of functions to evaluate each item's output individually. 2410 Each evaluator receives input, output, expected_output, and metadata. 2411 Can return single Evaluation dict or list of Evaluation dicts. 2412 composite_evaluator: Optional function that creates composite scores from item-level evaluations. 2413 Receives the same inputs as item-level evaluators (input, output, expected_output, metadata) 2414 plus the list of evaluations from item-level evaluators. Useful for weighted averages, 2415 pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics. 2416 run_evaluators: List of functions to evaluate the entire experiment run. 2417 Each run evaluator receives all item_results and can compute aggregate metrics. 2418 Useful for calculating averages, distributions, or cross-item comparisons. 2419 max_concurrency: Maximum number of concurrent task executions (default: 50). 2420 Controls the number of items processed simultaneously. Adjust based on 2421 API rate limits and system resources. 2422 metadata: Optional metadata dictionary to attach to all experiment traces. 2423 This metadata will be included in every trace created during the experiment. 2424 If `data` are Langfuse dataset items, the metadata will be attached to the dataset run, too. 2425 2426 Returns: 2427 ExperimentResult containing: 2428 - run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset. 2429 - item_results: List of results for each processed item with outputs and evaluations 2430 - run_evaluations: List of aggregate evaluation results for the entire run 2431 - dataset_run_id: ID of the dataset run (if using Langfuse datasets) 2432 - dataset_run_url: Direct URL to view results in Langfuse UI (if applicable) 2433 2434 Raises: 2435 ValueError: If required parameters are missing or invalid 2436 Exception: If experiment setup fails (individual item failures are handled gracefully) 2437 2438 Examples: 2439 Basic experiment with local data: 2440 ```python 2441 def summarize_text(*, item, **kwargs): 2442 return f"Summary: {item['input'][:50]}..." 2443 2444 def length_evaluator(*, input, output, expected_output=None, **kwargs): 2445 return { 2446 "name": "output_length", 2447 "value": len(output), 2448 "comment": f"Output contains {len(output)} characters" 2449 } 2450 2451 result = langfuse.run_experiment( 2452 name="Text Summarization Test", 2453 description="Evaluate summarization quality and length", 2454 data=[ 2455 {"input": "Long article text...", "expected_output": "Expected summary"}, 2456 {"input": "Another article...", "expected_output": "Another summary"} 2457 ], 2458 task=summarize_text, 2459 evaluators=[length_evaluator] 2460 ) 2461 2462 print(f"Processed {len(result.item_results)} items") 2463 for item_result in result.item_results: 2464 print(f"Input: {item_result.item['input']}") 2465 print(f"Output: {item_result.output}") 2466 print(f"Evaluations: {item_result.evaluations}") 2467 ``` 2468 2469 Advanced experiment with async task and multiple evaluators: 2470 ```python 2471 async def llm_task(*, item, **kwargs): 2472 # Simulate async LLM call 2473 response = await openai_client.chat.completions.create( 2474 model="gpt-4", 2475 messages=[{"role": "user", "content": item["input"]}] 2476 ) 2477 return response.choices[0].message.content 2478 2479 def accuracy_evaluator(*, input, output, expected_output=None, **kwargs): 2480 if expected_output and expected_output.lower() in output.lower(): 2481 return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"} 2482 return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"} 2483 2484 def toxicity_evaluator(*, input, output, expected_output=None, **kwargs): 2485 # Simulate toxicity check 2486 toxicity_score = check_toxicity(output) # Your toxicity checker 2487 return { 2488 "name": "toxicity", 2489 "value": toxicity_score, 2490 "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}" 2491 } 2492 2493 def average_accuracy(*, item_results, **kwargs): 2494 accuracies = [ 2495 eval.value for result in item_results 2496 for eval in result.evaluations 2497 if eval.name == "accuracy" 2498 ] 2499 return { 2500 "name": "average_accuracy", 2501 "value": sum(accuracies) / len(accuracies) if accuracies else 0, 2502 "comment": f"Average accuracy across {len(accuracies)} items" 2503 } 2504 2505 result = langfuse.run_experiment( 2506 name="LLM Safety and Accuracy Test", 2507 description="Evaluate model accuracy and safety across diverse prompts", 2508 data=test_dataset, # Your dataset items 2509 task=llm_task, 2510 evaluators=[accuracy_evaluator, toxicity_evaluator], 2511 run_evaluators=[average_accuracy], 2512 max_concurrency=5, # Limit concurrent API calls 2513 metadata={"model": "gpt-4", "temperature": 0.7} 2514 ) 2515 ``` 2516 2517 Using with Langfuse datasets: 2518 ```python 2519 # Get dataset from Langfuse 2520 dataset = langfuse.get_dataset("my-eval-dataset") 2521 2522 result = dataset.run_experiment( 2523 name="Production Model Evaluation", 2524 description="Monthly evaluation of production model performance", 2525 task=my_production_task, 2526 evaluators=[accuracy_evaluator, latency_evaluator] 2527 ) 2528 2529 # Results automatically linked to dataset in Langfuse UI 2530 print(f"View results: {result['dataset_run_url']}") 2531 ``` 2532 2533 Note: 2534 - Task and evaluator functions can be either synchronous or asynchronous 2535 - Individual item failures are logged but don't stop the experiment 2536 - All executions are automatically traced and visible in Langfuse UI 2537 - When using Langfuse datasets, results are automatically linked for easy comparison 2538 - This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.) 2539 - Async execution is handled automatically with smart event loop detection 2540 """ 2541 return cast( 2542 ExperimentResult, 2543 run_async_safely( 2544 self._run_experiment_async( 2545 name=name, 2546 run_name=self._create_experiment_run_name( 2547 name=name, run_name=run_name 2548 ), 2549 description=description, 2550 data=data, 2551 task=task, 2552 evaluators=evaluators or [], 2553 composite_evaluator=composite_evaluator, 2554 run_evaluators=run_evaluators or [], 2555 max_concurrency=max_concurrency, 2556 metadata=metadata, 2557 dataset_version=_dataset_version, 2558 ), 2559 ), 2560 ) 2561 2562 async def _run_experiment_async( 2563 self, 2564 *, 2565 name: str, 2566 run_name: str, 2567 description: Optional[str], 2568 data: ExperimentData, 2569 task: TaskFunction, 2570 evaluators: List[EvaluatorFunction], 2571 composite_evaluator: Optional[CompositeEvaluatorFunction], 2572 run_evaluators: List[RunEvaluatorFunction], 2573 max_concurrency: int, 2574 metadata: Optional[Dict[str, Any]] = None, 2575 dataset_version: Optional[datetime] = None, 2576 ) -> ExperimentResult: 2577 langfuse_logger.debug( 2578 f"Starting experiment '{name}' run '{run_name}' with {len(data)} items" 2579 ) 2580 2581 # Set up concurrency control 2582 semaphore = asyncio.Semaphore(max_concurrency) 2583 2584 # Process all items 2585 async def process_item(item: ExperimentItem) -> ExperimentItemResult: 2586 async with semaphore: 2587 return await self._process_experiment_item( 2588 item, 2589 task, 2590 evaluators, 2591 composite_evaluator, 2592 name, 2593 run_name, 2594 description, 2595 metadata, 2596 dataset_version, 2597 ) 2598 2599 # Run all items concurrently 2600 tasks = [process_item(item) for item in data] 2601 item_results = await asyncio.gather(*tasks, return_exceptions=True) 2602 2603 # Filter out any exceptions and log errors 2604 valid_results: List[ExperimentItemResult] = [] 2605 for i, result in enumerate(item_results): 2606 if isinstance(result, Exception): 2607 langfuse_logger.error(f"Item {i} failed: {result}") 2608 elif isinstance(result, ExperimentItemResult): 2609 valid_results.append(result) # type: ignore 2610 2611 # Run experiment-level evaluators 2612 run_evaluations: List[Evaluation] = [] 2613 for run_evaluator in run_evaluators: 2614 try: 2615 evaluations = await _run_evaluator( 2616 run_evaluator, item_results=valid_results 2617 ) 2618 run_evaluations.extend(evaluations) 2619 except Exception as e: 2620 langfuse_logger.error(f"Run evaluator failed: {e}") 2621 2622 # Generate dataset run URL if applicable 2623 dataset_run_id = valid_results[0].dataset_run_id if valid_results else None 2624 dataset_run_url = None 2625 if dataset_run_id and data: 2626 try: 2627 # Check if the first item has dataset_id (for DatasetItem objects) 2628 first_item = data[0] 2629 dataset_id = None 2630 2631 if hasattr(first_item, "dataset_id"): 2632 dataset_id = getattr(first_item, "dataset_id", None) 2633 2634 if dataset_id: 2635 project_id = self._get_project_id() 2636 2637 if project_id: 2638 dataset_run_url = f"{self._base_url}/project/{project_id}/datasets/{dataset_id}/runs/{dataset_run_id}" 2639 2640 except Exception: 2641 pass # URL generation is optional 2642 2643 # Store run-level evaluations as scores 2644 for evaluation in run_evaluations: 2645 try: 2646 if dataset_run_id: 2647 self.create_score( 2648 dataset_run_id=dataset_run_id, 2649 name=evaluation.name or "<unknown>", 2650 value=evaluation.value, # type: ignore 2651 comment=evaluation.comment, 2652 metadata=evaluation.metadata, 2653 data_type=evaluation.data_type, # type: ignore 2654 config_id=evaluation.config_id, 2655 ) 2656 2657 except Exception as e: 2658 langfuse_logger.error(f"Failed to store run evaluation: {e}") 2659 2660 # Flush scores and traces 2661 self.flush() 2662 2663 return ExperimentResult( 2664 name=name, 2665 run_name=run_name, 2666 description=description, 2667 item_results=valid_results, 2668 run_evaluations=run_evaluations, 2669 dataset_run_id=dataset_run_id, 2670 dataset_run_url=dataset_run_url, 2671 ) 2672 2673 async def _process_experiment_item( 2674 self, 2675 item: ExperimentItem, 2676 task: Callable, 2677 evaluators: List[Callable], 2678 composite_evaluator: Optional[CompositeEvaluatorFunction], 2679 experiment_name: str, 2680 experiment_run_name: str, 2681 experiment_description: Optional[str], 2682 experiment_metadata: Optional[Dict[str, Any]] = None, 2683 dataset_version: Optional[datetime] = None, 2684 ) -> ExperimentItemResult: 2685 span_name = "experiment-item-run" 2686 2687 with self.start_as_current_observation(name=span_name) as span: 2688 try: 2689 input_data = ( 2690 item.get("input") 2691 if isinstance(item, dict) 2692 else getattr(item, "input", None) 2693 ) 2694 2695 if input_data is None: 2696 raise ValueError("Experiment Item is missing input. Skipping item.") 2697 2698 expected_output = ( 2699 item.get("expected_output") 2700 if isinstance(item, dict) 2701 else getattr(item, "expected_output", None) 2702 ) 2703 2704 item_metadata = ( 2705 item.get("metadata") 2706 if isinstance(item, dict) 2707 else getattr(item, "metadata", None) 2708 ) 2709 2710 final_observation_metadata = { 2711 "experiment_name": experiment_name, 2712 "experiment_run_name": experiment_run_name, 2713 **(experiment_metadata or {}), 2714 } 2715 2716 trace_id = span.trace_id 2717 dataset_id = None 2718 dataset_item_id = None 2719 dataset_run_id = None 2720 2721 # Link to dataset run if this is a dataset item 2722 if hasattr(item, "id") and hasattr(item, "dataset_id"): 2723 try: 2724 # Use sync API to avoid event loop issues when run_async_safely 2725 # creates multiple event loops across different threads 2726 dataset_run_item = await asyncio.to_thread( 2727 self.api.dataset_run_items.create, 2728 run_name=experiment_run_name, 2729 run_description=experiment_description, 2730 metadata=experiment_metadata, 2731 dataset_item_id=item.id, # type: ignore 2732 trace_id=trace_id, 2733 observation_id=span.id, 2734 dataset_version=dataset_version, 2735 ) 2736 2737 dataset_run_id = dataset_run_item.dataset_run_id 2738 2739 except Exception as e: 2740 langfuse_logger.error(f"Failed to create dataset run item: {e}") 2741 2742 if ( 2743 not isinstance(item, dict) 2744 and hasattr(item, "dataset_id") 2745 and hasattr(item, "id") 2746 ): 2747 dataset_id = item.dataset_id 2748 dataset_item_id = item.id 2749 2750 final_observation_metadata.update( 2751 {"dataset_id": dataset_id, "dataset_item_id": dataset_item_id} 2752 ) 2753 2754 if isinstance(item_metadata, dict): 2755 final_observation_metadata.update(item_metadata) 2756 2757 experiment_id = dataset_run_id or self._create_observation_id() 2758 experiment_item_id = ( 2759 dataset_item_id or get_sha256_hash_hex(_serialize(input_data))[:16] 2760 ) 2761 span._otel_span.set_attributes( 2762 { 2763 k: v 2764 for k, v in { 2765 LangfuseOtelSpanAttributes.ENVIRONMENT: LANGFUSE_SDK_EXPERIMENT_ENVIRONMENT, 2766 LangfuseOtelSpanAttributes.EXPERIMENT_DESCRIPTION: experiment_description, 2767 LangfuseOtelSpanAttributes.EXPERIMENT_ITEM_EXPECTED_OUTPUT: _serialize( 2768 expected_output 2769 ), 2770 }.items() 2771 if v is not None 2772 } 2773 ) 2774 2775 propagated_experiment_attributes = PropagatedExperimentAttributes( 2776 experiment_id=experiment_id, 2777 experiment_name=experiment_run_name, 2778 experiment_metadata=_serialize(experiment_metadata), 2779 experiment_dataset_id=dataset_id, 2780 experiment_item_id=experiment_item_id, 2781 experiment_item_metadata=_serialize(item_metadata), 2782 experiment_item_root_observation_id=span.id, 2783 ) 2784 2785 with _propagate_attributes(experiment=propagated_experiment_attributes): 2786 output = await _run_task(task, item) 2787 2788 span.update( 2789 input=input_data, 2790 output=output, 2791 metadata=final_observation_metadata, 2792 ) 2793 2794 except Exception as e: 2795 span.update( 2796 output=f"Error: {str(e)}", level="ERROR", status_message=str(e) 2797 ) 2798 raise e 2799 2800 # Run evaluators 2801 evaluations = [] 2802 2803 for evaluator in evaluators: 2804 try: 2805 eval_metadata: Optional[Dict[str, Any]] = None 2806 2807 if isinstance(item, dict): 2808 eval_metadata = item.get("metadata") 2809 elif hasattr(item, "metadata"): 2810 eval_metadata = item.metadata 2811 2812 with _propagate_attributes( 2813 experiment=propagated_experiment_attributes 2814 ): 2815 eval_results = await _run_evaluator( 2816 evaluator, 2817 input=input_data, 2818 output=output, 2819 expected_output=expected_output, 2820 metadata=eval_metadata, 2821 ) 2822 evaluations.extend(eval_results) 2823 2824 # Store evaluations as scores 2825 for evaluation in eval_results: 2826 self.create_score( 2827 trace_id=trace_id, 2828 observation_id=span.id, 2829 name=evaluation.name, 2830 value=evaluation.value, # type: ignore 2831 comment=evaluation.comment, 2832 metadata=evaluation.metadata, 2833 config_id=evaluation.config_id, 2834 data_type=evaluation.data_type, # type: ignore 2835 ) 2836 2837 except Exception as e: 2838 langfuse_logger.error(f"Evaluator failed: {e}") 2839 2840 # Run composite evaluator if provided and we have evaluations 2841 if composite_evaluator and evaluations: 2842 try: 2843 composite_eval_metadata: Optional[Dict[str, Any]] = None 2844 if isinstance(item, dict): 2845 composite_eval_metadata = item.get("metadata") 2846 elif hasattr(item, "metadata"): 2847 composite_eval_metadata = item.metadata 2848 2849 with _propagate_attributes( 2850 experiment=propagated_experiment_attributes 2851 ): 2852 result = composite_evaluator( 2853 input=input_data, 2854 output=output, 2855 expected_output=expected_output, 2856 metadata=composite_eval_metadata, 2857 evaluations=evaluations, 2858 ) 2859 2860 # Handle async composite evaluators 2861 if asyncio.iscoroutine(result): 2862 result = await result 2863 2864 # Normalize to list 2865 composite_evals: List[Evaluation] = [] 2866 if isinstance(result, (dict, Evaluation)): 2867 composite_evals = [result] # type: ignore 2868 elif isinstance(result, list): 2869 composite_evals = result # type: ignore 2870 2871 # Store composite evaluations as scores and add to evaluations list 2872 for composite_evaluation in composite_evals: 2873 self.create_score( 2874 trace_id=trace_id, 2875 observation_id=span.id, 2876 name=composite_evaluation.name, 2877 value=composite_evaluation.value, # type: ignore 2878 comment=composite_evaluation.comment, 2879 metadata=composite_evaluation.metadata, 2880 config_id=composite_evaluation.config_id, 2881 data_type=composite_evaluation.data_type, # type: ignore 2882 ) 2883 evaluations.append(composite_evaluation) 2884 2885 except Exception as e: 2886 langfuse_logger.error(f"Composite evaluator failed: {e}") 2887 2888 return ExperimentItemResult( 2889 item=item, 2890 output=output, 2891 evaluations=evaluations, 2892 trace_id=trace_id, 2893 dataset_run_id=dataset_run_id, 2894 ) 2895 2896 def _create_experiment_run_name( 2897 self, *, name: Optional[str] = None, run_name: Optional[str] = None 2898 ) -> str: 2899 if run_name: 2900 return run_name 2901 2902 iso_timestamp = _get_timestamp().isoformat().replace("+00:00", "Z") 2903 2904 return f"{name} - {iso_timestamp}" 2905 2906 def run_batched_evaluation( 2907 self, 2908 *, 2909 scope: Literal["traces", "observations"], 2910 mapper: MapperFunction, 2911 filter: Optional[str] = None, 2912 fetch_batch_size: int = 50, 2913 fetch_trace_fields: Optional[str] = None, 2914 max_items: Optional[int] = None, 2915 max_retries: int = 3, 2916 evaluators: List[EvaluatorFunction], 2917 composite_evaluator: Optional[CompositeEvaluatorFunction] = None, 2918 max_concurrency: int = 5, 2919 metadata: Optional[Dict[str, Any]] = None, 2920 _add_observation_scores_to_trace: bool = False, 2921 _additional_trace_tags: Optional[List[str]] = None, 2922 resume_from: Optional[BatchEvaluationResumeToken] = None, 2923 verbose: bool = False, 2924 ) -> BatchEvaluationResult: 2925 """Fetch traces or observations and run evaluations on each item. 2926 2927 This method provides a powerful way to evaluate existing data in Langfuse at scale. 2928 It fetches items based on filters, transforms them using a mapper function, runs 2929 evaluators on each item, and creates scores that are linked back to the original 2930 entities. This is ideal for: 2931 2932 - Running evaluations on production traces after deployment 2933 - Backtesting new evaluation metrics on historical data 2934 - Batch scoring of observations for quality monitoring 2935 - Periodic evaluation runs on recent data 2936 2937 The method uses a streaming/pipeline approach to process items in batches, making 2938 it memory-efficient for large datasets. It includes comprehensive error handling, 2939 retry logic, and resume capability for long-running evaluations. 2940 2941 Args: 2942 scope: The type of items to evaluate. Must be one of: 2943 - "traces": Evaluate complete traces with all their observations 2944 - "observations": Evaluate individual observations (spans, generations, events) 2945 mapper: Function that transforms API response objects into evaluator inputs. 2946 Receives a trace/observation object and returns an EvaluatorInputs 2947 instance with input, output, expected_output, and metadata fields. 2948 Can be sync or async. 2949 evaluators: List of evaluation functions to run on each item. Each evaluator 2950 receives the mapped inputs and returns Evaluation object(s). Evaluator 2951 failures are logged but don't stop the batch evaluation. 2952 filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples: 2953 - '{"tags": ["production"]}' 2954 - '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}' 2955 Default: None (fetches all items). 2956 fetch_batch_size: Number of items to fetch per API call and hold in memory. 2957 Larger values may be faster but use more memory. Default: 50. 2958 fetch_trace_fields: Comma-separated list of fields to include when fetching traces. Available field groups: 'core' (always included), 'io' (input, output, metadata), 'scores', 'observations', 'metrics'. If not specified, all fields are returned. Example: 'core,scores,metrics'. Note: Excluded 'observations' or 'scores' fields return empty arrays; excluded 'metrics' returns -1 for 'totalCost' and 'latency'. Only relevant if scope is 'traces'. 2959 max_items: Maximum total number of items to process. If None, processes all 2960 items matching the filter. Useful for testing or limiting evaluation runs. 2961 Default: None (process all). 2962 max_concurrency: Maximum number of items to evaluate concurrently. Controls 2963 parallelism and resource usage. Default: 5. 2964 composite_evaluator: Optional function that creates a composite score from 2965 item-level evaluations. Receives the original item and its evaluations, 2966 returns a single Evaluation. Useful for weighted averages or combined metrics. 2967 Default: None. 2968 metadata: Optional metadata dict to add to all created scores. Useful for 2969 tracking evaluation runs, versions, or other context. Default: None. 2970 max_retries: Maximum number of retry attempts for failed batch fetches. 2971 Uses exponential backoff (1s, 2s, 4s). Default: 3. 2972 verbose: If True, logs progress information to console. Useful for monitoring 2973 long-running evaluations. Default: False. 2974 resume_from: Optional resume token from a previous incomplete run. Allows 2975 continuing evaluation after interruption or failure. Default: None. 2976 2977 2978 Returns: 2979 BatchEvaluationResult containing: 2980 - total_items_fetched: Number of items fetched from API 2981 - total_items_processed: Number of items successfully evaluated 2982 - total_items_failed: Number of items that failed evaluation 2983 - total_scores_created: Scores created by item-level evaluators 2984 - total_composite_scores_created: Scores created by composite evaluator 2985 - total_evaluations_failed: Individual evaluator failures 2986 - evaluator_stats: Per-evaluator statistics (success rate, scores created) 2987 - resume_token: Token for resuming if incomplete (None if completed) 2988 - completed: True if all items processed 2989 - duration_seconds: Total execution time 2990 - failed_item_ids: IDs of items that failed 2991 - error_summary: Error types and counts 2992 - has_more_items: True if max_items reached but more exist 2993 2994 Raises: 2995 ValueError: If invalid scope is provided. 2996 2997 Examples: 2998 Basic trace evaluation: 2999 ```python 3000 from langfuse import Langfuse, EvaluatorInputs, Evaluation 3001 3002 client = Langfuse() 3003 3004 # Define mapper to extract fields from traces 3005 def trace_mapper(trace): 3006 return EvaluatorInputs( 3007 input=trace.input, 3008 output=trace.output, 3009 expected_output=None, 3010 metadata={"trace_id": trace.id} 3011 ) 3012 3013 # Define evaluator 3014 def length_evaluator(*, input, output, expected_output, metadata): 3015 return Evaluation( 3016 name="output_length", 3017 value=len(output) if output else 0 3018 ) 3019 3020 # Run batch evaluation 3021 result = client.run_batched_evaluation( 3022 scope="traces", 3023 mapper=trace_mapper, 3024 evaluators=[length_evaluator], 3025 filter='{"tags": ["production"]}', 3026 max_items=1000, 3027 verbose=True 3028 ) 3029 3030 print(f"Processed {result.total_items_processed} traces") 3031 print(f"Created {result.total_scores_created} scores") 3032 ``` 3033 3034 Evaluation with composite scorer: 3035 ```python 3036 def accuracy_evaluator(*, input, output, expected_output, metadata): 3037 # ... evaluation logic 3038 return Evaluation(name="accuracy", value=0.85) 3039 3040 def relevance_evaluator(*, input, output, expected_output, metadata): 3041 # ... evaluation logic 3042 return Evaluation(name="relevance", value=0.92) 3043 3044 def composite_evaluator(*, item, evaluations): 3045 # Weighted average of evaluations 3046 weights = {"accuracy": 0.6, "relevance": 0.4} 3047 total = sum( 3048 e.value * weights.get(e.name, 0) 3049 for e in evaluations 3050 if isinstance(e.value, (int, float)) 3051 ) 3052 return Evaluation( 3053 name="composite_score", 3054 value=total, 3055 comment=f"Weighted average of {len(evaluations)} metrics" 3056 ) 3057 3058 result = client.run_batched_evaluation( 3059 scope="traces", 3060 mapper=trace_mapper, 3061 evaluators=[accuracy_evaluator, relevance_evaluator], 3062 composite_evaluator=composite_evaluator, 3063 filter='{"user_id": "important_user"}', 3064 verbose=True 3065 ) 3066 ``` 3067 3068 Handling incomplete runs with resume: 3069 ```python 3070 # Initial run that may fail or timeout 3071 result = client.run_batched_evaluation( 3072 scope="observations", 3073 mapper=obs_mapper, 3074 evaluators=[my_evaluator], 3075 max_items=10000, 3076 verbose=True 3077 ) 3078 3079 # Check if incomplete 3080 if not result.completed and result.resume_token: 3081 print(f"Processed {result.resume_token.items_processed} items before interruption") 3082 3083 # Resume from where it left off 3084 result = client.run_batched_evaluation( 3085 scope="observations", 3086 mapper=obs_mapper, 3087 evaluators=[my_evaluator], 3088 resume_from=result.resume_token, 3089 verbose=True 3090 ) 3091 3092 print(f"Total items processed: {result.total_items_processed}") 3093 ``` 3094 3095 Monitoring evaluator performance: 3096 ```python 3097 result = client.run_batched_evaluation(...) 3098 3099 for stats in result.evaluator_stats: 3100 success_rate = stats.successful_runs / stats.total_runs 3101 print(f"{stats.name}:") 3102 print(f" Success rate: {success_rate:.1%}") 3103 print(f" Scores created: {stats.total_scores_created}") 3104 3105 if stats.failed_runs > 0: 3106 print(f" â ī¸ Failed {stats.failed_runs} times") 3107 ``` 3108 3109 Note: 3110 - Evaluator failures are logged but don't stop the batch evaluation 3111 - Individual item failures are tracked but don't stop processing 3112 - Fetch failures are retried with exponential backoff 3113 - All scores are automatically flushed to Langfuse at the end 3114 - The resume mechanism uses timestamp-based filtering to avoid duplicates 3115 """ 3116 runner = BatchEvaluationRunner(self) 3117 3118 return cast( 3119 BatchEvaluationResult, 3120 run_async_safely( 3121 runner.run_async( 3122 scope=scope, 3123 mapper=mapper, 3124 evaluators=evaluators, 3125 filter=filter, 3126 fetch_batch_size=fetch_batch_size, 3127 fetch_trace_fields=fetch_trace_fields, 3128 max_items=max_items, 3129 max_concurrency=max_concurrency, 3130 composite_evaluator=composite_evaluator, 3131 metadata=metadata, 3132 _add_observation_scores_to_trace=_add_observation_scores_to_trace, 3133 _additional_trace_tags=_additional_trace_tags, 3134 max_retries=max_retries, 3135 verbose=verbose, 3136 resume_from=resume_from, 3137 ) 3138 ), 3139 ) 3140 3141 def auth_check(self) -> bool: 3142 """Check if the provided credentials (public and secret key) are valid. 3143 3144 Raises: 3145 Exception: If no projects were found for the provided credentials. 3146 3147 Note: 3148 This method is blocking. It is discouraged to use it in production code. 3149 """ 3150 try: 3151 projects = self.api.projects.get() 3152 langfuse_logger.debug( 3153 f"Auth check successful, found {len(projects.data)} projects" 3154 ) 3155 if len(projects.data) == 0: 3156 raise Exception( 3157 "Auth check failed, no project found for the keys provided." 3158 ) 3159 return True 3160 3161 except AttributeError as e: 3162 langfuse_logger.warning( 3163 f"Auth check failed: Client not properly initialized. Error: {e}" 3164 ) 3165 return False 3166 3167 except Error as e: 3168 handle_fern_exception(e) 3169 raise e 3170 3171 def create_dataset( 3172 self, 3173 *, 3174 name: str, 3175 description: Optional[str] = None, 3176 metadata: Optional[Any] = None, 3177 input_schema: Optional[Any] = None, 3178 expected_output_schema: Optional[Any] = None, 3179 ) -> Dataset: 3180 """Create a dataset with the given name on Langfuse. 3181 3182 Args: 3183 name: Name of the dataset to create. 3184 description: Description of the dataset. Defaults to None. 3185 metadata: Additional metadata. Defaults to None. 3186 input_schema: JSON Schema for validating dataset item inputs. When set, all new items will be validated against this schema. 3187 expected_output_schema: JSON Schema for validating dataset item expected outputs. When set, all new items will be validated against this schema. 3188 3189 Returns: 3190 Dataset: The created dataset as returned by the Langfuse API. 3191 """ 3192 try: 3193 langfuse_logger.debug(f"Creating datasets {name}") 3194 3195 result = self.api.datasets.create( 3196 name=name, 3197 description=description, 3198 metadata=metadata, 3199 input_schema=input_schema, 3200 expected_output_schema=expected_output_schema, 3201 ) 3202 3203 return cast(Dataset, result) 3204 3205 except Error as e: 3206 handle_fern_exception(e) 3207 raise e 3208 3209 def create_dataset_item( 3210 self, 3211 *, 3212 dataset_name: str, 3213 input: Optional[Any] = None, 3214 expected_output: Optional[Any] = None, 3215 metadata: Optional[Any] = None, 3216 source_trace_id: Optional[str] = None, 3217 source_observation_id: Optional[str] = None, 3218 status: Optional[DatasetStatus] = None, 3219 id: Optional[str] = None, 3220 ) -> DatasetItem: 3221 """Create a dataset item. 3222 3223 Upserts if an item with id already exists. 3224 3225 Args: 3226 dataset_name: Name of the dataset in which the dataset item should be created. 3227 input: Input data. Defaults to None. Can contain any dict, list or scalar. 3228 expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar. 3229 metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar. 3230 source_trace_id: Id of the source trace. Defaults to None. 3231 source_observation_id: Id of the source observation. Defaults to None. 3232 status: Status of the dataset item. Defaults to ACTIVE for newly created items. 3233 id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets. 3234 3235 Returns: 3236 DatasetItem: The created dataset item as returned by the Langfuse API. 3237 3238 Example: 3239 ```python 3240 from langfuse import Langfuse 3241 3242 langfuse = Langfuse() 3243 3244 # Uploading items to the Langfuse dataset named "capital_cities" 3245 langfuse.create_dataset_item( 3246 dataset_name="capital_cities", 3247 input={"input": {"country": "Italy"}}, 3248 expected_output={"expected_output": "Rome"}, 3249 metadata={"foo": "bar"} 3250 ) 3251 ``` 3252 """ 3253 try: 3254 langfuse_logger.debug(f"Creating dataset item for dataset {dataset_name}") 3255 3256 result = self.api.dataset_items.create( 3257 dataset_name=dataset_name, 3258 input=input, 3259 expected_output=expected_output, 3260 metadata=metadata, 3261 source_trace_id=source_trace_id, 3262 source_observation_id=source_observation_id, 3263 status=status, 3264 id=id, 3265 ) 3266 3267 return cast(DatasetItem, result) 3268 except Error as e: 3269 handle_fern_exception(e) 3270 raise e 3271 3272 def resolve_media_references( 3273 self, 3274 *, 3275 obj: Any, 3276 resolve_with: Literal["base64_data_uri"], 3277 max_depth: int = 10, 3278 content_fetch_timeout_seconds: int = 5, 3279 ) -> Any: 3280 """Replace media reference strings in an object with base64 data URIs. 3281 3282 This method recursively traverses an object (up to max_depth) looking for media reference strings 3283 in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using 3284 the provided Langfuse client and replaces the reference string with a base64 data URI. 3285 3286 If fetching media content fails for a reference string, a warning is logged and the reference 3287 string is left unchanged. 3288 3289 Args: 3290 obj: The object to process. Can be a primitive value, array, or nested object. 3291 If the object has a __dict__ attribute, a dict will be returned instead of the original object type. 3292 resolve_with: The representation of the media content to replace the media reference string with. 3293 Currently only "base64_data_uri" is supported. 3294 max_depth: int: The maximum depth to traverse the object. Default is 10. 3295 content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5. 3296 3297 Returns: 3298 A deep copy of the input object with all media references replaced with base64 data URIs where possible. 3299 If the input object has a __dict__ attribute, a dict will be returned instead of the original object type. 3300 3301 Example: 3302 obj = { 3303 "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@", 3304 "nested": { 3305 "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@" 3306 } 3307 } 3308 3309 result = await LangfuseMedia.resolve_media_references(obj, langfuse_client) 3310 3311 # Result: 3312 # { 3313 # "image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...", 3314 # "nested": { 3315 # "pdf": "data:application/pdf;base64,JVBERi0xLjcK..." 3316 # } 3317 # } 3318 """ 3319 return LangfuseMedia.resolve_media_references( 3320 langfuse_client=self, 3321 obj=obj, 3322 resolve_with=resolve_with, 3323 max_depth=max_depth, 3324 content_fetch_timeout_seconds=content_fetch_timeout_seconds, 3325 ) 3326 3327 @overload 3328 def get_prompt( 3329 self, 3330 name: str, 3331 *, 3332 version: Optional[int] = None, 3333 label: Optional[str] = None, 3334 type: Literal["chat"], 3335 cache_ttl_seconds: Optional[int] = None, 3336 fallback: Optional[List[ChatMessageDict]] = None, 3337 max_retries: Optional[int] = None, 3338 fetch_timeout_seconds: Optional[int] = None, 3339 ) -> ChatPromptClient: ... 3340 3341 @overload 3342 def get_prompt( 3343 self, 3344 name: str, 3345 *, 3346 version: Optional[int] = None, 3347 label: Optional[str] = None, 3348 type: Literal["text"] = "text", 3349 cache_ttl_seconds: Optional[int] = None, 3350 fallback: Optional[str] = None, 3351 max_retries: Optional[int] = None, 3352 fetch_timeout_seconds: Optional[int] = None, 3353 ) -> TextPromptClient: ... 3354 3355 def get_prompt( 3356 self, 3357 name: str, 3358 *, 3359 version: Optional[int] = None, 3360 label: Optional[str] = None, 3361 type: Literal["chat", "text"] = "text", 3362 cache_ttl_seconds: Optional[int] = None, 3363 fallback: Union[Optional[List[ChatMessageDict]], Optional[str]] = None, 3364 max_retries: Optional[int] = None, 3365 fetch_timeout_seconds: Optional[int] = None, 3366 ) -> PromptClient: 3367 """Get a prompt. 3368 3369 This method attempts to fetch the requested prompt from the local cache. If the prompt is not found 3370 in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again 3371 and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will 3372 return the expired prompt as a fallback. 3373 3374 Args: 3375 name (str): The name of the prompt to retrieve. 3376 3377 Keyword Args: 3378 version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both. 3379 label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both. 3380 cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a 3381 keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0. 3382 type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text". 3383 fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None. 3384 max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds. 3385 fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default. 3386 3387 Returns: 3388 The prompt object retrieved from the cache or directly fetched if not cached or expired of type 3389 - TextPromptClient, if type argument is 'text'. 3390 - ChatPromptClient, if type argument is 'chat'. 3391 3392 Raises: 3393 Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an 3394 expired prompt in the cache, in which case it logs a warning and returns the expired prompt. 3395 """ 3396 if self._resources is None: 3397 raise Error( 3398 "SDK is not correctly initialized. Check the init logs for more details." 3399 ) 3400 if version is not None and label is not None: 3401 raise ValueError("Cannot specify both version and label at the same time.") 3402 3403 if not name: 3404 raise ValueError("Prompt name cannot be empty.") 3405 3406 cache_key = PromptCache.generate_cache_key(name, version=version, label=label) 3407 bounded_max_retries = self._get_bounded_max_retries( 3408 max_retries, default_max_retries=2, max_retries_upper_bound=4 3409 ) 3410 3411 langfuse_logger.debug(f"Getting prompt '{cache_key}'") 3412 cached_prompt = self._resources.prompt_cache.get(cache_key) 3413 3414 if cached_prompt is None or cache_ttl_seconds == 0: 3415 langfuse_logger.debug( 3416 f"Prompt '{cache_key}' not found in cache or caching disabled." 3417 ) 3418 try: 3419 return self._fetch_prompt_and_update_cache( 3420 name, 3421 version=version, 3422 label=label, 3423 ttl_seconds=cache_ttl_seconds, 3424 max_retries=bounded_max_retries, 3425 fetch_timeout_seconds=fetch_timeout_seconds, 3426 ) 3427 except Exception as e: 3428 if fallback: 3429 langfuse_logger.warning( 3430 f"Returning fallback prompt for '{cache_key}' due to fetch error: {e}" 3431 ) 3432 3433 fallback_client_args: Dict[str, Any] = { 3434 "name": name, 3435 "prompt": fallback, 3436 "type": type, 3437 "version": version or 0, 3438 "config": {}, 3439 "labels": [label] if label else [], 3440 "tags": [], 3441 } 3442 3443 if type == "text": 3444 return TextPromptClient( 3445 prompt=Prompt_Text(**fallback_client_args), 3446 is_fallback=True, 3447 ) 3448 3449 if type == "chat": 3450 return ChatPromptClient( 3451 prompt=Prompt_Chat(**fallback_client_args), 3452 is_fallback=True, 3453 ) 3454 3455 raise e 3456 3457 if cached_prompt.is_expired(): 3458 langfuse_logger.debug(f"Stale prompt '{cache_key}' found in cache.") 3459 try: 3460 # refresh prompt in background thread, refresh_prompt deduplicates tasks 3461 langfuse_logger.debug(f"Refreshing prompt '{cache_key}' in background.") 3462 3463 def refresh_task() -> None: 3464 self._fetch_prompt_and_update_cache( 3465 name, 3466 version=version, 3467 label=label, 3468 ttl_seconds=cache_ttl_seconds, 3469 max_retries=bounded_max_retries, 3470 fetch_timeout_seconds=fetch_timeout_seconds, 3471 ) 3472 3473 self._resources.prompt_cache.add_refresh_prompt_task( 3474 cache_key, 3475 refresh_task, 3476 ) 3477 langfuse_logger.debug( 3478 f"Returning stale prompt '{cache_key}' from cache." 3479 ) 3480 # return stale prompt 3481 return cached_prompt.value 3482 3483 except Exception as e: 3484 langfuse_logger.warning( 3485 f"Error when refreshing cached prompt '{cache_key}', returning cached version. Error: {e}" 3486 ) 3487 # creation of refresh prompt task failed, return stale prompt 3488 return cached_prompt.value 3489 3490 return cached_prompt.value 3491 3492 def _fetch_prompt_and_update_cache( 3493 self, 3494 name: str, 3495 *, 3496 version: Optional[int] = None, 3497 label: Optional[str] = None, 3498 ttl_seconds: Optional[int] = None, 3499 max_retries: int, 3500 fetch_timeout_seconds: Optional[int], 3501 ) -> PromptClient: 3502 cache_key = PromptCache.generate_cache_key(name, version=version, label=label) 3503 langfuse_logger.debug(f"Fetching prompt '{cache_key}' from server...") 3504 3505 try: 3506 3507 @backoff.on_exception( 3508 backoff.constant, Exception, max_tries=max_retries + 1, logger=None 3509 ) 3510 def fetch_prompts() -> Any: 3511 return self.api.prompts.get( 3512 self._url_encode(name), 3513 version=version, 3514 label=label, 3515 request_options={ 3516 "timeout_in_seconds": fetch_timeout_seconds, 3517 } 3518 if fetch_timeout_seconds is not None 3519 else None, 3520 ) 3521 3522 prompt_response = fetch_prompts() 3523 3524 prompt: PromptClient 3525 if prompt_response.type == "chat": 3526 prompt = ChatPromptClient(prompt_response) 3527 else: 3528 prompt = TextPromptClient(prompt_response) 3529 3530 if self._resources is not None: 3531 self._resources.prompt_cache.set(cache_key, prompt, ttl_seconds) 3532 3533 return prompt 3534 3535 except NotFoundError as not_found_error: 3536 langfuse_logger.warning( 3537 f"Prompt '{cache_key}' not found during refresh, evicting from cache." 3538 ) 3539 if self._resources is not None: 3540 self._resources.prompt_cache.delete(cache_key) 3541 raise not_found_error 3542 3543 except Exception as e: 3544 langfuse_logger.error( 3545 f"Error while fetching prompt '{cache_key}': {str(e)}" 3546 ) 3547 raise e 3548 3549 def _get_bounded_max_retries( 3550 self, 3551 max_retries: Optional[int], 3552 *, 3553 default_max_retries: int = 2, 3554 max_retries_upper_bound: int = 4, 3555 ) -> int: 3556 if max_retries is None: 3557 return default_max_retries 3558 3559 bounded_max_retries = min( 3560 max(max_retries, 0), 3561 max_retries_upper_bound, 3562 ) 3563 3564 return bounded_max_retries 3565 3566 @overload 3567 def create_prompt( 3568 self, 3569 *, 3570 name: str, 3571 prompt: List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]], 3572 labels: List[str] = [], 3573 tags: Optional[List[str]] = None, 3574 type: Optional[Literal["chat"]], 3575 config: Optional[Any] = None, 3576 commit_message: Optional[str] = None, 3577 ) -> ChatPromptClient: ... 3578 3579 @overload 3580 def create_prompt( 3581 self, 3582 *, 3583 name: str, 3584 prompt: str, 3585 labels: List[str] = [], 3586 tags: Optional[List[str]] = None, 3587 type: Optional[Literal["text"]] = "text", 3588 config: Optional[Any] = None, 3589 commit_message: Optional[str] = None, 3590 ) -> TextPromptClient: ... 3591 3592 def create_prompt( 3593 self, 3594 *, 3595 name: str, 3596 prompt: Union[ 3597 str, List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]] 3598 ], 3599 labels: List[str] = [], 3600 tags: Optional[List[str]] = None, 3601 type: Optional[Literal["chat", "text"]] = "text", 3602 config: Optional[Any] = None, 3603 commit_message: Optional[str] = None, 3604 ) -> PromptClient: 3605 """Create a new prompt in Langfuse. 3606 3607 Keyword Args: 3608 name : The name of the prompt to be created. 3609 prompt : The content of the prompt to be created. 3610 is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead. 3611 labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label. 3612 tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt. 3613 config: Additional structured data to be saved with the prompt. Defaults to None. 3614 type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text". 3615 commit_message: Optional string describing the change. 3616 3617 Returns: 3618 TextPromptClient: The prompt if type argument is 'text'. 3619 ChatPromptClient: The prompt if type argument is 'chat'. 3620 """ 3621 try: 3622 langfuse_logger.debug(f"Creating prompt {name=}, {labels=}") 3623 3624 if type == "chat": 3625 if not isinstance(prompt, list): 3626 raise ValueError( 3627 "For 'chat' type, 'prompt' must be a list of chat messages with role and content attributes." 3628 ) 3629 request: Union[CreateChatPromptRequest, CreateTextPromptRequest] = ( 3630 CreateChatPromptRequest( 3631 name=name, 3632 prompt=cast(Any, prompt), 3633 labels=labels, 3634 tags=tags, 3635 config=config or {}, 3636 commit_message=commit_message, 3637 type=CreateChatPromptType.CHAT, 3638 ) 3639 ) 3640 server_prompt = self.api.prompts.create(request=request) 3641 3642 if self._resources is not None: 3643 self._resources.prompt_cache.invalidate(name) 3644 3645 return ChatPromptClient(prompt=cast(Prompt_Chat, server_prompt)) 3646 3647 if not isinstance(prompt, str): 3648 raise ValueError("For 'text' type, 'prompt' must be a string.") 3649 3650 request = CreateTextPromptRequest( 3651 name=name, 3652 prompt=prompt, 3653 labels=labels, 3654 tags=tags, 3655 config=config or {}, 3656 commit_message=commit_message, 3657 ) 3658 3659 server_prompt = self.api.prompts.create(request=request) 3660 3661 if self._resources is not None: 3662 self._resources.prompt_cache.invalidate(name) 3663 3664 return TextPromptClient(prompt=cast(Prompt_Text, server_prompt)) 3665 3666 except Error as e: 3667 handle_fern_exception(e) 3668 raise e 3669 3670 def update_prompt( 3671 self, 3672 *, 3673 name: str, 3674 version: int, 3675 new_labels: List[str] = [], 3676 ) -> Any: 3677 """Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name. 3678 3679 Args: 3680 name (str): The name of the prompt to update. 3681 version (int): The version number of the prompt to update. 3682 new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to []. 3683 3684 Returns: 3685 Prompt: The updated prompt from the Langfuse API. 3686 3687 """ 3688 updated_prompt = self.api.prompt_version.update( 3689 name=self._url_encode(name), 3690 version=version, 3691 new_labels=new_labels, 3692 ) 3693 3694 if self._resources is not None: 3695 self._resources.prompt_cache.invalidate(name) 3696 3697 return updated_prompt 3698 3699 def _url_encode(self, url: str, *, is_url_param: Optional[bool] = False) -> str: 3700 # httpx âĨ 0.28 does its own WHATWG-compliant quoting (eg. encodes bare 3701 # â%â, â?â, â#â, â|â, âĻ in query/path parts). Re-quoting here would 3702 # double-encode, so we skip when the value is about to be sent straight 3703 # to httpx (`is_url_param=True`) and the installed version is âĨ 0.28. 3704 if is_url_param and Version(httpx.__version__) >= Version("0.28.0"): 3705 return url 3706 3707 # urllib.parse.quote does not escape slashes "/" by default; we need to add safe="" to force escaping 3708 # we need add safe="" to force escaping of slashes 3709 # This is necessary for prompts in prompt folders 3710 return urllib.parse.quote(url, safe="") 3711 3712 def clear_prompt_cache(self) -> None: 3713 """Clear the entire prompt cache, removing all cached prompts. 3714 3715 This method is useful when you want to force a complete refresh of all 3716 cached prompts, for example after major updates or when you need to 3717 ensure the latest versions are fetched from the server. 3718 """ 3719 if self._resources is not None: 3720 self._resources.prompt_cache.clear()
Main client for Langfuse tracing and platform features.
This class provides an interface for creating and managing traces, spans, and generations in Langfuse as well as interacting with the Langfuse API.
The client features a thread-safe singleton pattern for each unique public API key, ensuring consistent trace context propagation across your application. It implements efficient batching of spans with configurable flush settings and includes background thread management for media uploads and score ingestion.
Configuration is flexible through either direct parameters or environment variables, with graceful fallbacks and runtime configuration updates.
Attributes:
- api: Synchronous API client for Langfuse backend communication
- async_api: Asynchronous API client for Langfuse backend communication
- _otel_tracer: Internal LangfuseTracer instance managing OpenTelemetry components
Arguments:
- public_key (Optional[str]): Your Langfuse public API key. Can also be set via LANGFUSE_PUBLIC_KEY environment variable.
- secret_key (Optional[str]): Your Langfuse secret API key. Can also be set via LANGFUSE_SECRET_KEY environment variable.
- base_url (Optional[str]): The Langfuse API base URL. Defaults to "https://cloud.langfuse.com". Can also be set via LANGFUSE_BASE_URL environment variable.
- host (Optional[str]): Deprecated. Use base_url instead. The Langfuse API host URL. Defaults to "https://cloud.langfuse.com".
- timeout (Optional[int]): Timeout in seconds for API requests. Defaults to 5 seconds.
- httpx_client (Optional[httpx.Client]): Custom httpx client for making non-tracing HTTP requests. If not provided, a default client will be created.
- debug (bool): Enable debug logging. Defaults to False. Can also be set via LANGFUSE_DEBUG environment variable.
- tracing_enabled (Optional[bool]): Enable or disable tracing. Defaults to True. Can also be set via LANGFUSE_TRACING_ENABLED environment variable.
- flush_at (Optional[int]): Number of spans to batch before sending to the API. Defaults to 512. Can also be set via LANGFUSE_FLUSH_AT environment variable.
- flush_interval (Optional[float]): Time in seconds between batch flushes. Defaults to 5 seconds. Can also be set via LANGFUSE_FLUSH_INTERVAL environment variable.
- environment (Optional[str]): Environment name for tracing. Default is 'default'. Can also be set via LANGFUSE_TRACING_ENVIRONMENT environment variable. Can be any lowercase alphanumeric string with hyphens and underscores that does not start with 'langfuse'.
- release (Optional[str]): Release version/hash of your application. Used for grouping analytics by release.
- media_upload_thread_count (Optional[int]): Number of background threads for handling media uploads. Defaults to 1. Can also be set via LANGFUSE_MEDIA_UPLOAD_THREAD_COUNT environment variable.
- sample_rate (Optional[float]): Sampling rate for traces (0.0 to 1.0). Defaults to 1.0 (100% of traces are sampled). Can also be set via LANGFUSE_SAMPLE_RATE environment variable.
- mask (Optional[MaskFunction]): Function to mask sensitive data in traces before sending to the API.
blocked_instrumentation_scopes (Optional[List[str]]): Deprecated. Use
should_export_spaninstead. Equivalent behavior:from langfuse.span_filter import is_default_export_span blocked = {"sqlite", "requests"} should_export_span = lambda span: ( is_default_export_span(span) and ( span.instrumentation_scope is None or span.instrumentation_scope.name not in blocked ) )should_export_span (Optional[Callable[[ReadableSpan], bool]]): Callback to decide whether to export a span. If omitted, Langfuse uses the default filter (Langfuse SDK spans, spans with
gen_ai.*attributes, and known LLM instrumentation scopes).- additional_headers (Optional[Dict[str, str]]): Additional headers to include in all API requests and OTLPSpanExporter requests. These headers will be merged with default headers. Note: If httpx_client is provided, additional_headers must be set directly on your custom httpx_client as well.
- tracer_provider(Optional[TracerProvider]): OpenTelemetry TracerProvider to use for Langfuse. This can be useful to set to have disconnected tracing between Langfuse and other OpenTelemetry-span emitting libraries. Note: To track active spans, the context is still shared between TracerProviders. This may lead to broken trace trees.
Example:
from langfuse.otel import Langfuse # Initialize the client (reads from env vars if not provided) langfuse = Langfuse( public_key="your-public-key", secret_key="your-secret-key", host="https://cloud.langfuse.com", # Optional, default shown ) # Create a trace span with langfuse.start_as_current_observation(name="process-query") as span: # Your application code here # Create a nested generation span for an LLM call with span.start_as_current_generation( name="generate-response", model="gpt-4", input={"query": "Tell me about AI"}, model_parameters={"temperature": 0.7, "max_tokens": 500} ) as generation: # Generate response here response = "AI is a field of computer science..." generation.update( output=response, usage_details={"prompt_tokens": 10, "completion_tokens": 50}, cost_details={"total_cost": 0.0023} ) # Score the generation (supports NUMERIC, BOOLEAN, CATEGORICAL) generation.score(name="relevance", value=0.95, data_type="NUMERIC")
226 def __init__( 227 self, 228 *, 229 public_key: Optional[str] = None, 230 secret_key: Optional[str] = None, 231 base_url: Optional[str] = None, 232 host: Optional[str] = None, 233 timeout: Optional[int] = None, 234 httpx_client: Optional[httpx.Client] = None, 235 debug: bool = False, 236 tracing_enabled: Optional[bool] = True, 237 flush_at: Optional[int] = None, 238 flush_interval: Optional[float] = None, 239 environment: Optional[str] = None, 240 release: Optional[str] = None, 241 media_upload_thread_count: Optional[int] = None, 242 sample_rate: Optional[float] = None, 243 mask: Optional[MaskFunction] = None, 244 blocked_instrumentation_scopes: Optional[List[str]] = None, 245 should_export_span: Optional[Callable[[ReadableSpan], bool]] = None, 246 additional_headers: Optional[Dict[str, str]] = None, 247 tracer_provider: Optional[TracerProvider] = None, 248 ): 249 self._base_url = ( 250 base_url 251 or os.environ.get(LANGFUSE_BASE_URL) 252 or host 253 or os.environ.get(LANGFUSE_HOST, "https://cloud.langfuse.com") 254 ) 255 self._environment = environment or cast( 256 str, os.environ.get(LANGFUSE_TRACING_ENVIRONMENT) 257 ) 258 self._release = ( 259 release 260 or os.environ.get(LANGFUSE_RELEASE, None) 261 or get_common_release_envs() 262 ) 263 self._project_id: Optional[str] = None 264 sample_rate = sample_rate or float(os.environ.get(LANGFUSE_SAMPLE_RATE, 1.0)) 265 if not 0.0 <= sample_rate <= 1.0: 266 raise ValueError( 267 f"Sample rate must be between 0.0 and 1.0, got {sample_rate}" 268 ) 269 270 timeout = timeout or int(os.environ.get(LANGFUSE_TIMEOUT, 5)) 271 272 self._tracing_enabled = ( 273 tracing_enabled 274 and os.environ.get(LANGFUSE_TRACING_ENABLED, "true").lower() != "false" 275 ) 276 if not self._tracing_enabled: 277 langfuse_logger.info( 278 "Configuration: Langfuse tracing is explicitly disabled. No data will be sent to the Langfuse API." 279 ) 280 281 debug = ( 282 debug if debug else (os.getenv(LANGFUSE_DEBUG, "false").lower() == "true") 283 ) 284 if debug: 285 logging.basicConfig( 286 format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" 287 ) 288 langfuse_logger.setLevel(logging.DEBUG) 289 290 public_key = public_key or os.environ.get(LANGFUSE_PUBLIC_KEY) 291 if public_key is None: 292 langfuse_logger.warning( 293 "Authentication error: Langfuse client initialized without public_key. Client will be disabled. " 294 "Provide a public_key parameter or set LANGFUSE_PUBLIC_KEY environment variable. " 295 ) 296 self._otel_tracer = otel_trace_api.NoOpTracer() 297 return 298 299 secret_key = secret_key or os.environ.get(LANGFUSE_SECRET_KEY) 300 if secret_key is None: 301 langfuse_logger.warning( 302 "Authentication error: Langfuse client initialized without secret_key. Client will be disabled. " 303 "Provide a secret_key parameter or set LANGFUSE_SECRET_KEY environment variable. " 304 ) 305 self._otel_tracer = otel_trace_api.NoOpTracer() 306 return 307 308 if os.environ.get("OTEL_SDK_DISABLED", "false").lower() == "true": 309 langfuse_logger.warning( 310 "OTEL_SDK_DISABLED is set. Langfuse tracing will be disabled and no traces will appear in the UI." 311 ) 312 313 if blocked_instrumentation_scopes is not None: 314 warnings.warn( 315 "`blocked_instrumentation_scopes` is deprecated and will be removed in a future release. " 316 "Use `should_export_span` instead. Example: " 317 "from langfuse.span_filter import is_default_export_span; " 318 'blocked={"scope"}; should_export_span=lambda span: ' 319 "is_default_export_span(span) and (span.instrumentation_scope is None or " 320 "span.instrumentation_scope.name not in blocked).", 321 DeprecationWarning, 322 stacklevel=2, 323 ) 324 325 # Initialize api and tracer if requirements are met 326 self._resources = LangfuseResourceManager( 327 public_key=public_key, 328 secret_key=secret_key, 329 base_url=self._base_url, 330 timeout=timeout, 331 environment=self._environment, 332 release=release, 333 flush_at=flush_at, 334 flush_interval=flush_interval, 335 httpx_client=httpx_client, 336 media_upload_thread_count=media_upload_thread_count, 337 sample_rate=sample_rate, 338 mask=mask, 339 tracing_enabled=self._tracing_enabled, 340 blocked_instrumentation_scopes=blocked_instrumentation_scopes, 341 should_export_span=should_export_span, 342 additional_headers=additional_headers, 343 tracer_provider=tracer_provider, 344 ) 345 self._mask = self._resources.mask 346 347 self._otel_tracer = ( 348 self._resources.tracer 349 if self._tracing_enabled and self._resources.tracer is not None 350 else otel_trace_api.NoOpTracer() 351 ) 352 self.api = self._resources.api 353 self.async_api = self._resources.async_api
502 def start_observation( 503 self, 504 *, 505 trace_context: Optional[TraceContext] = None, 506 name: str, 507 as_type: ObservationTypeLiteralNoEvent = "span", 508 input: Optional[Any] = None, 509 output: Optional[Any] = None, 510 metadata: Optional[Any] = None, 511 version: Optional[str] = None, 512 level: Optional[SpanLevel] = None, 513 status_message: Optional[str] = None, 514 completion_start_time: Optional[datetime] = None, 515 model: Optional[str] = None, 516 model_parameters: Optional[Dict[str, MapValue]] = None, 517 usage_details: Optional[Dict[str, int]] = None, 518 cost_details: Optional[Dict[str, float]] = None, 519 prompt: Optional[PromptClient] = None, 520 ) -> Union[ 521 LangfuseSpan, 522 LangfuseGeneration, 523 LangfuseAgent, 524 LangfuseTool, 525 LangfuseChain, 526 LangfuseRetriever, 527 LangfuseEvaluator, 528 LangfuseEmbedding, 529 LangfuseGuardrail, 530 ]: 531 """Create a new observation of the specified type. 532 533 This method creates a new observation but does not set it as the current span in the 534 context. To create and use an observation within a context, use start_as_current_observation(). 535 536 Args: 537 trace_context: Optional context for connecting to an existing trace 538 name: Name of the observation 539 as_type: Type of observation to create (defaults to "span") 540 input: Input data for the operation 541 output: Output data from the operation 542 metadata: Additional metadata to associate with the observation 543 version: Version identifier for the code or component 544 level: Importance level of the observation 545 status_message: Optional status message for the observation 546 completion_start_time: When the model started generating (for generation types) 547 model: Name/identifier of the AI model used (for generation types) 548 model_parameters: Parameters used for the model (for generation types) 549 usage_details: Token usage information (for generation types) 550 cost_details: Cost information (for generation types) 551 prompt: Associated prompt template (for generation types) 552 553 Returns: 554 An observation object of the appropriate type that must be ended with .end() 555 """ 556 if trace_context: 557 trace_id = trace_context.get("trace_id", None) 558 parent_span_id = trace_context.get("parent_span_id", None) 559 560 if trace_id: 561 remote_parent_span = self._create_remote_parent_span( 562 trace_id=trace_id, parent_span_id=parent_span_id 563 ) 564 565 with otel_trace_api.use_span( 566 cast(otel_trace_api.Span, remote_parent_span) 567 ): 568 otel_span = self._otel_tracer.start_span(name=name) 569 otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True) 570 571 return self._create_observation_from_otel_span( 572 otel_span=otel_span, 573 as_type=as_type, 574 input=input, 575 output=output, 576 metadata=metadata, 577 version=version, 578 level=level, 579 status_message=status_message, 580 completion_start_time=completion_start_time, 581 model=model, 582 model_parameters=model_parameters, 583 usage_details=usage_details, 584 cost_details=cost_details, 585 prompt=prompt, 586 ) 587 588 otel_span = self._otel_tracer.start_span(name=name) 589 590 return self._create_observation_from_otel_span( 591 otel_span=otel_span, 592 as_type=as_type, 593 input=input, 594 output=output, 595 metadata=metadata, 596 version=version, 597 level=level, 598 status_message=status_message, 599 completion_start_time=completion_start_time, 600 model=model, 601 model_parameters=model_parameters, 602 usage_details=usage_details, 603 cost_details=cost_details, 604 prompt=prompt, 605 )
Create a new observation of the specified type.
This method creates a new observation but does not set it as the current span in the context. To create and use an observation within a context, use start_as_current_observation().
Arguments:
- trace_context: Optional context for connecting to an existing trace
- name: Name of the observation
- as_type: Type of observation to create (defaults to "span")
- input: Input data for the operation
- output: Output data from the operation
- metadata: Additional metadata to associate with the observation
- version: Version identifier for the code or component
- level: Importance level of the observation
- status_message: Optional status message for the observation
- completion_start_time: When the model started generating (for generation types)
- model: Name/identifier of the AI model used (for generation types)
- model_parameters: Parameters used for the model (for generation types)
- usage_details: Token usage information (for generation types)
- cost_details: Cost information (for generation types)
- prompt: Associated prompt template (for generation types)
Returns:
An observation object of the appropriate type that must be ended with .end()
835 def start_as_current_observation( 836 self, 837 *, 838 trace_context: Optional[TraceContext] = None, 839 name: str, 840 as_type: ObservationTypeLiteralNoEvent = "span", 841 input: Optional[Any] = None, 842 output: Optional[Any] = None, 843 metadata: Optional[Any] = None, 844 version: Optional[str] = None, 845 level: Optional[SpanLevel] = None, 846 status_message: Optional[str] = None, 847 completion_start_time: Optional[datetime] = None, 848 model: Optional[str] = None, 849 model_parameters: Optional[Dict[str, MapValue]] = None, 850 usage_details: Optional[Dict[str, int]] = None, 851 cost_details: Optional[Dict[str, float]] = None, 852 prompt: Optional[PromptClient] = None, 853 end_on_exit: Optional[bool] = None, 854 ) -> Union[ 855 _AgnosticContextManager[LangfuseGeneration], 856 _AgnosticContextManager[LangfuseSpan], 857 _AgnosticContextManager[LangfuseAgent], 858 _AgnosticContextManager[LangfuseTool], 859 _AgnosticContextManager[LangfuseChain], 860 _AgnosticContextManager[LangfuseRetriever], 861 _AgnosticContextManager[LangfuseEvaluator], 862 _AgnosticContextManager[LangfuseEmbedding], 863 _AgnosticContextManager[LangfuseGuardrail], 864 ]: 865 """Create a new observation and set it as the current span in a context manager. 866 867 This method creates a new observation of the specified type and sets it as the 868 current span within a context manager. Use this method with a 'with' statement to 869 automatically handle the observation lifecycle within a code block. 870 871 The created observation will be the child of the current span in the context. 872 873 Args: 874 trace_context: Optional context for connecting to an existing trace 875 name: Name of the observation (e.g., function or operation name) 876 as_type: Type of observation to create (defaults to "span") 877 input: Input data for the operation (can be any JSON-serializable object) 878 output: Output data from the operation (can be any JSON-serializable object) 879 metadata: Additional metadata to associate with the observation 880 version: Version identifier for the code or component 881 level: Importance level of the observation (info, warning, error) 882 status_message: Optional status message for the observation 883 end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks. 884 885 The following parameters are available when as_type is: "generation" or "embedding". 886 completion_start_time: When the model started generating the response 887 model: Name/identifier of the AI model used (e.g., "gpt-4") 888 model_parameters: Parameters used for the model (e.g., temperature, max_tokens) 889 usage_details: Token usage information (e.g., prompt_tokens, completion_tokens) 890 cost_details: Cost information for the model call 891 prompt: Associated prompt template from Langfuse prompt management 892 893 Returns: 894 A context manager that yields the appropriate observation type based on as_type 895 896 Example: 897 ```python 898 # Create a span 899 with langfuse.start_as_current_observation(name="process-query", as_type="span") as span: 900 # Do work 901 result = process_data() 902 span.update(output=result) 903 904 # Create a child span automatically 905 with span.start_as_current_observation(name="sub-operation") as child_span: 906 # Do sub-operation work 907 child_span.update(output="sub-result") 908 909 # Create a tool observation 910 with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool: 911 # Do tool work 912 results = search_web(query) 913 tool.update(output=results) 914 915 # Create a generation observation 916 with langfuse.start_as_current_observation( 917 name="answer-generation", 918 as_type="generation", 919 model="gpt-4" 920 ) as generation: 921 # Generate answer 922 response = llm.generate(...) 923 generation.update(output=response) 924 ``` 925 """ 926 if as_type in get_observation_types_list(ObservationTypeGenerationLike): 927 if trace_context: 928 trace_id = trace_context.get("trace_id", None) 929 parent_span_id = trace_context.get("parent_span_id", None) 930 931 if trace_id: 932 remote_parent_span = self._create_remote_parent_span( 933 trace_id=trace_id, parent_span_id=parent_span_id 934 ) 935 936 return cast( 937 Union[ 938 _AgnosticContextManager[LangfuseGeneration], 939 _AgnosticContextManager[LangfuseEmbedding], 940 ], 941 self._create_span_with_parent_context( 942 as_type=as_type, 943 name=name, 944 remote_parent_span=remote_parent_span, 945 parent=None, 946 end_on_exit=end_on_exit, 947 input=input, 948 output=output, 949 metadata=metadata, 950 version=version, 951 level=level, 952 status_message=status_message, 953 completion_start_time=completion_start_time, 954 model=model, 955 model_parameters=model_parameters, 956 usage_details=usage_details, 957 cost_details=cost_details, 958 prompt=prompt, 959 ), 960 ) 961 962 return cast( 963 Union[ 964 _AgnosticContextManager[LangfuseGeneration], 965 _AgnosticContextManager[LangfuseEmbedding], 966 ], 967 self._start_as_current_otel_span_with_processed_media( 968 as_type=as_type, 969 name=name, 970 end_on_exit=end_on_exit, 971 input=input, 972 output=output, 973 metadata=metadata, 974 version=version, 975 level=level, 976 status_message=status_message, 977 completion_start_time=completion_start_time, 978 model=model, 979 model_parameters=model_parameters, 980 usage_details=usage_details, 981 cost_details=cost_details, 982 prompt=prompt, 983 ), 984 ) 985 986 if as_type in get_observation_types_list(ObservationTypeSpanLike): 987 if trace_context: 988 trace_id = trace_context.get("trace_id", None) 989 parent_span_id = trace_context.get("parent_span_id", None) 990 991 if trace_id: 992 remote_parent_span = self._create_remote_parent_span( 993 trace_id=trace_id, parent_span_id=parent_span_id 994 ) 995 996 return cast( 997 Union[ 998 _AgnosticContextManager[LangfuseSpan], 999 _AgnosticContextManager[LangfuseAgent], 1000 _AgnosticContextManager[LangfuseTool], 1001 _AgnosticContextManager[LangfuseChain], 1002 _AgnosticContextManager[LangfuseRetriever], 1003 _AgnosticContextManager[LangfuseEvaluator], 1004 _AgnosticContextManager[LangfuseGuardrail], 1005 ], 1006 self._create_span_with_parent_context( 1007 as_type=as_type, 1008 name=name, 1009 remote_parent_span=remote_parent_span, 1010 parent=None, 1011 end_on_exit=end_on_exit, 1012 input=input, 1013 output=output, 1014 metadata=metadata, 1015 version=version, 1016 level=level, 1017 status_message=status_message, 1018 ), 1019 ) 1020 1021 return cast( 1022 Union[ 1023 _AgnosticContextManager[LangfuseSpan], 1024 _AgnosticContextManager[LangfuseAgent], 1025 _AgnosticContextManager[LangfuseTool], 1026 _AgnosticContextManager[LangfuseChain], 1027 _AgnosticContextManager[LangfuseRetriever], 1028 _AgnosticContextManager[LangfuseEvaluator], 1029 _AgnosticContextManager[LangfuseGuardrail], 1030 ], 1031 self._start_as_current_otel_span_with_processed_media( 1032 as_type=as_type, 1033 name=name, 1034 end_on_exit=end_on_exit, 1035 input=input, 1036 output=output, 1037 metadata=metadata, 1038 version=version, 1039 level=level, 1040 status_message=status_message, 1041 ), 1042 ) 1043 1044 # This should never be reached since all valid types are handled above 1045 langfuse_logger.warning( 1046 f"Unknown observation type: {as_type}, falling back to span" 1047 ) 1048 return self._start_as_current_otel_span_with_processed_media( 1049 as_type="span", 1050 name=name, 1051 end_on_exit=end_on_exit, 1052 input=input, 1053 output=output, 1054 metadata=metadata, 1055 version=version, 1056 level=level, 1057 status_message=status_message, 1058 )
Create a new observation and set it as the current span in a context manager.
This method creates a new observation of the specified type and sets it as the current span within a context manager. Use this method with a 'with' statement to automatically handle the observation lifecycle within a code block.
The created observation will be the child of the current span in the context.
Arguments:
- trace_context: Optional context for connecting to an existing trace
- name: Name of the observation (e.g., function or operation name)
- as_type: Type of observation to create (defaults to "span")
- input: Input data for the operation (can be any JSON-serializable object)
- output: Output data from the operation (can be any JSON-serializable object)
- metadata: Additional metadata to associate with the observation
- version: Version identifier for the code or component
- level: Importance level of the observation (info, warning, error)
- status_message: Optional status message for the observation
- end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
- The following parameters are available when as_type is: "generation" or "embedding".
- completion_start_time: When the model started generating the response
- model: Name/identifier of the AI model used (e.g., "gpt-4")
- model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
- usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
- cost_details: Cost information for the model call
- prompt: Associated prompt template from Langfuse prompt management
Returns:
A context manager that yields the appropriate observation type based on as_type
Example:
# Create a span with langfuse.start_as_current_observation(name="process-query", as_type="span") as span: # Do work result = process_data() span.update(output=result) # Create a child span automatically with span.start_as_current_observation(name="sub-operation") as child_span: # Do sub-operation work child_span.update(output="sub-result") # Create a tool observation with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool: # Do tool work results = search_web(query) tool.update(output=results) # Create a generation observation with langfuse.start_as_current_observation( name="answer-generation", as_type="generation", model="gpt-4" ) as generation: # Generate answer response = llm.generate(...) generation.update(output=response)
1220 def update_current_generation( 1221 self, 1222 *, 1223 name: Optional[str] = None, 1224 input: Optional[Any] = None, 1225 output: Optional[Any] = None, 1226 metadata: Optional[Any] = None, 1227 version: Optional[str] = None, 1228 level: Optional[SpanLevel] = None, 1229 status_message: Optional[str] = None, 1230 completion_start_time: Optional[datetime] = None, 1231 model: Optional[str] = None, 1232 model_parameters: Optional[Dict[str, MapValue]] = None, 1233 usage_details: Optional[Dict[str, int]] = None, 1234 cost_details: Optional[Dict[str, float]] = None, 1235 prompt: Optional[PromptClient] = None, 1236 ) -> None: 1237 """Update the current active generation span with new information. 1238 1239 This method updates the current generation span in the active context with 1240 additional information. It's useful for adding output, usage stats, or other 1241 details that become available during or after model generation. 1242 1243 Args: 1244 name: The generation name 1245 input: Updated input data for the model 1246 output: Output from the model (e.g., completions) 1247 metadata: Additional metadata to associate with the generation 1248 version: Version identifier for the model or component 1249 level: Importance level of the generation (info, warning, error) 1250 status_message: Optional status message for the generation 1251 completion_start_time: When the model started generating the response 1252 model: Name/identifier of the AI model used (e.g., "gpt-4") 1253 model_parameters: Parameters used for the model (e.g., temperature, max_tokens) 1254 usage_details: Token usage information (e.g., prompt_tokens, completion_tokens) 1255 cost_details: Cost information for the model call 1256 prompt: Associated prompt template from Langfuse prompt management 1257 1258 Example: 1259 ```python 1260 with langfuse.start_as_current_generation(name="answer-query") as generation: 1261 # Initial setup and API call 1262 response = llm.generate(...) 1263 1264 # Update with results that weren't available at creation time 1265 langfuse.update_current_generation( 1266 output=response.text, 1267 usage_details={ 1268 "prompt_tokens": response.usage.prompt_tokens, 1269 "completion_tokens": response.usage.completion_tokens 1270 } 1271 ) 1272 ``` 1273 """ 1274 if not self._tracing_enabled: 1275 langfuse_logger.debug( 1276 "Operation skipped: update_current_generation - Tracing is disabled or client is in no-op mode." 1277 ) 1278 return 1279 1280 current_otel_span = self._get_current_otel_span() 1281 1282 if current_otel_span is not None: 1283 generation = LangfuseGeneration( 1284 otel_span=current_otel_span, langfuse_client=self 1285 ) 1286 1287 if name: 1288 current_otel_span.update_name(name) 1289 1290 generation.update( 1291 input=input, 1292 output=output, 1293 metadata=metadata, 1294 version=version, 1295 level=level, 1296 status_message=status_message, 1297 completion_start_time=completion_start_time, 1298 model=model, 1299 model_parameters=model_parameters, 1300 usage_details=usage_details, 1301 cost_details=cost_details, 1302 prompt=prompt, 1303 )
Update the current active generation span with new information.
This method updates the current generation span in the active context with additional information. It's useful for adding output, usage stats, or other details that become available during or after model generation.
Arguments:
- name: The generation name
- input: Updated input data for the model
- output: Output from the model (e.g., completions)
- metadata: Additional metadata to associate with the generation
- version: Version identifier for the model or component
- level: Importance level of the generation (info, warning, error)
- status_message: Optional status message for the generation
- completion_start_time: When the model started generating the response
- model: Name/identifier of the AI model used (e.g., "gpt-4")
- model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
- usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
- cost_details: Cost information for the model call
- prompt: Associated prompt template from Langfuse prompt management
Example:
with langfuse.start_as_current_generation(name="answer-query") as generation: # Initial setup and API call response = llm.generate(...) # Update with results that weren't available at creation time langfuse.update_current_generation( output=response.text, usage_details={ "prompt_tokens": response.usage.prompt_tokens, "completion_tokens": response.usage.completion_tokens } )
1305 def update_current_span( 1306 self, 1307 *, 1308 name: Optional[str] = None, 1309 input: Optional[Any] = None, 1310 output: Optional[Any] = None, 1311 metadata: Optional[Any] = None, 1312 version: Optional[str] = None, 1313 level: Optional[SpanLevel] = None, 1314 status_message: Optional[str] = None, 1315 ) -> None: 1316 """Update the current active span with new information. 1317 1318 This method updates the current span in the active context with 1319 additional information. It's useful for adding outputs or metadata 1320 that become available during execution. 1321 1322 Args: 1323 name: The span name 1324 input: Updated input data for the operation 1325 output: Output data from the operation 1326 metadata: Additional metadata to associate with the span 1327 version: Version identifier for the code or component 1328 level: Importance level of the span (info, warning, error) 1329 status_message: Optional status message for the span 1330 1331 Example: 1332 ```python 1333 with langfuse.start_as_current_observation(name="process-data") as span: 1334 # Initial processing 1335 result = process_first_part() 1336 1337 # Update with intermediate results 1338 langfuse.update_current_span(metadata={"intermediate_result": result}) 1339 1340 # Continue processing 1341 final_result = process_second_part(result) 1342 1343 # Final update 1344 langfuse.update_current_span(output=final_result) 1345 ``` 1346 """ 1347 if not self._tracing_enabled: 1348 langfuse_logger.debug( 1349 "Operation skipped: update_current_span - Tracing is disabled or client is in no-op mode." 1350 ) 1351 return 1352 1353 current_otel_span = self._get_current_otel_span() 1354 1355 if current_otel_span is not None: 1356 span = LangfuseSpan( 1357 otel_span=current_otel_span, 1358 langfuse_client=self, 1359 environment=self._environment, 1360 release=self._release, 1361 ) 1362 1363 if name: 1364 current_otel_span.update_name(name) 1365 1366 span.update( 1367 input=input, 1368 output=output, 1369 metadata=metadata, 1370 version=version, 1371 level=level, 1372 status_message=status_message, 1373 )
Update the current active span with new information.
This method updates the current span in the active context with additional information. It's useful for adding outputs or metadata that become available during execution.
Arguments:
- name: The span name
- input: Updated input data for the operation
- output: Output data from the operation
- metadata: Additional metadata to associate with the span
- version: Version identifier for the code or component
- level: Importance level of the span (info, warning, error)
- status_message: Optional status message for the span
Example:
with langfuse.start_as_current_observation(name="process-data") as span: # Initial processing result = process_first_part() # Update with intermediate results langfuse.update_current_span(metadata={"intermediate_result": result}) # Continue processing final_result = process_second_part(result) # Final update langfuse.update_current_span(output=final_result)
1375 @deprecated( 1376 "Trace-level input/output is deprecated. " 1377 "For trace attributes (user_id, session_id, tags, etc.), use propagate_attributes() instead. " 1378 "This method will be removed in a future major version." 1379 ) 1380 def set_current_trace_io( 1381 self, 1382 *, 1383 input: Optional[Any] = None, 1384 output: Optional[Any] = None, 1385 ) -> None: 1386 """Set trace-level input and output for the current span's trace. 1387 1388 .. deprecated:: 1389 This is a legacy method for backward compatibility with Langfuse platform 1390 features that still rely on trace-level input/output (e.g., legacy LLM-as-a-judge 1391 evaluators). It will be removed in a future major version. 1392 1393 For setting other trace attributes (user_id, session_id, metadata, tags, version), 1394 use :meth:`propagate_attributes` instead. 1395 1396 Args: 1397 input: Input data to associate with the trace. 1398 output: Output data to associate with the trace. 1399 """ 1400 if not self._tracing_enabled: 1401 langfuse_logger.debug( 1402 "Operation skipped: set_current_trace_io - Tracing is disabled or client is in no-op mode." 1403 ) 1404 return 1405 1406 current_otel_span = self._get_current_otel_span() 1407 1408 if current_otel_span is not None and current_otel_span.is_recording(): 1409 existing_observation_type = current_otel_span.attributes.get( # type: ignore[attr-defined] 1410 LangfuseOtelSpanAttributes.OBSERVATION_TYPE, "span" 1411 ) 1412 # We need to preserve the class to keep the correct observation type 1413 span_class = self._get_span_class(existing_observation_type) 1414 span = span_class( 1415 otel_span=current_otel_span, 1416 langfuse_client=self, 1417 environment=self._environment, 1418 release=self._release, 1419 ) 1420 1421 span.set_trace_io( 1422 input=input, 1423 output=output, 1424 )
Set trace-level input and output for the current span's trace.
Deprecated since version : This is a legacy method for backward compatibility with Langfuse platform features that still rely on trace-level input/output (e.g., legacy LLM-as-a-judge evaluators). It will be removed in a future major version.
For setting other trace attributes (user_id, session_id, metadata, tags, version),
use propagate_attributes() instead.
Arguments:
- input: Input data to associate with the trace.
- output: Output data to associate with the trace.
1426 def set_current_trace_as_public(self) -> None: 1427 """Make the current trace publicly accessible via its URL. 1428 1429 When a trace is published, anyone with the trace link can view the full trace 1430 without needing to be logged in to Langfuse. This action cannot be undone 1431 programmatically - once published, the entire trace becomes public. 1432 1433 This is a convenience method that publishes the trace from the currently 1434 active span context. Use this when you want to make a trace public from 1435 within a traced function without needing direct access to the span object. 1436 """ 1437 if not self._tracing_enabled: 1438 langfuse_logger.debug( 1439 "Operation skipped: set_current_trace_as_public - Tracing is disabled or client is in no-op mode." 1440 ) 1441 return 1442 1443 current_otel_span = self._get_current_otel_span() 1444 1445 if current_otel_span is not None and current_otel_span.is_recording(): 1446 existing_observation_type = current_otel_span.attributes.get( # type: ignore[attr-defined] 1447 LangfuseOtelSpanAttributes.OBSERVATION_TYPE, "span" 1448 ) 1449 # We need to preserve the class to keep the correct observation type 1450 span_class = self._get_span_class(existing_observation_type) 1451 span = span_class( 1452 otel_span=current_otel_span, 1453 langfuse_client=self, 1454 environment=self._environment, 1455 ) 1456 1457 span.set_trace_as_public()
Make the current trace publicly accessible via its URL.
When a trace is published, anyone with the trace link can view the full trace without needing to be logged in to Langfuse. This action cannot be undone programmatically - once published, the entire trace becomes public.
This is a convenience method that publishes the trace from the currently active span context. Use this when you want to make a trace public from within a traced function without needing direct access to the span object.
1459 def create_event( 1460 self, 1461 *, 1462 trace_context: Optional[TraceContext] = None, 1463 name: str, 1464 input: Optional[Any] = None, 1465 output: Optional[Any] = None, 1466 metadata: Optional[Any] = None, 1467 version: Optional[str] = None, 1468 level: Optional[SpanLevel] = None, 1469 status_message: Optional[str] = None, 1470 ) -> LangfuseEvent: 1471 """Create a new Langfuse observation of type 'EVENT'. 1472 1473 The created Langfuse Event observation will be the child of the current span in the context. 1474 1475 Args: 1476 trace_context: Optional context for connecting to an existing trace 1477 name: Name of the span (e.g., function or operation name) 1478 input: Input data for the operation (can be any JSON-serializable object) 1479 output: Output data from the operation (can be any JSON-serializable object) 1480 metadata: Additional metadata to associate with the span 1481 version: Version identifier for the code or component 1482 level: Importance level of the span (info, warning, error) 1483 status_message: Optional status message for the span 1484 1485 Returns: 1486 The Langfuse Event object 1487 1488 Example: 1489 ```python 1490 event = langfuse.create_event(name="process-event") 1491 ``` 1492 """ 1493 timestamp = time_ns() 1494 1495 if trace_context: 1496 trace_id = trace_context.get("trace_id", None) 1497 parent_span_id = trace_context.get("parent_span_id", None) 1498 1499 if trace_id: 1500 remote_parent_span = self._create_remote_parent_span( 1501 trace_id=trace_id, parent_span_id=parent_span_id 1502 ) 1503 1504 with otel_trace_api.use_span( 1505 cast(otel_trace_api.Span, remote_parent_span) 1506 ): 1507 otel_span = self._otel_tracer.start_span( 1508 name=name, start_time=timestamp 1509 ) 1510 otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True) 1511 1512 return cast( 1513 LangfuseEvent, 1514 LangfuseEvent( 1515 otel_span=otel_span, 1516 langfuse_client=self, 1517 environment=self._environment, 1518 release=self._release, 1519 input=input, 1520 output=output, 1521 metadata=metadata, 1522 version=version, 1523 level=level, 1524 status_message=status_message, 1525 ).end(end_time=timestamp), 1526 ) 1527 1528 otel_span = self._otel_tracer.start_span(name=name, start_time=timestamp) 1529 1530 return cast( 1531 LangfuseEvent, 1532 LangfuseEvent( 1533 otel_span=otel_span, 1534 langfuse_client=self, 1535 environment=self._environment, 1536 release=self._release, 1537 input=input, 1538 output=output, 1539 metadata=metadata, 1540 version=version, 1541 level=level, 1542 status_message=status_message, 1543 ).end(end_time=timestamp), 1544 )
Create a new Langfuse observation of type 'EVENT'.
The created Langfuse Event observation will be the child of the current span in the context.
Arguments:
- trace_context: Optional context for connecting to an existing trace
- name: Name of the span (e.g., function or operation name)
- input: Input data for the operation (can be any JSON-serializable object)
- output: Output data from the operation (can be any JSON-serializable object)
- metadata: Additional metadata to associate with the span
- version: Version identifier for the code or component
- level: Importance level of the span (info, warning, error)
- status_message: Optional status message for the span
Returns:
The Langfuse Event object
Example:
event = langfuse.create_event(name="process-event")
1633 @staticmethod 1634 def create_trace_id(*, seed: Optional[str] = None) -> str: 1635 """Create a unique trace ID for use with Langfuse. 1636 1637 This method generates a unique trace ID for use with various Langfuse APIs. 1638 It can either generate a random ID or create a deterministic ID based on 1639 a seed string. 1640 1641 Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes. 1642 This method ensures the generated ID meets this requirement. If you need to 1643 correlate an external ID with a Langfuse trace ID, use the external ID as the 1644 seed to get a valid, deterministic Langfuse trace ID. 1645 1646 Args: 1647 seed: Optional string to use as a seed for deterministic ID generation. 1648 If provided, the same seed will always produce the same ID. 1649 If not provided, a random ID will be generated. 1650 1651 Returns: 1652 A 32-character lowercase hexadecimal string representing the Langfuse trace ID. 1653 1654 Example: 1655 ```python 1656 # Generate a random trace ID 1657 trace_id = langfuse.create_trace_id() 1658 1659 # Generate a deterministic ID based on a seed 1660 session_trace_id = langfuse.create_trace_id(seed="session-456") 1661 1662 # Correlate an external ID with a Langfuse trace ID 1663 external_id = "external-system-123456" 1664 correlated_trace_id = langfuse.create_trace_id(seed=external_id) 1665 1666 # Use the ID with trace context 1667 with langfuse.start_as_current_observation( 1668 name="process-request", 1669 trace_context={"trace_id": trace_id} 1670 ) as span: 1671 # Operation will be part of the specific trace 1672 pass 1673 ``` 1674 """ 1675 if not seed: 1676 trace_id_int = RandomIdGenerator().generate_trace_id() 1677 1678 return Langfuse._format_otel_trace_id(trace_id_int) 1679 1680 return sha256(seed.encode("utf-8")).digest()[:16].hex()
Create a unique trace ID for use with Langfuse.
This method generates a unique trace ID for use with various Langfuse APIs. It can either generate a random ID or create a deterministic ID based on a seed string.
Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes. This method ensures the generated ID meets this requirement. If you need to correlate an external ID with a Langfuse trace ID, use the external ID as the seed to get a valid, deterministic Langfuse trace ID.
Arguments:
- seed: Optional string to use as a seed for deterministic ID generation. If provided, the same seed will always produce the same ID. If not provided, a random ID will be generated.
Returns:
A 32-character lowercase hexadecimal string representing the Langfuse trace ID.
Example:
# Generate a random trace ID trace_id = langfuse.create_trace_id() # Generate a deterministic ID based on a seed session_trace_id = langfuse.create_trace_id(seed="session-456") # Correlate an external ID with a Langfuse trace ID external_id = "external-system-123456" correlated_trace_id = langfuse.create_trace_id(seed=external_id) # Use the ID with trace context with langfuse.start_as_current_observation( name="process-request", trace_context={"trace_id": trace_id} ) as span: # Operation will be part of the specific trace pass
1758 def create_score( 1759 self, 1760 *, 1761 name: str, 1762 value: Union[float, str], 1763 session_id: Optional[str] = None, 1764 dataset_run_id: Optional[str] = None, 1765 trace_id: Optional[str] = None, 1766 observation_id: Optional[str] = None, 1767 score_id: Optional[str] = None, 1768 data_type: Optional[ScoreDataType] = None, 1769 comment: Optional[str] = None, 1770 config_id: Optional[str] = None, 1771 metadata: Optional[Any] = None, 1772 timestamp: Optional[datetime] = None, 1773 ) -> None: 1774 """Create a score for a specific trace or observation. 1775 1776 This method creates a score for evaluating a Langfuse trace or observation. Scores can be 1777 used to track quality metrics, user feedback, or automated evaluations. 1778 1779 Args: 1780 name: Name of the score (e.g., "relevance", "accuracy") 1781 value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL) 1782 session_id: ID of the Langfuse session to associate the score with 1783 dataset_run_id: ID of the Langfuse dataset run to associate the score with 1784 trace_id: ID of the Langfuse trace to associate the score with 1785 observation_id: Optional ID of the specific observation to score. Trace ID must be provided too. 1786 score_id: Optional custom ID for the score (auto-generated if not provided) 1787 data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL) 1788 comment: Optional comment or explanation for the score 1789 config_id: Optional ID of a score config defined in Langfuse 1790 metadata: Optional metadata to be attached to the score 1791 timestamp: Optional timestamp for the score (defaults to current UTC time) 1792 1793 Example: 1794 ```python 1795 # Create a numeric score for accuracy 1796 langfuse.create_score( 1797 name="accuracy", 1798 value=0.92, 1799 trace_id="abcdef1234567890abcdef1234567890", 1800 data_type="NUMERIC", 1801 comment="High accuracy with minor irrelevant details" 1802 ) 1803 1804 # Create a categorical score for sentiment 1805 langfuse.create_score( 1806 name="sentiment", 1807 value="positive", 1808 trace_id="abcdef1234567890abcdef1234567890", 1809 observation_id="abcdef1234567890", 1810 data_type="CATEGORICAL" 1811 ) 1812 ``` 1813 """ 1814 if not self._tracing_enabled: 1815 return 1816 1817 score_id = score_id or self._create_observation_id() 1818 1819 try: 1820 new_body = ScoreBody( 1821 id=score_id, 1822 session_id=session_id, 1823 datasetRunId=dataset_run_id, 1824 traceId=trace_id, 1825 observationId=observation_id, 1826 name=name, 1827 value=value, 1828 dataType=data_type, # type: ignore 1829 comment=comment, 1830 configId=config_id, 1831 environment=self._environment, 1832 metadata=metadata, 1833 ) 1834 1835 event = { 1836 "id": self.create_trace_id(), 1837 "type": "score-create", 1838 "timestamp": timestamp or _get_timestamp(), 1839 "body": new_body, 1840 } 1841 1842 if self._resources is not None: 1843 # Force the score to be in sample if it was for a legacy trace ID, i.e. non-32 hexchar 1844 force_sample = ( 1845 not self._is_valid_trace_id(trace_id) if trace_id else True 1846 ) 1847 1848 self._resources.add_score_task( 1849 event, 1850 force_sample=force_sample, 1851 ) 1852 1853 except Exception as e: 1854 langfuse_logger.exception( 1855 f"Error creating score: Failed to process score event for trace_id={trace_id}, name={name}. Error: {e}" 1856 )
Create a score for a specific trace or observation.
This method creates a score for evaluating a Langfuse trace or observation. Scores can be used to track quality metrics, user feedback, or automated evaluations.
Arguments:
- name: Name of the score (e.g., "relevance", "accuracy")
- value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
- session_id: ID of the Langfuse session to associate the score with
- dataset_run_id: ID of the Langfuse dataset run to associate the score with
- trace_id: ID of the Langfuse trace to associate the score with
- observation_id: Optional ID of the specific observation to score. Trace ID must be provided too.
- score_id: Optional custom ID for the score (auto-generated if not provided)
- data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
- comment: Optional comment or explanation for the score
- config_id: Optional ID of a score config defined in Langfuse
- metadata: Optional metadata to be attached to the score
- timestamp: Optional timestamp for the score (defaults to current UTC time)
Example:
# Create a numeric score for accuracy langfuse.create_score( name="accuracy", value=0.92, trace_id="abcdef1234567890abcdef1234567890", data_type="NUMERIC", comment="High accuracy with minor irrelevant details" ) # Create a categorical score for sentiment langfuse.create_score( name="sentiment", value="positive", trace_id="abcdef1234567890abcdef1234567890", observation_id="abcdef1234567890", data_type="CATEGORICAL" )
1917 def score_current_span( 1918 self, 1919 *, 1920 name: str, 1921 value: Union[float, str], 1922 score_id: Optional[str] = None, 1923 data_type: Optional[ScoreDataType] = None, 1924 comment: Optional[str] = None, 1925 config_id: Optional[str] = None, 1926 metadata: Optional[Any] = None, 1927 ) -> None: 1928 """Create a score for the current active span. 1929 1930 This method scores the currently active span in the context. It's a convenient 1931 way to score the current operation without needing to know its trace and span IDs. 1932 1933 Args: 1934 name: Name of the score (e.g., "relevance", "accuracy") 1935 value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL) 1936 score_id: Optional custom ID for the score (auto-generated if not provided) 1937 data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL) 1938 comment: Optional comment or explanation for the score 1939 config_id: Optional ID of a score config defined in Langfuse 1940 metadata: Optional metadata to be attached to the score 1941 1942 Example: 1943 ```python 1944 with langfuse.start_as_current_generation(name="answer-query") as generation: 1945 # Generate answer 1946 response = generate_answer(...) 1947 generation.update(output=response) 1948 1949 # Score the generation 1950 langfuse.score_current_span( 1951 name="relevance", 1952 value=0.85, 1953 data_type="NUMERIC", 1954 comment="Mostly relevant but contains some tangential information", 1955 metadata={"model": "gpt-4", "prompt_version": "v2"} 1956 ) 1957 ``` 1958 """ 1959 current_span = self._get_current_otel_span() 1960 1961 if current_span is not None: 1962 trace_id = self._get_otel_trace_id(current_span) 1963 observation_id = self._get_otel_span_id(current_span) 1964 1965 langfuse_logger.info( 1966 f"Score: Creating score name='{name}' value={value} for current span ({observation_id}) in trace {trace_id}" 1967 ) 1968 1969 self.create_score( 1970 trace_id=trace_id, 1971 observation_id=observation_id, 1972 name=name, 1973 value=cast(str, value), 1974 score_id=score_id, 1975 data_type=cast(Literal["CATEGORICAL"], data_type), 1976 comment=comment, 1977 config_id=config_id, 1978 metadata=metadata, 1979 )
Create a score for the current active span.
This method scores the currently active span in the context. It's a convenient way to score the current operation without needing to know its trace and span IDs.
Arguments:
- name: Name of the score (e.g., "relevance", "accuracy")
- value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
- score_id: Optional custom ID for the score (auto-generated if not provided)
- data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
- comment: Optional comment or explanation for the score
- config_id: Optional ID of a score config defined in Langfuse
- metadata: Optional metadata to be attached to the score
Example:
with langfuse.start_as_current_generation(name="answer-query") as generation: # Generate answer response = generate_answer(...) generation.update(output=response) # Score the generation langfuse.score_current_span( name="relevance", value=0.85, data_type="NUMERIC", comment="Mostly relevant but contains some tangential information", metadata={"model": "gpt-4", "prompt_version": "v2"} )
2007 def score_current_trace( 2008 self, 2009 *, 2010 name: str, 2011 value: Union[float, str], 2012 score_id: Optional[str] = None, 2013 data_type: Optional[ScoreDataType] = None, 2014 comment: Optional[str] = None, 2015 config_id: Optional[str] = None, 2016 metadata: Optional[Any] = None, 2017 ) -> None: 2018 """Create a score for the current trace. 2019 2020 This method scores the trace of the currently active span. Unlike score_current_span, 2021 this method associates the score with the entire trace rather than a specific span. 2022 It's useful for scoring overall performance or quality of the entire operation. 2023 2024 Args: 2025 name: Name of the score (e.g., "user_satisfaction", "overall_quality") 2026 value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL) 2027 score_id: Optional custom ID for the score (auto-generated if not provided) 2028 data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL) 2029 comment: Optional comment or explanation for the score 2030 config_id: Optional ID of a score config defined in Langfuse 2031 metadata: Optional metadata to be attached to the score 2032 2033 Example: 2034 ```python 2035 with langfuse.start_as_current_observation(name="process-user-request") as span: 2036 # Process request 2037 result = process_complete_request() 2038 span.update(output=result) 2039 2040 # Score the overall trace 2041 langfuse.score_current_trace( 2042 name="overall_quality", 2043 value=0.95, 2044 data_type="NUMERIC", 2045 comment="High quality end-to-end response", 2046 metadata={"evaluator": "gpt-4", "criteria": "comprehensive"} 2047 ) 2048 ``` 2049 """ 2050 current_span = self._get_current_otel_span() 2051 2052 if current_span is not None: 2053 trace_id = self._get_otel_trace_id(current_span) 2054 2055 langfuse_logger.info( 2056 f"Score: Creating score name='{name}' value={value} for entire trace {trace_id}" 2057 ) 2058 2059 self.create_score( 2060 trace_id=trace_id, 2061 name=name, 2062 value=cast(str, value), 2063 score_id=score_id, 2064 data_type=cast(Literal["CATEGORICAL"], data_type), 2065 comment=comment, 2066 config_id=config_id, 2067 metadata=metadata, 2068 )
Create a score for the current trace.
This method scores the trace of the currently active span. Unlike score_current_span, this method associates the score with the entire trace rather than a specific span. It's useful for scoring overall performance or quality of the entire operation.
Arguments:
- name: Name of the score (e.g., "user_satisfaction", "overall_quality")
- value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
- score_id: Optional custom ID for the score (auto-generated if not provided)
- data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
- comment: Optional comment or explanation for the score
- config_id: Optional ID of a score config defined in Langfuse
- metadata: Optional metadata to be attached to the score
Example:
with langfuse.start_as_current_observation(name="process-user-request") as span: # Process request result = process_complete_request() span.update(output=result) # Score the overall trace langfuse.score_current_trace( name="overall_quality", value=0.95, data_type="NUMERIC", comment="High quality end-to-end response", metadata={"evaluator": "gpt-4", "criteria": "comprehensive"} )
2070 def flush(self) -> None: 2071 """Force flush all pending spans and events to the Langfuse API. 2072 2073 This method manually flushes any pending spans, scores, and other events to the 2074 Langfuse API. It's useful in scenarios where you want to ensure all data is sent 2075 before proceeding, without waiting for the automatic flush interval. 2076 2077 Example: 2078 ```python 2079 # Record some spans and scores 2080 with langfuse.start_as_current_observation(name="operation") as span: 2081 # Do work... 2082 pass 2083 2084 # Ensure all data is sent to Langfuse before proceeding 2085 langfuse.flush() 2086 2087 # Continue with other work 2088 ``` 2089 """ 2090 if self._resources is not None: 2091 self._resources.flush()
Force flush all pending spans and events to the Langfuse API.
This method manually flushes any pending spans, scores, and other events to the Langfuse API. It's useful in scenarios where you want to ensure all data is sent before proceeding, without waiting for the automatic flush interval.
Example:
# Record some spans and scores with langfuse.start_as_current_observation(name="operation") as span: # Do work... pass # Ensure all data is sent to Langfuse before proceeding langfuse.flush() # Continue with other work
2093 def shutdown(self) -> None: 2094 """Shut down the Langfuse client and flush all pending data. 2095 2096 This method cleanly shuts down the Langfuse client, ensuring all pending data 2097 is flushed to the API and all background threads are properly terminated. 2098 2099 It's important to call this method when your application is shutting down to 2100 prevent data loss and resource leaks. For most applications, using the client 2101 as a context manager or relying on the automatic shutdown via atexit is sufficient. 2102 2103 Example: 2104 ```python 2105 # Initialize Langfuse 2106 langfuse = Langfuse(public_key="...", secret_key="...") 2107 2108 # Use Langfuse throughout your application 2109 # ... 2110 2111 # When application is shutting down 2112 langfuse.shutdown() 2113 ``` 2114 """ 2115 if self._resources is not None: 2116 self._resources.shutdown()
Shut down the Langfuse client and flush all pending data.
This method cleanly shuts down the Langfuse client, ensuring all pending data is flushed to the API and all background threads are properly terminated.
It's important to call this method when your application is shutting down to prevent data loss and resource leaks. For most applications, using the client as a context manager or relying on the automatic shutdown via atexit is sufficient.
Example:
# Initialize Langfuse langfuse = Langfuse(public_key="...", secret_key="...") # Use Langfuse throughout your application # ... # When application is shutting down langfuse.shutdown()
2118 def get_current_trace_id(self) -> Optional[str]: 2119 """Get the trace ID of the current active span. 2120 2121 This method retrieves the trace ID from the currently active span in the context. 2122 It can be used to get the trace ID for referencing in logs, external systems, 2123 or for creating related operations. 2124 2125 Returns: 2126 The current trace ID as a 32-character lowercase hexadecimal string, 2127 or None if there is no active span. 2128 2129 Example: 2130 ```python 2131 with langfuse.start_as_current_observation(name="process-request") as span: 2132 # Get the current trace ID for reference 2133 trace_id = langfuse.get_current_trace_id() 2134 2135 # Use it for external correlation 2136 log.info(f"Processing request with trace_id: {trace_id}") 2137 2138 # Or pass to another system 2139 external_system.process(data, trace_id=trace_id) 2140 ``` 2141 """ 2142 if not self._tracing_enabled: 2143 langfuse_logger.debug( 2144 "Operation skipped: get_current_trace_id - Tracing is disabled or client is in no-op mode." 2145 ) 2146 return None 2147 2148 current_otel_span = self._get_current_otel_span() 2149 2150 return self._get_otel_trace_id(current_otel_span) if current_otel_span else None
Get the trace ID of the current active span.
This method retrieves the trace ID from the currently active span in the context. It can be used to get the trace ID for referencing in logs, external systems, or for creating related operations.
Returns:
The current trace ID as a 32-character lowercase hexadecimal string, or None if there is no active span.
Example:
with langfuse.start_as_current_observation(name="process-request") as span: # Get the current trace ID for reference trace_id = langfuse.get_current_trace_id() # Use it for external correlation log.info(f"Processing request with trace_id: {trace_id}") # Or pass to another system external_system.process(data, trace_id=trace_id)
2152 def get_current_observation_id(self) -> Optional[str]: 2153 """Get the observation ID (span ID) of the current active span. 2154 2155 This method retrieves the observation ID from the currently active span in the context. 2156 It can be used to get the observation ID for referencing in logs, external systems, 2157 or for creating scores or other related operations. 2158 2159 Returns: 2160 The current observation ID as a 16-character lowercase hexadecimal string, 2161 or None if there is no active span. 2162 2163 Example: 2164 ```python 2165 with langfuse.start_as_current_observation(name="process-user-query") as span: 2166 # Get the current observation ID 2167 observation_id = langfuse.get_current_observation_id() 2168 2169 # Store it for later reference 2170 cache.set(f"query_{query_id}_observation", observation_id) 2171 2172 # Process the query... 2173 ``` 2174 """ 2175 if not self._tracing_enabled: 2176 langfuse_logger.debug( 2177 "Operation skipped: get_current_observation_id - Tracing is disabled or client is in no-op mode." 2178 ) 2179 return None 2180 2181 current_otel_span = self._get_current_otel_span() 2182 2183 return self._get_otel_span_id(current_otel_span) if current_otel_span else None
Get the observation ID (span ID) of the current active span.
This method retrieves the observation ID from the currently active span in the context. It can be used to get the observation ID for referencing in logs, external systems, or for creating scores or other related operations.
Returns:
The current observation ID as a 16-character lowercase hexadecimal string, or None if there is no active span.
Example:
with langfuse.start_as_current_observation(name="process-user-query") as span: # Get the current observation ID observation_id = langfuse.get_current_observation_id() # Store it for later reference cache.set(f"query_{query_id}_observation", observation_id) # Process the query...
2196 def get_trace_url(self, *, trace_id: Optional[str] = None) -> Optional[str]: 2197 """Get the URL to view a trace in the Langfuse UI. 2198 2199 This method generates a URL that links directly to a trace in the Langfuse UI. 2200 It's useful for providing links in logs, notifications, or debugging tools. 2201 2202 Args: 2203 trace_id: Optional trace ID to generate a URL for. If not provided, 2204 the trace ID of the current active span will be used. 2205 2206 Returns: 2207 A URL string pointing to the trace in the Langfuse UI, 2208 or None if the project ID couldn't be retrieved or no trace ID is available. 2209 2210 Example: 2211 ```python 2212 # Get URL for the current trace 2213 with langfuse.start_as_current_observation(name="process-request") as span: 2214 trace_url = langfuse.get_trace_url() 2215 log.info(f"Processing trace: {trace_url}") 2216 2217 # Get URL for a specific trace 2218 specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef") 2219 send_notification(f"Review needed for trace: {specific_trace_url}") 2220 ``` 2221 """ 2222 final_trace_id = trace_id or self.get_current_trace_id() 2223 if not final_trace_id: 2224 return None 2225 2226 project_id = self._get_project_id() 2227 2228 return ( 2229 f"{self._base_url}/project/{project_id}/traces/{final_trace_id}" 2230 if project_id and final_trace_id 2231 else None 2232 )
Get the URL to view a trace in the Langfuse UI.
This method generates a URL that links directly to a trace in the Langfuse UI. It's useful for providing links in logs, notifications, or debugging tools.
Arguments:
- trace_id: Optional trace ID to generate a URL for. If not provided, the trace ID of the current active span will be used.
Returns:
A URL string pointing to the trace in the Langfuse UI, or None if the project ID couldn't be retrieved or no trace ID is available.
Example:
# Get URL for the current trace with langfuse.start_as_current_observation(name="process-request") as span: trace_url = langfuse.get_trace_url() log.info(f"Processing trace: {trace_url}") # Get URL for a specific trace specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef") send_notification(f"Review needed for trace: {specific_trace_url}")
2234 def get_dataset( 2235 self, 2236 name: str, 2237 *, 2238 fetch_items_page_size: Optional[int] = 50, 2239 version: Optional[datetime] = None, 2240 ) -> "DatasetClient": 2241 """Fetch a dataset by its name. 2242 2243 Args: 2244 name (str): The name of the dataset to fetch. 2245 fetch_items_page_size (Optional[int]): All items of the dataset will be fetched in chunks of this size. Defaults to 50. 2246 version (Optional[datetime]): Retrieve dataset items as they existed at this specific point in time (UTC). 2247 If provided, returns the state of items at the specified UTC timestamp. 2248 If not provided, returns the latest version. Must be a timezone-aware datetime object in UTC. 2249 2250 Returns: 2251 DatasetClient: The dataset with the given name. 2252 """ 2253 try: 2254 langfuse_logger.debug(f"Getting datasets {name}") 2255 dataset = self.api.datasets.get(dataset_name=self._url_encode(name)) 2256 2257 dataset_items = [] 2258 page = 1 2259 2260 while True: 2261 new_items = self.api.dataset_items.list( 2262 dataset_name=self._url_encode(name, is_url_param=True), 2263 page=page, 2264 limit=fetch_items_page_size, 2265 version=version, 2266 ) 2267 dataset_items.extend(new_items.data) 2268 2269 if new_items.meta.total_pages <= page: 2270 break 2271 2272 page += 1 2273 2274 return DatasetClient( 2275 dataset=dataset, 2276 items=dataset_items, 2277 version=version, 2278 langfuse_client=self, 2279 ) 2280 2281 except Error as e: 2282 handle_fern_exception(e) 2283 raise e
Fetch a dataset by its name.
Arguments:
- name (str): The name of the dataset to fetch.
- fetch_items_page_size (Optional[int]): All items of the dataset will be fetched in chunks of this size. Defaults to 50.
- version (Optional[datetime]): Retrieve dataset items as they existed at this specific point in time (UTC). If provided, returns the state of items at the specified UTC timestamp. If not provided, returns the latest version. Must be a timezone-aware datetime object in UTC.
Returns:
DatasetClient: The dataset with the given name.
2285 def get_dataset_run( 2286 self, *, dataset_name: str, run_name: str 2287 ) -> DatasetRunWithItems: 2288 """Fetch a dataset run by dataset name and run name. 2289 2290 Args: 2291 dataset_name (str): The name of the dataset. 2292 run_name (str): The name of the run. 2293 2294 Returns: 2295 DatasetRunWithItems: The dataset run with its items. 2296 """ 2297 try: 2298 return cast( 2299 DatasetRunWithItems, 2300 self.api.datasets.get_run( 2301 dataset_name=self._url_encode(dataset_name), 2302 run_name=self._url_encode(run_name), 2303 request_options=None, 2304 ), 2305 ) 2306 except Error as e: 2307 handle_fern_exception(e) 2308 raise e
Fetch a dataset run by dataset name and run name.
Arguments:
- dataset_name (str): The name of the dataset.
- run_name (str): The name of the run.
Returns:
DatasetRunWithItems: The dataset run with its items.
2310 def get_dataset_runs( 2311 self, 2312 *, 2313 dataset_name: str, 2314 page: Optional[int] = None, 2315 limit: Optional[int] = None, 2316 ) -> PaginatedDatasetRuns: 2317 """Fetch all runs for a dataset. 2318 2319 Args: 2320 dataset_name (str): The name of the dataset. 2321 page (Optional[int]): Page number, starts at 1. 2322 limit (Optional[int]): Limit of items per page. 2323 2324 Returns: 2325 PaginatedDatasetRuns: Paginated list of dataset runs. 2326 """ 2327 try: 2328 return cast( 2329 PaginatedDatasetRuns, 2330 self.api.datasets.get_runs( 2331 dataset_name=self._url_encode(dataset_name), 2332 page=page, 2333 limit=limit, 2334 request_options=None, 2335 ), 2336 ) 2337 except Error as e: 2338 handle_fern_exception(e) 2339 raise e
Fetch all runs for a dataset.
Arguments:
- dataset_name (str): The name of the dataset.
- page (Optional[int]): Page number, starts at 1.
- limit (Optional[int]): Limit of items per page.
Returns:
PaginatedDatasetRuns: Paginated list of dataset runs.
2341 def delete_dataset_run( 2342 self, *, dataset_name: str, run_name: str 2343 ) -> DeleteDatasetRunResponse: 2344 """Delete a dataset run and all its run items. This action is irreversible. 2345 2346 Args: 2347 dataset_name (str): The name of the dataset. 2348 run_name (str): The name of the run. 2349 2350 Returns: 2351 DeleteDatasetRunResponse: Confirmation of deletion. 2352 """ 2353 try: 2354 return cast( 2355 DeleteDatasetRunResponse, 2356 self.api.datasets.delete_run( 2357 dataset_name=self._url_encode(dataset_name), 2358 run_name=self._url_encode(run_name), 2359 request_options=None, 2360 ), 2361 ) 2362 except Error as e: 2363 handle_fern_exception(e) 2364 raise e
Delete a dataset run and all its run items. This action is irreversible.
Arguments:
- dataset_name (str): The name of the dataset.
- run_name (str): The name of the run.
Returns:
DeleteDatasetRunResponse: Confirmation of deletion.
2366 def run_experiment( 2367 self, 2368 *, 2369 name: str, 2370 run_name: Optional[str] = None, 2371 description: Optional[str] = None, 2372 data: ExperimentData, 2373 task: TaskFunction, 2374 evaluators: List[EvaluatorFunction] = [], 2375 composite_evaluator: Optional[CompositeEvaluatorFunction] = None, 2376 run_evaluators: List[RunEvaluatorFunction] = [], 2377 max_concurrency: int = 50, 2378 metadata: Optional[Dict[str, str]] = None, 2379 _dataset_version: Optional[datetime] = None, 2380 ) -> ExperimentResult: 2381 """Run an experiment on a dataset with automatic tracing and evaluation. 2382 2383 This method executes a task function on each item in the provided dataset, 2384 automatically traces all executions with Langfuse for observability, runs 2385 item-level and run-level evaluators on the outputs, and returns comprehensive 2386 results with evaluation metrics. 2387 2388 The experiment system provides: 2389 - Automatic tracing of all task executions 2390 - Concurrent processing with configurable limits 2391 - Comprehensive error handling that isolates failures 2392 - Integration with Langfuse datasets for experiment tracking 2393 - Flexible evaluation framework supporting both sync and async evaluators 2394 2395 Args: 2396 name: Human-readable name for the experiment. Used for identification 2397 in the Langfuse UI. 2398 run_name: Optional exact name for the experiment run. If provided, this will be 2399 used as the exact dataset run name if the `data` contains Langfuse dataset items. 2400 If not provided, this will default to the experiment name appended with an ISO timestamp. 2401 description: Optional description explaining the experiment's purpose, 2402 methodology, or expected outcomes. 2403 data: Array of data items to process. Can be either: 2404 - List of dict-like items with 'input', 'expected_output', 'metadata' keys 2405 - List of Langfuse DatasetItem objects from dataset.items 2406 task: Function that processes each data item and returns output. 2407 Must accept 'item' as keyword argument and can return sync or async results. 2408 The task function signature should be: task(*, item, **kwargs) -> Any 2409 evaluators: List of functions to evaluate each item's output individually. 2410 Each evaluator receives input, output, expected_output, and metadata. 2411 Can return single Evaluation dict or list of Evaluation dicts. 2412 composite_evaluator: Optional function that creates composite scores from item-level evaluations. 2413 Receives the same inputs as item-level evaluators (input, output, expected_output, metadata) 2414 plus the list of evaluations from item-level evaluators. Useful for weighted averages, 2415 pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics. 2416 run_evaluators: List of functions to evaluate the entire experiment run. 2417 Each run evaluator receives all item_results and can compute aggregate metrics. 2418 Useful for calculating averages, distributions, or cross-item comparisons. 2419 max_concurrency: Maximum number of concurrent task executions (default: 50). 2420 Controls the number of items processed simultaneously. Adjust based on 2421 API rate limits and system resources. 2422 metadata: Optional metadata dictionary to attach to all experiment traces. 2423 This metadata will be included in every trace created during the experiment. 2424 If `data` are Langfuse dataset items, the metadata will be attached to the dataset run, too. 2425 2426 Returns: 2427 ExperimentResult containing: 2428 - run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset. 2429 - item_results: List of results for each processed item with outputs and evaluations 2430 - run_evaluations: List of aggregate evaluation results for the entire run 2431 - dataset_run_id: ID of the dataset run (if using Langfuse datasets) 2432 - dataset_run_url: Direct URL to view results in Langfuse UI (if applicable) 2433 2434 Raises: 2435 ValueError: If required parameters are missing or invalid 2436 Exception: If experiment setup fails (individual item failures are handled gracefully) 2437 2438 Examples: 2439 Basic experiment with local data: 2440 ```python 2441 def summarize_text(*, item, **kwargs): 2442 return f"Summary: {item['input'][:50]}..." 2443 2444 def length_evaluator(*, input, output, expected_output=None, **kwargs): 2445 return { 2446 "name": "output_length", 2447 "value": len(output), 2448 "comment": f"Output contains {len(output)} characters" 2449 } 2450 2451 result = langfuse.run_experiment( 2452 name="Text Summarization Test", 2453 description="Evaluate summarization quality and length", 2454 data=[ 2455 {"input": "Long article text...", "expected_output": "Expected summary"}, 2456 {"input": "Another article...", "expected_output": "Another summary"} 2457 ], 2458 task=summarize_text, 2459 evaluators=[length_evaluator] 2460 ) 2461 2462 print(f"Processed {len(result.item_results)} items") 2463 for item_result in result.item_results: 2464 print(f"Input: {item_result.item['input']}") 2465 print(f"Output: {item_result.output}") 2466 print(f"Evaluations: {item_result.evaluations}") 2467 ``` 2468 2469 Advanced experiment with async task and multiple evaluators: 2470 ```python 2471 async def llm_task(*, item, **kwargs): 2472 # Simulate async LLM call 2473 response = await openai_client.chat.completions.create( 2474 model="gpt-4", 2475 messages=[{"role": "user", "content": item["input"]}] 2476 ) 2477 return response.choices[0].message.content 2478 2479 def accuracy_evaluator(*, input, output, expected_output=None, **kwargs): 2480 if expected_output and expected_output.lower() in output.lower(): 2481 return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"} 2482 return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"} 2483 2484 def toxicity_evaluator(*, input, output, expected_output=None, **kwargs): 2485 # Simulate toxicity check 2486 toxicity_score = check_toxicity(output) # Your toxicity checker 2487 return { 2488 "name": "toxicity", 2489 "value": toxicity_score, 2490 "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}" 2491 } 2492 2493 def average_accuracy(*, item_results, **kwargs): 2494 accuracies = [ 2495 eval.value for result in item_results 2496 for eval in result.evaluations 2497 if eval.name == "accuracy" 2498 ] 2499 return { 2500 "name": "average_accuracy", 2501 "value": sum(accuracies) / len(accuracies) if accuracies else 0, 2502 "comment": f"Average accuracy across {len(accuracies)} items" 2503 } 2504 2505 result = langfuse.run_experiment( 2506 name="LLM Safety and Accuracy Test", 2507 description="Evaluate model accuracy and safety across diverse prompts", 2508 data=test_dataset, # Your dataset items 2509 task=llm_task, 2510 evaluators=[accuracy_evaluator, toxicity_evaluator], 2511 run_evaluators=[average_accuracy], 2512 max_concurrency=5, # Limit concurrent API calls 2513 metadata={"model": "gpt-4", "temperature": 0.7} 2514 ) 2515 ``` 2516 2517 Using with Langfuse datasets: 2518 ```python 2519 # Get dataset from Langfuse 2520 dataset = langfuse.get_dataset("my-eval-dataset") 2521 2522 result = dataset.run_experiment( 2523 name="Production Model Evaluation", 2524 description="Monthly evaluation of production model performance", 2525 task=my_production_task, 2526 evaluators=[accuracy_evaluator, latency_evaluator] 2527 ) 2528 2529 # Results automatically linked to dataset in Langfuse UI 2530 print(f"View results: {result['dataset_run_url']}") 2531 ``` 2532 2533 Note: 2534 - Task and evaluator functions can be either synchronous or asynchronous 2535 - Individual item failures are logged but don't stop the experiment 2536 - All executions are automatically traced and visible in Langfuse UI 2537 - When using Langfuse datasets, results are automatically linked for easy comparison 2538 - This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.) 2539 - Async execution is handled automatically with smart event loop detection 2540 """ 2541 return cast( 2542 ExperimentResult, 2543 run_async_safely( 2544 self._run_experiment_async( 2545 name=name, 2546 run_name=self._create_experiment_run_name( 2547 name=name, run_name=run_name 2548 ), 2549 description=description, 2550 data=data, 2551 task=task, 2552 evaluators=evaluators or [], 2553 composite_evaluator=composite_evaluator, 2554 run_evaluators=run_evaluators or [], 2555 max_concurrency=max_concurrency, 2556 metadata=metadata, 2557 dataset_version=_dataset_version, 2558 ), 2559 ), 2560 )
Run an experiment on a dataset with automatic tracing and evaluation.
This method executes a task function on each item in the provided dataset, automatically traces all executions with Langfuse for observability, runs item-level and run-level evaluators on the outputs, and returns comprehensive results with evaluation metrics.
The experiment system provides:
- Automatic tracing of all task executions
- Concurrent processing with configurable limits
- Comprehensive error handling that isolates failures
- Integration with Langfuse datasets for experiment tracking
- Flexible evaluation framework supporting both sync and async evaluators
Arguments:
- name: Human-readable name for the experiment. Used for identification in the Langfuse UI.
- run_name: Optional exact name for the experiment run. If provided, this will be
used as the exact dataset run name if the
datacontains Langfuse dataset items. If not provided, this will default to the experiment name appended with an ISO timestamp. - description: Optional description explaining the experiment's purpose, methodology, or expected outcomes.
- data: Array of data items to process. Can be either:
- List of dict-like items with 'input', 'expected_output', 'metadata' keys
- List of Langfuse DatasetItem objects from dataset.items
- task: Function that processes each data item and returns output. Must accept 'item' as keyword argument and can return sync or async results. The task function signature should be: task(, item, *kwargs) -> Any
- evaluators: List of functions to evaluate each item's output individually. Each evaluator receives input, output, expected_output, and metadata. Can return single Evaluation dict or list of Evaluation dicts.
- composite_evaluator: Optional function that creates composite scores from item-level evaluations. Receives the same inputs as item-level evaluators (input, output, expected_output, metadata) plus the list of evaluations from item-level evaluators. Useful for weighted averages, pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics.
- run_evaluators: List of functions to evaluate the entire experiment run. Each run evaluator receives all item_results and can compute aggregate metrics. Useful for calculating averages, distributions, or cross-item comparisons.
- max_concurrency: Maximum number of concurrent task executions (default: 50). Controls the number of items processed simultaneously. Adjust based on API rate limits and system resources.
- metadata: Optional metadata dictionary to attach to all experiment traces.
This metadata will be included in every trace created during the experiment.
If
dataare Langfuse dataset items, the metadata will be attached to the dataset run, too.
Returns:
ExperimentResult containing:
- run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset.
- item_results: List of results for each processed item with outputs and evaluations
- run_evaluations: List of aggregate evaluation results for the entire run
- dataset_run_id: ID of the dataset run (if using Langfuse datasets)
- dataset_run_url: Direct URL to view results in Langfuse UI (if applicable)
Raises:
- ValueError: If required parameters are missing or invalid
- Exception: If experiment setup fails (individual item failures are handled gracefully)
Examples:
Basic experiment with local data:
def summarize_text(*, item, **kwargs): return f"Summary: {item['input'][:50]}..." def length_evaluator(*, input, output, expected_output=None, **kwargs): return { "name": "output_length", "value": len(output), "comment": f"Output contains {len(output)} characters" } result = langfuse.run_experiment( name="Text Summarization Test", description="Evaluate summarization quality and length", data=[ {"input": "Long article text...", "expected_output": "Expected summary"}, {"input": "Another article...", "expected_output": "Another summary"} ], task=summarize_text, evaluators=[length_evaluator] ) print(f"Processed {len(result.item_results)} items") for item_result in result.item_results: print(f"Input: {item_result.item['input']}") print(f"Output: {item_result.output}") print(f"Evaluations: {item_result.evaluations}")Advanced experiment with async task and multiple evaluators:
async def llm_task(*, item, **kwargs): # Simulate async LLM call response = await openai_client.chat.completions.create( model="gpt-4", messages=[{"role": "user", "content": item["input"]}] ) return response.choices[0].message.content def accuracy_evaluator(*, input, output, expected_output=None, **kwargs): if expected_output and expected_output.lower() in output.lower(): return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"} return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"} def toxicity_evaluator(*, input, output, expected_output=None, **kwargs): # Simulate toxicity check toxicity_score = check_toxicity(output) # Your toxicity checker return { "name": "toxicity", "value": toxicity_score, "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}" } def average_accuracy(*, item_results, **kwargs): accuracies = [ eval.value for result in item_results for eval in result.evaluations if eval.name == "accuracy" ] return { "name": "average_accuracy", "value": sum(accuracies) / len(accuracies) if accuracies else 0, "comment": f"Average accuracy across {len(accuracies)} items" } result = langfuse.run_experiment( name="LLM Safety and Accuracy Test", description="Evaluate model accuracy and safety across diverse prompts", data=test_dataset, # Your dataset items task=llm_task, evaluators=[accuracy_evaluator, toxicity_evaluator], run_evaluators=[average_accuracy], max_concurrency=5, # Limit concurrent API calls metadata={"model": "gpt-4", "temperature": 0.7} )Using with Langfuse datasets:
# Get dataset from Langfuse dataset = langfuse.get_dataset("my-eval-dataset") result = dataset.run_experiment( name="Production Model Evaluation", description="Monthly evaluation of production model performance", task=my_production_task, evaluators=[accuracy_evaluator, latency_evaluator] ) # Results automatically linked to dataset in Langfuse UI print(f"View results: {result['dataset_run_url']}")
Note:
- Task and evaluator functions can be either synchronous or asynchronous
- Individual item failures are logged but don't stop the experiment
- All executions are automatically traced and visible in Langfuse UI
- When using Langfuse datasets, results are automatically linked for easy comparison
- This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.)
- Async execution is handled automatically with smart event loop detection
2906 def run_batched_evaluation( 2907 self, 2908 *, 2909 scope: Literal["traces", "observations"], 2910 mapper: MapperFunction, 2911 filter: Optional[str] = None, 2912 fetch_batch_size: int = 50, 2913 fetch_trace_fields: Optional[str] = None, 2914 max_items: Optional[int] = None, 2915 max_retries: int = 3, 2916 evaluators: List[EvaluatorFunction], 2917 composite_evaluator: Optional[CompositeEvaluatorFunction] = None, 2918 max_concurrency: int = 5, 2919 metadata: Optional[Dict[str, Any]] = None, 2920 _add_observation_scores_to_trace: bool = False, 2921 _additional_trace_tags: Optional[List[str]] = None, 2922 resume_from: Optional[BatchEvaluationResumeToken] = None, 2923 verbose: bool = False, 2924 ) -> BatchEvaluationResult: 2925 """Fetch traces or observations and run evaluations on each item. 2926 2927 This method provides a powerful way to evaluate existing data in Langfuse at scale. 2928 It fetches items based on filters, transforms them using a mapper function, runs 2929 evaluators on each item, and creates scores that are linked back to the original 2930 entities. This is ideal for: 2931 2932 - Running evaluations on production traces after deployment 2933 - Backtesting new evaluation metrics on historical data 2934 - Batch scoring of observations for quality monitoring 2935 - Periodic evaluation runs on recent data 2936 2937 The method uses a streaming/pipeline approach to process items in batches, making 2938 it memory-efficient for large datasets. It includes comprehensive error handling, 2939 retry logic, and resume capability for long-running evaluations. 2940 2941 Args: 2942 scope: The type of items to evaluate. Must be one of: 2943 - "traces": Evaluate complete traces with all their observations 2944 - "observations": Evaluate individual observations (spans, generations, events) 2945 mapper: Function that transforms API response objects into evaluator inputs. 2946 Receives a trace/observation object and returns an EvaluatorInputs 2947 instance with input, output, expected_output, and metadata fields. 2948 Can be sync or async. 2949 evaluators: List of evaluation functions to run on each item. Each evaluator 2950 receives the mapped inputs and returns Evaluation object(s). Evaluator 2951 failures are logged but don't stop the batch evaluation. 2952 filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples: 2953 - '{"tags": ["production"]}' 2954 - '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}' 2955 Default: None (fetches all items). 2956 fetch_batch_size: Number of items to fetch per API call and hold in memory. 2957 Larger values may be faster but use more memory. Default: 50. 2958 fetch_trace_fields: Comma-separated list of fields to include when fetching traces. Available field groups: 'core' (always included), 'io' (input, output, metadata), 'scores', 'observations', 'metrics'. If not specified, all fields are returned. Example: 'core,scores,metrics'. Note: Excluded 'observations' or 'scores' fields return empty arrays; excluded 'metrics' returns -1 for 'totalCost' and 'latency'. Only relevant if scope is 'traces'. 2959 max_items: Maximum total number of items to process. If None, processes all 2960 items matching the filter. Useful for testing or limiting evaluation runs. 2961 Default: None (process all). 2962 max_concurrency: Maximum number of items to evaluate concurrently. Controls 2963 parallelism and resource usage. Default: 5. 2964 composite_evaluator: Optional function that creates a composite score from 2965 item-level evaluations. Receives the original item and its evaluations, 2966 returns a single Evaluation. Useful for weighted averages or combined metrics. 2967 Default: None. 2968 metadata: Optional metadata dict to add to all created scores. Useful for 2969 tracking evaluation runs, versions, or other context. Default: None. 2970 max_retries: Maximum number of retry attempts for failed batch fetches. 2971 Uses exponential backoff (1s, 2s, 4s). Default: 3. 2972 verbose: If True, logs progress information to console. Useful for monitoring 2973 long-running evaluations. Default: False. 2974 resume_from: Optional resume token from a previous incomplete run. Allows 2975 continuing evaluation after interruption or failure. Default: None. 2976 2977 2978 Returns: 2979 BatchEvaluationResult containing: 2980 - total_items_fetched: Number of items fetched from API 2981 - total_items_processed: Number of items successfully evaluated 2982 - total_items_failed: Number of items that failed evaluation 2983 - total_scores_created: Scores created by item-level evaluators 2984 - total_composite_scores_created: Scores created by composite evaluator 2985 - total_evaluations_failed: Individual evaluator failures 2986 - evaluator_stats: Per-evaluator statistics (success rate, scores created) 2987 - resume_token: Token for resuming if incomplete (None if completed) 2988 - completed: True if all items processed 2989 - duration_seconds: Total execution time 2990 - failed_item_ids: IDs of items that failed 2991 - error_summary: Error types and counts 2992 - has_more_items: True if max_items reached but more exist 2993 2994 Raises: 2995 ValueError: If invalid scope is provided. 2996 2997 Examples: 2998 Basic trace evaluation: 2999 ```python 3000 from langfuse import Langfuse, EvaluatorInputs, Evaluation 3001 3002 client = Langfuse() 3003 3004 # Define mapper to extract fields from traces 3005 def trace_mapper(trace): 3006 return EvaluatorInputs( 3007 input=trace.input, 3008 output=trace.output, 3009 expected_output=None, 3010 metadata={"trace_id": trace.id} 3011 ) 3012 3013 # Define evaluator 3014 def length_evaluator(*, input, output, expected_output, metadata): 3015 return Evaluation( 3016 name="output_length", 3017 value=len(output) if output else 0 3018 ) 3019 3020 # Run batch evaluation 3021 result = client.run_batched_evaluation( 3022 scope="traces", 3023 mapper=trace_mapper, 3024 evaluators=[length_evaluator], 3025 filter='{"tags": ["production"]}', 3026 max_items=1000, 3027 verbose=True 3028 ) 3029 3030 print(f"Processed {result.total_items_processed} traces") 3031 print(f"Created {result.total_scores_created} scores") 3032 ``` 3033 3034 Evaluation with composite scorer: 3035 ```python 3036 def accuracy_evaluator(*, input, output, expected_output, metadata): 3037 # ... evaluation logic 3038 return Evaluation(name="accuracy", value=0.85) 3039 3040 def relevance_evaluator(*, input, output, expected_output, metadata): 3041 # ... evaluation logic 3042 return Evaluation(name="relevance", value=0.92) 3043 3044 def composite_evaluator(*, item, evaluations): 3045 # Weighted average of evaluations 3046 weights = {"accuracy": 0.6, "relevance": 0.4} 3047 total = sum( 3048 e.value * weights.get(e.name, 0) 3049 for e in evaluations 3050 if isinstance(e.value, (int, float)) 3051 ) 3052 return Evaluation( 3053 name="composite_score", 3054 value=total, 3055 comment=f"Weighted average of {len(evaluations)} metrics" 3056 ) 3057 3058 result = client.run_batched_evaluation( 3059 scope="traces", 3060 mapper=trace_mapper, 3061 evaluators=[accuracy_evaluator, relevance_evaluator], 3062 composite_evaluator=composite_evaluator, 3063 filter='{"user_id": "important_user"}', 3064 verbose=True 3065 ) 3066 ``` 3067 3068 Handling incomplete runs with resume: 3069 ```python 3070 # Initial run that may fail or timeout 3071 result = client.run_batched_evaluation( 3072 scope="observations", 3073 mapper=obs_mapper, 3074 evaluators=[my_evaluator], 3075 max_items=10000, 3076 verbose=True 3077 ) 3078 3079 # Check if incomplete 3080 if not result.completed and result.resume_token: 3081 print(f"Processed {result.resume_token.items_processed} items before interruption") 3082 3083 # Resume from where it left off 3084 result = client.run_batched_evaluation( 3085 scope="observations", 3086 mapper=obs_mapper, 3087 evaluators=[my_evaluator], 3088 resume_from=result.resume_token, 3089 verbose=True 3090 ) 3091 3092 print(f"Total items processed: {result.total_items_processed}") 3093 ``` 3094 3095 Monitoring evaluator performance: 3096 ```python 3097 result = client.run_batched_evaluation(...) 3098 3099 for stats in result.evaluator_stats: 3100 success_rate = stats.successful_runs / stats.total_runs 3101 print(f"{stats.name}:") 3102 print(f" Success rate: {success_rate:.1%}") 3103 print(f" Scores created: {stats.total_scores_created}") 3104 3105 if stats.failed_runs > 0: 3106 print(f" â ī¸ Failed {stats.failed_runs} times") 3107 ``` 3108 3109 Note: 3110 - Evaluator failures are logged but don't stop the batch evaluation 3111 - Individual item failures are tracked but don't stop processing 3112 - Fetch failures are retried with exponential backoff 3113 - All scores are automatically flushed to Langfuse at the end 3114 - The resume mechanism uses timestamp-based filtering to avoid duplicates 3115 """ 3116 runner = BatchEvaluationRunner(self) 3117 3118 return cast( 3119 BatchEvaluationResult, 3120 run_async_safely( 3121 runner.run_async( 3122 scope=scope, 3123 mapper=mapper, 3124 evaluators=evaluators, 3125 filter=filter, 3126 fetch_batch_size=fetch_batch_size, 3127 fetch_trace_fields=fetch_trace_fields, 3128 max_items=max_items, 3129 max_concurrency=max_concurrency, 3130 composite_evaluator=composite_evaluator, 3131 metadata=metadata, 3132 _add_observation_scores_to_trace=_add_observation_scores_to_trace, 3133 _additional_trace_tags=_additional_trace_tags, 3134 max_retries=max_retries, 3135 verbose=verbose, 3136 resume_from=resume_from, 3137 ) 3138 ), 3139 )
Fetch traces or observations and run evaluations on each item.
This method provides a powerful way to evaluate existing data in Langfuse at scale. It fetches items based on filters, transforms them using a mapper function, runs evaluators on each item, and creates scores that are linked back to the original entities. This is ideal for:
- Running evaluations on production traces after deployment
- Backtesting new evaluation metrics on historical data
- Batch scoring of observations for quality monitoring
- Periodic evaluation runs on recent data
The method uses a streaming/pipeline approach to process items in batches, making it memory-efficient for large datasets. It includes comprehensive error handling, retry logic, and resume capability for long-running evaluations.
Arguments:
- scope: The type of items to evaluate. Must be one of:
- "traces": Evaluate complete traces with all their observations
- "observations": Evaluate individual observations (spans, generations, events)
- mapper: Function that transforms API response objects into evaluator inputs. Receives a trace/observation object and returns an EvaluatorInputs instance with input, output, expected_output, and metadata fields. Can be sync or async.
- evaluators: List of evaluation functions to run on each item. Each evaluator receives the mapped inputs and returns Evaluation object(s). Evaluator failures are logged but don't stop the batch evaluation.
- filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples:
- '{"tags": ["production"]}'
- '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}' Default: None (fetches all items).
- fetch_batch_size: Number of items to fetch per API call and hold in memory. Larger values may be faster but use more memory. Default: 50.
- fetch_trace_fields: Comma-separated list of fields to include when fetching traces. Available field groups: 'core' (always included), 'io' (input, output, metadata), 'scores', 'observations', 'metrics'. If not specified, all fields are returned. Example: 'core,scores,metrics'. Note: Excluded 'observations' or 'scores' fields return empty arrays; excluded 'metrics' returns -1 for 'totalCost' and 'latency'. Only relevant if scope is 'traces'.
- max_items: Maximum total number of items to process. If None, processes all items matching the filter. Useful for testing or limiting evaluation runs. Default: None (process all).
- max_concurrency: Maximum number of items to evaluate concurrently. Controls parallelism and resource usage. Default: 5.
- composite_evaluator: Optional function that creates a composite score from item-level evaluations. Receives the original item and its evaluations, returns a single Evaluation. Useful for weighted averages or combined metrics. Default: None.
- metadata: Optional metadata dict to add to all created scores. Useful for tracking evaluation runs, versions, or other context. Default: None.
- max_retries: Maximum number of retry attempts for failed batch fetches. Uses exponential backoff (1s, 2s, 4s). Default: 3.
- verbose: If True, logs progress information to console. Useful for monitoring long-running evaluations. Default: False.
- resume_from: Optional resume token from a previous incomplete run. Allows continuing evaluation after interruption or failure. Default: None.
Returns:
BatchEvaluationResult containing: - total_items_fetched: Number of items fetched from API - total_items_processed: Number of items successfully evaluated - total_items_failed: Number of items that failed evaluation - total_scores_created: Scores created by item-level evaluators - total_composite_scores_created: Scores created by composite evaluator - total_evaluations_failed: Individual evaluator failures - evaluator_stats: Per-evaluator statistics (success rate, scores created) - resume_token: Token for resuming if incomplete (None if completed) - completed: True if all items processed - duration_seconds: Total execution time - failed_item_ids: IDs of items that failed - error_summary: Error types and counts - has_more_items: True if max_items reached but more exist
Raises:
- ValueError: If invalid scope is provided.
Examples:
Basic trace evaluation:
from langfuse import Langfuse, EvaluatorInputs, Evaluation client = Langfuse() # Define mapper to extract fields from traces def trace_mapper(trace): return EvaluatorInputs( input=trace.input, output=trace.output, expected_output=None, metadata={"trace_id": trace.id} ) # Define evaluator def length_evaluator(*, input, output, expected_output, metadata): return Evaluation( name="output_length", value=len(output) if output else 0 ) # Run batch evaluation result = client.run_batched_evaluation( scope="traces", mapper=trace_mapper, evaluators=[length_evaluator], filter='{"tags": ["production"]}', max_items=1000, verbose=True ) print(f"Processed {result.total_items_processed} traces") print(f"Created {result.total_scores_created} scores")Evaluation with composite scorer:
def accuracy_evaluator(*, input, output, expected_output, metadata): # ... evaluation logic return Evaluation(name="accuracy", value=0.85) def relevance_evaluator(*, input, output, expected_output, metadata): # ... evaluation logic return Evaluation(name="relevance", value=0.92) def composite_evaluator(*, item, evaluations): # Weighted average of evaluations weights = {"accuracy": 0.6, "relevance": 0.4} total = sum( e.value * weights.get(e.name, 0) for e in evaluations if isinstance(e.value, (int, float)) ) return Evaluation( name="composite_score", value=total, comment=f"Weighted average of {len(evaluations)} metrics" ) result = client.run_batched_evaluation( scope="traces", mapper=trace_mapper, evaluators=[accuracy_evaluator, relevance_evaluator], composite_evaluator=composite_evaluator, filter='{"user_id": "important_user"}', verbose=True )Handling incomplete runs with resume:
# Initial run that may fail or timeout result = client.run_batched_evaluation( scope="observations", mapper=obs_mapper, evaluators=[my_evaluator], max_items=10000, verbose=True ) # Check if incomplete if not result.completed and result.resume_token: print(f"Processed {result.resume_token.items_processed} items before interruption") # Resume from where it left off result = client.run_batched_evaluation( scope="observations", mapper=obs_mapper, evaluators=[my_evaluator], resume_from=result.resume_token, verbose=True ) print(f"Total items processed: {result.total_items_processed}")Monitoring evaluator performance:
result = client.run_batched_evaluation(...) for stats in result.evaluator_stats: success_rate = stats.successful_runs / stats.total_runs print(f"{stats.name}:") print(f" Success rate: {success_rate:.1%}") print(f" Scores created: {stats.total_scores_created}") if stats.failed_runs > 0: print(f" â ī¸ Failed {stats.failed_runs} times")
Note:
- Evaluator failures are logged but don't stop the batch evaluation
- Individual item failures are tracked but don't stop processing
- Fetch failures are retried with exponential backoff
- All scores are automatically flushed to Langfuse at the end
- The resume mechanism uses timestamp-based filtering to avoid duplicates
3141 def auth_check(self) -> bool: 3142 """Check if the provided credentials (public and secret key) are valid. 3143 3144 Raises: 3145 Exception: If no projects were found for the provided credentials. 3146 3147 Note: 3148 This method is blocking. It is discouraged to use it in production code. 3149 """ 3150 try: 3151 projects = self.api.projects.get() 3152 langfuse_logger.debug( 3153 f"Auth check successful, found {len(projects.data)} projects" 3154 ) 3155 if len(projects.data) == 0: 3156 raise Exception( 3157 "Auth check failed, no project found for the keys provided." 3158 ) 3159 return True 3160 3161 except AttributeError as e: 3162 langfuse_logger.warning( 3163 f"Auth check failed: Client not properly initialized. Error: {e}" 3164 ) 3165 return False 3166 3167 except Error as e: 3168 handle_fern_exception(e) 3169 raise e
Check if the provided credentials (public and secret key) are valid.
Raises:
- Exception: If no projects were found for the provided credentials.
Note:
This method is blocking. It is discouraged to use it in production code.
3171 def create_dataset( 3172 self, 3173 *, 3174 name: str, 3175 description: Optional[str] = None, 3176 metadata: Optional[Any] = None, 3177 input_schema: Optional[Any] = None, 3178 expected_output_schema: Optional[Any] = None, 3179 ) -> Dataset: 3180 """Create a dataset with the given name on Langfuse. 3181 3182 Args: 3183 name: Name of the dataset to create. 3184 description: Description of the dataset. Defaults to None. 3185 metadata: Additional metadata. Defaults to None. 3186 input_schema: JSON Schema for validating dataset item inputs. When set, all new items will be validated against this schema. 3187 expected_output_schema: JSON Schema for validating dataset item expected outputs. When set, all new items will be validated against this schema. 3188 3189 Returns: 3190 Dataset: The created dataset as returned by the Langfuse API. 3191 """ 3192 try: 3193 langfuse_logger.debug(f"Creating datasets {name}") 3194 3195 result = self.api.datasets.create( 3196 name=name, 3197 description=description, 3198 metadata=metadata, 3199 input_schema=input_schema, 3200 expected_output_schema=expected_output_schema, 3201 ) 3202 3203 return cast(Dataset, result) 3204 3205 except Error as e: 3206 handle_fern_exception(e) 3207 raise e
Create a dataset with the given name on Langfuse.
Arguments:
- name: Name of the dataset to create.
- description: Description of the dataset. Defaults to None.
- metadata: Additional metadata. Defaults to None.
- input_schema: JSON Schema for validating dataset item inputs. When set, all new items will be validated against this schema.
- expected_output_schema: JSON Schema for validating dataset item expected outputs. When set, all new items will be validated against this schema.
Returns:
Dataset: The created dataset as returned by the Langfuse API.
3209 def create_dataset_item( 3210 self, 3211 *, 3212 dataset_name: str, 3213 input: Optional[Any] = None, 3214 expected_output: Optional[Any] = None, 3215 metadata: Optional[Any] = None, 3216 source_trace_id: Optional[str] = None, 3217 source_observation_id: Optional[str] = None, 3218 status: Optional[DatasetStatus] = None, 3219 id: Optional[str] = None, 3220 ) -> DatasetItem: 3221 """Create a dataset item. 3222 3223 Upserts if an item with id already exists. 3224 3225 Args: 3226 dataset_name: Name of the dataset in which the dataset item should be created. 3227 input: Input data. Defaults to None. Can contain any dict, list or scalar. 3228 expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar. 3229 metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar. 3230 source_trace_id: Id of the source trace. Defaults to None. 3231 source_observation_id: Id of the source observation. Defaults to None. 3232 status: Status of the dataset item. Defaults to ACTIVE for newly created items. 3233 id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets. 3234 3235 Returns: 3236 DatasetItem: The created dataset item as returned by the Langfuse API. 3237 3238 Example: 3239 ```python 3240 from langfuse import Langfuse 3241 3242 langfuse = Langfuse() 3243 3244 # Uploading items to the Langfuse dataset named "capital_cities" 3245 langfuse.create_dataset_item( 3246 dataset_name="capital_cities", 3247 input={"input": {"country": "Italy"}}, 3248 expected_output={"expected_output": "Rome"}, 3249 metadata={"foo": "bar"} 3250 ) 3251 ``` 3252 """ 3253 try: 3254 langfuse_logger.debug(f"Creating dataset item for dataset {dataset_name}") 3255 3256 result = self.api.dataset_items.create( 3257 dataset_name=dataset_name, 3258 input=input, 3259 expected_output=expected_output, 3260 metadata=metadata, 3261 source_trace_id=source_trace_id, 3262 source_observation_id=source_observation_id, 3263 status=status, 3264 id=id, 3265 ) 3266 3267 return cast(DatasetItem, result) 3268 except Error as e: 3269 handle_fern_exception(e) 3270 raise e
Create a dataset item.
Upserts if an item with id already exists.
Arguments:
- dataset_name: Name of the dataset in which the dataset item should be created.
- input: Input data. Defaults to None. Can contain any dict, list or scalar.
- expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar.
- metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar.
- source_trace_id: Id of the source trace. Defaults to None.
- source_observation_id: Id of the source observation. Defaults to None.
- status: Status of the dataset item. Defaults to ACTIVE for newly created items.
- id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets.
Returns:
DatasetItem: The created dataset item as returned by the Langfuse API.
Example:
from langfuse import Langfuse langfuse = Langfuse() # Uploading items to the Langfuse dataset named "capital_cities" langfuse.create_dataset_item( dataset_name="capital_cities", input={"input": {"country": "Italy"}}, expected_output={"expected_output": "Rome"}, metadata={"foo": "bar"} )
3272 def resolve_media_references( 3273 self, 3274 *, 3275 obj: Any, 3276 resolve_with: Literal["base64_data_uri"], 3277 max_depth: int = 10, 3278 content_fetch_timeout_seconds: int = 5, 3279 ) -> Any: 3280 """Replace media reference strings in an object with base64 data URIs. 3281 3282 This method recursively traverses an object (up to max_depth) looking for media reference strings 3283 in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using 3284 the provided Langfuse client and replaces the reference string with a base64 data URI. 3285 3286 If fetching media content fails for a reference string, a warning is logged and the reference 3287 string is left unchanged. 3288 3289 Args: 3290 obj: The object to process. Can be a primitive value, array, or nested object. 3291 If the object has a __dict__ attribute, a dict will be returned instead of the original object type. 3292 resolve_with: The representation of the media content to replace the media reference string with. 3293 Currently only "base64_data_uri" is supported. 3294 max_depth: int: The maximum depth to traverse the object. Default is 10. 3295 content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5. 3296 3297 Returns: 3298 A deep copy of the input object with all media references replaced with base64 data URIs where possible. 3299 If the input object has a __dict__ attribute, a dict will be returned instead of the original object type. 3300 3301 Example: 3302 obj = { 3303 "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@", 3304 "nested": { 3305 "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@" 3306 } 3307 } 3308 3309 result = await LangfuseMedia.resolve_media_references(obj, langfuse_client) 3310 3311 # Result: 3312 # { 3313 # "image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...", 3314 # "nested": { 3315 # "pdf": "data:application/pdf;base64,JVBERi0xLjcK..." 3316 # } 3317 # } 3318 """ 3319 return LangfuseMedia.resolve_media_references( 3320 langfuse_client=self, 3321 obj=obj, 3322 resolve_with=resolve_with, 3323 max_depth=max_depth, 3324 content_fetch_timeout_seconds=content_fetch_timeout_seconds, 3325 )
Replace media reference strings in an object with base64 data URIs.
This method recursively traverses an object (up to max_depth) looking for media reference strings in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using the provided Langfuse client and replaces the reference string with a base64 data URI.
If fetching media content fails for a reference string, a warning is logged and the reference string is left unchanged.
Arguments:
- obj: The object to process. Can be a primitive value, array, or nested object. If the object has a __dict__ attribute, a dict will be returned instead of the original object type.
- resolve_with: The representation of the media content to replace the media reference string with. Currently only "base64_data_uri" is supported.
- max_depth: int: The maximum depth to traverse the object. Default is 10.
- content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5.
Returns:
A deep copy of the input object with all media references replaced with base64 data URIs where possible. If the input object has a __dict__ attribute, a dict will be returned instead of the original object type.
Example:
obj = { "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@", "nested": { "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@" } }
result = await LangfuseMedia.resolve_media_references(obj, langfuse_client)
Result:
{
"image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...",
"nested": {
"pdf": "data:application/pdf;base64,JVBERi0xLjcK..."
}
}
3355 def get_prompt( 3356 self, 3357 name: str, 3358 *, 3359 version: Optional[int] = None, 3360 label: Optional[str] = None, 3361 type: Literal["chat", "text"] = "text", 3362 cache_ttl_seconds: Optional[int] = None, 3363 fallback: Union[Optional[List[ChatMessageDict]], Optional[str]] = None, 3364 max_retries: Optional[int] = None, 3365 fetch_timeout_seconds: Optional[int] = None, 3366 ) -> PromptClient: 3367 """Get a prompt. 3368 3369 This method attempts to fetch the requested prompt from the local cache. If the prompt is not found 3370 in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again 3371 and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will 3372 return the expired prompt as a fallback. 3373 3374 Args: 3375 name (str): The name of the prompt to retrieve. 3376 3377 Keyword Args: 3378 version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both. 3379 label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both. 3380 cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a 3381 keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0. 3382 type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text". 3383 fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None. 3384 max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds. 3385 fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default. 3386 3387 Returns: 3388 The prompt object retrieved from the cache or directly fetched if not cached or expired of type 3389 - TextPromptClient, if type argument is 'text'. 3390 - ChatPromptClient, if type argument is 'chat'. 3391 3392 Raises: 3393 Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an 3394 expired prompt in the cache, in which case it logs a warning and returns the expired prompt. 3395 """ 3396 if self._resources is None: 3397 raise Error( 3398 "SDK is not correctly initialized. Check the init logs for more details." 3399 ) 3400 if version is not None and label is not None: 3401 raise ValueError("Cannot specify both version and label at the same time.") 3402 3403 if not name: 3404 raise ValueError("Prompt name cannot be empty.") 3405 3406 cache_key = PromptCache.generate_cache_key(name, version=version, label=label) 3407 bounded_max_retries = self._get_bounded_max_retries( 3408 max_retries, default_max_retries=2, max_retries_upper_bound=4 3409 ) 3410 3411 langfuse_logger.debug(f"Getting prompt '{cache_key}'") 3412 cached_prompt = self._resources.prompt_cache.get(cache_key) 3413 3414 if cached_prompt is None or cache_ttl_seconds == 0: 3415 langfuse_logger.debug( 3416 f"Prompt '{cache_key}' not found in cache or caching disabled." 3417 ) 3418 try: 3419 return self._fetch_prompt_and_update_cache( 3420 name, 3421 version=version, 3422 label=label, 3423 ttl_seconds=cache_ttl_seconds, 3424 max_retries=bounded_max_retries, 3425 fetch_timeout_seconds=fetch_timeout_seconds, 3426 ) 3427 except Exception as e: 3428 if fallback: 3429 langfuse_logger.warning( 3430 f"Returning fallback prompt for '{cache_key}' due to fetch error: {e}" 3431 ) 3432 3433 fallback_client_args: Dict[str, Any] = { 3434 "name": name, 3435 "prompt": fallback, 3436 "type": type, 3437 "version": version or 0, 3438 "config": {}, 3439 "labels": [label] if label else [], 3440 "tags": [], 3441 } 3442 3443 if type == "text": 3444 return TextPromptClient( 3445 prompt=Prompt_Text(**fallback_client_args), 3446 is_fallback=True, 3447 ) 3448 3449 if type == "chat": 3450 return ChatPromptClient( 3451 prompt=Prompt_Chat(**fallback_client_args), 3452 is_fallback=True, 3453 ) 3454 3455 raise e 3456 3457 if cached_prompt.is_expired(): 3458 langfuse_logger.debug(f"Stale prompt '{cache_key}' found in cache.") 3459 try: 3460 # refresh prompt in background thread, refresh_prompt deduplicates tasks 3461 langfuse_logger.debug(f"Refreshing prompt '{cache_key}' in background.") 3462 3463 def refresh_task() -> None: 3464 self._fetch_prompt_and_update_cache( 3465 name, 3466 version=version, 3467 label=label, 3468 ttl_seconds=cache_ttl_seconds, 3469 max_retries=bounded_max_retries, 3470 fetch_timeout_seconds=fetch_timeout_seconds, 3471 ) 3472 3473 self._resources.prompt_cache.add_refresh_prompt_task( 3474 cache_key, 3475 refresh_task, 3476 ) 3477 langfuse_logger.debug( 3478 f"Returning stale prompt '{cache_key}' from cache." 3479 ) 3480 # return stale prompt 3481 return cached_prompt.value 3482 3483 except Exception as e: 3484 langfuse_logger.warning( 3485 f"Error when refreshing cached prompt '{cache_key}', returning cached version. Error: {e}" 3486 ) 3487 # creation of refresh prompt task failed, return stale prompt 3488 return cached_prompt.value 3489 3490 return cached_prompt.value
Get a prompt.
This method attempts to fetch the requested prompt from the local cache. If the prompt is not found in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will return the expired prompt as a fallback.
Arguments:
- name (str): The name of the prompt to retrieve.
Keyword Args:
version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the
productionlabel is returned. Specify either version or label, not both. label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, theproductionlabel is returned. Specify either version or label, not both. cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0. type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text". fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None. max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds. fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default.
Returns:
The prompt object retrieved from the cache or directly fetched if not cached or expired of type
- TextPromptClient, if type argument is 'text'.
- ChatPromptClient, if type argument is 'chat'.
Raises:
- Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an
- expired prompt in the cache, in which case it logs a warning and returns the expired prompt.
3592 def create_prompt( 3593 self, 3594 *, 3595 name: str, 3596 prompt: Union[ 3597 str, List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]] 3598 ], 3599 labels: List[str] = [], 3600 tags: Optional[List[str]] = None, 3601 type: Optional[Literal["chat", "text"]] = "text", 3602 config: Optional[Any] = None, 3603 commit_message: Optional[str] = None, 3604 ) -> PromptClient: 3605 """Create a new prompt in Langfuse. 3606 3607 Keyword Args: 3608 name : The name of the prompt to be created. 3609 prompt : The content of the prompt to be created. 3610 is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead. 3611 labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label. 3612 tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt. 3613 config: Additional structured data to be saved with the prompt. Defaults to None. 3614 type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text". 3615 commit_message: Optional string describing the change. 3616 3617 Returns: 3618 TextPromptClient: The prompt if type argument is 'text'. 3619 ChatPromptClient: The prompt if type argument is 'chat'. 3620 """ 3621 try: 3622 langfuse_logger.debug(f"Creating prompt {name=}, {labels=}") 3623 3624 if type == "chat": 3625 if not isinstance(prompt, list): 3626 raise ValueError( 3627 "For 'chat' type, 'prompt' must be a list of chat messages with role and content attributes." 3628 ) 3629 request: Union[CreateChatPromptRequest, CreateTextPromptRequest] = ( 3630 CreateChatPromptRequest( 3631 name=name, 3632 prompt=cast(Any, prompt), 3633 labels=labels, 3634 tags=tags, 3635 config=config or {}, 3636 commit_message=commit_message, 3637 type=CreateChatPromptType.CHAT, 3638 ) 3639 ) 3640 server_prompt = self.api.prompts.create(request=request) 3641 3642 if self._resources is not None: 3643 self._resources.prompt_cache.invalidate(name) 3644 3645 return ChatPromptClient(prompt=cast(Prompt_Chat, server_prompt)) 3646 3647 if not isinstance(prompt, str): 3648 raise ValueError("For 'text' type, 'prompt' must be a string.") 3649 3650 request = CreateTextPromptRequest( 3651 name=name, 3652 prompt=prompt, 3653 labels=labels, 3654 tags=tags, 3655 config=config or {}, 3656 commit_message=commit_message, 3657 ) 3658 3659 server_prompt = self.api.prompts.create(request=request) 3660 3661 if self._resources is not None: 3662 self._resources.prompt_cache.invalidate(name) 3663 3664 return TextPromptClient(prompt=cast(Prompt_Text, server_prompt)) 3665 3666 except Error as e: 3667 handle_fern_exception(e) 3668 raise e
Create a new prompt in Langfuse.
Keyword Args:
name : The name of the prompt to be created. prompt : The content of the prompt to be created. is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead. labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label. tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt. config: Additional structured data to be saved with the prompt. Defaults to None. type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text". commit_message: Optional string describing the change.
Returns:
TextPromptClient: The prompt if type argument is 'text'. ChatPromptClient: The prompt if type argument is 'chat'.
3670 def update_prompt( 3671 self, 3672 *, 3673 name: str, 3674 version: int, 3675 new_labels: List[str] = [], 3676 ) -> Any: 3677 """Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name. 3678 3679 Args: 3680 name (str): The name of the prompt to update. 3681 version (int): The version number of the prompt to update. 3682 new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to []. 3683 3684 Returns: 3685 Prompt: The updated prompt from the Langfuse API. 3686 3687 """ 3688 updated_prompt = self.api.prompt_version.update( 3689 name=self._url_encode(name), 3690 version=version, 3691 new_labels=new_labels, 3692 ) 3693 3694 if self._resources is not None: 3695 self._resources.prompt_cache.invalidate(name) 3696 3697 return updated_prompt
Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name.
Arguments:
- name (str): The name of the prompt to update.
- version (int): The version number of the prompt to update.
- new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to [].
Returns:
Prompt: The updated prompt from the Langfuse API.
3712 def clear_prompt_cache(self) -> None: 3713 """Clear the entire prompt cache, removing all cached prompts. 3714 3715 This method is useful when you want to force a complete refresh of all 3716 cached prompts, for example after major updates or when you need to 3717 ensure the latest versions are fetched from the server. 3718 """ 3719 if self._resources is not None: 3720 self._resources.prompt_cache.clear()
Clear the entire prompt cache, removing all cached prompts.
This method is useful when you want to force a complete refresh of all cached prompts, for example after major updates or when you need to ensure the latest versions are fetched from the server.
62def get_client(*, public_key: Optional[str] = None) -> Langfuse: 63 """Get or create a Langfuse client instance. 64 65 Returns an existing Langfuse client or creates a new one if none exists. In multi-project setups, 66 providing a public_key is required. Multi-project support is experimental - see Langfuse docs. 67 68 Behavior: 69 - Single project: Returns existing client or creates new one 70 - Multi-project: Requires public_key to return specific client 71 - No public_key in multi-project: Returns disabled client to prevent data leakage 72 73 The function uses a singleton pattern per public_key to conserve resources and maintain state. 74 75 Args: 76 public_key (Optional[str]): Project identifier 77 - With key: Returns client for that project 78 - Without key: Returns single client or disabled client if multiple exist 79 80 Returns: 81 Langfuse: Client instance in one of three states: 82 1. Client for specified public_key 83 2. Default client for single-project setup 84 3. Disabled client when multiple projects exist without key 85 86 Security: 87 Disables tracing when multiple projects exist without explicit key to prevent 88 cross-project data leakage. Multi-project setups are experimental. 89 90 Example: 91 ```python 92 # Single project 93 client = get_client() # Default client 94 95 # In multi-project usage: 96 client_a = get_client(public_key="project_a_key") # Returns project A's client 97 client_b = get_client(public_key="project_b_key") # Returns project B's client 98 99 # Without specific key in multi-project setup: 100 client = get_client() # Returns disabled client for safety 101 ``` 102 """ 103 with LangfuseResourceManager._lock: 104 active_instances = LangfuseResourceManager._instances 105 106 # If no explicit public_key provided, check execution context 107 if not public_key: 108 public_key = _current_public_key.get(None) 109 110 if not public_key: 111 if len(active_instances) == 0: 112 # No clients initialized yet, create default instance 113 return Langfuse() 114 115 if len(active_instances) == 1: 116 # Only one client exists, safe to use without specifying key 117 instance = list(active_instances.values())[0] 118 119 # Initialize with the credentials bound to the instance 120 # This is important if the original instance was instantiated 121 # via constructor arguments 122 return _create_client_from_instance(instance) 123 124 else: 125 # Multiple clients exist but no key specified - disable tracing 126 # to prevent cross-project data leakage 127 langfuse_logger.warning( 128 "No 'langfuse_public_key' passed to decorated function, but multiple langfuse clients are instantiated in current process. Skipping tracing for this function to avoid cross-project leakage." 129 ) 130 return Langfuse( 131 tracing_enabled=False, public_key="fake", secret_key="fake" 132 ) 133 134 else: 135 # Specific key provided, look up existing instance 136 target_instance: Optional[LangfuseResourceManager] = active_instances.get( 137 public_key, None 138 ) 139 140 if target_instance is None: 141 # No instance found with this key - client not initialized properly 142 langfuse_logger.warning( 143 f"No Langfuse client with public key {public_key} has been initialized. Skipping tracing for decorated function." 144 ) 145 return Langfuse( 146 tracing_enabled=False, public_key="fake", secret_key="fake" 147 ) 148 149 # target_instance is guaranteed to be not None at this point 150 return _create_client_from_instance(target_instance, public_key)
Get or create a Langfuse client instance.
Returns an existing Langfuse client or creates a new one if none exists. In multi-project setups, providing a public_key is required. Multi-project support is experimental - see Langfuse docs.
Behavior:
- Single project: Returns existing client or creates new one
- Multi-project: Requires public_key to return specific client
- No public_key in multi-project: Returns disabled client to prevent data leakage
The function uses a singleton pattern per public_key to conserve resources and maintain state.
Arguments:
- public_key (Optional[str]): Project identifier
- With key: Returns client for that project
- Without key: Returns single client or disabled client if multiple exist
Returns:
Langfuse: Client instance in one of three states: 1. Client for specified public_key 2. Default client for single-project setup 3. Disabled client when multiple projects exist without key
Security:
Disables tracing when multiple projects exist without explicit key to prevent cross-project data leakage. Multi-project setups are experimental.
Example:
# Single project client = get_client() # Default client # In multi-project usage: client_a = get_client(public_key="project_a_key") # Returns project A's client client_b = get_client(public_key="project_b_key") # Returns project B's client # Without specific key in multi-project setup: client = get_client() # Returns disabled client for safety
88 def observe( 89 self, 90 func: Optional[F] = None, 91 *, 92 name: Optional[str] = None, 93 as_type: Optional[ObservationTypeLiteralNoEvent] = None, 94 capture_input: Optional[bool] = None, 95 capture_output: Optional[bool] = None, 96 transform_to_string: Optional[Callable[[Iterable], str]] = None, 97 ) -> Union[F, Callable[[F], F]]: 98 """Wrap a function to create and manage Langfuse tracing around its execution, supporting both synchronous and asynchronous functions. 99 100 This decorator provides seamless integration of Langfuse observability into your codebase. It automatically creates 101 spans or generations around function execution, capturing timing, inputs/outputs, and error states. The decorator 102 intelligently handles both synchronous and asynchronous functions, preserving function signatures and type hints. 103 104 Using OpenTelemetry's distributed tracing system, it maintains proper trace context propagation throughout your application, 105 enabling you to see hierarchical traces of function calls with detailed performance metrics and function-specific details. 106 107 Args: 108 func (Optional[Callable]): The function to decorate. When used with parentheses @observe(), this will be None. 109 name (Optional[str]): Custom name for the created trace or span. If not provided, the function name is used. 110 as_type (Optional[Literal]): Set the observation type. Supported values: 111 "generation", "span", "agent", "tool", "chain", "retriever", "embedding", "evaluator", "guardrail". 112 Observation types are highlighted in the Langfuse UI for filtering and visualization. 113 The types "generation" and "embedding" create a span on which additional attributes such as model metrics 114 can be set. 115 116 Returns: 117 Callable: A wrapped version of the original function that automatically creates and manages Langfuse spans. 118 119 Example: 120 For general function tracing with automatic naming: 121 ```python 122 @observe() 123 def process_user_request(user_id, query): 124 # Function is automatically traced with name "process_user_request" 125 return get_response(query) 126 ``` 127 128 For language model generation tracking: 129 ```python 130 @observe(name="answer-generation", as_type="generation") 131 async def generate_answer(query): 132 # Creates a generation-type span with extended LLM metrics 133 response = await openai.chat.completions.create( 134 model="gpt-4", 135 messages=[{"role": "user", "content": query}] 136 ) 137 return response.choices[0].message.content 138 ``` 139 140 For trace context propagation between functions: 141 ```python 142 @observe() 143 def main_process(): 144 # Parent span is created 145 return sub_process() # Child span automatically connected to parent 146 147 @observe() 148 def sub_process(): 149 # Automatically becomes a child span of main_process 150 return "result" 151 ``` 152 153 Raises: 154 Exception: Propagates any exceptions from the wrapped function after logging them in the trace. 155 156 Notes: 157 - The decorator preserves the original function's signature, docstring, and return type. 158 - Proper parent-child relationships between spans are automatically maintained. 159 - Special keyword arguments can be passed to control tracing: 160 - langfuse_trace_id: Explicitly set the trace ID for this function call 161 - langfuse_parent_observation_id: Explicitly set the parent span ID 162 - langfuse_public_key: Use a specific Langfuse project (when multiple clients exist) 163 - For async functions, the decorator returns an async function wrapper. 164 - For sync functions, the decorator returns a synchronous wrapper. 165 """ 166 valid_types = set(get_observation_types_list(ObservationTypeLiteralNoEvent)) 167 if as_type is not None and as_type not in valid_types: 168 logger.warning( 169 f"Invalid as_type '{as_type}'. Valid types are: {', '.join(sorted(valid_types))}. Defaulting to 'span'." 170 ) 171 as_type = "span" 172 173 function_io_capture_enabled = os.environ.get( 174 LANGFUSE_OBSERVE_DECORATOR_IO_CAPTURE_ENABLED, "True" 175 ).lower() not in ("false", "0") 176 177 should_capture_input = ( 178 capture_input if capture_input is not None else function_io_capture_enabled 179 ) 180 181 should_capture_output = ( 182 capture_output 183 if capture_output is not None 184 else function_io_capture_enabled 185 ) 186 187 def decorator(func: F) -> F: 188 return ( 189 self._async_observe( 190 func, 191 name=name, 192 as_type=as_type, 193 capture_input=should_capture_input, 194 capture_output=should_capture_output, 195 transform_to_string=transform_to_string, 196 ) 197 if asyncio.iscoroutinefunction(func) 198 else self._sync_observe( 199 func, 200 name=name, 201 as_type=as_type, 202 capture_input=should_capture_input, 203 capture_output=should_capture_output, 204 transform_to_string=transform_to_string, 205 ) 206 ) 207 208 """Handle decorator with or without parentheses. 209 210 This logic enables the decorator to work both with and without parentheses: 211 - @observe - Python passes the function directly to the decorator 212 - @observe() - Python calls the decorator first, which must return a function decorator 213 214 When called without arguments (@observe), the func parameter contains the function to decorate, 215 so we directly apply the decorator to it. When called with parentheses (@observe()), 216 func is None, so we return the decorator function itself for Python to apply in the next step. 217 """ 218 if func is None: 219 return decorator 220 else: 221 return decorator(func)
Wrap a function to create and manage Langfuse tracing around its execution, supporting both synchronous and asynchronous functions.
This decorator provides seamless integration of Langfuse observability into your codebase. It automatically creates spans or generations around function execution, capturing timing, inputs/outputs, and error states. The decorator intelligently handles both synchronous and asynchronous functions, preserving function signatures and type hints.
Using OpenTelemetry's distributed tracing system, it maintains proper trace context propagation throughout your application, enabling you to see hierarchical traces of function calls with detailed performance metrics and function-specific details.
Arguments:
- func (Optional[Callable]): The function to decorate. When used with parentheses @observe(), this will be None.
- name (Optional[str]): Custom name for the created trace or span. If not provided, the function name is used.
- as_type (Optional[Literal]): Set the observation type. Supported values: "generation", "span", "agent", "tool", "chain", "retriever", "embedding", "evaluator", "guardrail". Observation types are highlighted in the Langfuse UI for filtering and visualization. The types "generation" and "embedding" create a span on which additional attributes such as model metrics can be set.
Returns:
Callable: A wrapped version of the original function that automatically creates and manages Langfuse spans.
Example:
For general function tracing with automatic naming:
@observe() def process_user_request(user_id, query): # Function is automatically traced with name "process_user_request" return get_response(query)For language model generation tracking:
@observe(name="answer-generation", as_type="generation") async def generate_answer(query): # Creates a generation-type span with extended LLM metrics response = await openai.chat.completions.create( model="gpt-4", messages=[{"role": "user", "content": query}] ) return response.choices[0].message.contentFor trace context propagation between functions:
@observe() def main_process(): # Parent span is created return sub_process() # Child span automatically connected to parent @observe() def sub_process(): # Automatically becomes a child span of main_process return "result"
Raises:
- Exception: Propagates any exceptions from the wrapped function after logging them in the trace.
Notes:
- The decorator preserves the original function's signature, docstring, and return type.
- Proper parent-child relationships between spans are automatically maintained.
- Special keyword arguments can be passed to control tracing:
- langfuse_trace_id: Explicitly set the trace ID for this function call
- langfuse_parent_observation_id: Explicitly set the parent span ID
- langfuse_public_key: Use a specific Langfuse project (when multiple clients exist)
- For async functions, the decorator returns an async function wrapper.
- For sync functions, the decorator returns a synchronous wrapper.
76def propagate_attributes( 77 *, 78 user_id: Optional[str] = None, 79 session_id: Optional[str] = None, 80 metadata: Optional[Dict[str, str]] = None, 81 version: Optional[str] = None, 82 tags: Optional[List[str]] = None, 83 trace_name: Optional[str] = None, 84 as_baggage: bool = False, 85) -> _AgnosticContextManager[Any]: 86 """Propagate trace-level attributes to all spans created within this context. 87 88 This context manager sets attributes on the currently active span AND automatically 89 propagates them to all new child spans created within the context. This is the 90 recommended way to set trace-level attributes like user_id, session_id, and metadata 91 dimensions that should be consistently applied across all observations in a trace. 92 93 **IMPORTANT**: Call this as early as possible within your trace/workflow. Only the 94 currently active span and spans created after entering this context will have these 95 attributes. Pre-existing spans will NOT be retroactively updated. 96 97 **Why this matters**: Langfuse aggregation queries (e.g., total cost by user_id, 98 filtering by session_id) only include observations that have the attribute set. 99 If you call `propagate_attributes` late in your workflow, earlier spans won't be 100 included in aggregations for that attribute. 101 102 Args: 103 user_id: User identifier to associate with all spans in this context. 104 Must be US-ASCII string, â¤200 characters. Use this to track which user 105 generated each trace and enable e.g. per-user cost/performance analysis. 106 session_id: Session identifier to associate with all spans in this context. 107 Must be US-ASCII string, â¤200 characters. Use this to group related traces 108 within a user session (e.g., a conversation thread, multi-turn interaction). 109 metadata: Additional key-value metadata to propagate to all spans. 110 - Keys and values must be US-ASCII strings 111 - All values must be â¤200 characters 112 - Use for dimensions like internal correlating identifiers 113 - AVOID: large payloads, sensitive data, non-string values (will be dropped with warning) 114 version: Version identfier for parts of your application that are independently versioned, e.g. agents 115 tags: List of tags to categorize the group of observations 116 trace_name: Name to assign to the trace. Must be US-ASCII string, â¤200 characters. 117 Use this to set a consistent trace name for all spans created within this context. 118 as_baggage: If True, propagates attributes using OpenTelemetry baggage for 119 cross-process/service propagation. **Security warning**: When enabled, 120 attribute values are added to HTTP headers on ALL outbound requests. 121 Only enable if values are safe to transmit via HTTP headers and you need 122 cross-service tracing. Default: False. 123 124 Returns: 125 Context manager that propagates attributes to all child spans. 126 127 Example: 128 Basic usage with user and session tracking: 129 130 ```python 131 from langfuse import Langfuse 132 133 langfuse = Langfuse() 134 135 # Set attributes early in the trace 136 with langfuse.start_as_current_observation(name="user_workflow") as span: 137 with langfuse.propagate_attributes( 138 user_id="user_123", 139 session_id="session_abc", 140 metadata={"experiment": "variant_a", "environment": "production"} 141 ): 142 # All spans created here will have user_id, session_id, and metadata 143 with langfuse.start_observation(name="llm_call") as llm_span: 144 # This span inherits: user_id, session_id, experiment, environment 145 ... 146 147 with langfuse.start_generation(name="completion") as gen: 148 # This span also inherits all attributes 149 ... 150 ``` 151 152 Late propagation (anti-pattern): 153 154 ```python 155 with langfuse.start_as_current_observation(name="workflow") as span: 156 # These spans WON'T have user_id 157 early_span = langfuse.start_observation(name="early_work") 158 early_span.end() 159 160 # Set attributes in the middle 161 with langfuse.propagate_attributes(user_id="user_123"): 162 # Only spans created AFTER this point will have user_id 163 late_span = langfuse.start_observation(name="late_work") 164 late_span.end() 165 166 # Result: Aggregations by user_id will miss "early_work" span 167 ``` 168 169 Cross-service propagation with baggage (advanced): 170 171 ```python 172 # Service A - originating service 173 with langfuse.start_as_current_observation(name="api_request"): 174 with langfuse.propagate_attributes( 175 user_id="user_123", 176 session_id="session_abc", 177 as_baggage=True # Propagate via HTTP headers 178 ): 179 # Make HTTP request to Service B 180 response = requests.get("https://service-b.example.com/api") 181 # user_id and session_id are now in HTTP headers 182 183 # Service B - downstream service 184 # OpenTelemetry will automatically extract baggage from HTTP headers 185 # and propagate to spans in Service B 186 ``` 187 188 Note: 189 - **Validation**: All attribute values (user_id, session_id, metadata values) 190 must be strings â¤200 characters. Invalid values will be dropped with a 191 warning logged. Ensure values meet constraints before calling. 192 - **OpenTelemetry**: This uses OpenTelemetry context propagation under the hood, 193 making it compatible with other OTel-instrumented libraries. 194 195 Raises: 196 No exceptions are raised. Invalid values are logged as warnings and dropped. 197 """ 198 return _propagate_attributes( 199 user_id=user_id, 200 session_id=session_id, 201 metadata=metadata, 202 version=version, 203 tags=tags, 204 trace_name=trace_name, 205 as_baggage=as_baggage, 206 )
Propagate trace-level attributes to all spans created within this context.
This context manager sets attributes on the currently active span AND automatically propagates them to all new child spans created within the context. This is the recommended way to set trace-level attributes like user_id, session_id, and metadata dimensions that should be consistently applied across all observations in a trace.
IMPORTANT: Call this as early as possible within your trace/workflow. Only the currently active span and spans created after entering this context will have these attributes. Pre-existing spans will NOT be retroactively updated.
Why this matters: Langfuse aggregation queries (e.g., total cost by user_id,
filtering by session_id) only include observations that have the attribute set.
If you call propagate_attributes late in your workflow, earlier spans won't be
included in aggregations for that attribute.
Arguments:
- user_id: User identifier to associate with all spans in this context. Must be US-ASCII string, â¤200 characters. Use this to track which user generated each trace and enable e.g. per-user cost/performance analysis.
- session_id: Session identifier to associate with all spans in this context. Must be US-ASCII string, â¤200 characters. Use this to group related traces within a user session (e.g., a conversation thread, multi-turn interaction).
- metadata: Additional key-value metadata to propagate to all spans.
- Keys and values must be US-ASCII strings
- All values must be â¤200 characters
- Use for dimensions like internal correlating identifiers
- AVOID: large payloads, sensitive data, non-string values (will be dropped with warning)
- version: Version identfier for parts of your application that are independently versioned, e.g. agents
- tags: List of tags to categorize the group of observations
- trace_name: Name to assign to the trace. Must be US-ASCII string, â¤200 characters. Use this to set a consistent trace name for all spans created within this context.
- as_baggage: If True, propagates attributes using OpenTelemetry baggage for cross-process/service propagation. Security warning: When enabled, attribute values are added to HTTP headers on ALL outbound requests. Only enable if values are safe to transmit via HTTP headers and you need cross-service tracing. Default: False.
Returns:
Context manager that propagates attributes to all child spans.
Example:
Basic usage with user and session tracking:
from langfuse import Langfuse langfuse = Langfuse() # Set attributes early in the trace with langfuse.start_as_current_observation(name="user_workflow") as span: with langfuse.propagate_attributes( user_id="user_123", session_id="session_abc", metadata={"experiment": "variant_a", "environment": "production"} ): # All spans created here will have user_id, session_id, and metadata with langfuse.start_observation(name="llm_call") as llm_span: # This span inherits: user_id, session_id, experiment, environment ... with langfuse.start_generation(name="completion") as gen: # This span also inherits all attributes ...Late propagation (anti-pattern):
with langfuse.start_as_current_observation(name="workflow") as span: # These spans WON'T have user_id early_span = langfuse.start_observation(name="early_work") early_span.end() # Set attributes in the middle with langfuse.propagate_attributes(user_id="user_123"): # Only spans created AFTER this point will have user_id late_span = langfuse.start_observation(name="late_work") late_span.end() # Result: Aggregations by user_id will miss "early_work" spanCross-service propagation with baggage (advanced):
# Service A - originating service with langfuse.start_as_current_observation(name="api_request"): with langfuse.propagate_attributes( user_id="user_123", session_id="session_abc", as_baggage=True # Propagate via HTTP headers ): # Make HTTP request to Service B response = requests.get("https://service-b.example.com/api") # user_id and session_id are now in HTTP headers # Service B - downstream service # OpenTelemetry will automatically extract baggage from HTTP headers # and propagate to spans in Service B
Note:
- Validation: All attribute values (user_id, session_id, metadata values) must be strings â¤200 characters. Invalid values will be dropped with a warning logged. Ensure values meet constraints before calling.
- OpenTelemetry: This uses OpenTelemetry context propagation under the hood, making it compatible with other OTel-instrumented libraries.
Raises:
- No exceptions are raised. Invalid values are logged as warnings and dropped.
1247class LangfuseSpan(LangfuseObservationWrapper): 1248 """Standard span implementation for general operations in Langfuse. 1249 1250 This class represents a general-purpose span that can be used to trace 1251 any operation in your application. It extends the base LangfuseObservationWrapper 1252 with specific methods for creating child spans, generations, and updating 1253 span-specific attributes. If possible, use a more specific type for 1254 better observability and insights. 1255 """ 1256 1257 def __init__( 1258 self, 1259 *, 1260 otel_span: otel_trace_api.Span, 1261 langfuse_client: "Langfuse", 1262 input: Optional[Any] = None, 1263 output: Optional[Any] = None, 1264 metadata: Optional[Any] = None, 1265 environment: Optional[str] = None, 1266 release: Optional[str] = None, 1267 version: Optional[str] = None, 1268 level: Optional[SpanLevel] = None, 1269 status_message: Optional[str] = None, 1270 ): 1271 """Initialize a new LangfuseSpan. 1272 1273 Args: 1274 otel_span: The OpenTelemetry span to wrap 1275 langfuse_client: Reference to the parent Langfuse client 1276 input: Input data for the span (any JSON-serializable object) 1277 output: Output data from the span (any JSON-serializable object) 1278 metadata: Additional metadata to associate with the span 1279 environment: The tracing environment 1280 release: Release identifier for the application 1281 version: Version identifier for the code or component 1282 level: Importance level of the span (info, warning, error) 1283 status_message: Optional status message for the span 1284 """ 1285 super().__init__( 1286 otel_span=otel_span, 1287 as_type="span", 1288 langfuse_client=langfuse_client, 1289 input=input, 1290 output=output, 1291 metadata=metadata, 1292 environment=environment, 1293 release=release, 1294 version=version, 1295 level=level, 1296 status_message=status_message, 1297 )
Standard span implementation for general operations in Langfuse.
This class represents a general-purpose span that can be used to trace any operation in your application. It extends the base LangfuseObservationWrapper with specific methods for creating child spans, generations, and updating span-specific attributes. If possible, use a more specific type for better observability and insights.
1257 def __init__( 1258 self, 1259 *, 1260 otel_span: otel_trace_api.Span, 1261 langfuse_client: "Langfuse", 1262 input: Optional[Any] = None, 1263 output: Optional[Any] = None, 1264 metadata: Optional[Any] = None, 1265 environment: Optional[str] = None, 1266 release: Optional[str] = None, 1267 version: Optional[str] = None, 1268 level: Optional[SpanLevel] = None, 1269 status_message: Optional[str] = None, 1270 ): 1271 """Initialize a new LangfuseSpan. 1272 1273 Args: 1274 otel_span: The OpenTelemetry span to wrap 1275 langfuse_client: Reference to the parent Langfuse client 1276 input: Input data for the span (any JSON-serializable object) 1277 output: Output data from the span (any JSON-serializable object) 1278 metadata: Additional metadata to associate with the span 1279 environment: The tracing environment 1280 release: Release identifier for the application 1281 version: Version identifier for the code or component 1282 level: Importance level of the span (info, warning, error) 1283 status_message: Optional status message for the span 1284 """ 1285 super().__init__( 1286 otel_span=otel_span, 1287 as_type="span", 1288 langfuse_client=langfuse_client, 1289 input=input, 1290 output=output, 1291 metadata=metadata, 1292 environment=environment, 1293 release=release, 1294 version=version, 1295 level=level, 1296 status_message=status_message, 1297 )
Initialize a new LangfuseSpan.
Arguments:
- otel_span: The OpenTelemetry span to wrap
- langfuse_client: Reference to the parent Langfuse client
- input: Input data for the span (any JSON-serializable object)
- output: Output data from the span (any JSON-serializable object)
- metadata: Additional metadata to associate with the span
- environment: The tracing environment
- release: Release identifier for the application
- version: Version identifier for the code or component
- level: Importance level of the span (info, warning, error)
- status_message: Optional status message for the span
1300class LangfuseGeneration(LangfuseObservationWrapper): 1301 """Specialized span implementation for AI model generations in Langfuse. 1302 1303 This class represents a generation span specifically designed for tracking 1304 AI/LLM operations. It extends the base LangfuseObservationWrapper with specialized 1305 attributes for model details, token usage, and costs. 1306 """ 1307 1308 def __init__( 1309 self, 1310 *, 1311 otel_span: otel_trace_api.Span, 1312 langfuse_client: "Langfuse", 1313 input: Optional[Any] = None, 1314 output: Optional[Any] = None, 1315 metadata: Optional[Any] = None, 1316 environment: Optional[str] = None, 1317 release: Optional[str] = None, 1318 version: Optional[str] = None, 1319 level: Optional[SpanLevel] = None, 1320 status_message: Optional[str] = None, 1321 completion_start_time: Optional[datetime] = None, 1322 model: Optional[str] = None, 1323 model_parameters: Optional[Dict[str, MapValue]] = None, 1324 usage_details: Optional[Dict[str, int]] = None, 1325 cost_details: Optional[Dict[str, float]] = None, 1326 prompt: Optional[PromptClient] = None, 1327 ): 1328 """Initialize a new LangfuseGeneration span. 1329 1330 Args: 1331 otel_span: The OpenTelemetry span to wrap 1332 langfuse_client: Reference to the parent Langfuse client 1333 input: Input data for the generation (e.g., prompts) 1334 output: Output from the generation (e.g., completions) 1335 metadata: Additional metadata to associate with the generation 1336 environment: The tracing environment 1337 release: Release identifier for the application 1338 version: Version identifier for the model or component 1339 level: Importance level of the generation (info, warning, error) 1340 status_message: Optional status message for the generation 1341 completion_start_time: When the model started generating the response 1342 model: Name/identifier of the AI model used (e.g., "gpt-4") 1343 model_parameters: Parameters used for the model (e.g., temperature, max_tokens) 1344 usage_details: Token usage information (e.g., prompt_tokens, completion_tokens) 1345 cost_details: Cost information for the model call 1346 prompt: Associated prompt template from Langfuse prompt management 1347 """ 1348 super().__init__( 1349 as_type="generation", 1350 otel_span=otel_span, 1351 langfuse_client=langfuse_client, 1352 input=input, 1353 output=output, 1354 metadata=metadata, 1355 environment=environment, 1356 release=release, 1357 version=version, 1358 level=level, 1359 status_message=status_message, 1360 completion_start_time=completion_start_time, 1361 model=model, 1362 model_parameters=model_parameters, 1363 usage_details=usage_details, 1364 cost_details=cost_details, 1365 prompt=prompt, 1366 )
Specialized span implementation for AI model generations in Langfuse.
This class represents a generation span specifically designed for tracking AI/LLM operations. It extends the base LangfuseObservationWrapper with specialized attributes for model details, token usage, and costs.
1308 def __init__( 1309 self, 1310 *, 1311 otel_span: otel_trace_api.Span, 1312 langfuse_client: "Langfuse", 1313 input: Optional[Any] = None, 1314 output: Optional[Any] = None, 1315 metadata: Optional[Any] = None, 1316 environment: Optional[str] = None, 1317 release: Optional[str] = None, 1318 version: Optional[str] = None, 1319 level: Optional[SpanLevel] = None, 1320 status_message: Optional[str] = None, 1321 completion_start_time: Optional[datetime] = None, 1322 model: Optional[str] = None, 1323 model_parameters: Optional[Dict[str, MapValue]] = None, 1324 usage_details: Optional[Dict[str, int]] = None, 1325 cost_details: Optional[Dict[str, float]] = None, 1326 prompt: Optional[PromptClient] = None, 1327 ): 1328 """Initialize a new LangfuseGeneration span. 1329 1330 Args: 1331 otel_span: The OpenTelemetry span to wrap 1332 langfuse_client: Reference to the parent Langfuse client 1333 input: Input data for the generation (e.g., prompts) 1334 output: Output from the generation (e.g., completions) 1335 metadata: Additional metadata to associate with the generation 1336 environment: The tracing environment 1337 release: Release identifier for the application 1338 version: Version identifier for the model or component 1339 level: Importance level of the generation (info, warning, error) 1340 status_message: Optional status message for the generation 1341 completion_start_time: When the model started generating the response 1342 model: Name/identifier of the AI model used (e.g., "gpt-4") 1343 model_parameters: Parameters used for the model (e.g., temperature, max_tokens) 1344 usage_details: Token usage information (e.g., prompt_tokens, completion_tokens) 1345 cost_details: Cost information for the model call 1346 prompt: Associated prompt template from Langfuse prompt management 1347 """ 1348 super().__init__( 1349 as_type="generation", 1350 otel_span=otel_span, 1351 langfuse_client=langfuse_client, 1352 input=input, 1353 output=output, 1354 metadata=metadata, 1355 environment=environment, 1356 release=release, 1357 version=version, 1358 level=level, 1359 status_message=status_message, 1360 completion_start_time=completion_start_time, 1361 model=model, 1362 model_parameters=model_parameters, 1363 usage_details=usage_details, 1364 cost_details=cost_details, 1365 prompt=prompt, 1366 )
Initialize a new LangfuseGeneration span.
Arguments:
- otel_span: The OpenTelemetry span to wrap
- langfuse_client: Reference to the parent Langfuse client
- input: Input data for the generation (e.g., prompts)
- output: Output from the generation (e.g., completions)
- metadata: Additional metadata to associate with the generation
- environment: The tracing environment
- release: Release identifier for the application
- version: Version identifier for the model or component
- level: Importance level of the generation (info, warning, error)
- status_message: Optional status message for the generation
- completion_start_time: When the model started generating the response
- model: Name/identifier of the AI model used (e.g., "gpt-4")
- model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
- usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
- cost_details: Cost information for the model call
- prompt: Associated prompt template from Langfuse prompt management
1369class LangfuseEvent(LangfuseObservationWrapper): 1370 """Specialized span implementation for Langfuse Events.""" 1371 1372 def __init__( 1373 self, 1374 *, 1375 otel_span: otel_trace_api.Span, 1376 langfuse_client: "Langfuse", 1377 input: Optional[Any] = None, 1378 output: Optional[Any] = None, 1379 metadata: Optional[Any] = None, 1380 environment: Optional[str] = None, 1381 release: Optional[str] = None, 1382 version: Optional[str] = None, 1383 level: Optional[SpanLevel] = None, 1384 status_message: Optional[str] = None, 1385 ): 1386 """Initialize a new LangfuseEvent span. 1387 1388 Args: 1389 otel_span: The OpenTelemetry span to wrap 1390 langfuse_client: Reference to the parent Langfuse client 1391 input: Input data for the event 1392 output: Output from the event 1393 metadata: Additional metadata to associate with the generation 1394 environment: The tracing environment 1395 release: Release identifier for the application 1396 version: Version identifier for the model or component 1397 level: Importance level of the generation (info, warning, error) 1398 status_message: Optional status message for the generation 1399 """ 1400 super().__init__( 1401 otel_span=otel_span, 1402 as_type="event", 1403 langfuse_client=langfuse_client, 1404 input=input, 1405 output=output, 1406 metadata=metadata, 1407 environment=environment, 1408 release=release, 1409 version=version, 1410 level=level, 1411 status_message=status_message, 1412 ) 1413 1414 def update( 1415 self, 1416 *, 1417 name: Optional[str] = None, 1418 input: Optional[Any] = None, 1419 output: Optional[Any] = None, 1420 metadata: Optional[Any] = None, 1421 version: Optional[str] = None, 1422 level: Optional[SpanLevel] = None, 1423 status_message: Optional[str] = None, 1424 completion_start_time: Optional[datetime] = None, 1425 model: Optional[str] = None, 1426 model_parameters: Optional[Dict[str, MapValue]] = None, 1427 usage_details: Optional[Dict[str, int]] = None, 1428 cost_details: Optional[Dict[str, float]] = None, 1429 prompt: Optional[PromptClient] = None, 1430 **kwargs: Any, 1431 ) -> "LangfuseEvent": 1432 """Update is not allowed for LangfuseEvent because events cannot be updated. 1433 1434 This method logs a warning and returns self without making changes. 1435 1436 Returns: 1437 self: Returns the unchanged LangfuseEvent instance 1438 """ 1439 langfuse_logger.warning( 1440 "Attempted to update LangfuseEvent observation. Events cannot be updated after creation." 1441 ) 1442 return self
Specialized span implementation for Langfuse Events.
1372 def __init__( 1373 self, 1374 *, 1375 otel_span: otel_trace_api.Span, 1376 langfuse_client: "Langfuse", 1377 input: Optional[Any] = None, 1378 output: Optional[Any] = None, 1379 metadata: Optional[Any] = None, 1380 environment: Optional[str] = None, 1381 release: Optional[str] = None, 1382 version: Optional[str] = None, 1383 level: Optional[SpanLevel] = None, 1384 status_message: Optional[str] = None, 1385 ): 1386 """Initialize a new LangfuseEvent span. 1387 1388 Args: 1389 otel_span: The OpenTelemetry span to wrap 1390 langfuse_client: Reference to the parent Langfuse client 1391 input: Input data for the event 1392 output: Output from the event 1393 metadata: Additional metadata to associate with the generation 1394 environment: The tracing environment 1395 release: Release identifier for the application 1396 version: Version identifier for the model or component 1397 level: Importance level of the generation (info, warning, error) 1398 status_message: Optional status message for the generation 1399 """ 1400 super().__init__( 1401 otel_span=otel_span, 1402 as_type="event", 1403 langfuse_client=langfuse_client, 1404 input=input, 1405 output=output, 1406 metadata=metadata, 1407 environment=environment, 1408 release=release, 1409 version=version, 1410 level=level, 1411 status_message=status_message, 1412 )
Initialize a new LangfuseEvent span.
Arguments:
- otel_span: The OpenTelemetry span to wrap
- langfuse_client: Reference to the parent Langfuse client
- input: Input data for the event
- output: Output from the event
- metadata: Additional metadata to associate with the generation
- environment: The tracing environment
- release: Release identifier for the application
- version: Version identifier for the model or component
- level: Importance level of the generation (info, warning, error)
- status_message: Optional status message for the generation
1414 def update( 1415 self, 1416 *, 1417 name: Optional[str] = None, 1418 input: Optional[Any] = None, 1419 output: Optional[Any] = None, 1420 metadata: Optional[Any] = None, 1421 version: Optional[str] = None, 1422 level: Optional[SpanLevel] = None, 1423 status_message: Optional[str] = None, 1424 completion_start_time: Optional[datetime] = None, 1425 model: Optional[str] = None, 1426 model_parameters: Optional[Dict[str, MapValue]] = None, 1427 usage_details: Optional[Dict[str, int]] = None, 1428 cost_details: Optional[Dict[str, float]] = None, 1429 prompt: Optional[PromptClient] = None, 1430 **kwargs: Any, 1431 ) -> "LangfuseEvent": 1432 """Update is not allowed for LangfuseEvent because events cannot be updated. 1433 1434 This method logs a warning and returns self without making changes. 1435 1436 Returns: 1437 self: Returns the unchanged LangfuseEvent instance 1438 """ 1439 langfuse_logger.warning( 1440 "Attempted to update LangfuseEvent observation. Events cannot be updated after creation." 1441 ) 1442 return self
Update is not allowed for LangfuseEvent because events cannot be updated.
This method logs a warning and returns self without making changes.
Returns:
self: Returns the unchanged LangfuseEvent instance
28class LangfuseOtelSpanAttributes: 29 # Langfuse-Trace attributes 30 TRACE_NAME = "langfuse.trace.name" 31 TRACE_USER_ID = "user.id" 32 TRACE_SESSION_ID = "session.id" 33 TRACE_TAGS = "langfuse.trace.tags" 34 TRACE_PUBLIC = "langfuse.trace.public" 35 TRACE_METADATA = "langfuse.trace.metadata" 36 TRACE_INPUT = "langfuse.trace.input" 37 TRACE_OUTPUT = "langfuse.trace.output" 38 39 # Langfuse-observation attributes 40 OBSERVATION_TYPE = "langfuse.observation.type" 41 OBSERVATION_METADATA = "langfuse.observation.metadata" 42 OBSERVATION_LEVEL = "langfuse.observation.level" 43 OBSERVATION_STATUS_MESSAGE = "langfuse.observation.status_message" 44 OBSERVATION_INPUT = "langfuse.observation.input" 45 OBSERVATION_OUTPUT = "langfuse.observation.output" 46 47 # Langfuse-observation of type Generation attributes 48 OBSERVATION_COMPLETION_START_TIME = "langfuse.observation.completion_start_time" 49 OBSERVATION_MODEL = "langfuse.observation.model.name" 50 OBSERVATION_MODEL_PARAMETERS = "langfuse.observation.model.parameters" 51 OBSERVATION_USAGE_DETAILS = "langfuse.observation.usage_details" 52 OBSERVATION_COST_DETAILS = "langfuse.observation.cost_details" 53 OBSERVATION_PROMPT_NAME = "langfuse.observation.prompt.name" 54 OBSERVATION_PROMPT_VERSION = "langfuse.observation.prompt.version" 55 56 # General 57 ENVIRONMENT = "langfuse.environment" 58 RELEASE = "langfuse.release" 59 VERSION = "langfuse.version" 60 61 # Internal 62 AS_ROOT = "langfuse.internal.as_root" 63 64 # Experiments 65 EXPERIMENT_ID = "langfuse.experiment.id" 66 EXPERIMENT_NAME = "langfuse.experiment.name" 67 EXPERIMENT_DESCRIPTION = "langfuse.experiment.description" 68 EXPERIMENT_METADATA = "langfuse.experiment.metadata" 69 EXPERIMENT_DATASET_ID = "langfuse.experiment.dataset.id" 70 EXPERIMENT_ITEM_ID = "langfuse.experiment.item.id" 71 EXPERIMENT_ITEM_EXPECTED_OUTPUT = "langfuse.experiment.item.expected_output" 72 EXPERIMENT_ITEM_METADATA = "langfuse.experiment.item.metadata" 73 EXPERIMENT_ITEM_ROOT_OBSERVATION_ID = "langfuse.experiment.item.root_observation_id"
1445class LangfuseAgent(LangfuseObservationWrapper): 1446 """Agent observation for reasoning blocks that act on tools using LLM guidance.""" 1447 1448 def __init__(self, **kwargs: Any) -> None: 1449 """Initialize a new LangfuseAgent span.""" 1450 kwargs["as_type"] = "agent" 1451 super().__init__(**kwargs)
Agent observation for reasoning blocks that act on tools using LLM guidance.
1454class LangfuseTool(LangfuseObservationWrapper): 1455 """Tool observation representing external tool calls, e.g., calling a weather API.""" 1456 1457 def __init__(self, **kwargs: Any) -> None: 1458 """Initialize a new LangfuseTool span.""" 1459 kwargs["as_type"] = "tool" 1460 super().__init__(**kwargs)
Tool observation representing external tool calls, e.g., calling a weather API.
1463class LangfuseChain(LangfuseObservationWrapper): 1464 """Chain observation for connecting LLM application steps, e.g. passing context from retriever to LLM.""" 1465 1466 def __init__(self, **kwargs: Any) -> None: 1467 """Initialize a new LangfuseChain span.""" 1468 kwargs["as_type"] = "chain" 1469 super().__init__(**kwargs)
Chain observation for connecting LLM application steps, e.g. passing context from retriever to LLM.
1481class LangfuseEmbedding(LangfuseObservationWrapper): 1482 """Embedding observation for LLM embedding calls, typically used before retrieval.""" 1483 1484 def __init__(self, **kwargs: Any) -> None: 1485 """Initialize a new LangfuseEmbedding span.""" 1486 kwargs["as_type"] = "embedding" 1487 super().__init__(**kwargs)
Embedding observation for LLM embedding calls, typically used before retrieval.
1490class LangfuseEvaluator(LangfuseObservationWrapper): 1491 """Evaluator observation for assessing relevance, correctness, or helpfulness of LLM outputs.""" 1492 1493 def __init__(self, **kwargs: Any) -> None: 1494 """Initialize a new LangfuseEvaluator span.""" 1495 kwargs["as_type"] = "evaluator" 1496 super().__init__(**kwargs)
Evaluator observation for assessing relevance, correctness, or helpfulness of LLM outputs.
1472class LangfuseRetriever(LangfuseObservationWrapper): 1473 """Retriever observation for data retrieval steps, e.g. vector store or database queries.""" 1474 1475 def __init__(self, **kwargs: Any) -> None: 1476 """Initialize a new LangfuseRetriever span.""" 1477 kwargs["as_type"] = "retriever" 1478 super().__init__(**kwargs)
Retriever observation for data retrieval steps, e.g. vector store or database queries.
1499class LangfuseGuardrail(LangfuseObservationWrapper): 1500 """Guardrail observation for protection e.g. against jailbreaks or offensive content.""" 1501 1502 def __init__(self, **kwargs: Any) -> None: 1503 """Initialize a new LangfuseGuardrail span.""" 1504 kwargs["as_type"] = "guardrail" 1505 super().__init__(**kwargs)
Guardrail observation for protection e.g. against jailbreaks or offensive content.
93class Evaluation: 94 """Represents an evaluation result for an experiment item or an entire experiment run. 95 96 This class provides a strongly-typed way to create evaluation results in evaluator functions. 97 Users must use keyword arguments when instantiating this class. 98 99 Attributes: 100 name: Unique identifier for the evaluation metric. Should be descriptive 101 and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity"). 102 Used for aggregation and comparison across experiment runs. 103 value: The evaluation score or result. Can be: 104 - Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42) 105 - String: For categorical results like "positive", "negative", "neutral" 106 - Boolean: For binary assessments like "passes_safety_check" 107 comment: Optional human-readable explanation of the evaluation result. 108 Useful for providing context, explaining scoring rationale, or noting 109 special conditions. Displayed in Langfuse UI for interpretability. 110 metadata: Optional structured metadata about the evaluation process. 111 Can include confidence scores, intermediate calculations, model versions, 112 or any other relevant technical details. 113 data_type: Optional score data type. Required if value is not NUMERIC. 114 One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC. 115 config_id: Optional Langfuse score config ID. 116 117 Examples: 118 Basic accuracy evaluation: 119 ```python 120 from langfuse import Evaluation 121 122 def accuracy_evaluator(*, input, output, expected_output=None, **kwargs): 123 if not expected_output: 124 return Evaluation(name="accuracy", value=0, comment="No expected output") 125 126 is_correct = output.strip().lower() == expected_output.strip().lower() 127 return Evaluation( 128 name="accuracy", 129 value=1.0 if is_correct else 0.0, 130 comment="Correct answer" if is_correct else "Incorrect answer" 131 ) 132 ``` 133 134 Multi-metric evaluator: 135 ```python 136 def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs): 137 return [ 138 Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"), 139 Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"), 140 Evaluation( 141 name="quality", 142 value=0.85, 143 comment="High quality response", 144 metadata={"confidence": 0.92, "model": "gpt-4"} 145 ) 146 ] 147 ``` 148 149 Categorical evaluation: 150 ```python 151 def sentiment_evaluator(*, input, output, **kwargs): 152 sentiment = analyze_sentiment(output) # Returns "positive", "negative", or "neutral" 153 return Evaluation( 154 name="sentiment", 155 value=sentiment, 156 comment=f"Response expresses {sentiment} sentiment", 157 data_type="CATEGORICAL" 158 ) 159 ``` 160 161 Failed evaluation with error handling: 162 ```python 163 def external_api_evaluator(*, input, output, **kwargs): 164 try: 165 score = external_api.evaluate(output) 166 return Evaluation(name="external_score", value=score) 167 except Exception as e: 168 return Evaluation( 169 name="external_score", 170 value=0, 171 comment=f"API unavailable: {e}", 172 metadata={"error": str(e), "retry_count": 3} 173 ) 174 ``` 175 176 Note: 177 All arguments must be passed as keywords. Positional arguments are not allowed 178 to ensure code clarity and prevent errors from argument reordering. 179 """ 180 181 def __init__( 182 self, 183 *, 184 name: str, 185 value: Union[int, float, str, bool], 186 comment: Optional[str] = None, 187 metadata: Optional[Dict[str, Any]] = None, 188 data_type: Optional[ScoreDataType] = None, 189 config_id: Optional[str] = None, 190 ): 191 """Initialize an Evaluation with the provided data. 192 193 Args: 194 name: Unique identifier for the evaluation metric. 195 value: The evaluation score or result. 196 comment: Optional human-readable explanation of the result. 197 metadata: Optional structured metadata about the evaluation process. 198 data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN). 199 config_id: Optional Langfuse score config ID. 200 201 Note: 202 All arguments must be provided as keywords. Positional arguments will raise a TypeError. 203 """ 204 self.name = name 205 self.value = value 206 self.comment = comment 207 self.metadata = metadata 208 self.data_type = data_type 209 self.config_id = config_id
Represents an evaluation result for an experiment item or an entire experiment run.
This class provides a strongly-typed way to create evaluation results in evaluator functions. Users must use keyword arguments when instantiating this class.
Attributes:
- name: Unique identifier for the evaluation metric. Should be descriptive and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity"). Used for aggregation and comparison across experiment runs.
- value: The evaluation score or result. Can be:
- Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42)
- String: For categorical results like "positive", "negative", "neutral"
- Boolean: For binary assessments like "passes_safety_check"
- comment: Optional human-readable explanation of the evaluation result. Useful for providing context, explaining scoring rationale, or noting special conditions. Displayed in Langfuse UI for interpretability.
- metadata: Optional structured metadata about the evaluation process. Can include confidence scores, intermediate calculations, model versions, or any other relevant technical details.
- data_type: Optional score data type. Required if value is not NUMERIC. One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC.
- config_id: Optional Langfuse score config ID.
Examples:
Basic accuracy evaluation:
from langfuse import Evaluation def accuracy_evaluator(*, input, output, expected_output=None, **kwargs): if not expected_output: return Evaluation(name="accuracy", value=0, comment="No expected output") is_correct = output.strip().lower() == expected_output.strip().lower() return Evaluation( name="accuracy", value=1.0 if is_correct else 0.0, comment="Correct answer" if is_correct else "Incorrect answer" )Multi-metric evaluator:
def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs): return [ Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"), Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"), Evaluation( name="quality", value=0.85, comment="High quality response", metadata={"confidence": 0.92, "model": "gpt-4"} ) ]Categorical evaluation:
def sentiment_evaluator(*, input, output, **kwargs): sentiment = analyze_sentiment(output) # Returns "positive", "negative", or "neutral" return Evaluation( name="sentiment", value=sentiment, comment=f"Response expresses {sentiment} sentiment", data_type="CATEGORICAL" )Failed evaluation with error handling:
def external_api_evaluator(*, input, output, **kwargs): try: score = external_api.evaluate(output) return Evaluation(name="external_score", value=score) except Exception as e: return Evaluation( name="external_score", value=0, comment=f"API unavailable: {e}", metadata={"error": str(e), "retry_count": 3} )
Note:
All arguments must be passed as keywords. Positional arguments are not allowed to ensure code clarity and prevent errors from argument reordering.
181 def __init__( 182 self, 183 *, 184 name: str, 185 value: Union[int, float, str, bool], 186 comment: Optional[str] = None, 187 metadata: Optional[Dict[str, Any]] = None, 188 data_type: Optional[ScoreDataType] = None, 189 config_id: Optional[str] = None, 190 ): 191 """Initialize an Evaluation with the provided data. 192 193 Args: 194 name: Unique identifier for the evaluation metric. 195 value: The evaluation score or result. 196 comment: Optional human-readable explanation of the result. 197 metadata: Optional structured metadata about the evaluation process. 198 data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN). 199 config_id: Optional Langfuse score config ID. 200 201 Note: 202 All arguments must be provided as keywords. Positional arguments will raise a TypeError. 203 """ 204 self.name = name 205 self.value = value 206 self.comment = comment 207 self.metadata = metadata 208 self.data_type = data_type 209 self.config_id = config_id
Initialize an Evaluation with the provided data.
Arguments:
- name: Unique identifier for the evaluation metric.
- value: The evaluation score or result.
- comment: Optional human-readable explanation of the result.
- metadata: Optional structured metadata about the evaluation process.
- data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
- config_id: Optional Langfuse score config ID.
Note:
All arguments must be provided as keywords. Positional arguments will raise a TypeError.
38class EvaluatorInputs: 39 """Input data structure for evaluators, returned by mapper functions. 40 41 This class provides a strongly-typed container for transforming API response 42 objects (traces, observations) into the standardized format expected 43 by evaluator functions. It ensures consistent access to input, output, expected 44 output, and metadata regardless of the source entity type. 45 46 Attributes: 47 input: The input data that was provided to generate the output being evaluated. 48 For traces, this might be the initial prompt or request. For observations, 49 this could be the span's input. The exact meaning depends on your use case. 50 output: The actual output that was produced and needs to be evaluated. 51 For traces, this is typically the final response. For observations, 52 this might be the generation output or span result. 53 expected_output: Optional ground truth or expected result for comparison. 54 Used by evaluators to assess correctness. May be None if no ground truth 55 is available for the entity being evaluated. 56 metadata: Optional structured metadata providing additional context for evaluation. 57 Can include information about the entity, execution context, user attributes, 58 or any other relevant data that evaluators might use. 59 60 Examples: 61 Simple mapper for traces: 62 ```python 63 from langfuse import EvaluatorInputs 64 65 def trace_mapper(trace): 66 return EvaluatorInputs( 67 input=trace.input, 68 output=trace.output, 69 expected_output=None, # No ground truth available 70 metadata={"user_id": trace.user_id, "tags": trace.tags} 71 ) 72 ``` 73 74 Mapper for observations extracting specific fields: 75 ```python 76 def observation_mapper(observation): 77 # Extract input/output from observation's data 78 input_data = observation.input if hasattr(observation, 'input') else None 79 output_data = observation.output if hasattr(observation, 'output') else None 80 81 return EvaluatorInputs( 82 input=input_data, 83 output=output_data, 84 expected_output=None, 85 metadata={ 86 "observation_type": observation.type, 87 "model": observation.model, 88 "latency_ms": observation.end_time - observation.start_time 89 } 90 ) 91 ``` 92 ``` 93 94 Note: 95 All arguments must be passed as keywords when instantiating this class. 96 """ 97 98 def __init__( 99 self, 100 *, 101 input: Any, 102 output: Any, 103 expected_output: Any = None, 104 metadata: Optional[Dict[str, Any]] = None, 105 ): 106 """Initialize EvaluatorInputs with the provided data. 107 108 Args: 109 input: The input data for evaluation. 110 output: The output data to be evaluated. 111 expected_output: Optional ground truth for comparison. 112 metadata: Optional additional context for evaluation. 113 114 Note: 115 All arguments must be provided as keywords. 116 """ 117 self.input = input 118 self.output = output 119 self.expected_output = expected_output 120 self.metadata = metadata
Input data structure for evaluators, returned by mapper functions.
This class provides a strongly-typed container for transforming API response objects (traces, observations) into the standardized format expected by evaluator functions. It ensures consistent access to input, output, expected output, and metadata regardless of the source entity type.
Attributes:
- input: The input data that was provided to generate the output being evaluated. For traces, this might be the initial prompt or request. For observations, this could be the span's input. The exact meaning depends on your use case.
- output: The actual output that was produced and needs to be evaluated. For traces, this is typically the final response. For observations, this might be the generation output or span result.
- expected_output: Optional ground truth or expected result for comparison. Used by evaluators to assess correctness. May be None if no ground truth is available for the entity being evaluated.
- metadata: Optional structured metadata providing additional context for evaluation. Can include information about the entity, execution context, user attributes, or any other relevant data that evaluators might use.
Examples:
Simple mapper for traces:
from langfuse import EvaluatorInputs def trace_mapper(trace): return EvaluatorInputs( input=trace.input, output=trace.output, expected_output=None, # No ground truth available metadata={"user_id": trace.user_id, "tags": trace.tags} )Mapper for observations extracting specific fields:
def observation_mapper(observation): # Extract input/output from observation's data input_data = observation.input if hasattr(observation, 'input') else None output_data = observation.output if hasattr(observation, 'output') else None return EvaluatorInputs( input=input_data, output=output_data, expected_output=None, metadata={ "observation_type": observation.type, "model": observation.model, "latency_ms": observation.end_time - observation.start_time } )```
Note:
All arguments must be passed as keywords when instantiating this class.
98 def __init__( 99 self, 100 *, 101 input: Any, 102 output: Any, 103 expected_output: Any = None, 104 metadata: Optional[Dict[str, Any]] = None, 105 ): 106 """Initialize EvaluatorInputs with the provided data. 107 108 Args: 109 input: The input data for evaluation. 110 output: The output data to be evaluated. 111 expected_output: Optional ground truth for comparison. 112 metadata: Optional additional context for evaluation. 113 114 Note: 115 All arguments must be provided as keywords. 116 """ 117 self.input = input 118 self.output = output 119 self.expected_output = expected_output 120 self.metadata = metadata
Initialize EvaluatorInputs with the provided data.
Arguments:
- input: The input data for evaluation.
- output: The output data to be evaluated.
- expected_output: Optional ground truth for comparison.
- metadata: Optional additional context for evaluation.
Note:
All arguments must be provided as keywords.
123class MapperFunction(Protocol): 124 """Protocol defining the interface for mapper functions in batch evaluation. 125 126 Mapper functions transform API response objects (traces or observations) 127 into the standardized EvaluatorInputs format that evaluators expect. This abstraction 128 allows you to define how to extract and structure evaluation data from different 129 entity types. 130 131 Mapper functions must: 132 - Accept a single item parameter (trace, observation) 133 - Return an EvaluatorInputs instance with input, output, expected_output, metadata 134 - Can be either synchronous or asynchronous 135 - Should handle missing or malformed data gracefully 136 """ 137 138 def __call__( 139 self, 140 *, 141 item: Union["TraceWithFullDetails", "ObservationsView"], 142 **kwargs: Dict[str, Any], 143 ) -> Union[EvaluatorInputs, Awaitable[EvaluatorInputs]]: 144 """Transform an API response object into evaluator inputs. 145 146 This method defines how to extract evaluation-relevant data from the raw 147 API response object. The implementation should map entity-specific fields 148 to the standardized input/output/expected_output/metadata structure. 149 150 Args: 151 item: The API response object to transform. The type depends on the scope: 152 - TraceWithFullDetails: When evaluating traces 153 - ObservationsView: When evaluating observations 154 155 Returns: 156 EvaluatorInputs: A structured container with: 157 - input: The input data that generated the output 158 - output: The output to be evaluated 159 - expected_output: Optional ground truth for comparison 160 - metadata: Optional additional context 161 162 Can return either a direct EvaluatorInputs instance or an awaitable 163 (for async mappers that need to fetch additional data). 164 165 Examples: 166 Basic trace mapper: 167 ```python 168 def map_trace(trace): 169 return EvaluatorInputs( 170 input=trace.input, 171 output=trace.output, 172 expected_output=None, 173 metadata={"trace_id": trace.id, "user": trace.user_id} 174 ) 175 ``` 176 177 Observation mapper with conditional logic: 178 ```python 179 def map_observation(observation): 180 # Extract fields based on observation type 181 if observation.type == "GENERATION": 182 input_data = observation.input 183 output_data = observation.output 184 else: 185 # For other types, use different fields 186 input_data = observation.metadata.get("input") 187 output_data = observation.metadata.get("output") 188 189 return EvaluatorInputs( 190 input=input_data, 191 output=output_data, 192 expected_output=None, 193 metadata={"obs_id": observation.id, "type": observation.type} 194 ) 195 ``` 196 197 Async mapper (if additional processing needed): 198 ```python 199 async def map_trace_async(trace): 200 # Could do async processing here if needed 201 processed_output = await some_async_transformation(trace.output) 202 203 return EvaluatorInputs( 204 input=trace.input, 205 output=processed_output, 206 expected_output=None, 207 metadata={"trace_id": trace.id} 208 ) 209 ``` 210 """ 211 ...
Protocol defining the interface for mapper functions in batch evaluation.
Mapper functions transform API response objects (traces or observations) into the standardized EvaluatorInputs format that evaluators expect. This abstraction allows you to define how to extract and structure evaluation data from different entity types.
Mapper functions must:
- Accept a single item parameter (trace, observation)
- Return an EvaluatorInputs instance with input, output, expected_output, metadata
- Can be either synchronous or asynchronous
- Should handle missing or malformed data gracefully
1927def _no_init_or_replace_init(self, *args, **kwargs): 1928 cls = type(self) 1929 1930 if cls._is_protocol: 1931 raise TypeError('Protocols cannot be instantiated') 1932 1933 # Already using a custom `__init__`. No need to calculate correct 1934 # `__init__` to call. This can lead to RecursionError. See bpo-45121. 1935 if cls.__init__ is not _no_init_or_replace_init: 1936 return 1937 1938 # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`. 1939 # The first instantiation of the subclass will call `_no_init_or_replace_init` which 1940 # searches for a proper new `__init__` in the MRO. The new `__init__` 1941 # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent 1942 # instantiation of the protocol subclass will thus use the new 1943 # `__init__` and no longer call `_no_init_or_replace_init`. 1944 for base in cls.__mro__: 1945 init = base.__dict__.get('__init__', _no_init_or_replace_init) 1946 if init is not _no_init_or_replace_init: 1947 cls.__init__ = init 1948 break 1949 else: 1950 # should not happen 1951 cls.__init__ = object.__init__ 1952 1953 cls.__init__(self, *args, **kwargs)
214class CompositeEvaluatorFunction(Protocol): 215 """Protocol defining the interface for composite evaluator functions. 216 217 Composite evaluators create aggregate scores from multiple item-level evaluations. 218 This is commonly used to compute weighted averages, combined metrics, or other 219 composite assessments based on individual evaluation results. 220 221 Composite evaluators: 222 - Accept the same inputs as item-level evaluators (input, output, expected_output, metadata) 223 plus the list of evaluations 224 - Return either a single Evaluation, a list of Evaluations, or a dict 225 - Can be either synchronous or asynchronous 226 - Have access to both raw item data and evaluation results 227 """ 228 229 def __call__( 230 self, 231 *, 232 input: Optional[Any] = None, 233 output: Optional[Any] = None, 234 expected_output: Optional[Any] = None, 235 metadata: Optional[Dict[str, Any]] = None, 236 evaluations: List[Evaluation], 237 **kwargs: Dict[str, Any], 238 ) -> Union[ 239 Evaluation, 240 List[Evaluation], 241 Dict[str, Any], 242 Awaitable[Evaluation], 243 Awaitable[List[Evaluation]], 244 Awaitable[Dict[str, Any]], 245 ]: 246 r"""Create a composite evaluation from item-level evaluation results. 247 248 This method combines multiple evaluation scores into a single composite metric. 249 Common use cases include weighted averages, pass/fail decisions based on multiple 250 criteria, or custom scoring logic that considers multiple dimensions. 251 252 Args: 253 input: The input data that was provided to the system being evaluated. 254 output: The output generated by the system being evaluated. 255 expected_output: The expected/reference output for comparison (if available). 256 metadata: Additional metadata about the evaluation context. 257 evaluations: List of evaluation results from item-level evaluators. 258 Each evaluation contains name, value, comment, and metadata. 259 260 Returns: 261 Can return any of: 262 - Evaluation: A single composite evaluation result 263 - List[Evaluation]: Multiple composite evaluations 264 - Dict: A dict that will be converted to an Evaluation 265 - name: Identifier for the composite metric (e.g., "composite_score") 266 - value: The computed composite value 267 - comment: Optional explanation of how the score was computed 268 - metadata: Optional details about the composition logic 269 270 Can return either a direct Evaluation instance or an awaitable 271 (for async composite evaluators). 272 273 Examples: 274 Simple weighted average: 275 ```python 276 def weighted_composite(*, input, output, expected_output, metadata, evaluations): 277 weights = { 278 "accuracy": 0.5, 279 "relevance": 0.3, 280 "safety": 0.2 281 } 282 283 total_score = 0.0 284 total_weight = 0.0 285 286 for eval in evaluations: 287 if eval.name in weights and isinstance(eval.value, (int, float)): 288 total_score += eval.value * weights[eval.name] 289 total_weight += weights[eval.name] 290 291 final_score = total_score / total_weight if total_weight > 0 else 0.0 292 293 return Evaluation( 294 name="composite_score", 295 value=final_score, 296 comment=f"Weighted average of {len(evaluations)} metrics" 297 ) 298 ``` 299 300 Pass/fail composite based on thresholds: 301 ```python 302 def pass_fail_composite(*, input, output, expected_output, metadata, evaluations): 303 # Must pass all criteria 304 thresholds = { 305 "accuracy": 0.7, 306 "safety": 0.9, 307 "relevance": 0.6 308 } 309 310 passes = True 311 failing_metrics = [] 312 313 for metric, threshold in thresholds.items(): 314 eval_result = next((e for e in evaluations if e.name == metric), None) 315 if eval_result and isinstance(eval_result.value, (int, float)): 316 if eval_result.value < threshold: 317 passes = False 318 failing_metrics.append(metric) 319 320 return Evaluation( 321 name="passes_all_checks", 322 value=passes, 323 comment=f"Failed: {', '.join(failing_metrics)}" if failing_metrics else "All checks passed", 324 data_type="BOOLEAN" 325 ) 326 ``` 327 328 Async composite with external scoring: 329 ```python 330 async def llm_composite(*, input, output, expected_output, metadata, evaluations): 331 # Use LLM to synthesize multiple evaluation results 332 eval_summary = "\n".join( 333 f"- {e.name}: {e.value}" for e in evaluations 334 ) 335 336 prompt = f"Given these evaluation scores:\n{eval_summary}\n" 337 prompt += f"For the output: {output}\n" 338 prompt += "Provide an overall quality score from 0-1." 339 340 response = await openai.chat.completions.create( 341 model="gpt-4", 342 messages=[{"role": "user", "content": prompt}] 343 ) 344 345 score = float(response.choices[0].message.content.strip()) 346 347 return Evaluation( 348 name="llm_composite_score", 349 value=score, 350 comment="LLM-synthesized composite score" 351 ) 352 ``` 353 354 Context-aware composite: 355 ```python 356 def context_composite(*, input, output, expected_output, metadata, evaluations): 357 # Adjust weighting based on metadata 358 base_weights = {"accuracy": 0.5, "speed": 0.3, "cost": 0.2} 359 360 # If metadata indicates high importance, prioritize accuracy 361 if metadata and metadata.get('importance') == 'high': 362 weights = {"accuracy": 0.7, "speed": 0.2, "cost": 0.1} 363 else: 364 weights = base_weights 365 366 total = sum( 367 e.value * weights.get(e.name, 0) 368 for e in evaluations 369 if isinstance(e.value, (int, float)) 370 ) 371 372 return Evaluation( 373 name="weighted_composite", 374 value=total, 375 comment="Context-aware weighted composite" 376 ) 377 ``` 378 """ 379 ...
Protocol defining the interface for composite evaluator functions.
Composite evaluators create aggregate scores from multiple item-level evaluations. This is commonly used to compute weighted averages, combined metrics, or other composite assessments based on individual evaluation results.
Composite evaluators:
- Accept the same inputs as item-level evaluators (input, output, expected_output, metadata) plus the list of evaluations
- Return either a single Evaluation, a list of Evaluations, or a dict
- Can be either synchronous or asynchronous
- Have access to both raw item data and evaluation results
1927def _no_init_or_replace_init(self, *args, **kwargs): 1928 cls = type(self) 1929 1930 if cls._is_protocol: 1931 raise TypeError('Protocols cannot be instantiated') 1932 1933 # Already using a custom `__init__`. No need to calculate correct 1934 # `__init__` to call. This can lead to RecursionError. See bpo-45121. 1935 if cls.__init__ is not _no_init_or_replace_init: 1936 return 1937 1938 # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`. 1939 # The first instantiation of the subclass will call `_no_init_or_replace_init` which 1940 # searches for a proper new `__init__` in the MRO. The new `__init__` 1941 # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent 1942 # instantiation of the protocol subclass will thus use the new 1943 # `__init__` and no longer call `_no_init_or_replace_init`. 1944 for base in cls.__mro__: 1945 init = base.__dict__.get('__init__', _no_init_or_replace_init) 1946 if init is not _no_init_or_replace_init: 1947 cls.__init__ = init 1948 break 1949 else: 1950 # should not happen 1951 cls.__init__ = object.__init__ 1952 1953 cls.__init__(self, *args, **kwargs)
382class EvaluatorStats: 383 """Statistics for a single evaluator's performance during batch evaluation. 384 385 This class tracks detailed metrics about how a specific evaluator performed 386 across all items in a batch evaluation run. It helps identify evaluator issues, 387 understand reliability, and optimize evaluation pipelines. 388 389 Attributes: 390 name: The name of the evaluator function (extracted from __name__). 391 total_runs: Total number of times the evaluator was invoked. 392 successful_runs: Number of times the evaluator completed successfully. 393 failed_runs: Number of times the evaluator raised an exception or failed. 394 total_scores_created: Total number of evaluation scores created by this evaluator. 395 Can be higher than successful_runs if the evaluator returns multiple scores. 396 397 Examples: 398 Accessing evaluator stats from batch evaluation result: 399 ```python 400 result = client.run_batched_evaluation(...) 401 402 for stats in result.evaluator_stats: 403 print(f"Evaluator: {stats.name}") 404 print(f" Success rate: {stats.successful_runs / stats.total_runs:.1%}") 405 print(f" Scores created: {stats.total_scores_created}") 406 407 if stats.failed_runs > 0: 408 print(f" â ī¸ Failed {stats.failed_runs} times") 409 ``` 410 411 Identifying problematic evaluators: 412 ```python 413 result = client.run_batched_evaluation(...) 414 415 # Find evaluators with high failure rates 416 for stats in result.evaluator_stats: 417 failure_rate = stats.failed_runs / stats.total_runs 418 if failure_rate > 0.1: # More than 10% failures 419 print(f"â ī¸ {stats.name} has {failure_rate:.1%} failure rate") 420 print(f" Consider debugging or removing this evaluator") 421 ``` 422 423 Note: 424 All arguments must be passed as keywords when instantiating this class. 425 """ 426 427 def __init__( 428 self, 429 *, 430 name: str, 431 total_runs: int = 0, 432 successful_runs: int = 0, 433 failed_runs: int = 0, 434 total_scores_created: int = 0, 435 ): 436 """Initialize EvaluatorStats with the provided metrics. 437 438 Args: 439 name: The evaluator function name. 440 total_runs: Total number of evaluator invocations. 441 successful_runs: Number of successful completions. 442 failed_runs: Number of failures. 443 total_scores_created: Total scores created by this evaluator. 444 445 Note: 446 All arguments must be provided as keywords. 447 """ 448 self.name = name 449 self.total_runs = total_runs 450 self.successful_runs = successful_runs 451 self.failed_runs = failed_runs 452 self.total_scores_created = total_scores_created
Statistics for a single evaluator's performance during batch evaluation.
This class tracks detailed metrics about how a specific evaluator performed across all items in a batch evaluation run. It helps identify evaluator issues, understand reliability, and optimize evaluation pipelines.
Attributes:
- name: The name of the evaluator function (extracted from __name__).
- total_runs: Total number of times the evaluator was invoked.
- successful_runs: Number of times the evaluator completed successfully.
- failed_runs: Number of times the evaluator raised an exception or failed.
- total_scores_created: Total number of evaluation scores created by this evaluator. Can be higher than successful_runs if the evaluator returns multiple scores.
Examples:
Accessing evaluator stats from batch evaluation result:
result = client.run_batched_evaluation(...) for stats in result.evaluator_stats: print(f"Evaluator: {stats.name}") print(f" Success rate: {stats.successful_runs / stats.total_runs:.1%}") print(f" Scores created: {stats.total_scores_created}") if stats.failed_runs > 0: print(f" â ī¸ Failed {stats.failed_runs} times")Identifying problematic evaluators:
result = client.run_batched_evaluation(...) # Find evaluators with high failure rates for stats in result.evaluator_stats: failure_rate = stats.failed_runs / stats.total_runs if failure_rate > 0.1: # More than 10% failures print(f"â ī¸ {stats.name} has {failure_rate:.1%} failure rate") print(f" Consider debugging or removing this evaluator")
Note:
All arguments must be passed as keywords when instantiating this class.
427 def __init__( 428 self, 429 *, 430 name: str, 431 total_runs: int = 0, 432 successful_runs: int = 0, 433 failed_runs: int = 0, 434 total_scores_created: int = 0, 435 ): 436 """Initialize EvaluatorStats with the provided metrics. 437 438 Args: 439 name: The evaluator function name. 440 total_runs: Total number of evaluator invocations. 441 successful_runs: Number of successful completions. 442 failed_runs: Number of failures. 443 total_scores_created: Total scores created by this evaluator. 444 445 Note: 446 All arguments must be provided as keywords. 447 """ 448 self.name = name 449 self.total_runs = total_runs 450 self.successful_runs = successful_runs 451 self.failed_runs = failed_runs 452 self.total_scores_created = total_scores_created
Initialize EvaluatorStats with the provided metrics.
Arguments:
- name: The evaluator function name.
- total_runs: Total number of evaluator invocations.
- successful_runs: Number of successful completions.
- failed_runs: Number of failures.
- total_scores_created: Total scores created by this evaluator.
Note:
All arguments must be provided as keywords.
455class BatchEvaluationResumeToken: 456 """Token for resuming a failed batch evaluation run. 457 458 This class encapsulates all the information needed to resume a batch evaluation 459 that was interrupted or failed partway through. It uses timestamp-based filtering 460 to avoid re-processing items that were already evaluated, even if the underlying 461 dataset changed between runs. 462 463 Attributes: 464 scope: The type of items being evaluated ("traces", "observations"). 465 filter: The original JSON filter string used to query items. 466 last_processed_timestamp: ISO 8601 timestamp of the last successfully processed item. 467 Used to construct a filter that only fetches items after this timestamp. 468 last_processed_id: The ID of the last successfully processed item, for reference. 469 items_processed: Count of items successfully processed before interruption. 470 471 Examples: 472 Resuming a failed batch evaluation: 473 ```python 474 # Initial run that fails partway through 475 try: 476 result = client.run_batched_evaluation( 477 scope="traces", 478 mapper=my_mapper, 479 evaluators=[evaluator1, evaluator2], 480 filter='{"tags": ["production"]}', 481 max_items=10000 482 ) 483 except Exception as e: 484 print(f"Evaluation failed: {e}") 485 486 # Save the resume token 487 if result.resume_token: 488 # Store resume token for later (e.g., in a file or database) 489 import json 490 with open("resume_token.json", "w") as f: 491 json.dump({ 492 "scope": result.resume_token.scope, 493 "filter": result.resume_token.filter, 494 "last_timestamp": result.resume_token.last_processed_timestamp, 495 "last_id": result.resume_token.last_processed_id, 496 "items_done": result.resume_token.items_processed 497 }, f) 498 499 # Later, resume from where it left off 500 with open("resume_token.json") as f: 501 token_data = json.load(f) 502 503 resume_token = BatchEvaluationResumeToken( 504 scope=token_data["scope"], 505 filter=token_data["filter"], 506 last_processed_timestamp=token_data["last_timestamp"], 507 last_processed_id=token_data["last_id"], 508 items_processed=token_data["items_done"] 509 ) 510 511 # Resume the evaluation 512 result = client.run_batched_evaluation( 513 scope="traces", 514 mapper=my_mapper, 515 evaluators=[evaluator1, evaluator2], 516 resume_from=resume_token 517 ) 518 519 print(f"Processed {result.total_items_processed} additional items") 520 ``` 521 522 Handling partial completion: 523 ```python 524 result = client.run_batched_evaluation(...) 525 526 if not result.completed: 527 print(f"Evaluation incomplete. Processed {result.resume_token.items_processed} items") 528 print(f"Last item: {result.resume_token.last_processed_id}") 529 print(f"Resume from: {result.resume_token.last_processed_timestamp}") 530 531 # Optionally retry automatically 532 if result.resume_token: 533 print("Retrying...") 534 result = client.run_batched_evaluation( 535 scope=result.resume_token.scope, 536 mapper=my_mapper, 537 evaluators=my_evaluators, 538 resume_from=result.resume_token 539 ) 540 ``` 541 542 Note: 543 All arguments must be passed as keywords when instantiating this class. 544 The timestamp-based approach means that items created after the initial run 545 but before the timestamp will be skipped. This is intentional to avoid 546 duplicates and ensure consistent evaluation. 547 """ 548 549 def __init__( 550 self, 551 *, 552 scope: str, 553 filter: Optional[str], 554 last_processed_timestamp: str, 555 last_processed_id: str, 556 items_processed: int, 557 ): 558 """Initialize BatchEvaluationResumeToken with the provided state. 559 560 Args: 561 scope: The scope type ("traces", "observations"). 562 filter: The original JSON filter string. 563 last_processed_timestamp: ISO 8601 timestamp of last processed item. 564 last_processed_id: ID of last processed item. 565 items_processed: Count of items processed before interruption. 566 567 Note: 568 All arguments must be provided as keywords. 569 """ 570 self.scope = scope 571 self.filter = filter 572 self.last_processed_timestamp = last_processed_timestamp 573 self.last_processed_id = last_processed_id 574 self.items_processed = items_processed
Token for resuming a failed batch evaluation run.
This class encapsulates all the information needed to resume a batch evaluation that was interrupted or failed partway through. It uses timestamp-based filtering to avoid re-processing items that were already evaluated, even if the underlying dataset changed between runs.
Attributes:
- scope: The type of items being evaluated ("traces", "observations").
- filter: The original JSON filter string used to query items.
- last_processed_timestamp: ISO 8601 timestamp of the last successfully processed item. Used to construct a filter that only fetches items after this timestamp.
- last_processed_id: The ID of the last successfully processed item, for reference.
- items_processed: Count of items successfully processed before interruption.
Examples:
Resuming a failed batch evaluation:
# Initial run that fails partway through try: result = client.run_batched_evaluation( scope="traces", mapper=my_mapper, evaluators=[evaluator1, evaluator2], filter='{"tags": ["production"]}', max_items=10000 ) except Exception as e: print(f"Evaluation failed: {e}") # Save the resume token if result.resume_token: # Store resume token for later (e.g., in a file or database) import json with open("resume_token.json", "w") as f: json.dump({ "scope": result.resume_token.scope, "filter": result.resume_token.filter, "last_timestamp": result.resume_token.last_processed_timestamp, "last_id": result.resume_token.last_processed_id, "items_done": result.resume_token.items_processed }, f) # Later, resume from where it left off with open("resume_token.json") as f: token_data = json.load(f) resume_token = BatchEvaluationResumeToken( scope=token_data["scope"], filter=token_data["filter"], last_processed_timestamp=token_data["last_timestamp"], last_processed_id=token_data["last_id"], items_processed=token_data["items_done"] ) # Resume the evaluation result = client.run_batched_evaluation( scope="traces", mapper=my_mapper, evaluators=[evaluator1, evaluator2], resume_from=resume_token ) print(f"Processed {result.total_items_processed} additional items")Handling partial completion:
result = client.run_batched_evaluation(...) if not result.completed: print(f"Evaluation incomplete. Processed {result.resume_token.items_processed} items") print(f"Last item: {result.resume_token.last_processed_id}") print(f"Resume from: {result.resume_token.last_processed_timestamp}") # Optionally retry automatically if result.resume_token: print("Retrying...") result = client.run_batched_evaluation( scope=result.resume_token.scope, mapper=my_mapper, evaluators=my_evaluators, resume_from=result.resume_token )
Note:
All arguments must be passed as keywords when instantiating this class. The timestamp-based approach means that items created after the initial run but before the timestamp will be skipped. This is intentional to avoid duplicates and ensure consistent evaluation.
549 def __init__( 550 self, 551 *, 552 scope: str, 553 filter: Optional[str], 554 last_processed_timestamp: str, 555 last_processed_id: str, 556 items_processed: int, 557 ): 558 """Initialize BatchEvaluationResumeToken with the provided state. 559 560 Args: 561 scope: The scope type ("traces", "observations"). 562 filter: The original JSON filter string. 563 last_processed_timestamp: ISO 8601 timestamp of last processed item. 564 last_processed_id: ID of last processed item. 565 items_processed: Count of items processed before interruption. 566 567 Note: 568 All arguments must be provided as keywords. 569 """ 570 self.scope = scope 571 self.filter = filter 572 self.last_processed_timestamp = last_processed_timestamp 573 self.last_processed_id = last_processed_id 574 self.items_processed = items_processed
Initialize BatchEvaluationResumeToken with the provided state.
Arguments:
- scope: The scope type ("traces", "observations").
- filter: The original JSON filter string.
- last_processed_timestamp: ISO 8601 timestamp of last processed item.
- last_processed_id: ID of last processed item.
- items_processed: Count of items processed before interruption.
Note:
All arguments must be provided as keywords.
577class BatchEvaluationResult: 578 r"""Complete result structure for batch evaluation execution. 579 580 This class encapsulates comprehensive statistics and metadata about a batch 581 evaluation run, including counts, evaluator-specific metrics, timing information, 582 error details, and resume capability. 583 584 Attributes: 585 total_items_fetched: Total number of items fetched from the API. 586 total_items_processed: Number of items successfully evaluated. 587 total_items_failed: Number of items that failed during evaluation. 588 total_scores_created: Total scores created by all item-level evaluators. 589 total_composite_scores_created: Scores created by the composite evaluator. 590 total_evaluations_failed: Number of individual evaluator failures across all items. 591 evaluator_stats: List of per-evaluator statistics (success/failure rates, scores created). 592 resume_token: Token for resuming if evaluation was interrupted (None if completed). 593 completed: True if all items were processed, False if stopped early or failed. 594 duration_seconds: Total time taken to execute the batch evaluation. 595 failed_item_ids: List of IDs for items that failed evaluation. 596 error_summary: Dictionary mapping error types to occurrence counts. 597 has_more_items: True if max_items limit was reached but more items exist. 598 item_evaluations: Dictionary mapping item IDs to their evaluation results (both regular and composite). 599 600 Examples: 601 Basic result inspection: 602 ```python 603 result = client.run_batched_evaluation(...) 604 605 print(f"Processed: {result.total_items_processed}/{result.total_items_fetched}") 606 print(f"Scores created: {result.total_scores_created}") 607 print(f"Duration: {result.duration_seconds:.2f}s") 608 print(f"Success rate: {result.total_items_processed / result.total_items_fetched:.1%}") 609 ``` 610 611 Detailed analysis with evaluator stats: 612 ```python 613 result = client.run_batched_evaluation(...) 614 615 print(f"\nđ Batch Evaluation Results") 616 print(f"{'='*50}") 617 print(f"Items processed: {result.total_items_processed}") 618 print(f"Items failed: {result.total_items_failed}") 619 print(f"Scores created: {result.total_scores_created}") 620 621 if result.total_composite_scores_created > 0: 622 print(f"Composite scores: {result.total_composite_scores_created}") 623 624 print(f"\nđ Evaluator Performance:") 625 for stats in result.evaluator_stats: 626 success_rate = stats.successful_runs / stats.total_runs if stats.total_runs > 0 else 0 627 print(f"\n {stats.name}:") 628 print(f" Success rate: {success_rate:.1%}") 629 print(f" Scores created: {stats.total_scores_created}") 630 if stats.failed_runs > 0: 631 print(f" â ī¸ Failures: {stats.failed_runs}") 632 633 if result.error_summary: 634 print(f"\nâ ī¸ Errors encountered:") 635 for error_type, count in result.error_summary.items(): 636 print(f" {error_type}: {count}") 637 ``` 638 639 Handling incomplete runs: 640 ```python 641 result = client.run_batched_evaluation(...) 642 643 if not result.completed: 644 print("â ī¸ Evaluation incomplete!") 645 646 if result.resume_token: 647 print(f"Processed {result.resume_token.items_processed} items before failure") 648 print(f"Use resume_from parameter to continue from:") 649 print(f" Timestamp: {result.resume_token.last_processed_timestamp}") 650 print(f" Last ID: {result.resume_token.last_processed_id}") 651 652 if result.has_more_items: 653 print(f"âšī¸ More items available beyond max_items limit") 654 ``` 655 656 Performance monitoring: 657 ```python 658 result = client.run_batched_evaluation(...) 659 660 items_per_second = result.total_items_processed / result.duration_seconds 661 avg_scores_per_item = result.total_scores_created / result.total_items_processed 662 663 print(f"Performance metrics:") 664 print(f" Throughput: {items_per_second:.2f} items/second") 665 print(f" Avg scores/item: {avg_scores_per_item:.2f}") 666 print(f" Total duration: {result.duration_seconds:.2f}s") 667 668 if result.total_evaluations_failed > 0: 669 failure_rate = result.total_evaluations_failed / ( 670 result.total_items_processed * len(result.evaluator_stats) 671 ) 672 print(f" Evaluation failure rate: {failure_rate:.1%}") 673 ``` 674 675 Note: 676 All arguments must be passed as keywords when instantiating this class. 677 """ 678 679 def __init__( 680 self, 681 *, 682 total_items_fetched: int, 683 total_items_processed: int, 684 total_items_failed: int, 685 total_scores_created: int, 686 total_composite_scores_created: int, 687 total_evaluations_failed: int, 688 evaluator_stats: List[EvaluatorStats], 689 resume_token: Optional[BatchEvaluationResumeToken], 690 completed: bool, 691 duration_seconds: float, 692 failed_item_ids: List[str], 693 error_summary: Dict[str, int], 694 has_more_items: bool, 695 item_evaluations: Dict[str, List["Evaluation"]], 696 ): 697 """Initialize BatchEvaluationResult with comprehensive statistics. 698 699 Args: 700 total_items_fetched: Total items fetched from API. 701 total_items_processed: Items successfully evaluated. 702 total_items_failed: Items that failed evaluation. 703 total_scores_created: Scores from item-level evaluators. 704 total_composite_scores_created: Scores from composite evaluator. 705 total_evaluations_failed: Individual evaluator failures. 706 evaluator_stats: Per-evaluator statistics. 707 resume_token: Token for resuming (None if completed). 708 completed: Whether all items were processed. 709 duration_seconds: Total execution time. 710 failed_item_ids: IDs of failed items. 711 error_summary: Error types and counts. 712 has_more_items: Whether more items exist beyond max_items. 713 item_evaluations: Dictionary mapping item IDs to their evaluation results. 714 715 Note: 716 All arguments must be provided as keywords. 717 """ 718 self.total_items_fetched = total_items_fetched 719 self.total_items_processed = total_items_processed 720 self.total_items_failed = total_items_failed 721 self.total_scores_created = total_scores_created 722 self.total_composite_scores_created = total_composite_scores_created 723 self.total_evaluations_failed = total_evaluations_failed 724 self.evaluator_stats = evaluator_stats 725 self.resume_token = resume_token 726 self.completed = completed 727 self.duration_seconds = duration_seconds 728 self.failed_item_ids = failed_item_ids 729 self.error_summary = error_summary 730 self.has_more_items = has_more_items 731 self.item_evaluations = item_evaluations 732 733 def __str__(self) -> str: 734 """Return a formatted string representation of the batch evaluation results. 735 736 Returns: 737 A multi-line string with a summary of the evaluation results. 738 """ 739 lines = [] 740 lines.append("=" * 60) 741 lines.append("Batch Evaluation Results") 742 lines.append("=" * 60) 743 744 # Summary statistics 745 lines.append(f"\nStatus: {'Completed' if self.completed else 'Incomplete'}") 746 lines.append(f"Duration: {self.duration_seconds:.2f}s") 747 lines.append(f"\nItems fetched: {self.total_items_fetched}") 748 lines.append(f"Items processed: {self.total_items_processed}") 749 750 if self.total_items_failed > 0: 751 lines.append(f"Items failed: {self.total_items_failed}") 752 753 # Success rate 754 if self.total_items_fetched > 0: 755 success_rate = self.total_items_processed / self.total_items_fetched * 100 756 lines.append(f"Success rate: {success_rate:.1f}%") 757 758 # Scores created 759 lines.append(f"\nScores created: {self.total_scores_created}") 760 if self.total_composite_scores_created > 0: 761 lines.append(f"Composite scores: {self.total_composite_scores_created}") 762 763 total_scores = self.total_scores_created + self.total_composite_scores_created 764 lines.append(f"Total scores: {total_scores}") 765 766 # Evaluator statistics 767 if self.evaluator_stats: 768 lines.append("\nEvaluator Performance:") 769 for stats in self.evaluator_stats: 770 lines.append(f" {stats.name}:") 771 if stats.total_runs > 0: 772 success_rate = ( 773 stats.successful_runs / stats.total_runs * 100 774 if stats.total_runs > 0 775 else 0 776 ) 777 lines.append( 778 f" Runs: {stats.successful_runs}/{stats.total_runs} " 779 f"({success_rate:.1f}% success)" 780 ) 781 lines.append(f" Scores created: {stats.total_scores_created}") 782 if stats.failed_runs > 0: 783 lines.append(f" Failed runs: {stats.failed_runs}") 784 785 # Performance metrics 786 if self.total_items_processed > 0 and self.duration_seconds > 0: 787 items_per_sec = self.total_items_processed / self.duration_seconds 788 lines.append("\nPerformance:") 789 lines.append(f" Throughput: {items_per_sec:.2f} items/second") 790 if self.total_scores_created > 0: 791 avg_scores = self.total_scores_created / self.total_items_processed 792 lines.append(f" Avg scores per item: {avg_scores:.2f}") 793 794 # Errors and warnings 795 if self.error_summary: 796 lines.append("\nErrors encountered:") 797 for error_type, count in self.error_summary.items(): 798 lines.append(f" {error_type}: {count}") 799 800 # Incomplete run information 801 if not self.completed: 802 lines.append("\nWarning: Evaluation incomplete") 803 if self.resume_token: 804 lines.append( 805 f" Last processed: {self.resume_token.last_processed_timestamp}" 806 ) 807 lines.append(f" Items processed: {self.resume_token.items_processed}") 808 lines.append(" Use resume_from parameter to continue") 809 810 if self.has_more_items: 811 lines.append("\nNote: More items available beyond max_items limit") 812 813 lines.append("=" * 60) 814 return "\n".join(lines)
Complete result structure for batch evaluation execution.
This class encapsulates comprehensive statistics and metadata about a batch evaluation run, including counts, evaluator-specific metrics, timing information, error details, and resume capability.
Attributes:
- total_items_fetched: Total number of items fetched from the API.
- total_items_processed: Number of items successfully evaluated.
- total_items_failed: Number of items that failed during evaluation.
- total_scores_created: Total scores created by all item-level evaluators.
- total_composite_scores_created: Scores created by the composite evaluator.
- total_evaluations_failed: Number of individual evaluator failures across all items.
- evaluator_stats: List of per-evaluator statistics (success/failure rates, scores created).
- resume_token: Token for resuming if evaluation was interrupted (None if completed).
- completed: True if all items were processed, False if stopped early or failed.
- duration_seconds: Total time taken to execute the batch evaluation.
- failed_item_ids: List of IDs for items that failed evaluation.
- error_summary: Dictionary mapping error types to occurrence counts.
- has_more_items: True if max_items limit was reached but more items exist.
- item_evaluations: Dictionary mapping item IDs to their evaluation results (both regular and composite).
Examples:
Basic result inspection:
result = client.run_batched_evaluation(...) print(f"Processed: {result.total_items_processed}/{result.total_items_fetched}") print(f"Scores created: {result.total_scores_created}") print(f"Duration: {result.duration_seconds:.2f}s") print(f"Success rate: {result.total_items_processed / result.total_items_fetched:.1%}")Detailed analysis with evaluator stats:
result = client.run_batched_evaluation(...) print(f"\nđ Batch Evaluation Results") print(f"{'='*50}") print(f"Items processed: {result.total_items_processed}") print(f"Items failed: {result.total_items_failed}") print(f"Scores created: {result.total_scores_created}") if result.total_composite_scores_created > 0: print(f"Composite scores: {result.total_composite_scores_created}") print(f"\nđ Evaluator Performance:") for stats in result.evaluator_stats: success_rate = stats.successful_runs / stats.total_runs if stats.total_runs > 0 else 0 print(f"\n {stats.name}:") print(f" Success rate: {success_rate:.1%}") print(f" Scores created: {stats.total_scores_created}") if stats.failed_runs > 0: print(f" â ī¸ Failures: {stats.failed_runs}") if result.error_summary: print(f"\nâ ī¸ Errors encountered:") for error_type, count in result.error_summary.items(): print(f" {error_type}: {count}")Handling incomplete runs:
result = client.run_batched_evaluation(...) if not result.completed: print("â ī¸ Evaluation incomplete!") if result.resume_token: print(f"Processed {result.resume_token.items_processed} items before failure") print(f"Use resume_from parameter to continue from:") print(f" Timestamp: {result.resume_token.last_processed_timestamp}") print(f" Last ID: {result.resume_token.last_processed_id}") if result.has_more_items: print(f"âšī¸ More items available beyond max_items limit")Performance monitoring:
result = client.run_batched_evaluation(...) items_per_second = result.total_items_processed / result.duration_seconds avg_scores_per_item = result.total_scores_created / result.total_items_processed print(f"Performance metrics:") print(f" Throughput: {items_per_second:.2f} items/second") print(f" Avg scores/item: {avg_scores_per_item:.2f}") print(f" Total duration: {result.duration_seconds:.2f}s") if result.total_evaluations_failed > 0: failure_rate = result.total_evaluations_failed / ( result.total_items_processed * len(result.evaluator_stats) ) print(f" Evaluation failure rate: {failure_rate:.1%}")
Note:
All arguments must be passed as keywords when instantiating this class.
679 def __init__( 680 self, 681 *, 682 total_items_fetched: int, 683 total_items_processed: int, 684 total_items_failed: int, 685 total_scores_created: int, 686 total_composite_scores_created: int, 687 total_evaluations_failed: int, 688 evaluator_stats: List[EvaluatorStats], 689 resume_token: Optional[BatchEvaluationResumeToken], 690 completed: bool, 691 duration_seconds: float, 692 failed_item_ids: List[str], 693 error_summary: Dict[str, int], 694 has_more_items: bool, 695 item_evaluations: Dict[str, List["Evaluation"]], 696 ): 697 """Initialize BatchEvaluationResult with comprehensive statistics. 698 699 Args: 700 total_items_fetched: Total items fetched from API. 701 total_items_processed: Items successfully evaluated. 702 total_items_failed: Items that failed evaluation. 703 total_scores_created: Scores from item-level evaluators. 704 total_composite_scores_created: Scores from composite evaluator. 705 total_evaluations_failed: Individual evaluator failures. 706 evaluator_stats: Per-evaluator statistics. 707 resume_token: Token for resuming (None if completed). 708 completed: Whether all items were processed. 709 duration_seconds: Total execution time. 710 failed_item_ids: IDs of failed items. 711 error_summary: Error types and counts. 712 has_more_items: Whether more items exist beyond max_items. 713 item_evaluations: Dictionary mapping item IDs to their evaluation results. 714 715 Note: 716 All arguments must be provided as keywords. 717 """ 718 self.total_items_fetched = total_items_fetched 719 self.total_items_processed = total_items_processed 720 self.total_items_failed = total_items_failed 721 self.total_scores_created = total_scores_created 722 self.total_composite_scores_created = total_composite_scores_created 723 self.total_evaluations_failed = total_evaluations_failed 724 self.evaluator_stats = evaluator_stats 725 self.resume_token = resume_token 726 self.completed = completed 727 self.duration_seconds = duration_seconds 728 self.failed_item_ids = failed_item_ids 729 self.error_summary = error_summary 730 self.has_more_items = has_more_items 731 self.item_evaluations = item_evaluations
Initialize BatchEvaluationResult with comprehensive statistics.
Arguments:
- total_items_fetched: Total items fetched from API.
- total_items_processed: Items successfully evaluated.
- total_items_failed: Items that failed evaluation.
- total_scores_created: Scores from item-level evaluators.
- total_composite_scores_created: Scores from composite evaluator.
- total_evaluations_failed: Individual evaluator failures.
- evaluator_stats: Per-evaluator statistics.
- resume_token: Token for resuming (None if completed).
- completed: Whether all items were processed.
- duration_seconds: Total execution time.
- failed_item_ids: IDs of failed items.
- error_summary: Error types and counts.
- has_more_items: Whether more items exist beyond max_items.
- item_evaluations: Dictionary mapping item IDs to their evaluation results.
Note:
All arguments must be provided as keywords.
73def is_default_export_span(span: ReadableSpan) -> bool: 74 """Return whether a span should be exported by default.""" 75 return ( 76 is_langfuse_span(span) or is_genai_span(span) or is_known_llm_instrumentor(span) 77 )
Return whether a span should be exported by default.
36def is_langfuse_span(span: ReadableSpan) -> bool: 37 """Return whether the span was created by the Langfuse SDK tracer.""" 38 return ( 39 span.instrumentation_scope is not None 40 and span.instrumentation_scope.name == LANGFUSE_TRACER_NAME 41 )
Return whether the span was created by the Langfuse SDK tracer.
44def is_genai_span(span: ReadableSpan) -> bool: 45 """Return whether the span has any ``gen_ai.*`` semantic convention attribute.""" 46 if span.attributes is None: 47 return False 48 49 return any( 50 isinstance(key, str) and key.startswith("gen_ai") 51 for key in span.attributes.keys() 52 )
Return whether the span has any gen_ai.* semantic convention attribute.
60def is_known_llm_instrumentor(span: ReadableSpan) -> bool: 61 """Return whether the span comes from a known LLM instrumentation scope.""" 62 if span.instrumentation_scope is None: 63 return False 64 65 scope_name = span.instrumentation_scope.name 66 67 return any( 68 _matches_scope_prefix(scope_name, prefix) 69 for prefix in KNOWN_LLM_INSTRUMENTATION_SCOPE_PREFIXES 70 )
Return whether the span comes from a known LLM instrumentation scope.