langfuse
1""".. include:: ../README.md""" 2 3from langfuse.batch_evaluation import ( 4 BatchEvaluationResult, 5 BatchEvaluationResumeToken, 6 CompositeEvaluatorFunction, 7 EvaluatorInputs, 8 EvaluatorStats, 9 MapperFunction, 10) 11from langfuse.experiment import Evaluation, RegressionError, RunnerContext 12 13from ._client import client as _client_module 14from ._client.attributes import LangfuseOtelSpanAttributes 15from ._client.constants import ObservationTypeLiteral 16from ._client.get_client import get_client 17from ._client.observe import observe 18from ._client.propagation import propagate_attributes 19from ._client.span import ( 20 LangfuseAgent, 21 LangfuseChain, 22 LangfuseEmbedding, 23 LangfuseEvaluator, 24 LangfuseEvent, 25 LangfuseGeneration, 26 LangfuseGuardrail, 27 LangfuseRetriever, 28 LangfuseSpan, 29 LangfuseTool, 30) 31from ._version import __version__ 32from .span_filter import ( 33 KNOWN_LLM_INSTRUMENTATION_SCOPE_PREFIXES, 34 is_default_export_span, 35 is_genai_span, 36 is_known_llm_instrumentor, 37 is_langfuse_span, 38) 39 40Langfuse = _client_module.Langfuse 41 42__all__ = [ 43 "Langfuse", 44 "get_client", 45 "observe", 46 "propagate_attributes", 47 "ObservationTypeLiteral", 48 "LangfuseSpan", 49 "LangfuseGeneration", 50 "LangfuseEvent", 51 "LangfuseOtelSpanAttributes", 52 "LangfuseAgent", 53 "LangfuseTool", 54 "LangfuseChain", 55 "LangfuseEmbedding", 56 "LangfuseEvaluator", 57 "LangfuseRetriever", 58 "LangfuseGuardrail", 59 "Evaluation", 60 "EvaluatorInputs", 61 "MapperFunction", 62 "CompositeEvaluatorFunction", 63 "EvaluatorStats", 64 "BatchEvaluationResumeToken", 65 "BatchEvaluationResult", 66 "RunnerContext", 67 "RegressionError", 68 "__version__", 69 "is_default_export_span", 70 "is_langfuse_span", 71 "is_genai_span", 72 "is_known_llm_instrumentor", 73 "KNOWN_LLM_INSTRUMENTATION_SCOPE_PREFIXES", 74 "experiment", 75 "api", 76]
142class Langfuse: 143 """Main client for Langfuse tracing and platform features. 144 145 This class provides an interface for creating and managing traces, spans, 146 and generations in Langfuse as well as interacting with the Langfuse API. 147 148 The client features a thread-safe singleton pattern for each unique public API key, 149 ensuring consistent trace context propagation across your application. It implements 150 efficient batching of spans with configurable flush settings and includes background 151 thread management for media uploads and score ingestion. 152 153 Configuration is flexible through either direct parameters or environment variables, 154 with graceful fallbacks and runtime configuration updates. 155 156 Attributes: 157 api: Synchronous API client for Langfuse backend communication 158 async_api: Asynchronous API client for Langfuse backend communication 159 _otel_tracer: Internal LangfuseTracer instance managing OpenTelemetry components 160 161 Parameters: 162 public_key (Optional[str]): Your Langfuse public API key. Can also be set via LANGFUSE_PUBLIC_KEY environment variable. 163 secret_key (Optional[str]): Your Langfuse secret API key. Can also be set via LANGFUSE_SECRET_KEY environment variable. 164 base_url (Optional[str]): The Langfuse API base URL. Defaults to "https://cloud.langfuse.com". Can also be set via LANGFUSE_BASE_URL environment variable. 165 host (Optional[str]): Deprecated. Use base_url instead. The Langfuse API host URL. Defaults to "https://cloud.langfuse.com". 166 timeout (Optional[int]): Timeout in seconds for API requests. Defaults to 5 seconds. 167 httpx_client (Optional[httpx.Client]): Custom httpx client for making non-tracing HTTP requests. If not provided, a default client will be created. 168 debug (bool): Enable debug logging. Defaults to False. Can also be set via LANGFUSE_DEBUG environment variable. 169 tracing_enabled (Optional[bool]): Enable or disable tracing. Defaults to True. Can also be set via LANGFUSE_TRACING_ENABLED environment variable. 170 flush_at (Optional[int]): Number of spans to batch before sending to the API. Defaults to 512. Can also be set via LANGFUSE_FLUSH_AT environment variable. 171 flush_interval (Optional[float]): Time in seconds between batch flushes. Defaults to 5 seconds. Can also be set via LANGFUSE_FLUSH_INTERVAL environment variable. 172 environment (Optional[str]): Environment name for tracing. Default is 'default'. Can also be set via LANGFUSE_TRACING_ENVIRONMENT environment variable. Can be any lowercase alphanumeric string with hyphens and underscores that does not start with 'langfuse'. 173 release (Optional[str]): Release version/hash of your application. Used for grouping analytics by release. 174 media_upload_thread_count (Optional[int]): Number of background threads for handling media uploads. Defaults to 1. Can also be set via LANGFUSE_MEDIA_UPLOAD_THREAD_COUNT environment variable. 175 sample_rate (Optional[float]): Sampling rate for traces (0.0 to 1.0). Defaults to 1.0 (100% of traces are sampled). Can also be set via LANGFUSE_SAMPLE_RATE environment variable. 176 mask (Optional[MaskFunction]): Function to mask sensitive data in traces before sending to the API. 177 blocked_instrumentation_scopes (Optional[List[str]]): Deprecated. Use `should_export_span` instead. Equivalent behavior: 178 ```python 179 from langfuse.span_filter import is_default_export_span 180 blocked = {"sqlite", "requests"} 181 182 should_export_span = lambda span: ( 183 is_default_export_span(span) 184 and ( 185 span.instrumentation_scope is None 186 or span.instrumentation_scope.name not in blocked 187 ) 188 ) 189 ``` 190 should_export_span (Optional[Callable[[ReadableSpan], bool]]): Callback to decide whether to export a span. If omitted, Langfuse uses the default filter (Langfuse SDK spans, spans with `gen_ai.*` attributes, and known LLM instrumentation scopes). 191 additional_headers (Optional[Dict[str, str]]): Additional headers to include in all API requests and in the default OTLPSpanExporter requests. These headers will be merged with default headers. Note: If httpx_client is provided, additional_headers must be set directly on your custom httpx_client as well. If `span_exporter` is provided, these headers are not wired into that exporter and must be configured on the exporter instance directly. 192 tracer_provider(Optional[TracerProvider]): OpenTelemetry TracerProvider to use for Langfuse. This can be useful to set to have disconnected tracing between Langfuse and other OpenTelemetry-span emitting libraries. Note: To track active spans, the context is still shared between TracerProviders. This may lead to broken trace trees. 193 span_exporter (Optional[SpanExporter]): Custom OpenTelemetry span exporter for the Langfuse span processor. If omitted, Langfuse creates an OTLPSpanExporter pointed at the Langfuse OTLP endpoint. If provided, Langfuse does not wire `base_url`, exporter headers, exporter auth, or exporter timeout into it. Configure endpoint, headers, and timeout on the exporter instance directly. If you are sending spans to Langfuse v4 or using Langfuse Cloud Fast Preview, include `x-langfuse-ingestion-version=4` on the exporter to enable real time processing of exported spans. 194 195 Example: 196 ```python 197 from langfuse.otel import Langfuse 198 199 # Initialize the client (reads from env vars if not provided) 200 langfuse = Langfuse( 201 public_key="your-public-key", 202 secret_key="your-secret-key", 203 host="https://cloud.langfuse.com", # Optional, default shown 204 ) 205 206 # Create a trace span 207 with langfuse.start_as_current_observation(name="process-query") as span: 208 # Your application code here 209 210 # Create a nested generation span for an LLM call 211 with span.start_as_current_generation( 212 name="generate-response", 213 model="gpt-4", 214 input={"query": "Tell me about AI"}, 215 model_parameters={"temperature": 0.7, "max_tokens": 500} 216 ) as generation: 217 # Generate response here 218 response = "AI is a field of computer science..." 219 220 generation.update( 221 output=response, 222 usage_details={"prompt_tokens": 10, "completion_tokens": 50}, 223 cost_details={"total_cost": 0.0023} 224 ) 225 226 # Score the generation (supports NUMERIC, BOOLEAN, CATEGORICAL) 227 generation.score(name="relevance", value=0.95, data_type="NUMERIC") 228 ``` 229 """ 230 231 _resources: Optional[LangfuseResourceManager] = None 232 _mask: Optional[MaskFunction] = None 233 _otel_tracer: otel_trace_api.Tracer 234 235 def __init__( 236 self, 237 *, 238 public_key: Optional[str] = None, 239 secret_key: Optional[str] = None, 240 base_url: Optional[str] = None, 241 host: Optional[str] = None, 242 timeout: Optional[int] = None, 243 httpx_client: Optional[httpx.Client] = None, 244 debug: bool = False, 245 tracing_enabled: Optional[bool] = True, 246 flush_at: Optional[int] = None, 247 flush_interval: Optional[float] = None, 248 environment: Optional[str] = None, 249 release: Optional[str] = None, 250 media_upload_thread_count: Optional[int] = None, 251 sample_rate: Optional[float] = None, 252 mask: Optional[MaskFunction] = None, 253 blocked_instrumentation_scopes: Optional[List[str]] = None, 254 should_export_span: Optional[Callable[[ReadableSpan], bool]] = None, 255 additional_headers: Optional[Dict[str, str]] = None, 256 tracer_provider: Optional[TracerProvider] = None, 257 span_exporter: Optional[SpanExporter] = None, 258 ): 259 self._base_url = ( 260 base_url 261 or os.environ.get(LANGFUSE_BASE_URL) 262 or host 263 or os.environ.get(LANGFUSE_HOST, "https://cloud.langfuse.com") 264 ) 265 self._environment = environment or cast( 266 str, os.environ.get(LANGFUSE_TRACING_ENVIRONMENT) 267 ) 268 self._release = ( 269 release 270 or os.environ.get(LANGFUSE_RELEASE, None) 271 or get_common_release_envs() 272 ) 273 self._project_id: Optional[str] = None 274 sample_rate = sample_rate or float(os.environ.get(LANGFUSE_SAMPLE_RATE, 1.0)) 275 if not 0.0 <= sample_rate <= 1.0: 276 raise ValueError( 277 f"Sample rate must be between 0.0 and 1.0, got {sample_rate}" 278 ) 279 280 timeout = timeout or int(os.environ.get(LANGFUSE_TIMEOUT, 5)) 281 282 self._tracing_enabled = ( 283 tracing_enabled 284 and os.environ.get(LANGFUSE_TRACING_ENABLED, "true").lower() != "false" 285 ) 286 if not self._tracing_enabled: 287 langfuse_logger.info( 288 "Configuration: Langfuse tracing is explicitly disabled. No data will be sent to the Langfuse API." 289 ) 290 291 debug = ( 292 debug if debug else (os.getenv(LANGFUSE_DEBUG, "false").lower() == "true") 293 ) 294 if debug: 295 logging.basicConfig( 296 format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" 297 ) 298 langfuse_logger.setLevel(logging.DEBUG) 299 300 public_key = public_key or os.environ.get(LANGFUSE_PUBLIC_KEY) 301 if public_key is None: 302 langfuse_logger.warning( 303 "Authentication error: Langfuse client initialized without public_key. Client will be disabled. " 304 "Provide a public_key parameter or set LANGFUSE_PUBLIC_KEY environment variable. " 305 ) 306 self._otel_tracer = otel_trace_api.NoOpTracer() 307 return 308 309 secret_key = secret_key or os.environ.get(LANGFUSE_SECRET_KEY) 310 if secret_key is None: 311 langfuse_logger.warning( 312 "Authentication error: Langfuse client initialized without secret_key. Client will be disabled. " 313 "Provide a secret_key parameter or set LANGFUSE_SECRET_KEY environment variable. " 314 ) 315 self._otel_tracer = otel_trace_api.NoOpTracer() 316 return 317 318 if os.environ.get("OTEL_SDK_DISABLED", "false").lower() == "true": 319 langfuse_logger.warning( 320 "OTEL_SDK_DISABLED is set. Langfuse tracing will be disabled and no traces will appear in the UI." 321 ) 322 323 if blocked_instrumentation_scopes is not None: 324 warnings.warn( 325 "`blocked_instrumentation_scopes` is deprecated and will be removed in a future release. " 326 "Use `should_export_span` instead. Example: " 327 "from langfuse.span_filter import is_default_export_span; " 328 'blocked={"scope"}; should_export_span=lambda span: ' 329 "is_default_export_span(span) and (span.instrumentation_scope is None or " 330 "span.instrumentation_scope.name not in blocked).", 331 DeprecationWarning, 332 stacklevel=2, 333 ) 334 335 # Initialize api and tracer if requirements are met 336 self._resources = LangfuseResourceManager( 337 public_key=public_key, 338 secret_key=secret_key, 339 base_url=self._base_url, 340 timeout=timeout, 341 environment=self._environment, 342 release=release, 343 flush_at=flush_at, 344 flush_interval=flush_interval, 345 httpx_client=httpx_client, 346 media_upload_thread_count=media_upload_thread_count, 347 sample_rate=sample_rate, 348 mask=mask, 349 tracing_enabled=self._tracing_enabled, 350 blocked_instrumentation_scopes=blocked_instrumentation_scopes, 351 should_export_span=should_export_span, 352 additional_headers=additional_headers, 353 tracer_provider=tracer_provider, 354 span_exporter=span_exporter, 355 ) 356 self._mask = self._resources.mask 357 358 self._otel_tracer = ( 359 self._resources.tracer 360 if self._tracing_enabled and self._resources.tracer is not None 361 else otel_trace_api.NoOpTracer() 362 ) 363 self.api = self._resources.api 364 self.async_api = self._resources.async_api 365 366 @overload 367 def start_observation( 368 self, 369 *, 370 trace_context: Optional[TraceContext] = None, 371 name: str, 372 as_type: Literal["generation"], 373 input: Optional[Any] = None, 374 output: Optional[Any] = None, 375 metadata: Optional[Any] = None, 376 version: Optional[str] = None, 377 level: Optional[SpanLevel] = None, 378 status_message: Optional[str] = None, 379 completion_start_time: Optional[datetime] = None, 380 model: Optional[str] = None, 381 model_parameters: Optional[Dict[str, MapValue]] = None, 382 usage_details: Optional[Dict[str, int]] = None, 383 cost_details: Optional[Dict[str, float]] = None, 384 prompt: Optional[PromptClient] = None, 385 ) -> LangfuseGeneration: ... 386 387 @overload 388 def start_observation( 389 self, 390 *, 391 trace_context: Optional[TraceContext] = None, 392 name: str, 393 as_type: Literal["span"] = "span", 394 input: Optional[Any] = None, 395 output: Optional[Any] = None, 396 metadata: Optional[Any] = None, 397 version: Optional[str] = None, 398 level: Optional[SpanLevel] = None, 399 status_message: Optional[str] = None, 400 ) -> LangfuseSpan: ... 401 402 @overload 403 def start_observation( 404 self, 405 *, 406 trace_context: Optional[TraceContext] = None, 407 name: str, 408 as_type: Literal["agent"], 409 input: Optional[Any] = None, 410 output: Optional[Any] = None, 411 metadata: Optional[Any] = None, 412 version: Optional[str] = None, 413 level: Optional[SpanLevel] = None, 414 status_message: Optional[str] = None, 415 ) -> LangfuseAgent: ... 416 417 @overload 418 def start_observation( 419 self, 420 *, 421 trace_context: Optional[TraceContext] = None, 422 name: str, 423 as_type: Literal["tool"], 424 input: Optional[Any] = None, 425 output: Optional[Any] = None, 426 metadata: Optional[Any] = None, 427 version: Optional[str] = None, 428 level: Optional[SpanLevel] = None, 429 status_message: Optional[str] = None, 430 ) -> LangfuseTool: ... 431 432 @overload 433 def start_observation( 434 self, 435 *, 436 trace_context: Optional[TraceContext] = None, 437 name: str, 438 as_type: Literal["chain"], 439 input: Optional[Any] = None, 440 output: Optional[Any] = None, 441 metadata: Optional[Any] = None, 442 version: Optional[str] = None, 443 level: Optional[SpanLevel] = None, 444 status_message: Optional[str] = None, 445 ) -> LangfuseChain: ... 446 447 @overload 448 def start_observation( 449 self, 450 *, 451 trace_context: Optional[TraceContext] = None, 452 name: str, 453 as_type: Literal["retriever"], 454 input: Optional[Any] = None, 455 output: Optional[Any] = None, 456 metadata: Optional[Any] = None, 457 version: Optional[str] = None, 458 level: Optional[SpanLevel] = None, 459 status_message: Optional[str] = None, 460 ) -> LangfuseRetriever: ... 461 462 @overload 463 def start_observation( 464 self, 465 *, 466 trace_context: Optional[TraceContext] = None, 467 name: str, 468 as_type: Literal["evaluator"], 469 input: Optional[Any] = None, 470 output: Optional[Any] = None, 471 metadata: Optional[Any] = None, 472 version: Optional[str] = None, 473 level: Optional[SpanLevel] = None, 474 status_message: Optional[str] = None, 475 ) -> LangfuseEvaluator: ... 476 477 @overload 478 def start_observation( 479 self, 480 *, 481 trace_context: Optional[TraceContext] = None, 482 name: str, 483 as_type: Literal["embedding"], 484 input: Optional[Any] = None, 485 output: Optional[Any] = None, 486 metadata: Optional[Any] = None, 487 version: Optional[str] = None, 488 level: Optional[SpanLevel] = None, 489 status_message: Optional[str] = None, 490 completion_start_time: Optional[datetime] = None, 491 model: Optional[str] = None, 492 model_parameters: Optional[Dict[str, MapValue]] = None, 493 usage_details: Optional[Dict[str, int]] = None, 494 cost_details: Optional[Dict[str, float]] = None, 495 prompt: Optional[PromptClient] = None, 496 ) -> LangfuseEmbedding: ... 497 498 @overload 499 def start_observation( 500 self, 501 *, 502 trace_context: Optional[TraceContext] = None, 503 name: str, 504 as_type: Literal["guardrail"], 505 input: Optional[Any] = None, 506 output: Optional[Any] = None, 507 metadata: Optional[Any] = None, 508 version: Optional[str] = None, 509 level: Optional[SpanLevel] = None, 510 status_message: Optional[str] = None, 511 ) -> LangfuseGuardrail: ... 512 513 def start_observation( 514 self, 515 *, 516 trace_context: Optional[TraceContext] = None, 517 name: str, 518 as_type: ObservationTypeLiteralNoEvent = "span", 519 input: Optional[Any] = None, 520 output: Optional[Any] = None, 521 metadata: Optional[Any] = None, 522 version: Optional[str] = None, 523 level: Optional[SpanLevel] = None, 524 status_message: Optional[str] = None, 525 completion_start_time: Optional[datetime] = None, 526 model: Optional[str] = None, 527 model_parameters: Optional[Dict[str, MapValue]] = None, 528 usage_details: Optional[Dict[str, int]] = None, 529 cost_details: Optional[Dict[str, float]] = None, 530 prompt: Optional[PromptClient] = None, 531 ) -> Union[ 532 LangfuseSpan, 533 LangfuseGeneration, 534 LangfuseAgent, 535 LangfuseTool, 536 LangfuseChain, 537 LangfuseRetriever, 538 LangfuseEvaluator, 539 LangfuseEmbedding, 540 LangfuseGuardrail, 541 ]: 542 """Create a new observation of the specified type. 543 544 This method creates a new observation but does not set it as the current span in the 545 context. To create and use an observation within a context, use start_as_current_observation(). 546 547 Args: 548 trace_context: Optional context for connecting to an existing trace 549 name: Name of the observation 550 as_type: Type of observation to create (defaults to "span") 551 input: Input data for the operation 552 output: Output data from the operation 553 metadata: Additional metadata to associate with the observation 554 version: Version identifier for the code or component 555 level: Importance level of the observation 556 status_message: Optional status message for the observation 557 completion_start_time: When the model started generating (for generation types) 558 model: Name/identifier of the AI model used (for generation types) 559 model_parameters: Parameters used for the model (for generation types) 560 usage_details: Token usage information (for generation types) 561 cost_details: Cost information (for generation types) 562 prompt: Associated prompt template (for generation types) 563 564 Returns: 565 An observation object of the appropriate type that must be ended with .end() 566 """ 567 if trace_context: 568 trace_id = trace_context.get("trace_id", None) 569 parent_span_id = trace_context.get("parent_span_id", None) 570 571 if trace_id: 572 remote_parent_span = self._create_remote_parent_span( 573 trace_id=trace_id, parent_span_id=parent_span_id 574 ) 575 576 with otel_trace_api.use_span( 577 cast(otel_trace_api.Span, remote_parent_span) 578 ): 579 otel_span = self._otel_tracer.start_span(name=name) 580 otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True) 581 582 return self._create_observation_from_otel_span( 583 otel_span=otel_span, 584 as_type=as_type, 585 input=input, 586 output=output, 587 metadata=metadata, 588 version=version, 589 level=level, 590 status_message=status_message, 591 completion_start_time=completion_start_time, 592 model=model, 593 model_parameters=model_parameters, 594 usage_details=usage_details, 595 cost_details=cost_details, 596 prompt=prompt, 597 ) 598 599 otel_span = self._otel_tracer.start_span(name=name) 600 601 return self._create_observation_from_otel_span( 602 otel_span=otel_span, 603 as_type=as_type, 604 input=input, 605 output=output, 606 metadata=metadata, 607 version=version, 608 level=level, 609 status_message=status_message, 610 completion_start_time=completion_start_time, 611 model=model, 612 model_parameters=model_parameters, 613 usage_details=usage_details, 614 cost_details=cost_details, 615 prompt=prompt, 616 ) 617 618 def _create_observation_from_otel_span( 619 self, 620 *, 621 otel_span: otel_trace_api.Span, 622 as_type: ObservationTypeLiteralNoEvent, 623 input: Optional[Any] = None, 624 output: Optional[Any] = None, 625 metadata: Optional[Any] = None, 626 version: Optional[str] = None, 627 level: Optional[SpanLevel] = None, 628 status_message: Optional[str] = None, 629 completion_start_time: Optional[datetime] = None, 630 model: Optional[str] = None, 631 model_parameters: Optional[Dict[str, MapValue]] = None, 632 usage_details: Optional[Dict[str, int]] = None, 633 cost_details: Optional[Dict[str, float]] = None, 634 prompt: Optional[PromptClient] = None, 635 ) -> Union[ 636 LangfuseSpan, 637 LangfuseGeneration, 638 LangfuseAgent, 639 LangfuseTool, 640 LangfuseChain, 641 LangfuseRetriever, 642 LangfuseEvaluator, 643 LangfuseEmbedding, 644 LangfuseGuardrail, 645 ]: 646 """Create the appropriate observation type from an OTEL span.""" 647 if as_type in get_observation_types_list(ObservationTypeGenerationLike): 648 observation_class = self._get_span_class(as_type) 649 # Type ignore to prevent overloads of internal _get_span_class function, 650 # issue is that LangfuseEvent could be returned and that classes have diff. args 651 return observation_class( # type: ignore[return-value,call-arg] 652 otel_span=otel_span, 653 langfuse_client=self, 654 environment=self._environment, 655 release=self._release, 656 input=input, 657 output=output, 658 metadata=metadata, 659 version=version, 660 level=level, 661 status_message=status_message, 662 completion_start_time=completion_start_time, 663 model=model, 664 model_parameters=model_parameters, 665 usage_details=usage_details, 666 cost_details=cost_details, 667 prompt=prompt, 668 ) 669 else: 670 # For other types (e.g. span, guardrail), create appropriate class without generation properties 671 observation_class = self._get_span_class(as_type) 672 # Type ignore to prevent overloads of internal _get_span_class function, 673 # issue is that LangfuseEvent could be returned and that classes have diff. args 674 return observation_class( # type: ignore[return-value,call-arg] 675 otel_span=otel_span, 676 langfuse_client=self, 677 environment=self._environment, 678 release=self._release, 679 input=input, 680 output=output, 681 metadata=metadata, 682 version=version, 683 level=level, 684 status_message=status_message, 685 ) 686 # span._observation_type = as_type 687 # span._otel_span.set_attribute("langfuse.observation.type", as_type) 688 # return span 689 690 @overload 691 def start_as_current_observation( 692 self, 693 *, 694 trace_context: Optional[TraceContext] = None, 695 name: str, 696 as_type: Literal["generation"], 697 input: Optional[Any] = None, 698 output: Optional[Any] = None, 699 metadata: Optional[Any] = None, 700 version: Optional[str] = None, 701 level: Optional[SpanLevel] = None, 702 status_message: Optional[str] = None, 703 completion_start_time: Optional[datetime] = None, 704 model: Optional[str] = None, 705 model_parameters: Optional[Dict[str, MapValue]] = None, 706 usage_details: Optional[Dict[str, int]] = None, 707 cost_details: Optional[Dict[str, float]] = None, 708 prompt: Optional[PromptClient] = None, 709 end_on_exit: Optional[bool] = None, 710 ) -> _AgnosticContextManager[LangfuseGeneration]: ... 711 712 @overload 713 def start_as_current_observation( 714 self, 715 *, 716 trace_context: Optional[TraceContext] = None, 717 name: str, 718 as_type: Literal["span"] = "span", 719 input: Optional[Any] = None, 720 output: Optional[Any] = None, 721 metadata: Optional[Any] = None, 722 version: Optional[str] = None, 723 level: Optional[SpanLevel] = None, 724 status_message: Optional[str] = None, 725 end_on_exit: Optional[bool] = None, 726 ) -> _AgnosticContextManager[LangfuseSpan]: ... 727 728 @overload 729 def start_as_current_observation( 730 self, 731 *, 732 trace_context: Optional[TraceContext] = None, 733 name: str, 734 as_type: Literal["agent"], 735 input: Optional[Any] = None, 736 output: Optional[Any] = None, 737 metadata: Optional[Any] = None, 738 version: Optional[str] = None, 739 level: Optional[SpanLevel] = None, 740 status_message: Optional[str] = None, 741 end_on_exit: Optional[bool] = None, 742 ) -> _AgnosticContextManager[LangfuseAgent]: ... 743 744 @overload 745 def start_as_current_observation( 746 self, 747 *, 748 trace_context: Optional[TraceContext] = None, 749 name: str, 750 as_type: Literal["tool"], 751 input: Optional[Any] = None, 752 output: Optional[Any] = None, 753 metadata: Optional[Any] = None, 754 version: Optional[str] = None, 755 level: Optional[SpanLevel] = None, 756 status_message: Optional[str] = None, 757 end_on_exit: Optional[bool] = None, 758 ) -> _AgnosticContextManager[LangfuseTool]: ... 759 760 @overload 761 def start_as_current_observation( 762 self, 763 *, 764 trace_context: Optional[TraceContext] = None, 765 name: str, 766 as_type: Literal["chain"], 767 input: Optional[Any] = None, 768 output: Optional[Any] = None, 769 metadata: Optional[Any] = None, 770 version: Optional[str] = None, 771 level: Optional[SpanLevel] = None, 772 status_message: Optional[str] = None, 773 end_on_exit: Optional[bool] = None, 774 ) -> _AgnosticContextManager[LangfuseChain]: ... 775 776 @overload 777 def start_as_current_observation( 778 self, 779 *, 780 trace_context: Optional[TraceContext] = None, 781 name: str, 782 as_type: Literal["retriever"], 783 input: Optional[Any] = None, 784 output: Optional[Any] = None, 785 metadata: Optional[Any] = None, 786 version: Optional[str] = None, 787 level: Optional[SpanLevel] = None, 788 status_message: Optional[str] = None, 789 end_on_exit: Optional[bool] = None, 790 ) -> _AgnosticContextManager[LangfuseRetriever]: ... 791 792 @overload 793 def start_as_current_observation( 794 self, 795 *, 796 trace_context: Optional[TraceContext] = None, 797 name: str, 798 as_type: Literal["evaluator"], 799 input: Optional[Any] = None, 800 output: Optional[Any] = None, 801 metadata: Optional[Any] = None, 802 version: Optional[str] = None, 803 level: Optional[SpanLevel] = None, 804 status_message: Optional[str] = None, 805 end_on_exit: Optional[bool] = None, 806 ) -> _AgnosticContextManager[LangfuseEvaluator]: ... 807 808 @overload 809 def start_as_current_observation( 810 self, 811 *, 812 trace_context: Optional[TraceContext] = None, 813 name: str, 814 as_type: Literal["embedding"], 815 input: Optional[Any] = None, 816 output: Optional[Any] = None, 817 metadata: Optional[Any] = None, 818 version: Optional[str] = None, 819 level: Optional[SpanLevel] = None, 820 status_message: Optional[str] = None, 821 completion_start_time: Optional[datetime] = None, 822 model: Optional[str] = None, 823 model_parameters: Optional[Dict[str, MapValue]] = None, 824 usage_details: Optional[Dict[str, int]] = None, 825 cost_details: Optional[Dict[str, float]] = None, 826 prompt: Optional[PromptClient] = None, 827 end_on_exit: Optional[bool] = None, 828 ) -> _AgnosticContextManager[LangfuseEmbedding]: ... 829 830 @overload 831 def start_as_current_observation( 832 self, 833 *, 834 trace_context: Optional[TraceContext] = None, 835 name: str, 836 as_type: Literal["guardrail"], 837 input: Optional[Any] = None, 838 output: Optional[Any] = None, 839 metadata: Optional[Any] = None, 840 version: Optional[str] = None, 841 level: Optional[SpanLevel] = None, 842 status_message: Optional[str] = None, 843 end_on_exit: Optional[bool] = None, 844 ) -> _AgnosticContextManager[LangfuseGuardrail]: ... 845 846 def start_as_current_observation( 847 self, 848 *, 849 trace_context: Optional[TraceContext] = None, 850 name: str, 851 as_type: ObservationTypeLiteralNoEvent = "span", 852 input: Optional[Any] = None, 853 output: Optional[Any] = None, 854 metadata: Optional[Any] = None, 855 version: Optional[str] = None, 856 level: Optional[SpanLevel] = None, 857 status_message: Optional[str] = None, 858 completion_start_time: Optional[datetime] = None, 859 model: Optional[str] = None, 860 model_parameters: Optional[Dict[str, MapValue]] = None, 861 usage_details: Optional[Dict[str, int]] = None, 862 cost_details: Optional[Dict[str, float]] = None, 863 prompt: Optional[PromptClient] = None, 864 end_on_exit: Optional[bool] = None, 865 ) -> Union[ 866 _AgnosticContextManager[LangfuseGeneration], 867 _AgnosticContextManager[LangfuseSpan], 868 _AgnosticContextManager[LangfuseAgent], 869 _AgnosticContextManager[LangfuseTool], 870 _AgnosticContextManager[LangfuseChain], 871 _AgnosticContextManager[LangfuseRetriever], 872 _AgnosticContextManager[LangfuseEvaluator], 873 _AgnosticContextManager[LangfuseEmbedding], 874 _AgnosticContextManager[LangfuseGuardrail], 875 ]: 876 """Create a new observation and set it as the current span in a context manager. 877 878 This method creates a new observation of the specified type and sets it as the 879 current span within a context manager. Use this method with a 'with' statement to 880 automatically handle the observation lifecycle within a code block. 881 882 The created observation will be the child of the current span in the context. 883 884 Args: 885 trace_context: Optional context for connecting to an existing trace 886 name: Name of the observation (e.g., function or operation name) 887 as_type: Type of observation to create (defaults to "span") 888 input: Input data for the operation (can be any JSON-serializable object) 889 output: Output data from the operation (can be any JSON-serializable object) 890 metadata: Additional metadata to associate with the observation 891 version: Version identifier for the code or component 892 level: Importance level of the observation (info, warning, error) 893 status_message: Optional status message for the observation 894 end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks. 895 896 The following parameters are available when as_type is: "generation" or "embedding". 897 completion_start_time: When the model started generating the response 898 model: Name/identifier of the AI model used (e.g., "gpt-4") 899 model_parameters: Parameters used for the model (e.g., temperature, max_tokens) 900 usage_details: Token usage information (e.g., prompt_tokens, completion_tokens) 901 cost_details: Cost information for the model call 902 prompt: Associated prompt template from Langfuse prompt management 903 904 Returns: 905 A context manager that yields the appropriate observation type based on as_type 906 907 Example: 908 ```python 909 # Create a span 910 with langfuse.start_as_current_observation(name="process-query", as_type="span") as span: 911 # Do work 912 result = process_data() 913 span.update(output=result) 914 915 # Create a child span automatically 916 with span.start_as_current_observation(name="sub-operation") as child_span: 917 # Do sub-operation work 918 child_span.update(output="sub-result") 919 920 # Create a tool observation 921 with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool: 922 # Do tool work 923 results = search_web(query) 924 tool.update(output=results) 925 926 # Create a generation observation 927 with langfuse.start_as_current_observation( 928 name="answer-generation", 929 as_type="generation", 930 model="gpt-4" 931 ) as generation: 932 # Generate answer 933 response = llm.generate(...) 934 generation.update(output=response) 935 ``` 936 """ 937 if as_type in get_observation_types_list(ObservationTypeGenerationLike): 938 if trace_context: 939 trace_id = trace_context.get("trace_id", None) 940 parent_span_id = trace_context.get("parent_span_id", None) 941 942 if trace_id: 943 remote_parent_span = self._create_remote_parent_span( 944 trace_id=trace_id, parent_span_id=parent_span_id 945 ) 946 947 return cast( 948 Union[ 949 _AgnosticContextManager[LangfuseGeneration], 950 _AgnosticContextManager[LangfuseEmbedding], 951 ], 952 self._create_span_with_parent_context( 953 as_type=as_type, 954 name=name, 955 remote_parent_span=remote_parent_span, 956 parent=None, 957 end_on_exit=end_on_exit, 958 input=input, 959 output=output, 960 metadata=metadata, 961 version=version, 962 level=level, 963 status_message=status_message, 964 completion_start_time=completion_start_time, 965 model=model, 966 model_parameters=model_parameters, 967 usage_details=usage_details, 968 cost_details=cost_details, 969 prompt=prompt, 970 ), 971 ) 972 973 return cast( 974 Union[ 975 _AgnosticContextManager[LangfuseGeneration], 976 _AgnosticContextManager[LangfuseEmbedding], 977 ], 978 self._start_as_current_otel_span_with_processed_media( 979 as_type=as_type, 980 name=name, 981 end_on_exit=end_on_exit, 982 input=input, 983 output=output, 984 metadata=metadata, 985 version=version, 986 level=level, 987 status_message=status_message, 988 completion_start_time=completion_start_time, 989 model=model, 990 model_parameters=model_parameters, 991 usage_details=usage_details, 992 cost_details=cost_details, 993 prompt=prompt, 994 ), 995 ) 996 997 if as_type in get_observation_types_list(ObservationTypeSpanLike): 998 if trace_context: 999 trace_id = trace_context.get("trace_id", None) 1000 parent_span_id = trace_context.get("parent_span_id", None) 1001 1002 if trace_id: 1003 remote_parent_span = self._create_remote_parent_span( 1004 trace_id=trace_id, parent_span_id=parent_span_id 1005 ) 1006 1007 return cast( 1008 Union[ 1009 _AgnosticContextManager[LangfuseSpan], 1010 _AgnosticContextManager[LangfuseAgent], 1011 _AgnosticContextManager[LangfuseTool], 1012 _AgnosticContextManager[LangfuseChain], 1013 _AgnosticContextManager[LangfuseRetriever], 1014 _AgnosticContextManager[LangfuseEvaluator], 1015 _AgnosticContextManager[LangfuseGuardrail], 1016 ], 1017 self._create_span_with_parent_context( 1018 as_type=as_type, 1019 name=name, 1020 remote_parent_span=remote_parent_span, 1021 parent=None, 1022 end_on_exit=end_on_exit, 1023 input=input, 1024 output=output, 1025 metadata=metadata, 1026 version=version, 1027 level=level, 1028 status_message=status_message, 1029 ), 1030 ) 1031 1032 return cast( 1033 Union[ 1034 _AgnosticContextManager[LangfuseSpan], 1035 _AgnosticContextManager[LangfuseAgent], 1036 _AgnosticContextManager[LangfuseTool], 1037 _AgnosticContextManager[LangfuseChain], 1038 _AgnosticContextManager[LangfuseRetriever], 1039 _AgnosticContextManager[LangfuseEvaluator], 1040 _AgnosticContextManager[LangfuseGuardrail], 1041 ], 1042 self._start_as_current_otel_span_with_processed_media( 1043 as_type=as_type, 1044 name=name, 1045 end_on_exit=end_on_exit, 1046 input=input, 1047 output=output, 1048 metadata=metadata, 1049 version=version, 1050 level=level, 1051 status_message=status_message, 1052 ), 1053 ) 1054 1055 # This should never be reached since all valid types are handled above 1056 langfuse_logger.warning( 1057 f"Unknown observation type: {as_type}, falling back to span" 1058 ) 1059 return self._start_as_current_otel_span_with_processed_media( 1060 as_type="span", 1061 name=name, 1062 end_on_exit=end_on_exit, 1063 input=input, 1064 output=output, 1065 metadata=metadata, 1066 version=version, 1067 level=level, 1068 status_message=status_message, 1069 ) 1070 1071 def _get_span_class( 1072 self, 1073 as_type: ObservationTypeLiteral, 1074 ) -> Union[ 1075 Type[LangfuseAgent], 1076 Type[LangfuseTool], 1077 Type[LangfuseChain], 1078 Type[LangfuseRetriever], 1079 Type[LangfuseEvaluator], 1080 Type[LangfuseEmbedding], 1081 Type[LangfuseGuardrail], 1082 Type[LangfuseGeneration], 1083 Type[LangfuseEvent], 1084 Type[LangfuseSpan], 1085 ]: 1086 """Get the appropriate span class based on as_type.""" 1087 normalized_type = as_type.lower() 1088 1089 if normalized_type == "agent": 1090 return LangfuseAgent 1091 elif normalized_type == "tool": 1092 return LangfuseTool 1093 elif normalized_type == "chain": 1094 return LangfuseChain 1095 elif normalized_type == "retriever": 1096 return LangfuseRetriever 1097 elif normalized_type == "evaluator": 1098 return LangfuseEvaluator 1099 elif normalized_type == "embedding": 1100 return LangfuseEmbedding 1101 elif normalized_type == "guardrail": 1102 return LangfuseGuardrail 1103 elif normalized_type == "generation": 1104 return LangfuseGeneration 1105 elif normalized_type == "event": 1106 return LangfuseEvent 1107 elif normalized_type == "span": 1108 return LangfuseSpan 1109 else: 1110 return LangfuseSpan 1111 1112 @_agnosticcontextmanager 1113 def _create_span_with_parent_context( 1114 self, 1115 *, 1116 name: str, 1117 parent: Optional[otel_trace_api.Span] = None, 1118 remote_parent_span: Optional[otel_trace_api.Span] = None, 1119 as_type: ObservationTypeLiteralNoEvent, 1120 end_on_exit: Optional[bool] = None, 1121 input: Optional[Any] = None, 1122 output: Optional[Any] = None, 1123 metadata: Optional[Any] = None, 1124 version: Optional[str] = None, 1125 level: Optional[SpanLevel] = None, 1126 status_message: Optional[str] = None, 1127 completion_start_time: Optional[datetime] = None, 1128 model: Optional[str] = None, 1129 model_parameters: Optional[Dict[str, MapValue]] = None, 1130 usage_details: Optional[Dict[str, int]] = None, 1131 cost_details: Optional[Dict[str, float]] = None, 1132 prompt: Optional[PromptClient] = None, 1133 ) -> Any: 1134 parent_span = parent or cast(otel_trace_api.Span, remote_parent_span) 1135 1136 with otel_trace_api.use_span(parent_span): 1137 with self._start_as_current_otel_span_with_processed_media( 1138 name=name, 1139 as_type=as_type, 1140 end_on_exit=end_on_exit, 1141 input=input, 1142 output=output, 1143 metadata=metadata, 1144 version=version, 1145 level=level, 1146 status_message=status_message, 1147 completion_start_time=completion_start_time, 1148 model=model, 1149 model_parameters=model_parameters, 1150 usage_details=usage_details, 1151 cost_details=cost_details, 1152 prompt=prompt, 1153 ) as langfuse_span: 1154 if remote_parent_span is not None: 1155 langfuse_span._otel_span.set_attribute( 1156 LangfuseOtelSpanAttributes.AS_ROOT, True 1157 ) 1158 1159 yield langfuse_span 1160 1161 @_agnosticcontextmanager 1162 def _start_as_current_otel_span_with_processed_media( 1163 self, 1164 *, 1165 name: str, 1166 as_type: Optional[ObservationTypeLiteralNoEvent] = None, 1167 end_on_exit: Optional[bool] = None, 1168 input: Optional[Any] = None, 1169 output: Optional[Any] = None, 1170 metadata: Optional[Any] = None, 1171 version: Optional[str] = None, 1172 level: Optional[SpanLevel] = None, 1173 status_message: Optional[str] = None, 1174 completion_start_time: Optional[datetime] = None, 1175 model: Optional[str] = None, 1176 model_parameters: Optional[Dict[str, MapValue]] = None, 1177 usage_details: Optional[Dict[str, int]] = None, 1178 cost_details: Optional[Dict[str, float]] = None, 1179 prompt: Optional[PromptClient] = None, 1180 ) -> Any: 1181 with self._otel_tracer.start_as_current_span( 1182 name=name, 1183 end_on_exit=end_on_exit if end_on_exit is not None else True, 1184 ) as otel_span: 1185 baggage_token = None 1186 1187 if otel_span.is_recording(): 1188 context_with_app_root_claim = _set_langfuse_trace_id_in_baggage( 1189 trace_id=self._get_otel_trace_id(otel_span), 1190 context=otel_context_api.get_current(), 1191 ) 1192 baggage_token = otel_context_api.attach(context_with_app_root_claim) 1193 1194 span_class = self._get_span_class( 1195 as_type or "generation" 1196 ) # default was "generation" 1197 1198 try: 1199 common_args = { 1200 "otel_span": otel_span, 1201 "langfuse_client": self, 1202 "environment": self._environment, 1203 "release": self._release, 1204 "input": input, 1205 "output": output, 1206 "metadata": metadata, 1207 "version": version, 1208 "level": level, 1209 "status_message": status_message, 1210 } 1211 1212 if span_class in [ 1213 LangfuseGeneration, 1214 LangfuseEmbedding, 1215 ]: 1216 common_args.update( 1217 { 1218 "completion_start_time": completion_start_time, 1219 "model": model, 1220 "model_parameters": model_parameters, 1221 "usage_details": usage_details, 1222 "cost_details": cost_details, 1223 "prompt": prompt, 1224 } 1225 ) 1226 # For span-like types (span, agent, tool, chain, retriever, evaluator, guardrail), no generation properties needed 1227 1228 yield span_class(**common_args) # type: ignore[arg-type] 1229 1230 finally: 1231 if baggage_token is not None: 1232 _detach_context_token_safely(baggage_token) 1233 1234 def _get_current_otel_span(self) -> Optional[otel_trace_api.Span]: 1235 current_span = otel_trace_api.get_current_span() 1236 1237 if current_span is otel_trace_api.INVALID_SPAN: 1238 langfuse_logger.warning( 1239 "Context error: No active span in current context. Operations that depend on an active span will be skipped. " 1240 "Ensure spans are created with start_as_current_observation() or that you're operating within an active span context." 1241 ) 1242 return None 1243 1244 return current_span 1245 1246 def update_current_generation( 1247 self, 1248 *, 1249 name: Optional[str] = None, 1250 input: Optional[Any] = None, 1251 output: Optional[Any] = None, 1252 metadata: Optional[Any] = None, 1253 version: Optional[str] = None, 1254 level: Optional[SpanLevel] = None, 1255 status_message: Optional[str] = None, 1256 completion_start_time: Optional[datetime] = None, 1257 model: Optional[str] = None, 1258 model_parameters: Optional[Dict[str, MapValue]] = None, 1259 usage_details: Optional[Dict[str, int]] = None, 1260 cost_details: Optional[Dict[str, float]] = None, 1261 prompt: Optional[PromptClient] = None, 1262 ) -> None: 1263 """Update the current active generation span with new information. 1264 1265 This method updates the current generation span in the active context with 1266 additional information. It's useful for adding output, usage stats, or other 1267 details that become available during or after model generation. 1268 1269 Args: 1270 name: The generation name 1271 input: Updated input data for the model 1272 output: Output from the model (e.g., completions) 1273 metadata: Additional metadata to associate with the generation 1274 version: Version identifier for the model or component 1275 level: Importance level of the generation (info, warning, error) 1276 status_message: Optional status message for the generation 1277 completion_start_time: When the model started generating the response 1278 model: Name/identifier of the AI model used (e.g., "gpt-4") 1279 model_parameters: Parameters used for the model (e.g., temperature, max_tokens) 1280 usage_details: Token usage information (e.g., prompt_tokens, completion_tokens) 1281 cost_details: Cost information for the model call 1282 prompt: Associated prompt template from Langfuse prompt management 1283 1284 Example: 1285 ```python 1286 with langfuse.start_as_current_generation(name="answer-query") as generation: 1287 # Initial setup and API call 1288 response = llm.generate(...) 1289 1290 # Update with results that weren't available at creation time 1291 langfuse.update_current_generation( 1292 output=response.text, 1293 usage_details={ 1294 "prompt_tokens": response.usage.prompt_tokens, 1295 "completion_tokens": response.usage.completion_tokens 1296 } 1297 ) 1298 ``` 1299 """ 1300 if not self._tracing_enabled: 1301 langfuse_logger.debug( 1302 "Operation skipped: update_current_generation - Tracing is disabled or client is in no-op mode." 1303 ) 1304 return 1305 1306 current_otel_span = self._get_current_otel_span() 1307 1308 if current_otel_span is not None: 1309 generation = LangfuseGeneration( 1310 otel_span=current_otel_span, langfuse_client=self 1311 ) 1312 1313 if name: 1314 current_otel_span.update_name(name) 1315 1316 generation.update( 1317 input=input, 1318 output=output, 1319 metadata=metadata, 1320 version=version, 1321 level=level, 1322 status_message=status_message, 1323 completion_start_time=completion_start_time, 1324 model=model, 1325 model_parameters=model_parameters, 1326 usage_details=usage_details, 1327 cost_details=cost_details, 1328 prompt=prompt, 1329 ) 1330 1331 def update_current_span( 1332 self, 1333 *, 1334 name: Optional[str] = None, 1335 input: Optional[Any] = None, 1336 output: Optional[Any] = None, 1337 metadata: Optional[Any] = None, 1338 version: Optional[str] = None, 1339 level: Optional[SpanLevel] = None, 1340 status_message: Optional[str] = None, 1341 ) -> None: 1342 """Update the current active span with new information. 1343 1344 This method updates the current span in the active context with 1345 additional information. It's useful for adding outputs or metadata 1346 that become available during execution. 1347 1348 Args: 1349 name: The span name 1350 input: Updated input data for the operation 1351 output: Output data from the operation 1352 metadata: Additional metadata to associate with the span 1353 version: Version identifier for the code or component 1354 level: Importance level of the span (info, warning, error) 1355 status_message: Optional status message for the span 1356 1357 Example: 1358 ```python 1359 with langfuse.start_as_current_observation(name="process-data") as span: 1360 # Initial processing 1361 result = process_first_part() 1362 1363 # Update with intermediate results 1364 langfuse.update_current_span(metadata={"intermediate_result": result}) 1365 1366 # Continue processing 1367 final_result = process_second_part(result) 1368 1369 # Final update 1370 langfuse.update_current_span(output=final_result) 1371 ``` 1372 """ 1373 if not self._tracing_enabled: 1374 langfuse_logger.debug( 1375 "Operation skipped: update_current_span - Tracing is disabled or client is in no-op mode." 1376 ) 1377 return 1378 1379 current_otel_span = self._get_current_otel_span() 1380 1381 if current_otel_span is not None: 1382 span = LangfuseSpan( 1383 otel_span=current_otel_span, 1384 langfuse_client=self, 1385 environment=self._environment, 1386 release=self._release, 1387 ) 1388 1389 if name: 1390 current_otel_span.update_name(name) 1391 1392 span.update( 1393 input=input, 1394 output=output, 1395 metadata=metadata, 1396 version=version, 1397 level=level, 1398 status_message=status_message, 1399 ) 1400 1401 @deprecated( 1402 "Trace-level input/output is deprecated. " 1403 "For trace attributes (user_id, session_id, tags, etc.), use propagate_attributes() instead. " 1404 "This method will be removed in a future major version." 1405 ) 1406 def set_current_trace_io( 1407 self, 1408 *, 1409 input: Optional[Any] = None, 1410 output: Optional[Any] = None, 1411 ) -> None: 1412 """Set trace-level input and output for the current span's trace. 1413 1414 .. deprecated:: 1415 This is a legacy method for backward compatibility with Langfuse platform 1416 features that still rely on trace-level input/output (e.g., legacy LLM-as-a-judge 1417 evaluators). It will be removed in a future major version. 1418 1419 For setting other trace attributes (user_id, session_id, metadata, tags, version), 1420 use :meth:`propagate_attributes` instead. 1421 1422 Args: 1423 input: Input data to associate with the trace. 1424 output: Output data to associate with the trace. 1425 """ 1426 if not self._tracing_enabled: 1427 langfuse_logger.debug( 1428 "Operation skipped: set_current_trace_io - Tracing is disabled or client is in no-op mode." 1429 ) 1430 return 1431 1432 current_otel_span = self._get_current_otel_span() 1433 1434 if current_otel_span is not None and current_otel_span.is_recording(): 1435 existing_observation_type = current_otel_span.attributes.get( # type: ignore[attr-defined] 1436 LangfuseOtelSpanAttributes.OBSERVATION_TYPE, "span" 1437 ) 1438 # We need to preserve the class to keep the correct observation type 1439 span_class = self._get_span_class(existing_observation_type) 1440 span = span_class( 1441 otel_span=current_otel_span, 1442 langfuse_client=self, 1443 environment=self._environment, 1444 release=self._release, 1445 ) 1446 1447 span.set_trace_io( 1448 input=input, 1449 output=output, 1450 ) 1451 1452 def set_current_trace_as_public(self) -> None: 1453 """Make the current trace publicly accessible via its URL. 1454 1455 When a trace is published, anyone with the trace link can view the full trace 1456 without needing to be logged in to Langfuse. This action cannot be undone 1457 programmatically - once published, the entire trace becomes public. 1458 1459 This is a convenience method that publishes the trace from the currently 1460 active span context. Use this when you want to make a trace public from 1461 within a traced function without needing direct access to the span object. 1462 """ 1463 if not self._tracing_enabled: 1464 langfuse_logger.debug( 1465 "Operation skipped: set_current_trace_as_public - Tracing is disabled or client is in no-op mode." 1466 ) 1467 return 1468 1469 current_otel_span = self._get_current_otel_span() 1470 1471 if current_otel_span is not None and current_otel_span.is_recording(): 1472 existing_observation_type = current_otel_span.attributes.get( # type: ignore[attr-defined] 1473 LangfuseOtelSpanAttributes.OBSERVATION_TYPE, "span" 1474 ) 1475 # We need to preserve the class to keep the correct observation type 1476 span_class = self._get_span_class(existing_observation_type) 1477 span = span_class( 1478 otel_span=current_otel_span, 1479 langfuse_client=self, 1480 environment=self._environment, 1481 ) 1482 1483 span.set_trace_as_public() 1484 1485 def create_event( 1486 self, 1487 *, 1488 trace_context: Optional[TraceContext] = None, 1489 name: str, 1490 input: Optional[Any] = None, 1491 output: Optional[Any] = None, 1492 metadata: Optional[Any] = None, 1493 version: Optional[str] = None, 1494 level: Optional[SpanLevel] = None, 1495 status_message: Optional[str] = None, 1496 ) -> LangfuseEvent: 1497 """Create a new Langfuse observation of type 'EVENT'. 1498 1499 The created Langfuse Event observation will be the child of the current span in the context. 1500 1501 Args: 1502 trace_context: Optional context for connecting to an existing trace 1503 name: Name of the span (e.g., function or operation name) 1504 input: Input data for the operation (can be any JSON-serializable object) 1505 output: Output data from the operation (can be any JSON-serializable object) 1506 metadata: Additional metadata to associate with the span 1507 version: Version identifier for the code or component 1508 level: Importance level of the span (info, warning, error) 1509 status_message: Optional status message for the span 1510 1511 Returns: 1512 The Langfuse Event object 1513 1514 Example: 1515 ```python 1516 event = langfuse.create_event(name="process-event") 1517 ``` 1518 """ 1519 timestamp = time_ns() 1520 1521 if trace_context: 1522 trace_id = trace_context.get("trace_id", None) 1523 parent_span_id = trace_context.get("parent_span_id", None) 1524 1525 if trace_id: 1526 remote_parent_span = self._create_remote_parent_span( 1527 trace_id=trace_id, parent_span_id=parent_span_id 1528 ) 1529 1530 with otel_trace_api.use_span( 1531 cast(otel_trace_api.Span, remote_parent_span) 1532 ): 1533 otel_span = self._otel_tracer.start_span( 1534 name=name, start_time=timestamp 1535 ) 1536 otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True) 1537 1538 return cast( 1539 LangfuseEvent, 1540 LangfuseEvent( 1541 otel_span=otel_span, 1542 langfuse_client=self, 1543 environment=self._environment, 1544 release=self._release, 1545 input=input, 1546 output=output, 1547 metadata=metadata, 1548 version=version, 1549 level=level, 1550 status_message=status_message, 1551 ).end(end_time=timestamp), 1552 ) 1553 1554 otel_span = self._otel_tracer.start_span(name=name, start_time=timestamp) 1555 1556 return cast( 1557 LangfuseEvent, 1558 LangfuseEvent( 1559 otel_span=otel_span, 1560 langfuse_client=self, 1561 environment=self._environment, 1562 release=self._release, 1563 input=input, 1564 output=output, 1565 metadata=metadata, 1566 version=version, 1567 level=level, 1568 status_message=status_message, 1569 ).end(end_time=timestamp), 1570 ) 1571 1572 def _create_remote_parent_span( 1573 self, *, trace_id: str, parent_span_id: Optional[str] 1574 ) -> Any: 1575 if not self._is_valid_trace_id(trace_id): 1576 langfuse_logger.warning( 1577 f"Passed trace ID '{trace_id}' is not a valid 32 lowercase hex char Langfuse trace id. Ignoring trace ID." 1578 ) 1579 1580 if parent_span_id and not self._is_valid_span_id(parent_span_id): 1581 langfuse_logger.warning( 1582 f"Passed span ID '{parent_span_id}' is not a valid 16 lowercase hex char Langfuse span id. Ignoring parent span ID." 1583 ) 1584 1585 int_trace_id = int(trace_id, 16) 1586 int_parent_span_id = ( 1587 int(parent_span_id, 16) 1588 if parent_span_id 1589 else RandomIdGenerator().generate_span_id() 1590 ) 1591 1592 span_context = otel_trace_api.SpanContext( 1593 trace_id=int_trace_id, 1594 span_id=int_parent_span_id, 1595 trace_flags=otel_trace_api.TraceFlags(0x01), # mark span as sampled 1596 is_remote=False, 1597 ) 1598 1599 return otel_trace_api.NonRecordingSpan(span_context) 1600 1601 def _is_valid_trace_id(self, trace_id: str) -> bool: 1602 pattern = r"^[0-9a-f]{32}$" 1603 1604 return bool(re.match(pattern, trace_id)) 1605 1606 def _is_valid_span_id(self, span_id: str) -> bool: 1607 pattern = r"^[0-9a-f]{16}$" 1608 1609 return bool(re.match(pattern, span_id)) 1610 1611 def _create_observation_id(self, *, seed: Optional[str] = None) -> str: 1612 """Create a unique observation ID for use with Langfuse. 1613 1614 This method generates a unique observation ID (span ID in OpenTelemetry terms) 1615 for use with various Langfuse APIs. It can either generate a random ID or 1616 create a deterministic ID based on a seed string. 1617 1618 Observation IDs must be 16 lowercase hexadecimal characters, representing 8 bytes. 1619 This method ensures the generated ID meets this requirement. If you need to 1620 correlate an external ID with a Langfuse observation ID, use the external ID as 1621 the seed to get a valid, deterministic observation ID. 1622 1623 Args: 1624 seed: Optional string to use as a seed for deterministic ID generation. 1625 If provided, the same seed will always produce the same ID. 1626 If not provided, a random ID will be generated. 1627 1628 Returns: 1629 A 16-character lowercase hexadecimal string representing the observation ID. 1630 1631 Example: 1632 ```python 1633 # Generate a random observation ID 1634 obs_id = langfuse.create_observation_id() 1635 1636 # Generate a deterministic ID based on a seed 1637 user_obs_id = langfuse.create_observation_id(seed="user-123-feedback") 1638 1639 # Correlate an external item ID with a Langfuse observation ID 1640 item_id = "item-789012" 1641 correlated_obs_id = langfuse.create_observation_id(seed=item_id) 1642 1643 # Use the ID with Langfuse APIs 1644 langfuse.create_score( 1645 name="relevance", 1646 value=0.95, 1647 trace_id=trace_id, 1648 observation_id=obs_id 1649 ) 1650 ``` 1651 """ 1652 if not seed: 1653 span_id_int = RandomIdGenerator().generate_span_id() 1654 1655 return self._format_otel_span_id(span_id_int) 1656 1657 return sha256(seed.encode("utf-8")).digest()[:8].hex() 1658 1659 @staticmethod 1660 def create_trace_id(*, seed: Optional[str] = None) -> str: 1661 """Create a unique trace ID for use with Langfuse. 1662 1663 This method generates a unique trace ID for use with various Langfuse APIs. 1664 It can either generate a random ID or create a deterministic ID based on 1665 a seed string. 1666 1667 Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes. 1668 This method ensures the generated ID meets this requirement. If you need to 1669 correlate an external ID with a Langfuse trace ID, use the external ID as the 1670 seed to get a valid, deterministic Langfuse trace ID. 1671 1672 Args: 1673 seed: Optional string to use as a seed for deterministic ID generation. 1674 If provided, the same seed will always produce the same ID. 1675 If not provided, a random ID will be generated. 1676 1677 Returns: 1678 A 32-character lowercase hexadecimal string representing the Langfuse trace ID. 1679 1680 Example: 1681 ```python 1682 # Generate a random trace ID 1683 trace_id = langfuse.create_trace_id() 1684 1685 # Generate a deterministic ID based on a seed 1686 session_trace_id = langfuse.create_trace_id(seed="session-456") 1687 1688 # Correlate an external ID with a Langfuse trace ID 1689 external_id = "external-system-123456" 1690 correlated_trace_id = langfuse.create_trace_id(seed=external_id) 1691 1692 # Use the ID with trace context 1693 with langfuse.start_as_current_observation( 1694 name="process-request", 1695 trace_context={"trace_id": trace_id} 1696 ) as span: 1697 # Operation will be part of the specific trace 1698 pass 1699 ``` 1700 """ 1701 if not seed: 1702 trace_id_int = RandomIdGenerator().generate_trace_id() 1703 1704 return Langfuse._format_otel_trace_id(trace_id_int) 1705 1706 return sha256(seed.encode("utf-8")).digest()[:16].hex() 1707 1708 def _get_otel_trace_id(self, otel_span: otel_trace_api.Span) -> str: 1709 span_context = otel_span.get_span_context() 1710 1711 return self._format_otel_trace_id(span_context.trace_id) 1712 1713 def _get_otel_span_id(self, otel_span: otel_trace_api.Span) -> str: 1714 span_context = otel_span.get_span_context() 1715 1716 return self._format_otel_span_id(span_context.span_id) 1717 1718 @staticmethod 1719 def _format_otel_span_id(span_id_int: int) -> str: 1720 """Format an integer span ID to a 16-character lowercase hex string. 1721 1722 Internal method to convert an OpenTelemetry integer span ID to the standard 1723 W3C Trace Context format (16-character lowercase hex string). 1724 1725 Args: 1726 span_id_int: 64-bit integer representing a span ID 1727 1728 Returns: 1729 A 16-character lowercase hexadecimal string 1730 """ 1731 return format(span_id_int, "016x") 1732 1733 @staticmethod 1734 def _format_otel_trace_id(trace_id_int: int) -> str: 1735 """Format an integer trace ID to a 32-character lowercase hex string. 1736 1737 Internal method to convert an OpenTelemetry integer trace ID to the standard 1738 W3C Trace Context format (32-character lowercase hex string). 1739 1740 Args: 1741 trace_id_int: 128-bit integer representing a trace ID 1742 1743 Returns: 1744 A 32-character lowercase hexadecimal string 1745 """ 1746 return format(trace_id_int, "032x") 1747 1748 @overload 1749 def create_score( 1750 self, 1751 *, 1752 name: str, 1753 value: float, 1754 session_id: Optional[str] = None, 1755 dataset_run_id: Optional[str] = None, 1756 trace_id: Optional[str] = None, 1757 observation_id: Optional[str] = None, 1758 score_id: Optional[str] = None, 1759 data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None, 1760 comment: Optional[str] = None, 1761 config_id: Optional[str] = None, 1762 metadata: Optional[Any] = None, 1763 timestamp: Optional[datetime] = None, 1764 ) -> None: ... 1765 1766 @overload 1767 def create_score( 1768 self, 1769 *, 1770 name: str, 1771 value: str, 1772 session_id: Optional[str] = None, 1773 dataset_run_id: Optional[str] = None, 1774 trace_id: Optional[str] = None, 1775 score_id: Optional[str] = None, 1776 observation_id: Optional[str] = None, 1777 data_type: Optional[Literal["CATEGORICAL", "TEXT"]] = "CATEGORICAL", 1778 comment: Optional[str] = None, 1779 config_id: Optional[str] = None, 1780 metadata: Optional[Any] = None, 1781 timestamp: Optional[datetime] = None, 1782 ) -> None: ... 1783 1784 def create_score( 1785 self, 1786 *, 1787 name: str, 1788 value: Union[float, str], 1789 session_id: Optional[str] = None, 1790 dataset_run_id: Optional[str] = None, 1791 trace_id: Optional[str] = None, 1792 observation_id: Optional[str] = None, 1793 score_id: Optional[str] = None, 1794 data_type: Optional[ScoreDataType] = None, 1795 comment: Optional[str] = None, 1796 config_id: Optional[str] = None, 1797 metadata: Optional[Any] = None, 1798 timestamp: Optional[datetime] = None, 1799 ) -> None: 1800 """Create a score for a specific trace or observation. 1801 1802 This method creates a score for evaluating a Langfuse trace or observation. Scores can be 1803 used to track quality metrics, user feedback, or automated evaluations. 1804 1805 Args: 1806 name: Name of the score (e.g., "relevance", "accuracy") 1807 value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT) 1808 session_id: ID of the Langfuse session to associate the score with 1809 dataset_run_id: ID of the Langfuse dataset run to associate the score with 1810 trace_id: ID of the Langfuse trace to associate the score with 1811 observation_id: Optional ID of the specific observation to score. Trace ID must be provided too. 1812 score_id: Optional custom ID for the score (auto-generated if not provided) 1813 data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT) 1814 comment: Optional comment or explanation for the score 1815 config_id: Optional ID of a score config defined in Langfuse 1816 metadata: Optional metadata to be attached to the score 1817 timestamp: Optional timestamp for the score (defaults to current UTC time) 1818 1819 Example: 1820 ```python 1821 # Create a numeric score for accuracy 1822 langfuse.create_score( 1823 name="accuracy", 1824 value=0.92, 1825 trace_id="abcdef1234567890abcdef1234567890", 1826 data_type="NUMERIC", 1827 comment="High accuracy with minor irrelevant details" 1828 ) 1829 1830 # Create a categorical score for sentiment 1831 langfuse.create_score( 1832 name="sentiment", 1833 value="positive", 1834 trace_id="abcdef1234567890abcdef1234567890", 1835 observation_id="abcdef1234567890", 1836 data_type="CATEGORICAL" 1837 ) 1838 ``` 1839 """ 1840 if not self._tracing_enabled: 1841 return 1842 1843 score_id = score_id or self._create_observation_id() 1844 1845 try: 1846 new_body = ScoreBody( 1847 id=score_id, 1848 sessionId=session_id, 1849 datasetRunId=dataset_run_id, 1850 traceId=trace_id, 1851 observationId=observation_id, 1852 name=name, 1853 value=value, 1854 dataType=data_type, # type: ignore 1855 comment=comment, 1856 configId=config_id, 1857 environment=self._environment, 1858 metadata=metadata, 1859 ) 1860 1861 event = { 1862 "id": self.create_trace_id(), 1863 "type": "score-create", 1864 "timestamp": timestamp or _get_timestamp(), 1865 "body": new_body, 1866 } 1867 1868 if self._resources is not None: 1869 # Force the score to be in sample if it was for a legacy trace ID, i.e. non-32 hexchar 1870 force_sample = ( 1871 not self._is_valid_trace_id(trace_id) if trace_id else True 1872 ) 1873 1874 self._resources.add_score_task( 1875 event, 1876 force_sample=force_sample, 1877 ) 1878 1879 except Exception as e: 1880 langfuse_logger.exception( 1881 f"Error creating score: Failed to process score event for trace_id={trace_id}, name={name}. Error: {e}" 1882 ) 1883 1884 def _create_trace_tags_via_ingestion( 1885 self, 1886 *, 1887 trace_id: str, 1888 tags: List[str], 1889 ) -> None: 1890 """Private helper to enqueue trace tag updates via ingestion API events.""" 1891 if not self._tracing_enabled: 1892 return 1893 1894 if len(tags) == 0: 1895 return 1896 1897 try: 1898 new_body = TraceBody( 1899 id=trace_id, 1900 tags=tags, 1901 ) 1902 1903 event = { 1904 "id": self.create_trace_id(), 1905 "type": "trace-create", 1906 "timestamp": _get_timestamp(), 1907 "body": new_body, 1908 } 1909 1910 if self._resources is not None: 1911 self._resources.add_trace_task(event) 1912 except Exception as e: 1913 langfuse_logger.exception( 1914 f"Error updating trace tags: Failed to process trace update event for trace_id={trace_id}. Error: {e}" 1915 ) 1916 1917 @overload 1918 def score_current_span( 1919 self, 1920 *, 1921 name: str, 1922 value: float, 1923 score_id: Optional[str] = None, 1924 data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None, 1925 comment: Optional[str] = None, 1926 config_id: Optional[str] = None, 1927 metadata: Optional[Any] = None, 1928 ) -> None: ... 1929 1930 @overload 1931 def score_current_span( 1932 self, 1933 *, 1934 name: str, 1935 value: str, 1936 score_id: Optional[str] = None, 1937 data_type: Optional[Literal["CATEGORICAL", "TEXT"]] = "CATEGORICAL", 1938 comment: Optional[str] = None, 1939 config_id: Optional[str] = None, 1940 metadata: Optional[Any] = None, 1941 ) -> None: ... 1942 1943 def score_current_span( 1944 self, 1945 *, 1946 name: str, 1947 value: Union[float, str], 1948 score_id: Optional[str] = None, 1949 data_type: Optional[ScoreDataType] = None, 1950 comment: Optional[str] = None, 1951 config_id: Optional[str] = None, 1952 metadata: Optional[Any] = None, 1953 ) -> None: 1954 """Create a score for the current active span. 1955 1956 This method scores the currently active span in the context. It's a convenient 1957 way to score the current operation without needing to know its trace and span IDs. 1958 1959 Args: 1960 name: Name of the score (e.g., "relevance", "accuracy") 1961 value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT) 1962 score_id: Optional custom ID for the score (auto-generated if not provided) 1963 data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT) 1964 comment: Optional comment or explanation for the score 1965 config_id: Optional ID of a score config defined in Langfuse 1966 metadata: Optional metadata to be attached to the score 1967 1968 Example: 1969 ```python 1970 with langfuse.start_as_current_generation(name="answer-query") as generation: 1971 # Generate answer 1972 response = generate_answer(...) 1973 generation.update(output=response) 1974 1975 # Score the generation 1976 langfuse.score_current_span( 1977 name="relevance", 1978 value=0.85, 1979 data_type="NUMERIC", 1980 comment="Mostly relevant but contains some tangential information", 1981 metadata={"model": "gpt-4", "prompt_version": "v2"} 1982 ) 1983 ``` 1984 """ 1985 current_span = self._get_current_otel_span() 1986 1987 if current_span is not None: 1988 trace_id = self._get_otel_trace_id(current_span) 1989 observation_id = self._get_otel_span_id(current_span) 1990 1991 langfuse_logger.info( 1992 f"Score: Creating score name='{name}' value={value} for current span ({observation_id}) in trace {trace_id}" 1993 ) 1994 1995 self.create_score( 1996 trace_id=trace_id, 1997 observation_id=observation_id, 1998 name=name, 1999 value=cast(str, value), 2000 score_id=score_id, 2001 data_type=cast(Literal["CATEGORICAL", "TEXT"], data_type), 2002 comment=comment, 2003 config_id=config_id, 2004 metadata=metadata, 2005 ) 2006 2007 @overload 2008 def score_current_trace( 2009 self, 2010 *, 2011 name: str, 2012 value: float, 2013 score_id: Optional[str] = None, 2014 data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None, 2015 comment: Optional[str] = None, 2016 config_id: Optional[str] = None, 2017 metadata: Optional[Any] = None, 2018 ) -> None: ... 2019 2020 @overload 2021 def score_current_trace( 2022 self, 2023 *, 2024 name: str, 2025 value: str, 2026 score_id: Optional[str] = None, 2027 data_type: Optional[Literal["CATEGORICAL", "TEXT"]] = "CATEGORICAL", 2028 comment: Optional[str] = None, 2029 config_id: Optional[str] = None, 2030 metadata: Optional[Any] = None, 2031 ) -> None: ... 2032 2033 def score_current_trace( 2034 self, 2035 *, 2036 name: str, 2037 value: Union[float, str], 2038 score_id: Optional[str] = None, 2039 data_type: Optional[ScoreDataType] = None, 2040 comment: Optional[str] = None, 2041 config_id: Optional[str] = None, 2042 metadata: Optional[Any] = None, 2043 ) -> None: 2044 """Create a score for the current trace. 2045 2046 This method scores the trace of the currently active span. Unlike score_current_span, 2047 this method associates the score with the entire trace rather than a specific span. 2048 It's useful for scoring overall performance or quality of the entire operation. 2049 2050 Args: 2051 name: Name of the score (e.g., "user_satisfaction", "overall_quality") 2052 value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT) 2053 score_id: Optional custom ID for the score (auto-generated if not provided) 2054 data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT) 2055 comment: Optional comment or explanation for the score 2056 config_id: Optional ID of a score config defined in Langfuse 2057 metadata: Optional metadata to be attached to the score 2058 2059 Example: 2060 ```python 2061 with langfuse.start_as_current_observation(name="process-user-request") as span: 2062 # Process request 2063 result = process_complete_request() 2064 span.update(output=result) 2065 2066 # Score the overall trace 2067 langfuse.score_current_trace( 2068 name="overall_quality", 2069 value=0.95, 2070 data_type="NUMERIC", 2071 comment="High quality end-to-end response", 2072 metadata={"evaluator": "gpt-4", "criteria": "comprehensive"} 2073 ) 2074 ``` 2075 """ 2076 current_span = self._get_current_otel_span() 2077 2078 if current_span is not None: 2079 trace_id = self._get_otel_trace_id(current_span) 2080 2081 langfuse_logger.info( 2082 f"Score: Creating score name='{name}' value={value} for entire trace {trace_id}" 2083 ) 2084 2085 self.create_score( 2086 trace_id=trace_id, 2087 name=name, 2088 value=cast(str, value), 2089 score_id=score_id, 2090 data_type=cast(Literal["CATEGORICAL", "TEXT"], data_type), 2091 comment=comment, 2092 config_id=config_id, 2093 metadata=metadata, 2094 ) 2095 2096 def flush(self) -> None: 2097 """Force flush all pending spans and events to the Langfuse API. 2098 2099 This method manually flushes any pending spans, scores, and other events to the 2100 Langfuse API. It's useful in scenarios where you want to ensure all data is sent 2101 before proceeding, without waiting for the automatic flush interval. 2102 2103 Example: 2104 ```python 2105 # Record some spans and scores 2106 with langfuse.start_as_current_observation(name="operation") as span: 2107 # Do work... 2108 pass 2109 2110 # Ensure all data is sent to Langfuse before proceeding 2111 langfuse.flush() 2112 2113 # Continue with other work 2114 ``` 2115 """ 2116 if self._resources is not None: 2117 self._resources.flush() 2118 2119 def shutdown(self) -> None: 2120 """Shut down the Langfuse client and flush all pending data. 2121 2122 This method cleanly shuts down the Langfuse client, ensuring all pending data 2123 is flushed to the API and all background threads are properly terminated. 2124 2125 It's important to call this method when your application is shutting down to 2126 prevent data loss and resource leaks. For most applications, using the client 2127 as a context manager or relying on the automatic shutdown via atexit is sufficient. 2128 2129 Example: 2130 ```python 2131 # Initialize Langfuse 2132 langfuse = Langfuse(public_key="...", secret_key="...") 2133 2134 # Use Langfuse throughout your application 2135 # ... 2136 2137 # When application is shutting down 2138 langfuse.shutdown() 2139 ``` 2140 """ 2141 if self._resources is not None: 2142 self._resources.shutdown() 2143 2144 def get_current_trace_id(self) -> Optional[str]: 2145 """Get the trace ID of the current active span. 2146 2147 This method retrieves the trace ID from the currently active span in the context. 2148 It can be used to get the trace ID for referencing in logs, external systems, 2149 or for creating related operations. 2150 2151 Returns: 2152 The current trace ID as a 32-character lowercase hexadecimal string, 2153 or None if there is no active span. 2154 2155 Example: 2156 ```python 2157 with langfuse.start_as_current_observation(name="process-request") as span: 2158 # Get the current trace ID for reference 2159 trace_id = langfuse.get_current_trace_id() 2160 2161 # Use it for external correlation 2162 log.info(f"Processing request with trace_id: {trace_id}") 2163 2164 # Or pass to another system 2165 external_system.process(data, trace_id=trace_id) 2166 ``` 2167 """ 2168 if not self._tracing_enabled: 2169 langfuse_logger.debug( 2170 "Operation skipped: get_current_trace_id - Tracing is disabled or client is in no-op mode." 2171 ) 2172 return None 2173 2174 current_otel_span = self._get_current_otel_span() 2175 2176 return self._get_otel_trace_id(current_otel_span) if current_otel_span else None 2177 2178 def get_current_observation_id(self) -> Optional[str]: 2179 """Get the observation ID (span ID) of the current active span. 2180 2181 This method retrieves the observation ID from the currently active span in the context. 2182 It can be used to get the observation ID for referencing in logs, external systems, 2183 or for creating scores or other related operations. 2184 2185 Returns: 2186 The current observation ID as a 16-character lowercase hexadecimal string, 2187 or None if there is no active span. 2188 2189 Example: 2190 ```python 2191 with langfuse.start_as_current_observation(name="process-user-query") as span: 2192 # Get the current observation ID 2193 observation_id = langfuse.get_current_observation_id() 2194 2195 # Store it for later reference 2196 cache.set(f"query_{query_id}_observation", observation_id) 2197 2198 # Process the query... 2199 ``` 2200 """ 2201 if not self._tracing_enabled: 2202 langfuse_logger.debug( 2203 "Operation skipped: get_current_observation_id - Tracing is disabled or client is in no-op mode." 2204 ) 2205 return None 2206 2207 current_otel_span = self._get_current_otel_span() 2208 2209 return self._get_otel_span_id(current_otel_span) if current_otel_span else None 2210 2211 def _get_project_id(self) -> Optional[str]: 2212 """Fetch and return the current project id. Persisted across requests. Returns None if no project id is found for api keys.""" 2213 if not self._project_id: 2214 proj = self.api.projects.get() 2215 if not proj.data or not proj.data[0].id: 2216 return None 2217 2218 self._project_id = proj.data[0].id 2219 2220 return self._project_id 2221 2222 def get_trace_url(self, *, trace_id: Optional[str] = None) -> Optional[str]: 2223 """Get the URL to view a trace in the Langfuse UI. 2224 2225 This method generates a URL that links directly to a trace in the Langfuse UI. 2226 It's useful for providing links in logs, notifications, or debugging tools. 2227 2228 Args: 2229 trace_id: Optional trace ID to generate a URL for. If not provided, 2230 the trace ID of the current active span will be used. 2231 2232 Returns: 2233 A URL string pointing to the trace in the Langfuse UI, 2234 or None if the project ID couldn't be retrieved or no trace ID is available. 2235 2236 Example: 2237 ```python 2238 # Get URL for the current trace 2239 with langfuse.start_as_current_observation(name="process-request") as span: 2240 trace_url = langfuse.get_trace_url() 2241 log.info(f"Processing trace: {trace_url}") 2242 2243 # Get URL for a specific trace 2244 specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef") 2245 send_notification(f"Review needed for trace: {specific_trace_url}") 2246 ``` 2247 """ 2248 final_trace_id = trace_id or self.get_current_trace_id() 2249 if not final_trace_id: 2250 return None 2251 2252 project_id = self._get_project_id() 2253 2254 return ( 2255 f"{self._base_url}/project/{project_id}/traces/{final_trace_id}" 2256 if project_id and final_trace_id 2257 else None 2258 ) 2259 2260 def get_dataset( 2261 self, 2262 name: str, 2263 *, 2264 fetch_items_page_size: Optional[int] = 50, 2265 version: Optional[datetime] = None, 2266 ) -> "DatasetClient": 2267 """Fetch a dataset by its name. 2268 2269 Args: 2270 name (str): The name of the dataset to fetch. 2271 fetch_items_page_size (Optional[int]): All items of the dataset will be fetched in chunks of this size. Defaults to 50. 2272 version (Optional[datetime]): Retrieve dataset items as they existed at this specific point in time (UTC). 2273 If provided, returns the state of items at the specified UTC timestamp. 2274 If not provided, returns the latest version. Must be a timezone-aware datetime object in UTC. 2275 2276 Returns: 2277 DatasetClient: The dataset with the given name. 2278 """ 2279 try: 2280 langfuse_logger.debug(f"Getting datasets {name}") 2281 dataset = self.api.datasets.get(dataset_name=self._url_encode(name)) 2282 2283 dataset_items = [] 2284 page = 1 2285 2286 while True: 2287 new_items = self.api.dataset_items.list( 2288 dataset_name=self._url_encode(name, is_url_param=True), 2289 page=page, 2290 limit=fetch_items_page_size, 2291 version=version, 2292 ) 2293 dataset_items.extend(new_items.data) 2294 2295 if new_items.meta.total_pages <= page: 2296 break 2297 2298 page += 1 2299 2300 return DatasetClient( 2301 dataset=dataset, 2302 items=dataset_items, 2303 version=version, 2304 langfuse_client=self, 2305 ) 2306 2307 except Error as e: 2308 handle_fern_exception(e) 2309 raise e 2310 2311 def get_dataset_run( 2312 self, *, dataset_name: str, run_name: str 2313 ) -> DatasetRunWithItems: 2314 """Fetch a dataset run by dataset name and run name. 2315 2316 Args: 2317 dataset_name (str): The name of the dataset. 2318 run_name (str): The name of the run. 2319 2320 Returns: 2321 DatasetRunWithItems: The dataset run with its items. 2322 """ 2323 try: 2324 return cast( 2325 DatasetRunWithItems, 2326 self.api.datasets.get_run( 2327 dataset_name=self._url_encode(dataset_name), 2328 run_name=self._url_encode(run_name), 2329 request_options=None, 2330 ), 2331 ) 2332 except Error as e: 2333 handle_fern_exception(e) 2334 raise e 2335 2336 def get_dataset_runs( 2337 self, 2338 *, 2339 dataset_name: str, 2340 page: Optional[int] = None, 2341 limit: Optional[int] = None, 2342 ) -> PaginatedDatasetRuns: 2343 """Fetch all runs for a dataset. 2344 2345 Args: 2346 dataset_name (str): The name of the dataset. 2347 page (Optional[int]): Page number, starts at 1. 2348 limit (Optional[int]): Limit of items per page. 2349 2350 Returns: 2351 PaginatedDatasetRuns: Paginated list of dataset runs. 2352 """ 2353 try: 2354 return cast( 2355 PaginatedDatasetRuns, 2356 self.api.datasets.get_runs( 2357 dataset_name=self._url_encode(dataset_name), 2358 page=page, 2359 limit=limit, 2360 request_options=None, 2361 ), 2362 ) 2363 except Error as e: 2364 handle_fern_exception(e) 2365 raise e 2366 2367 def delete_dataset_run( 2368 self, *, dataset_name: str, run_name: str 2369 ) -> DeleteDatasetRunResponse: 2370 """Delete a dataset run and all its run items. This action is irreversible. 2371 2372 Args: 2373 dataset_name (str): The name of the dataset. 2374 run_name (str): The name of the run. 2375 2376 Returns: 2377 DeleteDatasetRunResponse: Confirmation of deletion. 2378 """ 2379 try: 2380 return cast( 2381 DeleteDatasetRunResponse, 2382 self.api.datasets.delete_run( 2383 dataset_name=self._url_encode(dataset_name), 2384 run_name=self._url_encode(run_name), 2385 request_options=None, 2386 ), 2387 ) 2388 except Error as e: 2389 handle_fern_exception(e) 2390 raise e 2391 2392 def run_experiment( 2393 self, 2394 *, 2395 name: str, 2396 run_name: Optional[str] = None, 2397 description: Optional[str] = None, 2398 data: ExperimentData, 2399 task: TaskFunction, 2400 evaluators: List[EvaluatorFunction] = [], 2401 composite_evaluator: Optional[CompositeEvaluatorFunction] = None, 2402 run_evaluators: List[RunEvaluatorFunction] = [], 2403 max_concurrency: int = 50, 2404 metadata: Optional[Dict[str, str]] = None, 2405 _dataset_version: Optional[datetime] = None, 2406 ) -> ExperimentResult: 2407 """Run an experiment on a dataset with automatic tracing and evaluation. 2408 2409 This method executes a task function on each item in the provided dataset, 2410 automatically traces all executions with Langfuse for observability, runs 2411 item-level and run-level evaluators on the outputs, and returns comprehensive 2412 results with evaluation metrics. 2413 2414 The experiment system provides: 2415 - Automatic tracing of all task executions 2416 - Concurrent processing with configurable limits 2417 - Comprehensive error handling that isolates failures 2418 - Integration with Langfuse datasets for experiment tracking 2419 - Flexible evaluation framework supporting both sync and async evaluators 2420 2421 Args: 2422 name: Human-readable name for the experiment. Used for identification 2423 in the Langfuse UI. 2424 run_name: Optional exact name for the experiment run. If provided, this will be 2425 used as the exact dataset run name if the `data` contains Langfuse dataset items. 2426 If not provided, this will default to the experiment name appended with an ISO timestamp. 2427 description: Optional description explaining the experiment's purpose, 2428 methodology, or expected outcomes. 2429 data: Array of data items to process. Can be either: 2430 - List of dict-like items with 'input', 'expected_output', 'metadata' keys 2431 - List of Langfuse DatasetItem objects from dataset.items 2432 task: Function that processes each data item and returns output. 2433 Must accept 'item' as keyword argument and can return sync or async results. 2434 The task function signature should be: task(*, item, **kwargs) -> Any 2435 evaluators: List of functions to evaluate each item's output individually. 2436 Each evaluator receives input, output, expected_output, and metadata. 2437 Can return single Evaluation dict or list of Evaluation dicts. 2438 composite_evaluator: Optional function that creates composite scores from item-level evaluations. 2439 Receives the same inputs as item-level evaluators (input, output, expected_output, metadata) 2440 plus the list of evaluations from item-level evaluators. Useful for weighted averages, 2441 pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics. 2442 run_evaluators: List of functions to evaluate the entire experiment run. 2443 Each run evaluator receives all item_results and can compute aggregate metrics. 2444 Useful for calculating averages, distributions, or cross-item comparisons. 2445 max_concurrency: Maximum number of concurrent task executions (default: 50). 2446 Controls the number of items processed simultaneously. Adjust based on 2447 API rate limits and system resources. 2448 metadata: Optional metadata dictionary to attach to all experiment traces. 2449 This metadata will be included in every trace created during the experiment. 2450 If `data` are Langfuse dataset items, the metadata will be attached to the dataset run, too. 2451 2452 Returns: 2453 ExperimentResult containing: 2454 - run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset. 2455 - item_results: List of results for each processed item with outputs and evaluations 2456 - run_evaluations: List of aggregate evaluation results for the entire run 2457 - experiment_id: Stable identifier for the experiment run across all items 2458 - dataset_run_id: ID of the dataset run (if using Langfuse datasets) 2459 - dataset_run_url: Direct URL to view results in Langfuse UI (if applicable) 2460 2461 Raises: 2462 ValueError: If required parameters are missing or invalid 2463 Exception: If experiment setup fails (individual item failures are handled gracefully) 2464 2465 Examples: 2466 Basic experiment with local data: 2467 ```python 2468 def summarize_text(*, item, **kwargs): 2469 return f"Summary: {item['input'][:50]}..." 2470 2471 def length_evaluator(*, input, output, expected_output=None, **kwargs): 2472 return { 2473 "name": "output_length", 2474 "value": len(output), 2475 "comment": f"Output contains {len(output)} characters" 2476 } 2477 2478 result = langfuse.run_experiment( 2479 name="Text Summarization Test", 2480 description="Evaluate summarization quality and length", 2481 data=[ 2482 {"input": "Long article text...", "expected_output": "Expected summary"}, 2483 {"input": "Another article...", "expected_output": "Another summary"} 2484 ], 2485 task=summarize_text, 2486 evaluators=[length_evaluator] 2487 ) 2488 2489 print(f"Processed {len(result.item_results)} items") 2490 for item_result in result.item_results: 2491 print(f"Input: {item_result.item['input']}") 2492 print(f"Output: {item_result.output}") 2493 print(f"Evaluations: {item_result.evaluations}") 2494 ``` 2495 2496 Advanced experiment with async task and multiple evaluators: 2497 ```python 2498 async def llm_task(*, item, **kwargs): 2499 # Simulate async LLM call 2500 response = await openai_client.chat.completions.create( 2501 model="gpt-4", 2502 messages=[{"role": "user", "content": item["input"]}] 2503 ) 2504 return response.choices[0].message.content 2505 2506 def accuracy_evaluator(*, input, output, expected_output=None, **kwargs): 2507 if expected_output and expected_output.lower() in output.lower(): 2508 return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"} 2509 return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"} 2510 2511 def toxicity_evaluator(*, input, output, expected_output=None, **kwargs): 2512 # Simulate toxicity check 2513 toxicity_score = check_toxicity(output) # Your toxicity checker 2514 return { 2515 "name": "toxicity", 2516 "value": toxicity_score, 2517 "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}" 2518 } 2519 2520 def average_accuracy(*, item_results, **kwargs): 2521 accuracies = [ 2522 eval.value for result in item_results 2523 for eval in result.evaluations 2524 if eval.name == "accuracy" 2525 ] 2526 return { 2527 "name": "average_accuracy", 2528 "value": sum(accuracies) / len(accuracies) if accuracies else 0, 2529 "comment": f"Average accuracy across {len(accuracies)} items" 2530 } 2531 2532 result = langfuse.run_experiment( 2533 name="LLM Safety and Accuracy Test", 2534 description="Evaluate model accuracy and safety across diverse prompts", 2535 data=test_dataset, # Your dataset items 2536 task=llm_task, 2537 evaluators=[accuracy_evaluator, toxicity_evaluator], 2538 run_evaluators=[average_accuracy], 2539 max_concurrency=5, # Limit concurrent API calls 2540 metadata={"model": "gpt-4", "temperature": 0.7} 2541 ) 2542 ``` 2543 2544 Using with Langfuse datasets: 2545 ```python 2546 # Get dataset from Langfuse 2547 dataset = langfuse.get_dataset("my-eval-dataset") 2548 2549 result = dataset.run_experiment( 2550 name="Production Model Evaluation", 2551 description="Monthly evaluation of production model performance", 2552 task=my_production_task, 2553 evaluators=[accuracy_evaluator, latency_evaluator] 2554 ) 2555 2556 # Results automatically linked to dataset in Langfuse UI 2557 print(f"View results: {result['dataset_run_url']}") 2558 ``` 2559 2560 Note: 2561 - Task and evaluator functions can be either synchronous or asynchronous 2562 - Individual item failures are logged but don't stop the experiment 2563 - All executions are automatically traced and visible in Langfuse UI 2564 - When using Langfuse datasets, results are automatically linked for easy comparison 2565 - This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.) 2566 - Async execution is handled automatically with smart event loop detection 2567 """ 2568 return cast( 2569 ExperimentResult, 2570 run_async_safely( 2571 self._run_experiment_async( 2572 name=name, 2573 run_name=self._create_experiment_run_name( 2574 name=name, run_name=run_name 2575 ), 2576 description=description, 2577 data=data, 2578 task=task, 2579 evaluators=evaluators or [], 2580 composite_evaluator=composite_evaluator, 2581 run_evaluators=run_evaluators or [], 2582 max_concurrency=max_concurrency, 2583 metadata=metadata, 2584 dataset_version=_dataset_version, 2585 ), 2586 ), 2587 ) 2588 2589 async def _run_experiment_async( 2590 self, 2591 *, 2592 name: str, 2593 run_name: str, 2594 description: Optional[str], 2595 data: ExperimentData, 2596 task: TaskFunction, 2597 evaluators: List[EvaluatorFunction], 2598 composite_evaluator: Optional[CompositeEvaluatorFunction], 2599 run_evaluators: List[RunEvaluatorFunction], 2600 max_concurrency: int, 2601 metadata: Optional[Dict[str, Any]] = None, 2602 dataset_version: Optional[datetime] = None, 2603 ) -> ExperimentResult: 2604 langfuse_logger.debug( 2605 f"Starting experiment '{name}' run '{run_name}' with {len(data)} items" 2606 ) 2607 2608 shared_fallback_experiment_id = self._create_observation_id() 2609 2610 # Set up concurrency control 2611 semaphore = asyncio.Semaphore(max_concurrency) 2612 2613 # Process all items 2614 async def process_item(item: ExperimentItem) -> ExperimentItemResult: 2615 async with semaphore: 2616 return await self._process_experiment_item( 2617 item, 2618 task, 2619 evaluators, 2620 composite_evaluator, 2621 shared_fallback_experiment_id, 2622 name, 2623 run_name, 2624 description, 2625 metadata, 2626 dataset_version, 2627 ) 2628 2629 # Run all items concurrently 2630 tasks = [process_item(item) for item in data] 2631 item_results = await asyncio.gather(*tasks, return_exceptions=True) 2632 2633 # Filter out any exceptions and log errors 2634 valid_results: List[ExperimentItemResult] = [] 2635 for i, result in enumerate(item_results): 2636 if isinstance(result, Exception): 2637 langfuse_logger.error(f"Item {i} failed: {result}") 2638 elif isinstance(result, ExperimentItemResult): 2639 valid_results.append(result) # type: ignore 2640 2641 # Run experiment-level evaluators 2642 run_evaluations: List[Evaluation] = [] 2643 for run_evaluator in run_evaluators: 2644 try: 2645 evaluations = await _run_evaluator( 2646 run_evaluator, item_results=valid_results 2647 ) 2648 run_evaluations.extend(evaluations) 2649 except Exception as e: 2650 langfuse_logger.error(f"Run evaluator failed: {e}") 2651 2652 # Generate dataset run URL if applicable 2653 dataset_run_id = next( 2654 ( 2655 result.dataset_run_id 2656 for result in valid_results 2657 if result.dataset_run_id 2658 ), 2659 None, 2660 ) 2661 dataset_run_url = None 2662 if dataset_run_id and data: 2663 try: 2664 # Check if the first item has dataset_id (for DatasetItem objects) 2665 first_item = data[0] 2666 dataset_id = None 2667 2668 if hasattr(first_item, "dataset_id"): 2669 dataset_id = getattr(first_item, "dataset_id", None) 2670 2671 if dataset_id: 2672 project_id = self._get_project_id() 2673 2674 if project_id: 2675 dataset_run_url = f"{self._base_url}/project/{project_id}/datasets/{dataset_id}/runs/{dataset_run_id}" 2676 2677 except Exception: 2678 pass # URL generation is optional 2679 2680 # Store run-level evaluations as scores 2681 for evaluation in run_evaluations: 2682 try: 2683 if dataset_run_id: 2684 self.create_score( 2685 dataset_run_id=dataset_run_id, 2686 name=evaluation.name or "<unknown>", 2687 value=evaluation.value, # type: ignore 2688 comment=evaluation.comment, 2689 metadata=evaluation.metadata, 2690 data_type=evaluation.data_type, # type: ignore 2691 config_id=evaluation.config_id, 2692 ) 2693 2694 except Exception as e: 2695 langfuse_logger.error(f"Failed to store run evaluation: {e}") 2696 2697 # Flush scores and traces 2698 self.flush() 2699 2700 return ExperimentResult( 2701 name=name, 2702 run_name=run_name, 2703 description=description, 2704 item_results=valid_results, 2705 run_evaluations=run_evaluations, 2706 experiment_id=dataset_run_id or shared_fallback_experiment_id, 2707 dataset_run_id=dataset_run_id, 2708 dataset_run_url=dataset_run_url, 2709 ) 2710 2711 async def _process_experiment_item( 2712 self, 2713 item: ExperimentItem, 2714 task: Callable, 2715 evaluators: List[Callable], 2716 composite_evaluator: Optional[CompositeEvaluatorFunction], 2717 fallback_experiment_id: str, 2718 experiment_name: str, 2719 experiment_run_name: str, 2720 experiment_description: Optional[str], 2721 experiment_metadata: Optional[Dict[str, Any]] = None, 2722 dataset_version: Optional[datetime] = None, 2723 ) -> ExperimentItemResult: 2724 span_name = "experiment-item-run" 2725 2726 with self.start_as_current_observation(name=span_name) as span: 2727 try: 2728 input_data = ( 2729 item.get("input") 2730 if isinstance(item, dict) 2731 else getattr(item, "input", None) 2732 ) 2733 2734 if input_data is None: 2735 raise ValueError("Experiment Item is missing input. Skipping item.") 2736 2737 expected_output = ( 2738 item.get("expected_output") 2739 if isinstance(item, dict) 2740 else getattr(item, "expected_output", None) 2741 ) 2742 2743 item_metadata = ( 2744 item.get("metadata") 2745 if isinstance(item, dict) 2746 else getattr(item, "metadata", None) 2747 ) 2748 2749 final_observation_metadata = { 2750 "experiment_name": experiment_name, 2751 "experiment_run_name": experiment_run_name, 2752 **(experiment_metadata or {}), 2753 } 2754 2755 trace_id = span.trace_id 2756 dataset_id = None 2757 dataset_item_id = None 2758 dataset_run_id = None 2759 2760 # Link to dataset run if this is a dataset item 2761 if hasattr(item, "id") and hasattr(item, "dataset_id"): 2762 try: 2763 # Use sync API to avoid event loop issues when run_async_safely 2764 # creates multiple event loops across different threads 2765 dataset_run_item = await asyncio.to_thread( 2766 self.api.dataset_run_items.create, 2767 run_name=experiment_run_name, 2768 run_description=experiment_description, 2769 metadata=experiment_metadata, 2770 dataset_item_id=item.id, # type: ignore 2771 trace_id=trace_id, 2772 observation_id=span.id, 2773 dataset_version=dataset_version, 2774 ) 2775 2776 dataset_run_id = dataset_run_item.dataset_run_id 2777 2778 except Exception as e: 2779 langfuse_logger.error(f"Failed to create dataset run item: {e}") 2780 2781 if ( 2782 not isinstance(item, dict) 2783 and hasattr(item, "dataset_id") 2784 and hasattr(item, "id") 2785 ): 2786 dataset_id = item.dataset_id 2787 dataset_item_id = item.id 2788 2789 final_observation_metadata.update( 2790 {"dataset_id": dataset_id, "dataset_item_id": dataset_item_id} 2791 ) 2792 2793 if isinstance(item_metadata, dict): 2794 final_observation_metadata.update(item_metadata) 2795 2796 experiment_id = dataset_run_id or fallback_experiment_id 2797 experiment_item_id = ( 2798 dataset_item_id or get_sha256_hash_hex(_serialize(input_data))[:16] 2799 ) 2800 span._otel_span.set_attributes( 2801 { 2802 k: v 2803 for k, v in { 2804 LangfuseOtelSpanAttributes.ENVIRONMENT: LANGFUSE_SDK_EXPERIMENT_ENVIRONMENT, 2805 LangfuseOtelSpanAttributes.EXPERIMENT_DESCRIPTION: experiment_description, 2806 LangfuseOtelSpanAttributes.EXPERIMENT_ITEM_EXPECTED_OUTPUT: _serialize( 2807 expected_output 2808 ), 2809 }.items() 2810 if v is not None 2811 } 2812 ) 2813 2814 propagated_experiment_attributes = PropagatedExperimentAttributes( 2815 experiment_id=experiment_id, 2816 experiment_name=experiment_run_name, 2817 experiment_metadata=_flatten_and_serialize_metadata_values( 2818 experiment_metadata 2819 ), 2820 experiment_dataset_id=dataset_id, 2821 experiment_item_id=experiment_item_id, 2822 experiment_item_metadata=_flatten_and_serialize_metadata_values( 2823 item_metadata if isinstance(item_metadata, dict) else None 2824 ), 2825 experiment_item_root_observation_id=span.id, 2826 ) 2827 2828 with _propagate_attributes(experiment=propagated_experiment_attributes): 2829 output = await _run_task(task, item) 2830 2831 span.update( 2832 input=input_data, 2833 output=output, 2834 metadata=final_observation_metadata, 2835 ) 2836 2837 except Exception as e: 2838 span.update( 2839 output=f"Error: {str(e)}", level="ERROR", status_message=str(e) 2840 ) 2841 raise e 2842 2843 # Run evaluators 2844 evaluations = [] 2845 2846 for evaluator in evaluators: 2847 try: 2848 eval_metadata: Optional[Dict[str, Any]] = None 2849 2850 if isinstance(item, dict): 2851 eval_metadata = item.get("metadata") 2852 elif hasattr(item, "metadata"): 2853 eval_metadata = item.metadata 2854 2855 with _propagate_attributes( 2856 experiment=propagated_experiment_attributes 2857 ): 2858 eval_results = await _run_evaluator( 2859 evaluator, 2860 input=input_data, 2861 output=output, 2862 expected_output=expected_output, 2863 metadata=eval_metadata, 2864 ) 2865 evaluations.extend(eval_results) 2866 2867 # Store evaluations as scores 2868 for evaluation in eval_results: 2869 self.create_score( 2870 trace_id=trace_id, 2871 observation_id=span.id, 2872 name=evaluation.name, 2873 value=evaluation.value, # type: ignore 2874 comment=evaluation.comment, 2875 metadata=evaluation.metadata, 2876 config_id=evaluation.config_id, 2877 data_type=evaluation.data_type, # type: ignore 2878 ) 2879 2880 except Exception as e: 2881 langfuse_logger.error(f"Evaluator failed: {e}") 2882 2883 # Run composite evaluator if provided and we have evaluations 2884 if composite_evaluator and evaluations: 2885 try: 2886 composite_eval_metadata: Optional[Dict[str, Any]] = None 2887 if isinstance(item, dict): 2888 composite_eval_metadata = item.get("metadata") 2889 elif hasattr(item, "metadata"): 2890 composite_eval_metadata = item.metadata 2891 2892 with _propagate_attributes( 2893 experiment=propagated_experiment_attributes 2894 ): 2895 result = composite_evaluator( 2896 input=input_data, 2897 output=output, 2898 expected_output=expected_output, 2899 metadata=composite_eval_metadata, 2900 evaluations=evaluations, 2901 ) 2902 2903 # Handle async composite evaluators 2904 if asyncio.iscoroutine(result): 2905 result = await result 2906 2907 # Normalize to list 2908 composite_evals: List[Evaluation] = [] 2909 if isinstance(result, (dict, Evaluation)): 2910 composite_evals = [result] # type: ignore 2911 elif isinstance(result, list): 2912 composite_evals = result # type: ignore 2913 2914 # Store composite evaluations as scores and add to evaluations list 2915 for composite_evaluation in composite_evals: 2916 self.create_score( 2917 trace_id=trace_id, 2918 observation_id=span.id, 2919 name=composite_evaluation.name, 2920 value=composite_evaluation.value, # type: ignore 2921 comment=composite_evaluation.comment, 2922 metadata=composite_evaluation.metadata, 2923 config_id=composite_evaluation.config_id, 2924 data_type=composite_evaluation.data_type, # type: ignore 2925 ) 2926 evaluations.append(composite_evaluation) 2927 2928 except Exception as e: 2929 langfuse_logger.error(f"Composite evaluator failed: {e}") 2930 2931 return ExperimentItemResult( 2932 item=item, 2933 output=output, 2934 evaluations=evaluations, 2935 trace_id=trace_id, 2936 dataset_run_id=dataset_run_id, 2937 ) 2938 2939 def _create_experiment_run_name( 2940 self, *, name: Optional[str] = None, run_name: Optional[str] = None 2941 ) -> str: 2942 if run_name: 2943 return run_name 2944 2945 iso_timestamp = _get_timestamp().isoformat().replace("+00:00", "Z") 2946 2947 return f"{name} - {iso_timestamp}" 2948 2949 def run_batched_evaluation( 2950 self, 2951 *, 2952 scope: Literal["traces", "observations"], 2953 mapper: MapperFunction, 2954 filter: Optional[str] = None, 2955 fetch_batch_size: int = 50, 2956 fetch_trace_fields: Optional[str] = None, 2957 max_items: Optional[int] = None, 2958 max_retries: int = 3, 2959 evaluators: List[EvaluatorFunction], 2960 composite_evaluator: Optional[CompositeEvaluatorFunction] = None, 2961 max_concurrency: int = 5, 2962 metadata: Optional[Dict[str, Any]] = None, 2963 _add_observation_scores_to_trace: bool = False, 2964 _additional_trace_tags: Optional[List[str]] = None, 2965 resume_from: Optional[BatchEvaluationResumeToken] = None, 2966 verbose: bool = False, 2967 ) -> BatchEvaluationResult: 2968 """Fetch traces or observations and run evaluations on each item. 2969 2970 This method provides a powerful way to evaluate existing data in Langfuse at scale. 2971 It fetches items based on filters, transforms them using a mapper function, runs 2972 evaluators on each item, and creates scores that are linked back to the original 2973 entities. This is ideal for: 2974 2975 - Running evaluations on production traces after deployment 2976 - Backtesting new evaluation metrics on historical data 2977 - Batch scoring of observations for quality monitoring 2978 - Periodic evaluation runs on recent data 2979 2980 The method uses a streaming/pipeline approach to process items in batches, making 2981 it memory-efficient for large datasets. It includes comprehensive error handling, 2982 retry logic, and resume capability for long-running evaluations. 2983 2984 Args: 2985 scope: The type of items to evaluate. Must be one of: 2986 - "traces": Evaluate complete traces with all their observations 2987 - "observations": Evaluate individual observations (spans, generations, events) 2988 mapper: Function that transforms API response objects into evaluator inputs. 2989 Receives a trace/observation object and returns an EvaluatorInputs 2990 instance with input, output, expected_output, and metadata fields. 2991 Can be sync or async. 2992 evaluators: List of evaluation functions to run on each item. Each evaluator 2993 receives the mapped inputs and returns Evaluation object(s). Evaluator 2994 failures are logged but don't stop the batch evaluation. 2995 filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples: 2996 - '{"tags": ["production"]}' 2997 - '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}' 2998 Default: None (fetches all items). 2999 fetch_batch_size: Number of items to fetch per API call and hold in memory. 3000 Larger values may be faster but use more memory. Default: 50. 3001 fetch_trace_fields: Comma-separated list of fields to include when fetching traces. Available field groups: 'core' (always included), 'io' (input, output, metadata), 'scores', 'observations', 'metrics'. If not specified, all fields are returned. Example: 'core,scores,metrics'. Note: Excluded 'observations' or 'scores' fields return empty arrays; excluded 'metrics' returns -1 for 'totalCost' and 'latency'. Only relevant if scope is 'traces'. 3002 max_items: Maximum total number of items to process. If None, processes all 3003 items matching the filter. Useful for testing or limiting evaluation runs. 3004 Default: None (process all). 3005 max_concurrency: Maximum number of items to evaluate concurrently. Controls 3006 parallelism and resource usage. Default: 5. 3007 composite_evaluator: Optional function that creates a composite score from 3008 item-level evaluations. Receives the original item and its evaluations, 3009 returns a single Evaluation. Useful for weighted averages or combined metrics. 3010 Default: None. 3011 metadata: Optional metadata dict to add to all created scores. Useful for 3012 tracking evaluation runs, versions, or other context. Default: None. 3013 max_retries: Maximum number of retry attempts for failed batch fetches. 3014 Uses exponential backoff (1s, 2s, 4s). Default: 3. 3015 verbose: If True, logs progress information to console. Useful for monitoring 3016 long-running evaluations. Default: False. 3017 resume_from: Optional resume token from a previous incomplete run. Allows 3018 continuing evaluation after interruption or failure. Default: None. 3019 3020 3021 Returns: 3022 BatchEvaluationResult containing: 3023 - total_items_fetched: Number of items fetched from API 3024 - total_items_processed: Number of items successfully evaluated 3025 - total_items_failed: Number of items that failed evaluation 3026 - total_scores_created: Scores created by item-level evaluators 3027 - total_composite_scores_created: Scores created by composite evaluator 3028 - total_evaluations_failed: Individual evaluator failures 3029 - evaluator_stats: Per-evaluator statistics (success rate, scores created) 3030 - resume_token: Token for resuming if incomplete (None if completed) 3031 - completed: True if all items processed 3032 - duration_seconds: Total execution time 3033 - failed_item_ids: IDs of items that failed 3034 - error_summary: Error types and counts 3035 - has_more_items: True if max_items reached but more exist 3036 3037 Raises: 3038 ValueError: If invalid scope is provided. 3039 3040 Examples: 3041 Basic trace evaluation: 3042 ```python 3043 from langfuse import Langfuse, EvaluatorInputs, Evaluation 3044 3045 client = Langfuse() 3046 3047 # Define mapper to extract fields from traces 3048 def trace_mapper(trace): 3049 return EvaluatorInputs( 3050 input=trace.input, 3051 output=trace.output, 3052 expected_output=None, 3053 metadata={"trace_id": trace.id} 3054 ) 3055 3056 # Define evaluator 3057 def length_evaluator(*, input, output, expected_output, metadata): 3058 return Evaluation( 3059 name="output_length", 3060 value=len(output) if output else 0 3061 ) 3062 3063 # Run batch evaluation 3064 result = client.run_batched_evaluation( 3065 scope="traces", 3066 mapper=trace_mapper, 3067 evaluators=[length_evaluator], 3068 filter='{"tags": ["production"]}', 3069 max_items=1000, 3070 verbose=True 3071 ) 3072 3073 print(f"Processed {result.total_items_processed} traces") 3074 print(f"Created {result.total_scores_created} scores") 3075 ``` 3076 3077 Evaluation with composite scorer: 3078 ```python 3079 def accuracy_evaluator(*, input, output, expected_output, metadata): 3080 # ... evaluation logic 3081 return Evaluation(name="accuracy", value=0.85) 3082 3083 def relevance_evaluator(*, input, output, expected_output, metadata): 3084 # ... evaluation logic 3085 return Evaluation(name="relevance", value=0.92) 3086 3087 def composite_evaluator(*, item, evaluations): 3088 # Weighted average of evaluations 3089 weights = {"accuracy": 0.6, "relevance": 0.4} 3090 total = sum( 3091 e.value * weights.get(e.name, 0) 3092 for e in evaluations 3093 if isinstance(e.value, (int, float)) 3094 ) 3095 return Evaluation( 3096 name="composite_score", 3097 value=total, 3098 comment=f"Weighted average of {len(evaluations)} metrics" 3099 ) 3100 3101 result = client.run_batched_evaluation( 3102 scope="traces", 3103 mapper=trace_mapper, 3104 evaluators=[accuracy_evaluator, relevance_evaluator], 3105 composite_evaluator=composite_evaluator, 3106 filter='{"user_id": "important_user"}', 3107 verbose=True 3108 ) 3109 ``` 3110 3111 Handling incomplete runs with resume: 3112 ```python 3113 # Initial run that may fail or timeout 3114 result = client.run_batched_evaluation( 3115 scope="observations", 3116 mapper=obs_mapper, 3117 evaluators=[my_evaluator], 3118 max_items=10000, 3119 verbose=True 3120 ) 3121 3122 # Check if incomplete 3123 if not result.completed and result.resume_token: 3124 print(f"Processed {result.resume_token.items_processed} items before interruption") 3125 3126 # Resume from where it left off 3127 result = client.run_batched_evaluation( 3128 scope="observations", 3129 mapper=obs_mapper, 3130 evaluators=[my_evaluator], 3131 resume_from=result.resume_token, 3132 verbose=True 3133 ) 3134 3135 print(f"Total items processed: {result.total_items_processed}") 3136 ``` 3137 3138 Monitoring evaluator performance: 3139 ```python 3140 result = client.run_batched_evaluation(...) 3141 3142 for stats in result.evaluator_stats: 3143 success_rate = stats.successful_runs / stats.total_runs 3144 print(f"{stats.name}:") 3145 print(f" Success rate: {success_rate:.1%}") 3146 print(f" Scores created: {stats.total_scores_created}") 3147 3148 if stats.failed_runs > 0: 3149 print(f" ⚠️ Failed {stats.failed_runs} times") 3150 ``` 3151 3152 Note: 3153 - Evaluator failures are logged but don't stop the batch evaluation 3154 - Individual item failures are tracked but don't stop processing 3155 - Fetch failures are retried with exponential backoff 3156 - All scores are automatically flushed to Langfuse at the end 3157 - The resume mechanism uses timestamp-based filtering to avoid duplicates 3158 """ 3159 runner = BatchEvaluationRunner(self) 3160 3161 return cast( 3162 BatchEvaluationResult, 3163 run_async_safely( 3164 runner.run_async( 3165 scope=scope, 3166 mapper=mapper, 3167 evaluators=evaluators, 3168 filter=filter, 3169 fetch_batch_size=fetch_batch_size, 3170 fetch_trace_fields=fetch_trace_fields, 3171 max_items=max_items, 3172 max_concurrency=max_concurrency, 3173 composite_evaluator=composite_evaluator, 3174 metadata=metadata, 3175 _add_observation_scores_to_trace=_add_observation_scores_to_trace, 3176 _additional_trace_tags=_additional_trace_tags, 3177 max_retries=max_retries, 3178 verbose=verbose, 3179 resume_from=resume_from, 3180 ) 3181 ), 3182 ) 3183 3184 def auth_check(self) -> bool: 3185 """Check if the provided credentials (public and secret key) are valid. 3186 3187 Raises: 3188 Exception: If no projects were found for the provided credentials. 3189 3190 Note: 3191 This method is blocking. It is discouraged to use it in production code. 3192 """ 3193 try: 3194 projects = self.api.projects.get() 3195 langfuse_logger.debug( 3196 f"Auth check successful, found {len(projects.data)} projects" 3197 ) 3198 if len(projects.data) == 0: 3199 raise Exception( 3200 "Auth check failed, no project found for the keys provided." 3201 ) 3202 return True 3203 3204 except AttributeError as e: 3205 langfuse_logger.warning( 3206 f"Auth check failed: Client not properly initialized. Error: {e}" 3207 ) 3208 return False 3209 3210 except Error as e: 3211 handle_fern_exception(e) 3212 raise e 3213 3214 def create_dataset( 3215 self, 3216 *, 3217 name: str, 3218 description: Optional[str] = None, 3219 metadata: Optional[Any] = None, 3220 input_schema: Optional[Any] = None, 3221 expected_output_schema: Optional[Any] = None, 3222 ) -> Dataset: 3223 """Create a dataset with the given name on Langfuse. 3224 3225 Args: 3226 name: Name of the dataset to create. 3227 description: Description of the dataset. Defaults to None. 3228 metadata: Additional metadata. Defaults to None. 3229 input_schema: JSON Schema for validating dataset item inputs. When set, all new items will be validated against this schema. 3230 expected_output_schema: JSON Schema for validating dataset item expected outputs. When set, all new items will be validated against this schema. 3231 3232 Returns: 3233 Dataset: The created dataset as returned by the Langfuse API. 3234 """ 3235 try: 3236 langfuse_logger.debug(f"Creating datasets {name}") 3237 3238 result = self.api.datasets.create( 3239 name=name, 3240 description=description, 3241 metadata=metadata, 3242 input_schema=input_schema, 3243 expected_output_schema=expected_output_schema, 3244 ) 3245 3246 return cast(Dataset, result) 3247 3248 except Error as e: 3249 handle_fern_exception(e) 3250 raise e 3251 3252 def create_dataset_item( 3253 self, 3254 *, 3255 dataset_name: str, 3256 input: Optional[Any] = None, 3257 expected_output: Optional[Any] = None, 3258 metadata: Optional[Any] = None, 3259 source_trace_id: Optional[str] = None, 3260 source_observation_id: Optional[str] = None, 3261 status: Optional[DatasetStatus] = None, 3262 id: Optional[str] = None, 3263 ) -> DatasetItem: 3264 """Create a dataset item. 3265 3266 Upserts if an item with id already exists. 3267 3268 Args: 3269 dataset_name: Name of the dataset in which the dataset item should be created. 3270 input: Input data. Defaults to None. Can contain any dict, list or scalar. 3271 expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar. 3272 metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar. 3273 source_trace_id: Id of the source trace. Defaults to None. 3274 source_observation_id: Id of the source observation. Defaults to None. 3275 status: Status of the dataset item. Defaults to ACTIVE for newly created items. 3276 id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets. 3277 3278 Returns: 3279 DatasetItem: The created dataset item as returned by the Langfuse API. 3280 3281 Example: 3282 ```python 3283 from langfuse import Langfuse 3284 3285 langfuse = Langfuse() 3286 3287 # Uploading items to the Langfuse dataset named "capital_cities" 3288 langfuse.create_dataset_item( 3289 dataset_name="capital_cities", 3290 input={"input": {"country": "Italy"}}, 3291 expected_output={"expected_output": "Rome"}, 3292 metadata={"foo": "bar"} 3293 ) 3294 ``` 3295 """ 3296 try: 3297 langfuse_logger.debug(f"Creating dataset item for dataset {dataset_name}") 3298 3299 result = self.api.dataset_items.create( 3300 dataset_name=dataset_name, 3301 input=input, 3302 expected_output=expected_output, 3303 metadata=metadata, 3304 source_trace_id=source_trace_id, 3305 source_observation_id=source_observation_id, 3306 status=status, 3307 id=id, 3308 ) 3309 3310 return cast(DatasetItem, result) 3311 except Error as e: 3312 handle_fern_exception(e) 3313 raise e 3314 3315 def resolve_media_references( 3316 self, 3317 *, 3318 obj: Any, 3319 resolve_with: Literal["base64_data_uri"], 3320 max_depth: int = 10, 3321 content_fetch_timeout_seconds: int = 5, 3322 ) -> Any: 3323 """Replace media reference strings in an object with base64 data URIs. 3324 3325 This method recursively traverses an object (up to max_depth) looking for media reference strings 3326 in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using 3327 the provided Langfuse client and replaces the reference string with a base64 data URI. 3328 3329 If fetching media content fails for a reference string, a warning is logged and the reference 3330 string is left unchanged. 3331 3332 Args: 3333 obj: The object to process. Can be a primitive value, array, or nested object. 3334 If the object has a __dict__ attribute, a dict will be returned instead of the original object type. 3335 resolve_with: The representation of the media content to replace the media reference string with. 3336 Currently only "base64_data_uri" is supported. 3337 max_depth: int: The maximum depth to traverse the object. Default is 10. 3338 content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5. 3339 3340 Returns: 3341 A deep copy of the input object with all media references replaced with base64 data URIs where possible. 3342 If the input object has a __dict__ attribute, a dict will be returned instead of the original object type. 3343 3344 Example: 3345 obj = { 3346 "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@", 3347 "nested": { 3348 "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@" 3349 } 3350 } 3351 3352 result = await LangfuseMedia.resolve_media_references(obj, langfuse_client) 3353 3354 # Result: 3355 # { 3356 # "image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...", 3357 # "nested": { 3358 # "pdf": "data:application/pdf;base64,JVBERi0xLjcK..." 3359 # } 3360 # } 3361 """ 3362 return LangfuseMedia.resolve_media_references( 3363 langfuse_client=self, 3364 obj=obj, 3365 resolve_with=resolve_with, 3366 max_depth=max_depth, 3367 content_fetch_timeout_seconds=content_fetch_timeout_seconds, 3368 ) 3369 3370 @overload 3371 def get_prompt( 3372 self, 3373 name: str, 3374 *, 3375 version: Optional[int] = None, 3376 label: Optional[str] = None, 3377 type: Literal["chat"], 3378 cache_ttl_seconds: Optional[int] = None, 3379 fallback: Optional[List[ChatMessageDict]] = None, 3380 max_retries: Optional[int] = None, 3381 fetch_timeout_seconds: Optional[int] = None, 3382 ) -> ChatPromptClient: ... 3383 3384 @overload 3385 def get_prompt( 3386 self, 3387 name: str, 3388 *, 3389 version: Optional[int] = None, 3390 label: Optional[str] = None, 3391 type: Literal["text"] = "text", 3392 cache_ttl_seconds: Optional[int] = None, 3393 fallback: Optional[str] = None, 3394 max_retries: Optional[int] = None, 3395 fetch_timeout_seconds: Optional[int] = None, 3396 ) -> TextPromptClient: ... 3397 3398 def get_prompt( 3399 self, 3400 name: str, 3401 *, 3402 version: Optional[int] = None, 3403 label: Optional[str] = None, 3404 type: Literal["chat", "text"] = "text", 3405 cache_ttl_seconds: Optional[int] = None, 3406 fallback: Union[Optional[List[ChatMessageDict]], Optional[str]] = None, 3407 max_retries: Optional[int] = None, 3408 fetch_timeout_seconds: Optional[int] = None, 3409 ) -> PromptClient: 3410 """Get a prompt. 3411 3412 This method attempts to fetch the requested prompt from the local cache. If the prompt is not found 3413 in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again 3414 and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will 3415 return the expired prompt as a fallback. 3416 3417 Args: 3418 name (str): The name of the prompt to retrieve. 3419 3420 Keyword Args: 3421 version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both. 3422 label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both. 3423 cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a 3424 keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0. 3425 type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text". 3426 fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None. 3427 max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds. 3428 fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default. 3429 3430 Returns: 3431 The prompt object retrieved from the cache or directly fetched if not cached or expired of type 3432 - TextPromptClient, if type argument is 'text'. 3433 - ChatPromptClient, if type argument is 'chat'. 3434 3435 Raises: 3436 Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an 3437 expired prompt in the cache, in which case it logs a warning and returns the expired prompt. 3438 """ 3439 if self._resources is None: 3440 raise Error( 3441 "SDK is not correctly initialized. Check the init logs for more details." 3442 ) 3443 if version is not None and label is not None: 3444 raise ValueError("Cannot specify both version and label at the same time.") 3445 3446 if not name: 3447 raise ValueError("Prompt name cannot be empty.") 3448 3449 cache_key = PromptCache.generate_cache_key(name, version=version, label=label) 3450 bounded_max_retries = self._get_bounded_max_retries( 3451 max_retries, default_max_retries=2, max_retries_upper_bound=4 3452 ) 3453 3454 langfuse_logger.debug(f"Getting prompt '{cache_key}'") 3455 cached_prompt = self._resources.prompt_cache.get(cache_key) 3456 3457 if cached_prompt is None or cache_ttl_seconds == 0: 3458 langfuse_logger.debug( 3459 f"Prompt '{cache_key}' not found in cache or caching disabled." 3460 ) 3461 try: 3462 return self._fetch_prompt_and_update_cache( 3463 name, 3464 version=version, 3465 label=label, 3466 ttl_seconds=cache_ttl_seconds, 3467 max_retries=bounded_max_retries, 3468 fetch_timeout_seconds=fetch_timeout_seconds, 3469 ) 3470 except Exception as e: 3471 if fallback: 3472 langfuse_logger.warning( 3473 f"Returning fallback prompt for '{cache_key}' due to fetch error: {e}" 3474 ) 3475 3476 fallback_client_args: Dict[str, Any] = { 3477 "name": name, 3478 "prompt": fallback, 3479 "type": type, 3480 "version": version or 0, 3481 "config": {}, 3482 "labels": [label] if label else [], 3483 "tags": [], 3484 } 3485 3486 if type == "text": 3487 return TextPromptClient( 3488 prompt=Prompt_Text(**fallback_client_args), 3489 is_fallback=True, 3490 ) 3491 3492 if type == "chat": 3493 return ChatPromptClient( 3494 prompt=Prompt_Chat(**fallback_client_args), 3495 is_fallback=True, 3496 ) 3497 3498 raise e 3499 3500 if cached_prompt.is_expired(): 3501 langfuse_logger.debug(f"Stale prompt '{cache_key}' found in cache.") 3502 try: 3503 # refresh prompt in background thread, refresh_prompt deduplicates tasks 3504 langfuse_logger.debug(f"Refreshing prompt '{cache_key}' in background.") 3505 3506 def refresh_task() -> None: 3507 self._fetch_prompt_and_update_cache( 3508 name, 3509 version=version, 3510 label=label, 3511 ttl_seconds=cache_ttl_seconds, 3512 max_retries=bounded_max_retries, 3513 fetch_timeout_seconds=fetch_timeout_seconds, 3514 ) 3515 3516 self._resources.prompt_cache.add_refresh_prompt_task_if_current( 3517 cache_key, 3518 cached_prompt, 3519 refresh_task, 3520 ) 3521 langfuse_logger.debug( 3522 f"Returning stale prompt '{cache_key}' from cache." 3523 ) 3524 # return stale prompt 3525 return cached_prompt.value 3526 3527 except Exception as e: 3528 langfuse_logger.warning( 3529 f"Error when refreshing cached prompt '{cache_key}', returning cached version. Error: {e}" 3530 ) 3531 # creation of refresh prompt task failed, return stale prompt 3532 return cached_prompt.value 3533 3534 return cached_prompt.value 3535 3536 def _fetch_prompt_and_update_cache( 3537 self, 3538 name: str, 3539 *, 3540 version: Optional[int] = None, 3541 label: Optional[str] = None, 3542 ttl_seconds: Optional[int] = None, 3543 max_retries: int, 3544 fetch_timeout_seconds: Optional[int], 3545 ) -> PromptClient: 3546 cache_key = PromptCache.generate_cache_key(name, version=version, label=label) 3547 langfuse_logger.debug(f"Fetching prompt '{cache_key}' from server...") 3548 3549 try: 3550 3551 @backoff.on_exception( 3552 backoff.constant, Exception, max_tries=max_retries + 1, logger=None 3553 ) 3554 def fetch_prompts() -> Any: 3555 return self.api.prompts.get( 3556 self._url_encode(name), 3557 version=version, 3558 label=label, 3559 request_options={ 3560 "timeout_in_seconds": fetch_timeout_seconds, 3561 } 3562 if fetch_timeout_seconds is not None 3563 else None, 3564 ) 3565 3566 prompt_response = fetch_prompts() 3567 3568 prompt: PromptClient 3569 if prompt_response.type == "chat": 3570 prompt = ChatPromptClient(prompt_response) 3571 else: 3572 prompt = TextPromptClient(prompt_response) 3573 3574 if self._resources is not None: 3575 self._resources.prompt_cache.set(cache_key, prompt, ttl_seconds) 3576 3577 return prompt 3578 3579 except NotFoundError as not_found_error: 3580 langfuse_logger.warning( 3581 f"Prompt '{cache_key}' not found during refresh, evicting from cache." 3582 ) 3583 if self._resources is not None: 3584 self._resources.prompt_cache.delete(cache_key) 3585 raise not_found_error 3586 3587 except Exception as e: 3588 langfuse_logger.error( 3589 f"Error while fetching prompt '{cache_key}': {str(e)}" 3590 ) 3591 raise e 3592 3593 def _get_bounded_max_retries( 3594 self, 3595 max_retries: Optional[int], 3596 *, 3597 default_max_retries: int = 2, 3598 max_retries_upper_bound: int = 4, 3599 ) -> int: 3600 if max_retries is None: 3601 return default_max_retries 3602 3603 bounded_max_retries = min( 3604 max(max_retries, 0), 3605 max_retries_upper_bound, 3606 ) 3607 3608 return bounded_max_retries 3609 3610 @overload 3611 def create_prompt( 3612 self, 3613 *, 3614 name: str, 3615 prompt: List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]], 3616 labels: List[str] = [], 3617 tags: Optional[List[str]] = None, 3618 type: Optional[Literal["chat"]], 3619 config: Optional[Any] = None, 3620 commit_message: Optional[str] = None, 3621 ) -> ChatPromptClient: ... 3622 3623 @overload 3624 def create_prompt( 3625 self, 3626 *, 3627 name: str, 3628 prompt: str, 3629 labels: List[str] = [], 3630 tags: Optional[List[str]] = None, 3631 type: Optional[Literal["text"]] = "text", 3632 config: Optional[Any] = None, 3633 commit_message: Optional[str] = None, 3634 ) -> TextPromptClient: ... 3635 3636 def create_prompt( 3637 self, 3638 *, 3639 name: str, 3640 prompt: Union[ 3641 str, List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]] 3642 ], 3643 labels: List[str] = [], 3644 tags: Optional[List[str]] = None, 3645 type: Optional[Literal["chat", "text"]] = "text", 3646 config: Optional[Any] = None, 3647 commit_message: Optional[str] = None, 3648 ) -> PromptClient: 3649 """Create a new prompt in Langfuse. 3650 3651 Keyword Args: 3652 name : The name of the prompt to be created. 3653 prompt : The content of the prompt to be created. 3654 is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead. 3655 labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label. 3656 tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt. 3657 config: Additional structured data to be saved with the prompt. Defaults to None. 3658 type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text". 3659 commit_message: Optional string describing the change. 3660 3661 Returns: 3662 TextPromptClient: The prompt if type argument is 'text'. 3663 ChatPromptClient: The prompt if type argument is 'chat'. 3664 """ 3665 try: 3666 langfuse_logger.debug(f"Creating prompt {name=}, {labels=}") 3667 3668 if type == "chat": 3669 if not isinstance(prompt, list): 3670 raise ValueError( 3671 "For 'chat' type, 'prompt' must be a list of chat messages with role and content attributes." 3672 ) 3673 request: Union[CreateChatPromptRequest, CreateTextPromptRequest] = ( 3674 CreateChatPromptRequest( 3675 name=name, 3676 prompt=cast(Any, prompt), 3677 labels=labels, 3678 tags=tags, 3679 config=config or {}, 3680 commit_message=commit_message, 3681 type=CreateChatPromptType.CHAT, 3682 ) 3683 ) 3684 server_prompt = self.api.prompts.create(request=request) 3685 3686 if self._resources is not None: 3687 self._resources.prompt_cache.invalidate(name) 3688 3689 return ChatPromptClient(prompt=cast(Prompt_Chat, server_prompt)) 3690 3691 if not isinstance(prompt, str): 3692 raise ValueError("For 'text' type, 'prompt' must be a string.") 3693 3694 request = CreateTextPromptRequest( 3695 name=name, 3696 prompt=prompt, 3697 labels=labels, 3698 tags=tags, 3699 config=config or {}, 3700 commit_message=commit_message, 3701 ) 3702 3703 server_prompt = self.api.prompts.create(request=request) 3704 3705 if self._resources is not None: 3706 self._resources.prompt_cache.invalidate(name) 3707 3708 return TextPromptClient(prompt=cast(Prompt_Text, server_prompt)) 3709 3710 except Error as e: 3711 handle_fern_exception(e) 3712 raise e 3713 3714 def update_prompt( 3715 self, 3716 *, 3717 name: str, 3718 version: int, 3719 new_labels: List[str] = [], 3720 ) -> Any: 3721 """Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name. 3722 3723 Args: 3724 name (str): The name of the prompt to update. 3725 version (int): The version number of the prompt to update. 3726 new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to []. 3727 3728 Returns: 3729 Prompt: The updated prompt from the Langfuse API. 3730 3731 """ 3732 updated_prompt = self.api.prompt_version.update( 3733 name=self._url_encode(name), 3734 version=version, 3735 new_labels=new_labels, 3736 ) 3737 3738 if self._resources is not None: 3739 self._resources.prompt_cache.invalidate(name) 3740 3741 return updated_prompt 3742 3743 def _url_encode(self, url: str, *, is_url_param: Optional[bool] = False) -> str: 3744 # httpx ≥ 0.28 does its own WHATWG-compliant quoting (eg. encodes bare 3745 # “%”, “?”, “#”, “|”, … in query/path parts). Re-quoting here would 3746 # double-encode, so we skip when the value is about to be sent straight 3747 # to httpx (`is_url_param=True`) and the installed version is ≥ 0.28. 3748 if is_url_param and Version(httpx.__version__) >= Version("0.28.0"): 3749 return url 3750 3751 # urllib.parse.quote does not escape slashes "/" by default; we need to add safe="" to force escaping 3752 # we need add safe="" to force escaping of slashes 3753 # This is necessary for prompts in prompt folders 3754 return urllib.parse.quote(url, safe="") 3755 3756 def clear_prompt_cache(self) -> None: 3757 """Clear the entire prompt cache, removing all cached prompts. 3758 3759 This method is useful when you want to force a complete refresh of all 3760 cached prompts, for example after major updates or when you need to 3761 ensure the latest versions are fetched from the server. 3762 """ 3763 if self._resources is not None: 3764 self._resources.prompt_cache.clear()
Main client for Langfuse tracing and platform features.
This class provides an interface for creating and managing traces, spans, and generations in Langfuse as well as interacting with the Langfuse API.
The client features a thread-safe singleton pattern for each unique public API key, ensuring consistent trace context propagation across your application. It implements efficient batching of spans with configurable flush settings and includes background thread management for media uploads and score ingestion.
Configuration is flexible through either direct parameters or environment variables, with graceful fallbacks and runtime configuration updates.
Attributes:
- api: Synchronous API client for Langfuse backend communication
- async_api: Asynchronous API client for Langfuse backend communication
- _otel_tracer: Internal LangfuseTracer instance managing OpenTelemetry components
Arguments:
- public_key (Optional[str]): Your Langfuse public API key. Can also be set via LANGFUSE_PUBLIC_KEY environment variable.
- secret_key (Optional[str]): Your Langfuse secret API key. Can also be set via LANGFUSE_SECRET_KEY environment variable.
- base_url (Optional[str]): The Langfuse API base URL. Defaults to "https://cloud.langfuse.com". Can also be set via LANGFUSE_BASE_URL environment variable.
- host (Optional[str]): Deprecated. Use base_url instead. The Langfuse API host URL. Defaults to "https://cloud.langfuse.com".
- timeout (Optional[int]): Timeout in seconds for API requests. Defaults to 5 seconds.
- httpx_client (Optional[httpx.Client]): Custom httpx client for making non-tracing HTTP requests. If not provided, a default client will be created.
- debug (bool): Enable debug logging. Defaults to False. Can also be set via LANGFUSE_DEBUG environment variable.
- tracing_enabled (Optional[bool]): Enable or disable tracing. Defaults to True. Can also be set via LANGFUSE_TRACING_ENABLED environment variable.
- flush_at (Optional[int]): Number of spans to batch before sending to the API. Defaults to 512. Can also be set via LANGFUSE_FLUSH_AT environment variable.
- flush_interval (Optional[float]): Time in seconds between batch flushes. Defaults to 5 seconds. Can also be set via LANGFUSE_FLUSH_INTERVAL environment variable.
- environment (Optional[str]): Environment name for tracing. Default is 'default'. Can also be set via LANGFUSE_TRACING_ENVIRONMENT environment variable. Can be any lowercase alphanumeric string with hyphens and underscores that does not start with 'langfuse'.
- release (Optional[str]): Release version/hash of your application. Used for grouping analytics by release.
- media_upload_thread_count (Optional[int]): Number of background threads for handling media uploads. Defaults to 1. Can also be set via LANGFUSE_MEDIA_UPLOAD_THREAD_COUNT environment variable.
- sample_rate (Optional[float]): Sampling rate for traces (0.0 to 1.0). Defaults to 1.0 (100% of traces are sampled). Can also be set via LANGFUSE_SAMPLE_RATE environment variable.
- mask (Optional[MaskFunction]): Function to mask sensitive data in traces before sending to the API.
blocked_instrumentation_scopes (Optional[List[str]]): Deprecated. Use
should_export_spaninstead. Equivalent behavior:from langfuse.span_filter import is_default_export_span blocked = {"sqlite", "requests"} should_export_span = lambda span: ( is_default_export_span(span) and ( span.instrumentation_scope is None or span.instrumentation_scope.name not in blocked ) )should_export_span (Optional[Callable[[ReadableSpan], bool]]): Callback to decide whether to export a span. If omitted, Langfuse uses the default filter (Langfuse SDK spans, spans with
gen_ai.*attributes, and known LLM instrumentation scopes).- additional_headers (Optional[Dict[str, str]]): Additional headers to include in all API requests and in the default OTLPSpanExporter requests. These headers will be merged with default headers. Note: If httpx_client is provided, additional_headers must be set directly on your custom httpx_client as well. If
span_exporteris provided, these headers are not wired into that exporter and must be configured on the exporter instance directly. - tracer_provider(Optional[TracerProvider]): OpenTelemetry TracerProvider to use for Langfuse. This can be useful to set to have disconnected tracing between Langfuse and other OpenTelemetry-span emitting libraries. Note: To track active spans, the context is still shared between TracerProviders. This may lead to broken trace trees.
- span_exporter (Optional[SpanExporter]): Custom OpenTelemetry span exporter for the Langfuse span processor. If omitted, Langfuse creates an OTLPSpanExporter pointed at the Langfuse OTLP endpoint. If provided, Langfuse does not wire
base_url, exporter headers, exporter auth, or exporter timeout into it. Configure endpoint, headers, and timeout on the exporter instance directly. If you are sending spans to Langfuse v4 or using Langfuse Cloud Fast Preview, includex-langfuse-ingestion-version=4on the exporter to enable real time processing of exported spans.
Example:
from langfuse.otel import Langfuse # Initialize the client (reads from env vars if not provided) langfuse = Langfuse( public_key="your-public-key", secret_key="your-secret-key", host="https://cloud.langfuse.com", # Optional, default shown ) # Create a trace span with langfuse.start_as_current_observation(name="process-query") as span: # Your application code here # Create a nested generation span for an LLM call with span.start_as_current_generation( name="generate-response", model="gpt-4", input={"query": "Tell me about AI"}, model_parameters={"temperature": 0.7, "max_tokens": 500} ) as generation: # Generate response here response = "AI is a field of computer science..." generation.update( output=response, usage_details={"prompt_tokens": 10, "completion_tokens": 50}, cost_details={"total_cost": 0.0023} ) # Score the generation (supports NUMERIC, BOOLEAN, CATEGORICAL) generation.score(name="relevance", value=0.95, data_type="NUMERIC")
235 def __init__( 236 self, 237 *, 238 public_key: Optional[str] = None, 239 secret_key: Optional[str] = None, 240 base_url: Optional[str] = None, 241 host: Optional[str] = None, 242 timeout: Optional[int] = None, 243 httpx_client: Optional[httpx.Client] = None, 244 debug: bool = False, 245 tracing_enabled: Optional[bool] = True, 246 flush_at: Optional[int] = None, 247 flush_interval: Optional[float] = None, 248 environment: Optional[str] = None, 249 release: Optional[str] = None, 250 media_upload_thread_count: Optional[int] = None, 251 sample_rate: Optional[float] = None, 252 mask: Optional[MaskFunction] = None, 253 blocked_instrumentation_scopes: Optional[List[str]] = None, 254 should_export_span: Optional[Callable[[ReadableSpan], bool]] = None, 255 additional_headers: Optional[Dict[str, str]] = None, 256 tracer_provider: Optional[TracerProvider] = None, 257 span_exporter: Optional[SpanExporter] = None, 258 ): 259 self._base_url = ( 260 base_url 261 or os.environ.get(LANGFUSE_BASE_URL) 262 or host 263 or os.environ.get(LANGFUSE_HOST, "https://cloud.langfuse.com") 264 ) 265 self._environment = environment or cast( 266 str, os.environ.get(LANGFUSE_TRACING_ENVIRONMENT) 267 ) 268 self._release = ( 269 release 270 or os.environ.get(LANGFUSE_RELEASE, None) 271 or get_common_release_envs() 272 ) 273 self._project_id: Optional[str] = None 274 sample_rate = sample_rate or float(os.environ.get(LANGFUSE_SAMPLE_RATE, 1.0)) 275 if not 0.0 <= sample_rate <= 1.0: 276 raise ValueError( 277 f"Sample rate must be between 0.0 and 1.0, got {sample_rate}" 278 ) 279 280 timeout = timeout or int(os.environ.get(LANGFUSE_TIMEOUT, 5)) 281 282 self._tracing_enabled = ( 283 tracing_enabled 284 and os.environ.get(LANGFUSE_TRACING_ENABLED, "true").lower() != "false" 285 ) 286 if not self._tracing_enabled: 287 langfuse_logger.info( 288 "Configuration: Langfuse tracing is explicitly disabled. No data will be sent to the Langfuse API." 289 ) 290 291 debug = ( 292 debug if debug else (os.getenv(LANGFUSE_DEBUG, "false").lower() == "true") 293 ) 294 if debug: 295 logging.basicConfig( 296 format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" 297 ) 298 langfuse_logger.setLevel(logging.DEBUG) 299 300 public_key = public_key or os.environ.get(LANGFUSE_PUBLIC_KEY) 301 if public_key is None: 302 langfuse_logger.warning( 303 "Authentication error: Langfuse client initialized without public_key. Client will be disabled. " 304 "Provide a public_key parameter or set LANGFUSE_PUBLIC_KEY environment variable. " 305 ) 306 self._otel_tracer = otel_trace_api.NoOpTracer() 307 return 308 309 secret_key = secret_key or os.environ.get(LANGFUSE_SECRET_KEY) 310 if secret_key is None: 311 langfuse_logger.warning( 312 "Authentication error: Langfuse client initialized without secret_key. Client will be disabled. " 313 "Provide a secret_key parameter or set LANGFUSE_SECRET_KEY environment variable. " 314 ) 315 self._otel_tracer = otel_trace_api.NoOpTracer() 316 return 317 318 if os.environ.get("OTEL_SDK_DISABLED", "false").lower() == "true": 319 langfuse_logger.warning( 320 "OTEL_SDK_DISABLED is set. Langfuse tracing will be disabled and no traces will appear in the UI." 321 ) 322 323 if blocked_instrumentation_scopes is not None: 324 warnings.warn( 325 "`blocked_instrumentation_scopes` is deprecated and will be removed in a future release. " 326 "Use `should_export_span` instead. Example: " 327 "from langfuse.span_filter import is_default_export_span; " 328 'blocked={"scope"}; should_export_span=lambda span: ' 329 "is_default_export_span(span) and (span.instrumentation_scope is None or " 330 "span.instrumentation_scope.name not in blocked).", 331 DeprecationWarning, 332 stacklevel=2, 333 ) 334 335 # Initialize api and tracer if requirements are met 336 self._resources = LangfuseResourceManager( 337 public_key=public_key, 338 secret_key=secret_key, 339 base_url=self._base_url, 340 timeout=timeout, 341 environment=self._environment, 342 release=release, 343 flush_at=flush_at, 344 flush_interval=flush_interval, 345 httpx_client=httpx_client, 346 media_upload_thread_count=media_upload_thread_count, 347 sample_rate=sample_rate, 348 mask=mask, 349 tracing_enabled=self._tracing_enabled, 350 blocked_instrumentation_scopes=blocked_instrumentation_scopes, 351 should_export_span=should_export_span, 352 additional_headers=additional_headers, 353 tracer_provider=tracer_provider, 354 span_exporter=span_exporter, 355 ) 356 self._mask = self._resources.mask 357 358 self._otel_tracer = ( 359 self._resources.tracer 360 if self._tracing_enabled and self._resources.tracer is not None 361 else otel_trace_api.NoOpTracer() 362 ) 363 self.api = self._resources.api 364 self.async_api = self._resources.async_api
513 def start_observation( 514 self, 515 *, 516 trace_context: Optional[TraceContext] = None, 517 name: str, 518 as_type: ObservationTypeLiteralNoEvent = "span", 519 input: Optional[Any] = None, 520 output: Optional[Any] = None, 521 metadata: Optional[Any] = None, 522 version: Optional[str] = None, 523 level: Optional[SpanLevel] = None, 524 status_message: Optional[str] = None, 525 completion_start_time: Optional[datetime] = None, 526 model: Optional[str] = None, 527 model_parameters: Optional[Dict[str, MapValue]] = None, 528 usage_details: Optional[Dict[str, int]] = None, 529 cost_details: Optional[Dict[str, float]] = None, 530 prompt: Optional[PromptClient] = None, 531 ) -> Union[ 532 LangfuseSpan, 533 LangfuseGeneration, 534 LangfuseAgent, 535 LangfuseTool, 536 LangfuseChain, 537 LangfuseRetriever, 538 LangfuseEvaluator, 539 LangfuseEmbedding, 540 LangfuseGuardrail, 541 ]: 542 """Create a new observation of the specified type. 543 544 This method creates a new observation but does not set it as the current span in the 545 context. To create and use an observation within a context, use start_as_current_observation(). 546 547 Args: 548 trace_context: Optional context for connecting to an existing trace 549 name: Name of the observation 550 as_type: Type of observation to create (defaults to "span") 551 input: Input data for the operation 552 output: Output data from the operation 553 metadata: Additional metadata to associate with the observation 554 version: Version identifier for the code or component 555 level: Importance level of the observation 556 status_message: Optional status message for the observation 557 completion_start_time: When the model started generating (for generation types) 558 model: Name/identifier of the AI model used (for generation types) 559 model_parameters: Parameters used for the model (for generation types) 560 usage_details: Token usage information (for generation types) 561 cost_details: Cost information (for generation types) 562 prompt: Associated prompt template (for generation types) 563 564 Returns: 565 An observation object of the appropriate type that must be ended with .end() 566 """ 567 if trace_context: 568 trace_id = trace_context.get("trace_id", None) 569 parent_span_id = trace_context.get("parent_span_id", None) 570 571 if trace_id: 572 remote_parent_span = self._create_remote_parent_span( 573 trace_id=trace_id, parent_span_id=parent_span_id 574 ) 575 576 with otel_trace_api.use_span( 577 cast(otel_trace_api.Span, remote_parent_span) 578 ): 579 otel_span = self._otel_tracer.start_span(name=name) 580 otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True) 581 582 return self._create_observation_from_otel_span( 583 otel_span=otel_span, 584 as_type=as_type, 585 input=input, 586 output=output, 587 metadata=metadata, 588 version=version, 589 level=level, 590 status_message=status_message, 591 completion_start_time=completion_start_time, 592 model=model, 593 model_parameters=model_parameters, 594 usage_details=usage_details, 595 cost_details=cost_details, 596 prompt=prompt, 597 ) 598 599 otel_span = self._otel_tracer.start_span(name=name) 600 601 return self._create_observation_from_otel_span( 602 otel_span=otel_span, 603 as_type=as_type, 604 input=input, 605 output=output, 606 metadata=metadata, 607 version=version, 608 level=level, 609 status_message=status_message, 610 completion_start_time=completion_start_time, 611 model=model, 612 model_parameters=model_parameters, 613 usage_details=usage_details, 614 cost_details=cost_details, 615 prompt=prompt, 616 )
Create a new observation of the specified type.
This method creates a new observation but does not set it as the current span in the context. To create and use an observation within a context, use start_as_current_observation().
Arguments:
- trace_context: Optional context for connecting to an existing trace
- name: Name of the observation
- as_type: Type of observation to create (defaults to "span")
- input: Input data for the operation
- output: Output data from the operation
- metadata: Additional metadata to associate with the observation
- version: Version identifier for the code or component
- level: Importance level of the observation
- status_message: Optional status message for the observation
- completion_start_time: When the model started generating (for generation types)
- model: Name/identifier of the AI model used (for generation types)
- model_parameters: Parameters used for the model (for generation types)
- usage_details: Token usage information (for generation types)
- cost_details: Cost information (for generation types)
- prompt: Associated prompt template (for generation types)
Returns:
An observation object of the appropriate type that must be ended with .end()
846 def start_as_current_observation( 847 self, 848 *, 849 trace_context: Optional[TraceContext] = None, 850 name: str, 851 as_type: ObservationTypeLiteralNoEvent = "span", 852 input: Optional[Any] = None, 853 output: Optional[Any] = None, 854 metadata: Optional[Any] = None, 855 version: Optional[str] = None, 856 level: Optional[SpanLevel] = None, 857 status_message: Optional[str] = None, 858 completion_start_time: Optional[datetime] = None, 859 model: Optional[str] = None, 860 model_parameters: Optional[Dict[str, MapValue]] = None, 861 usage_details: Optional[Dict[str, int]] = None, 862 cost_details: Optional[Dict[str, float]] = None, 863 prompt: Optional[PromptClient] = None, 864 end_on_exit: Optional[bool] = None, 865 ) -> Union[ 866 _AgnosticContextManager[LangfuseGeneration], 867 _AgnosticContextManager[LangfuseSpan], 868 _AgnosticContextManager[LangfuseAgent], 869 _AgnosticContextManager[LangfuseTool], 870 _AgnosticContextManager[LangfuseChain], 871 _AgnosticContextManager[LangfuseRetriever], 872 _AgnosticContextManager[LangfuseEvaluator], 873 _AgnosticContextManager[LangfuseEmbedding], 874 _AgnosticContextManager[LangfuseGuardrail], 875 ]: 876 """Create a new observation and set it as the current span in a context manager. 877 878 This method creates a new observation of the specified type and sets it as the 879 current span within a context manager. Use this method with a 'with' statement to 880 automatically handle the observation lifecycle within a code block. 881 882 The created observation will be the child of the current span in the context. 883 884 Args: 885 trace_context: Optional context for connecting to an existing trace 886 name: Name of the observation (e.g., function or operation name) 887 as_type: Type of observation to create (defaults to "span") 888 input: Input data for the operation (can be any JSON-serializable object) 889 output: Output data from the operation (can be any JSON-serializable object) 890 metadata: Additional metadata to associate with the observation 891 version: Version identifier for the code or component 892 level: Importance level of the observation (info, warning, error) 893 status_message: Optional status message for the observation 894 end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks. 895 896 The following parameters are available when as_type is: "generation" or "embedding". 897 completion_start_time: When the model started generating the response 898 model: Name/identifier of the AI model used (e.g., "gpt-4") 899 model_parameters: Parameters used for the model (e.g., temperature, max_tokens) 900 usage_details: Token usage information (e.g., prompt_tokens, completion_tokens) 901 cost_details: Cost information for the model call 902 prompt: Associated prompt template from Langfuse prompt management 903 904 Returns: 905 A context manager that yields the appropriate observation type based on as_type 906 907 Example: 908 ```python 909 # Create a span 910 with langfuse.start_as_current_observation(name="process-query", as_type="span") as span: 911 # Do work 912 result = process_data() 913 span.update(output=result) 914 915 # Create a child span automatically 916 with span.start_as_current_observation(name="sub-operation") as child_span: 917 # Do sub-operation work 918 child_span.update(output="sub-result") 919 920 # Create a tool observation 921 with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool: 922 # Do tool work 923 results = search_web(query) 924 tool.update(output=results) 925 926 # Create a generation observation 927 with langfuse.start_as_current_observation( 928 name="answer-generation", 929 as_type="generation", 930 model="gpt-4" 931 ) as generation: 932 # Generate answer 933 response = llm.generate(...) 934 generation.update(output=response) 935 ``` 936 """ 937 if as_type in get_observation_types_list(ObservationTypeGenerationLike): 938 if trace_context: 939 trace_id = trace_context.get("trace_id", None) 940 parent_span_id = trace_context.get("parent_span_id", None) 941 942 if trace_id: 943 remote_parent_span = self._create_remote_parent_span( 944 trace_id=trace_id, parent_span_id=parent_span_id 945 ) 946 947 return cast( 948 Union[ 949 _AgnosticContextManager[LangfuseGeneration], 950 _AgnosticContextManager[LangfuseEmbedding], 951 ], 952 self._create_span_with_parent_context( 953 as_type=as_type, 954 name=name, 955 remote_parent_span=remote_parent_span, 956 parent=None, 957 end_on_exit=end_on_exit, 958 input=input, 959 output=output, 960 metadata=metadata, 961 version=version, 962 level=level, 963 status_message=status_message, 964 completion_start_time=completion_start_time, 965 model=model, 966 model_parameters=model_parameters, 967 usage_details=usage_details, 968 cost_details=cost_details, 969 prompt=prompt, 970 ), 971 ) 972 973 return cast( 974 Union[ 975 _AgnosticContextManager[LangfuseGeneration], 976 _AgnosticContextManager[LangfuseEmbedding], 977 ], 978 self._start_as_current_otel_span_with_processed_media( 979 as_type=as_type, 980 name=name, 981 end_on_exit=end_on_exit, 982 input=input, 983 output=output, 984 metadata=metadata, 985 version=version, 986 level=level, 987 status_message=status_message, 988 completion_start_time=completion_start_time, 989 model=model, 990 model_parameters=model_parameters, 991 usage_details=usage_details, 992 cost_details=cost_details, 993 prompt=prompt, 994 ), 995 ) 996 997 if as_type in get_observation_types_list(ObservationTypeSpanLike): 998 if trace_context: 999 trace_id = trace_context.get("trace_id", None) 1000 parent_span_id = trace_context.get("parent_span_id", None) 1001 1002 if trace_id: 1003 remote_parent_span = self._create_remote_parent_span( 1004 trace_id=trace_id, parent_span_id=parent_span_id 1005 ) 1006 1007 return cast( 1008 Union[ 1009 _AgnosticContextManager[LangfuseSpan], 1010 _AgnosticContextManager[LangfuseAgent], 1011 _AgnosticContextManager[LangfuseTool], 1012 _AgnosticContextManager[LangfuseChain], 1013 _AgnosticContextManager[LangfuseRetriever], 1014 _AgnosticContextManager[LangfuseEvaluator], 1015 _AgnosticContextManager[LangfuseGuardrail], 1016 ], 1017 self._create_span_with_parent_context( 1018 as_type=as_type, 1019 name=name, 1020 remote_parent_span=remote_parent_span, 1021 parent=None, 1022 end_on_exit=end_on_exit, 1023 input=input, 1024 output=output, 1025 metadata=metadata, 1026 version=version, 1027 level=level, 1028 status_message=status_message, 1029 ), 1030 ) 1031 1032 return cast( 1033 Union[ 1034 _AgnosticContextManager[LangfuseSpan], 1035 _AgnosticContextManager[LangfuseAgent], 1036 _AgnosticContextManager[LangfuseTool], 1037 _AgnosticContextManager[LangfuseChain], 1038 _AgnosticContextManager[LangfuseRetriever], 1039 _AgnosticContextManager[LangfuseEvaluator], 1040 _AgnosticContextManager[LangfuseGuardrail], 1041 ], 1042 self._start_as_current_otel_span_with_processed_media( 1043 as_type=as_type, 1044 name=name, 1045 end_on_exit=end_on_exit, 1046 input=input, 1047 output=output, 1048 metadata=metadata, 1049 version=version, 1050 level=level, 1051 status_message=status_message, 1052 ), 1053 ) 1054 1055 # This should never be reached since all valid types are handled above 1056 langfuse_logger.warning( 1057 f"Unknown observation type: {as_type}, falling back to span" 1058 ) 1059 return self._start_as_current_otel_span_with_processed_media( 1060 as_type="span", 1061 name=name, 1062 end_on_exit=end_on_exit, 1063 input=input, 1064 output=output, 1065 metadata=metadata, 1066 version=version, 1067 level=level, 1068 status_message=status_message, 1069 )
Create a new observation and set it as the current span in a context manager.
This method creates a new observation of the specified type and sets it as the current span within a context manager. Use this method with a 'with' statement to automatically handle the observation lifecycle within a code block.
The created observation will be the child of the current span in the context.
Arguments:
- trace_context: Optional context for connecting to an existing trace
- name: Name of the observation (e.g., function or operation name)
- as_type: Type of observation to create (defaults to "span")
- input: Input data for the operation (can be any JSON-serializable object)
- output: Output data from the operation (can be any JSON-serializable object)
- metadata: Additional metadata to associate with the observation
- version: Version identifier for the code or component
- level: Importance level of the observation (info, warning, error)
- status_message: Optional status message for the observation
- end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
- The following parameters are available when as_type is: "generation" or "embedding".
- completion_start_time: When the model started generating the response
- model: Name/identifier of the AI model used (e.g., "gpt-4")
- model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
- usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
- cost_details: Cost information for the model call
- prompt: Associated prompt template from Langfuse prompt management
Returns:
A context manager that yields the appropriate observation type based on as_type
Example:
# Create a span with langfuse.start_as_current_observation(name="process-query", as_type="span") as span: # Do work result = process_data() span.update(output=result) # Create a child span automatically with span.start_as_current_observation(name="sub-operation") as child_span: # Do sub-operation work child_span.update(output="sub-result") # Create a tool observation with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool: # Do tool work results = search_web(query) tool.update(output=results) # Create a generation observation with langfuse.start_as_current_observation( name="answer-generation", as_type="generation", model="gpt-4" ) as generation: # Generate answer response = llm.generate(...) generation.update(output=response)
1246 def update_current_generation( 1247 self, 1248 *, 1249 name: Optional[str] = None, 1250 input: Optional[Any] = None, 1251 output: Optional[Any] = None, 1252 metadata: Optional[Any] = None, 1253 version: Optional[str] = None, 1254 level: Optional[SpanLevel] = None, 1255 status_message: Optional[str] = None, 1256 completion_start_time: Optional[datetime] = None, 1257 model: Optional[str] = None, 1258 model_parameters: Optional[Dict[str, MapValue]] = None, 1259 usage_details: Optional[Dict[str, int]] = None, 1260 cost_details: Optional[Dict[str, float]] = None, 1261 prompt: Optional[PromptClient] = None, 1262 ) -> None: 1263 """Update the current active generation span with new information. 1264 1265 This method updates the current generation span in the active context with 1266 additional information. It's useful for adding output, usage stats, or other 1267 details that become available during or after model generation. 1268 1269 Args: 1270 name: The generation name 1271 input: Updated input data for the model 1272 output: Output from the model (e.g., completions) 1273 metadata: Additional metadata to associate with the generation 1274 version: Version identifier for the model or component 1275 level: Importance level of the generation (info, warning, error) 1276 status_message: Optional status message for the generation 1277 completion_start_time: When the model started generating the response 1278 model: Name/identifier of the AI model used (e.g., "gpt-4") 1279 model_parameters: Parameters used for the model (e.g., temperature, max_tokens) 1280 usage_details: Token usage information (e.g., prompt_tokens, completion_tokens) 1281 cost_details: Cost information for the model call 1282 prompt: Associated prompt template from Langfuse prompt management 1283 1284 Example: 1285 ```python 1286 with langfuse.start_as_current_generation(name="answer-query") as generation: 1287 # Initial setup and API call 1288 response = llm.generate(...) 1289 1290 # Update with results that weren't available at creation time 1291 langfuse.update_current_generation( 1292 output=response.text, 1293 usage_details={ 1294 "prompt_tokens": response.usage.prompt_tokens, 1295 "completion_tokens": response.usage.completion_tokens 1296 } 1297 ) 1298 ``` 1299 """ 1300 if not self._tracing_enabled: 1301 langfuse_logger.debug( 1302 "Operation skipped: update_current_generation - Tracing is disabled or client is in no-op mode." 1303 ) 1304 return 1305 1306 current_otel_span = self._get_current_otel_span() 1307 1308 if current_otel_span is not None: 1309 generation = LangfuseGeneration( 1310 otel_span=current_otel_span, langfuse_client=self 1311 ) 1312 1313 if name: 1314 current_otel_span.update_name(name) 1315 1316 generation.update( 1317 input=input, 1318 output=output, 1319 metadata=metadata, 1320 version=version, 1321 level=level, 1322 status_message=status_message, 1323 completion_start_time=completion_start_time, 1324 model=model, 1325 model_parameters=model_parameters, 1326 usage_details=usage_details, 1327 cost_details=cost_details, 1328 prompt=prompt, 1329 )
Update the current active generation span with new information.
This method updates the current generation span in the active context with additional information. It's useful for adding output, usage stats, or other details that become available during or after model generation.
Arguments:
- name: The generation name
- input: Updated input data for the model
- output: Output from the model (e.g., completions)
- metadata: Additional metadata to associate with the generation
- version: Version identifier for the model or component
- level: Importance level of the generation (info, warning, error)
- status_message: Optional status message for the generation
- completion_start_time: When the model started generating the response
- model: Name/identifier of the AI model used (e.g., "gpt-4")
- model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
- usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
- cost_details: Cost information for the model call
- prompt: Associated prompt template from Langfuse prompt management
Example:
with langfuse.start_as_current_generation(name="answer-query") as generation: # Initial setup and API call response = llm.generate(...) # Update with results that weren't available at creation time langfuse.update_current_generation( output=response.text, usage_details={ "prompt_tokens": response.usage.prompt_tokens, "completion_tokens": response.usage.completion_tokens } )
1331 def update_current_span( 1332 self, 1333 *, 1334 name: Optional[str] = None, 1335 input: Optional[Any] = None, 1336 output: Optional[Any] = None, 1337 metadata: Optional[Any] = None, 1338 version: Optional[str] = None, 1339 level: Optional[SpanLevel] = None, 1340 status_message: Optional[str] = None, 1341 ) -> None: 1342 """Update the current active span with new information. 1343 1344 This method updates the current span in the active context with 1345 additional information. It's useful for adding outputs or metadata 1346 that become available during execution. 1347 1348 Args: 1349 name: The span name 1350 input: Updated input data for the operation 1351 output: Output data from the operation 1352 metadata: Additional metadata to associate with the span 1353 version: Version identifier for the code or component 1354 level: Importance level of the span (info, warning, error) 1355 status_message: Optional status message for the span 1356 1357 Example: 1358 ```python 1359 with langfuse.start_as_current_observation(name="process-data") as span: 1360 # Initial processing 1361 result = process_first_part() 1362 1363 # Update with intermediate results 1364 langfuse.update_current_span(metadata={"intermediate_result": result}) 1365 1366 # Continue processing 1367 final_result = process_second_part(result) 1368 1369 # Final update 1370 langfuse.update_current_span(output=final_result) 1371 ``` 1372 """ 1373 if not self._tracing_enabled: 1374 langfuse_logger.debug( 1375 "Operation skipped: update_current_span - Tracing is disabled or client is in no-op mode." 1376 ) 1377 return 1378 1379 current_otel_span = self._get_current_otel_span() 1380 1381 if current_otel_span is not None: 1382 span = LangfuseSpan( 1383 otel_span=current_otel_span, 1384 langfuse_client=self, 1385 environment=self._environment, 1386 release=self._release, 1387 ) 1388 1389 if name: 1390 current_otel_span.update_name(name) 1391 1392 span.update( 1393 input=input, 1394 output=output, 1395 metadata=metadata, 1396 version=version, 1397 level=level, 1398 status_message=status_message, 1399 )
Update the current active span with new information.
This method updates the current span in the active context with additional information. It's useful for adding outputs or metadata that become available during execution.
Arguments:
- name: The span name
- input: Updated input data for the operation
- output: Output data from the operation
- metadata: Additional metadata to associate with the span
- version: Version identifier for the code or component
- level: Importance level of the span (info, warning, error)
- status_message: Optional status message for the span
Example:
with langfuse.start_as_current_observation(name="process-data") as span: # Initial processing result = process_first_part() # Update with intermediate results langfuse.update_current_span(metadata={"intermediate_result": result}) # Continue processing final_result = process_second_part(result) # Final update langfuse.update_current_span(output=final_result)
1401 @deprecated( 1402 "Trace-level input/output is deprecated. " 1403 "For trace attributes (user_id, session_id, tags, etc.), use propagate_attributes() instead. " 1404 "This method will be removed in a future major version." 1405 ) 1406 def set_current_trace_io( 1407 self, 1408 *, 1409 input: Optional[Any] = None, 1410 output: Optional[Any] = None, 1411 ) -> None: 1412 """Set trace-level input and output for the current span's trace. 1413 1414 .. deprecated:: 1415 This is a legacy method for backward compatibility with Langfuse platform 1416 features that still rely on trace-level input/output (e.g., legacy LLM-as-a-judge 1417 evaluators). It will be removed in a future major version. 1418 1419 For setting other trace attributes (user_id, session_id, metadata, tags, version), 1420 use :meth:`propagate_attributes` instead. 1421 1422 Args: 1423 input: Input data to associate with the trace. 1424 output: Output data to associate with the trace. 1425 """ 1426 if not self._tracing_enabled: 1427 langfuse_logger.debug( 1428 "Operation skipped: set_current_trace_io - Tracing is disabled or client is in no-op mode." 1429 ) 1430 return 1431 1432 current_otel_span = self._get_current_otel_span() 1433 1434 if current_otel_span is not None and current_otel_span.is_recording(): 1435 existing_observation_type = current_otel_span.attributes.get( # type: ignore[attr-defined] 1436 LangfuseOtelSpanAttributes.OBSERVATION_TYPE, "span" 1437 ) 1438 # We need to preserve the class to keep the correct observation type 1439 span_class = self._get_span_class(existing_observation_type) 1440 span = span_class( 1441 otel_span=current_otel_span, 1442 langfuse_client=self, 1443 environment=self._environment, 1444 release=self._release, 1445 ) 1446 1447 span.set_trace_io( 1448 input=input, 1449 output=output, 1450 )
Set trace-level input and output for the current span's trace.
Deprecated since version : This is a legacy method for backward compatibility with Langfuse platform features that still rely on trace-level input/output (e.g., legacy LLM-as-a-judge evaluators). It will be removed in a future major version.
For setting other trace attributes (user_id, session_id, metadata, tags, version),
use propagate_attributes() instead.
Arguments:
- input: Input data to associate with the trace.
- output: Output data to associate with the trace.
1452 def set_current_trace_as_public(self) -> None: 1453 """Make the current trace publicly accessible via its URL. 1454 1455 When a trace is published, anyone with the trace link can view the full trace 1456 without needing to be logged in to Langfuse. This action cannot be undone 1457 programmatically - once published, the entire trace becomes public. 1458 1459 This is a convenience method that publishes the trace from the currently 1460 active span context. Use this when you want to make a trace public from 1461 within a traced function without needing direct access to the span object. 1462 """ 1463 if not self._tracing_enabled: 1464 langfuse_logger.debug( 1465 "Operation skipped: set_current_trace_as_public - Tracing is disabled or client is in no-op mode." 1466 ) 1467 return 1468 1469 current_otel_span = self._get_current_otel_span() 1470 1471 if current_otel_span is not None and current_otel_span.is_recording(): 1472 existing_observation_type = current_otel_span.attributes.get( # type: ignore[attr-defined] 1473 LangfuseOtelSpanAttributes.OBSERVATION_TYPE, "span" 1474 ) 1475 # We need to preserve the class to keep the correct observation type 1476 span_class = self._get_span_class(existing_observation_type) 1477 span = span_class( 1478 otel_span=current_otel_span, 1479 langfuse_client=self, 1480 environment=self._environment, 1481 ) 1482 1483 span.set_trace_as_public()
Make the current trace publicly accessible via its URL.
When a trace is published, anyone with the trace link can view the full trace without needing to be logged in to Langfuse. This action cannot be undone programmatically - once published, the entire trace becomes public.
This is a convenience method that publishes the trace from the currently active span context. Use this when you want to make a trace public from within a traced function without needing direct access to the span object.
1485 def create_event( 1486 self, 1487 *, 1488 trace_context: Optional[TraceContext] = None, 1489 name: str, 1490 input: Optional[Any] = None, 1491 output: Optional[Any] = None, 1492 metadata: Optional[Any] = None, 1493 version: Optional[str] = None, 1494 level: Optional[SpanLevel] = None, 1495 status_message: Optional[str] = None, 1496 ) -> LangfuseEvent: 1497 """Create a new Langfuse observation of type 'EVENT'. 1498 1499 The created Langfuse Event observation will be the child of the current span in the context. 1500 1501 Args: 1502 trace_context: Optional context for connecting to an existing trace 1503 name: Name of the span (e.g., function or operation name) 1504 input: Input data for the operation (can be any JSON-serializable object) 1505 output: Output data from the operation (can be any JSON-serializable object) 1506 metadata: Additional metadata to associate with the span 1507 version: Version identifier for the code or component 1508 level: Importance level of the span (info, warning, error) 1509 status_message: Optional status message for the span 1510 1511 Returns: 1512 The Langfuse Event object 1513 1514 Example: 1515 ```python 1516 event = langfuse.create_event(name="process-event") 1517 ``` 1518 """ 1519 timestamp = time_ns() 1520 1521 if trace_context: 1522 trace_id = trace_context.get("trace_id", None) 1523 parent_span_id = trace_context.get("parent_span_id", None) 1524 1525 if trace_id: 1526 remote_parent_span = self._create_remote_parent_span( 1527 trace_id=trace_id, parent_span_id=parent_span_id 1528 ) 1529 1530 with otel_trace_api.use_span( 1531 cast(otel_trace_api.Span, remote_parent_span) 1532 ): 1533 otel_span = self._otel_tracer.start_span( 1534 name=name, start_time=timestamp 1535 ) 1536 otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True) 1537 1538 return cast( 1539 LangfuseEvent, 1540 LangfuseEvent( 1541 otel_span=otel_span, 1542 langfuse_client=self, 1543 environment=self._environment, 1544 release=self._release, 1545 input=input, 1546 output=output, 1547 metadata=metadata, 1548 version=version, 1549 level=level, 1550 status_message=status_message, 1551 ).end(end_time=timestamp), 1552 ) 1553 1554 otel_span = self._otel_tracer.start_span(name=name, start_time=timestamp) 1555 1556 return cast( 1557 LangfuseEvent, 1558 LangfuseEvent( 1559 otel_span=otel_span, 1560 langfuse_client=self, 1561 environment=self._environment, 1562 release=self._release, 1563 input=input, 1564 output=output, 1565 metadata=metadata, 1566 version=version, 1567 level=level, 1568 status_message=status_message, 1569 ).end(end_time=timestamp), 1570 )
Create a new Langfuse observation of type 'EVENT'.
The created Langfuse Event observation will be the child of the current span in the context.
Arguments:
- trace_context: Optional context for connecting to an existing trace
- name: Name of the span (e.g., function or operation name)
- input: Input data for the operation (can be any JSON-serializable object)
- output: Output data from the operation (can be any JSON-serializable object)
- metadata: Additional metadata to associate with the span
- version: Version identifier for the code or component
- level: Importance level of the span (info, warning, error)
- status_message: Optional status message for the span
Returns:
The Langfuse Event object
Example:
event = langfuse.create_event(name="process-event")
1659 @staticmethod 1660 def create_trace_id(*, seed: Optional[str] = None) -> str: 1661 """Create a unique trace ID for use with Langfuse. 1662 1663 This method generates a unique trace ID for use with various Langfuse APIs. 1664 It can either generate a random ID or create a deterministic ID based on 1665 a seed string. 1666 1667 Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes. 1668 This method ensures the generated ID meets this requirement. If you need to 1669 correlate an external ID with a Langfuse trace ID, use the external ID as the 1670 seed to get a valid, deterministic Langfuse trace ID. 1671 1672 Args: 1673 seed: Optional string to use as a seed for deterministic ID generation. 1674 If provided, the same seed will always produce the same ID. 1675 If not provided, a random ID will be generated. 1676 1677 Returns: 1678 A 32-character lowercase hexadecimal string representing the Langfuse trace ID. 1679 1680 Example: 1681 ```python 1682 # Generate a random trace ID 1683 trace_id = langfuse.create_trace_id() 1684 1685 # Generate a deterministic ID based on a seed 1686 session_trace_id = langfuse.create_trace_id(seed="session-456") 1687 1688 # Correlate an external ID with a Langfuse trace ID 1689 external_id = "external-system-123456" 1690 correlated_trace_id = langfuse.create_trace_id(seed=external_id) 1691 1692 # Use the ID with trace context 1693 with langfuse.start_as_current_observation( 1694 name="process-request", 1695 trace_context={"trace_id": trace_id} 1696 ) as span: 1697 # Operation will be part of the specific trace 1698 pass 1699 ``` 1700 """ 1701 if not seed: 1702 trace_id_int = RandomIdGenerator().generate_trace_id() 1703 1704 return Langfuse._format_otel_trace_id(trace_id_int) 1705 1706 return sha256(seed.encode("utf-8")).digest()[:16].hex()
Create a unique trace ID for use with Langfuse.
This method generates a unique trace ID for use with various Langfuse APIs. It can either generate a random ID or create a deterministic ID based on a seed string.
Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes. This method ensures the generated ID meets this requirement. If you need to correlate an external ID with a Langfuse trace ID, use the external ID as the seed to get a valid, deterministic Langfuse trace ID.
Arguments:
- seed: Optional string to use as a seed for deterministic ID generation. If provided, the same seed will always produce the same ID. If not provided, a random ID will be generated.
Returns:
A 32-character lowercase hexadecimal string representing the Langfuse trace ID.
Example:
# Generate a random trace ID trace_id = langfuse.create_trace_id() # Generate a deterministic ID based on a seed session_trace_id = langfuse.create_trace_id(seed="session-456") # Correlate an external ID with a Langfuse trace ID external_id = "external-system-123456" correlated_trace_id = langfuse.create_trace_id(seed=external_id) # Use the ID with trace context with langfuse.start_as_current_observation( name="process-request", trace_context={"trace_id": trace_id} ) as span: # Operation will be part of the specific trace pass
1784 def create_score( 1785 self, 1786 *, 1787 name: str, 1788 value: Union[float, str], 1789 session_id: Optional[str] = None, 1790 dataset_run_id: Optional[str] = None, 1791 trace_id: Optional[str] = None, 1792 observation_id: Optional[str] = None, 1793 score_id: Optional[str] = None, 1794 data_type: Optional[ScoreDataType] = None, 1795 comment: Optional[str] = None, 1796 config_id: Optional[str] = None, 1797 metadata: Optional[Any] = None, 1798 timestamp: Optional[datetime] = None, 1799 ) -> None: 1800 """Create a score for a specific trace or observation. 1801 1802 This method creates a score for evaluating a Langfuse trace or observation. Scores can be 1803 used to track quality metrics, user feedback, or automated evaluations. 1804 1805 Args: 1806 name: Name of the score (e.g., "relevance", "accuracy") 1807 value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT) 1808 session_id: ID of the Langfuse session to associate the score with 1809 dataset_run_id: ID of the Langfuse dataset run to associate the score with 1810 trace_id: ID of the Langfuse trace to associate the score with 1811 observation_id: Optional ID of the specific observation to score. Trace ID must be provided too. 1812 score_id: Optional custom ID for the score (auto-generated if not provided) 1813 data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT) 1814 comment: Optional comment or explanation for the score 1815 config_id: Optional ID of a score config defined in Langfuse 1816 metadata: Optional metadata to be attached to the score 1817 timestamp: Optional timestamp for the score (defaults to current UTC time) 1818 1819 Example: 1820 ```python 1821 # Create a numeric score for accuracy 1822 langfuse.create_score( 1823 name="accuracy", 1824 value=0.92, 1825 trace_id="abcdef1234567890abcdef1234567890", 1826 data_type="NUMERIC", 1827 comment="High accuracy with minor irrelevant details" 1828 ) 1829 1830 # Create a categorical score for sentiment 1831 langfuse.create_score( 1832 name="sentiment", 1833 value="positive", 1834 trace_id="abcdef1234567890abcdef1234567890", 1835 observation_id="abcdef1234567890", 1836 data_type="CATEGORICAL" 1837 ) 1838 ``` 1839 """ 1840 if not self._tracing_enabled: 1841 return 1842 1843 score_id = score_id or self._create_observation_id() 1844 1845 try: 1846 new_body = ScoreBody( 1847 id=score_id, 1848 sessionId=session_id, 1849 datasetRunId=dataset_run_id, 1850 traceId=trace_id, 1851 observationId=observation_id, 1852 name=name, 1853 value=value, 1854 dataType=data_type, # type: ignore 1855 comment=comment, 1856 configId=config_id, 1857 environment=self._environment, 1858 metadata=metadata, 1859 ) 1860 1861 event = { 1862 "id": self.create_trace_id(), 1863 "type": "score-create", 1864 "timestamp": timestamp or _get_timestamp(), 1865 "body": new_body, 1866 } 1867 1868 if self._resources is not None: 1869 # Force the score to be in sample if it was for a legacy trace ID, i.e. non-32 hexchar 1870 force_sample = ( 1871 not self._is_valid_trace_id(trace_id) if trace_id else True 1872 ) 1873 1874 self._resources.add_score_task( 1875 event, 1876 force_sample=force_sample, 1877 ) 1878 1879 except Exception as e: 1880 langfuse_logger.exception( 1881 f"Error creating score: Failed to process score event for trace_id={trace_id}, name={name}. Error: {e}" 1882 )
Create a score for a specific trace or observation.
This method creates a score for evaluating a Langfuse trace or observation. Scores can be used to track quality metrics, user feedback, or automated evaluations.
Arguments:
- name: Name of the score (e.g., "relevance", "accuracy")
- value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT)
- session_id: ID of the Langfuse session to associate the score with
- dataset_run_id: ID of the Langfuse dataset run to associate the score with
- trace_id: ID of the Langfuse trace to associate the score with
- observation_id: Optional ID of the specific observation to score. Trace ID must be provided too.
- score_id: Optional custom ID for the score (auto-generated if not provided)
- data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT)
- comment: Optional comment or explanation for the score
- config_id: Optional ID of a score config defined in Langfuse
- metadata: Optional metadata to be attached to the score
- timestamp: Optional timestamp for the score (defaults to current UTC time)
Example:
# Create a numeric score for accuracy langfuse.create_score( name="accuracy", value=0.92, trace_id="abcdef1234567890abcdef1234567890", data_type="NUMERIC", comment="High accuracy with minor irrelevant details" ) # Create a categorical score for sentiment langfuse.create_score( name="sentiment", value="positive", trace_id="abcdef1234567890abcdef1234567890", observation_id="abcdef1234567890", data_type="CATEGORICAL" )
1943 def score_current_span( 1944 self, 1945 *, 1946 name: str, 1947 value: Union[float, str], 1948 score_id: Optional[str] = None, 1949 data_type: Optional[ScoreDataType] = None, 1950 comment: Optional[str] = None, 1951 config_id: Optional[str] = None, 1952 metadata: Optional[Any] = None, 1953 ) -> None: 1954 """Create a score for the current active span. 1955 1956 This method scores the currently active span in the context. It's a convenient 1957 way to score the current operation without needing to know its trace and span IDs. 1958 1959 Args: 1960 name: Name of the score (e.g., "relevance", "accuracy") 1961 value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT) 1962 score_id: Optional custom ID for the score (auto-generated if not provided) 1963 data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT) 1964 comment: Optional comment or explanation for the score 1965 config_id: Optional ID of a score config defined in Langfuse 1966 metadata: Optional metadata to be attached to the score 1967 1968 Example: 1969 ```python 1970 with langfuse.start_as_current_generation(name="answer-query") as generation: 1971 # Generate answer 1972 response = generate_answer(...) 1973 generation.update(output=response) 1974 1975 # Score the generation 1976 langfuse.score_current_span( 1977 name="relevance", 1978 value=0.85, 1979 data_type="NUMERIC", 1980 comment="Mostly relevant but contains some tangential information", 1981 metadata={"model": "gpt-4", "prompt_version": "v2"} 1982 ) 1983 ``` 1984 """ 1985 current_span = self._get_current_otel_span() 1986 1987 if current_span is not None: 1988 trace_id = self._get_otel_trace_id(current_span) 1989 observation_id = self._get_otel_span_id(current_span) 1990 1991 langfuse_logger.info( 1992 f"Score: Creating score name='{name}' value={value} for current span ({observation_id}) in trace {trace_id}" 1993 ) 1994 1995 self.create_score( 1996 trace_id=trace_id, 1997 observation_id=observation_id, 1998 name=name, 1999 value=cast(str, value), 2000 score_id=score_id, 2001 data_type=cast(Literal["CATEGORICAL", "TEXT"], data_type), 2002 comment=comment, 2003 config_id=config_id, 2004 metadata=metadata, 2005 )
Create a score for the current active span.
This method scores the currently active span in the context. It's a convenient way to score the current operation without needing to know its trace and span IDs.
Arguments:
- name: Name of the score (e.g., "relevance", "accuracy")
- value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT)
- score_id: Optional custom ID for the score (auto-generated if not provided)
- data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT)
- comment: Optional comment or explanation for the score
- config_id: Optional ID of a score config defined in Langfuse
- metadata: Optional metadata to be attached to the score
Example:
with langfuse.start_as_current_generation(name="answer-query") as generation: # Generate answer response = generate_answer(...) generation.update(output=response) # Score the generation langfuse.score_current_span( name="relevance", value=0.85, data_type="NUMERIC", comment="Mostly relevant but contains some tangential information", metadata={"model": "gpt-4", "prompt_version": "v2"} )
2033 def score_current_trace( 2034 self, 2035 *, 2036 name: str, 2037 value: Union[float, str], 2038 score_id: Optional[str] = None, 2039 data_type: Optional[ScoreDataType] = None, 2040 comment: Optional[str] = None, 2041 config_id: Optional[str] = None, 2042 metadata: Optional[Any] = None, 2043 ) -> None: 2044 """Create a score for the current trace. 2045 2046 This method scores the trace of the currently active span. Unlike score_current_span, 2047 this method associates the score with the entire trace rather than a specific span. 2048 It's useful for scoring overall performance or quality of the entire operation. 2049 2050 Args: 2051 name: Name of the score (e.g., "user_satisfaction", "overall_quality") 2052 value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT) 2053 score_id: Optional custom ID for the score (auto-generated if not provided) 2054 data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT) 2055 comment: Optional comment or explanation for the score 2056 config_id: Optional ID of a score config defined in Langfuse 2057 metadata: Optional metadata to be attached to the score 2058 2059 Example: 2060 ```python 2061 with langfuse.start_as_current_observation(name="process-user-request") as span: 2062 # Process request 2063 result = process_complete_request() 2064 span.update(output=result) 2065 2066 # Score the overall trace 2067 langfuse.score_current_trace( 2068 name="overall_quality", 2069 value=0.95, 2070 data_type="NUMERIC", 2071 comment="High quality end-to-end response", 2072 metadata={"evaluator": "gpt-4", "criteria": "comprehensive"} 2073 ) 2074 ``` 2075 """ 2076 current_span = self._get_current_otel_span() 2077 2078 if current_span is not None: 2079 trace_id = self._get_otel_trace_id(current_span) 2080 2081 langfuse_logger.info( 2082 f"Score: Creating score name='{name}' value={value} for entire trace {trace_id}" 2083 ) 2084 2085 self.create_score( 2086 trace_id=trace_id, 2087 name=name, 2088 value=cast(str, value), 2089 score_id=score_id, 2090 data_type=cast(Literal["CATEGORICAL", "TEXT"], data_type), 2091 comment=comment, 2092 config_id=config_id, 2093 metadata=metadata, 2094 )
Create a score for the current trace.
This method scores the trace of the currently active span. Unlike score_current_span, this method associates the score with the entire trace rather than a specific span. It's useful for scoring overall performance or quality of the entire operation.
Arguments:
- name: Name of the score (e.g., "user_satisfaction", "overall_quality")
- value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT)
- score_id: Optional custom ID for the score (auto-generated if not provided)
- data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT)
- comment: Optional comment or explanation for the score
- config_id: Optional ID of a score config defined in Langfuse
- metadata: Optional metadata to be attached to the score
Example:
with langfuse.start_as_current_observation(name="process-user-request") as span: # Process request result = process_complete_request() span.update(output=result) # Score the overall trace langfuse.score_current_trace( name="overall_quality", value=0.95, data_type="NUMERIC", comment="High quality end-to-end response", metadata={"evaluator": "gpt-4", "criteria": "comprehensive"} )
2096 def flush(self) -> None: 2097 """Force flush all pending spans and events to the Langfuse API. 2098 2099 This method manually flushes any pending spans, scores, and other events to the 2100 Langfuse API. It's useful in scenarios where you want to ensure all data is sent 2101 before proceeding, without waiting for the automatic flush interval. 2102 2103 Example: 2104 ```python 2105 # Record some spans and scores 2106 with langfuse.start_as_current_observation(name="operation") as span: 2107 # Do work... 2108 pass 2109 2110 # Ensure all data is sent to Langfuse before proceeding 2111 langfuse.flush() 2112 2113 # Continue with other work 2114 ``` 2115 """ 2116 if self._resources is not None: 2117 self._resources.flush()
Force flush all pending spans and events to the Langfuse API.
This method manually flushes any pending spans, scores, and other events to the Langfuse API. It's useful in scenarios where you want to ensure all data is sent before proceeding, without waiting for the automatic flush interval.
Example:
# Record some spans and scores with langfuse.start_as_current_observation(name="operation") as span: # Do work... pass # Ensure all data is sent to Langfuse before proceeding langfuse.flush() # Continue with other work
2119 def shutdown(self) -> None: 2120 """Shut down the Langfuse client and flush all pending data. 2121 2122 This method cleanly shuts down the Langfuse client, ensuring all pending data 2123 is flushed to the API and all background threads are properly terminated. 2124 2125 It's important to call this method when your application is shutting down to 2126 prevent data loss and resource leaks. For most applications, using the client 2127 as a context manager or relying on the automatic shutdown via atexit is sufficient. 2128 2129 Example: 2130 ```python 2131 # Initialize Langfuse 2132 langfuse = Langfuse(public_key="...", secret_key="...") 2133 2134 # Use Langfuse throughout your application 2135 # ... 2136 2137 # When application is shutting down 2138 langfuse.shutdown() 2139 ``` 2140 """ 2141 if self._resources is not None: 2142 self._resources.shutdown()
Shut down the Langfuse client and flush all pending data.
This method cleanly shuts down the Langfuse client, ensuring all pending data is flushed to the API and all background threads are properly terminated.
It's important to call this method when your application is shutting down to prevent data loss and resource leaks. For most applications, using the client as a context manager or relying on the automatic shutdown via atexit is sufficient.
Example:
# Initialize Langfuse langfuse = Langfuse(public_key="...", secret_key="...") # Use Langfuse throughout your application # ... # When application is shutting down langfuse.shutdown()
2144 def get_current_trace_id(self) -> Optional[str]: 2145 """Get the trace ID of the current active span. 2146 2147 This method retrieves the trace ID from the currently active span in the context. 2148 It can be used to get the trace ID for referencing in logs, external systems, 2149 or for creating related operations. 2150 2151 Returns: 2152 The current trace ID as a 32-character lowercase hexadecimal string, 2153 or None if there is no active span. 2154 2155 Example: 2156 ```python 2157 with langfuse.start_as_current_observation(name="process-request") as span: 2158 # Get the current trace ID for reference 2159 trace_id = langfuse.get_current_trace_id() 2160 2161 # Use it for external correlation 2162 log.info(f"Processing request with trace_id: {trace_id}") 2163 2164 # Or pass to another system 2165 external_system.process(data, trace_id=trace_id) 2166 ``` 2167 """ 2168 if not self._tracing_enabled: 2169 langfuse_logger.debug( 2170 "Operation skipped: get_current_trace_id - Tracing is disabled or client is in no-op mode." 2171 ) 2172 return None 2173 2174 current_otel_span = self._get_current_otel_span() 2175 2176 return self._get_otel_trace_id(current_otel_span) if current_otel_span else None
Get the trace ID of the current active span.
This method retrieves the trace ID from the currently active span in the context. It can be used to get the trace ID for referencing in logs, external systems, or for creating related operations.
Returns:
The current trace ID as a 32-character lowercase hexadecimal string, or None if there is no active span.
Example:
with langfuse.start_as_current_observation(name="process-request") as span: # Get the current trace ID for reference trace_id = langfuse.get_current_trace_id() # Use it for external correlation log.info(f"Processing request with trace_id: {trace_id}") # Or pass to another system external_system.process(data, trace_id=trace_id)
2178 def get_current_observation_id(self) -> Optional[str]: 2179 """Get the observation ID (span ID) of the current active span. 2180 2181 This method retrieves the observation ID from the currently active span in the context. 2182 It can be used to get the observation ID for referencing in logs, external systems, 2183 or for creating scores or other related operations. 2184 2185 Returns: 2186 The current observation ID as a 16-character lowercase hexadecimal string, 2187 or None if there is no active span. 2188 2189 Example: 2190 ```python 2191 with langfuse.start_as_current_observation(name="process-user-query") as span: 2192 # Get the current observation ID 2193 observation_id = langfuse.get_current_observation_id() 2194 2195 # Store it for later reference 2196 cache.set(f"query_{query_id}_observation", observation_id) 2197 2198 # Process the query... 2199 ``` 2200 """ 2201 if not self._tracing_enabled: 2202 langfuse_logger.debug( 2203 "Operation skipped: get_current_observation_id - Tracing is disabled or client is in no-op mode." 2204 ) 2205 return None 2206 2207 current_otel_span = self._get_current_otel_span() 2208 2209 return self._get_otel_span_id(current_otel_span) if current_otel_span else None
Get the observation ID (span ID) of the current active span.
This method retrieves the observation ID from the currently active span in the context. It can be used to get the observation ID for referencing in logs, external systems, or for creating scores or other related operations.
Returns:
The current observation ID as a 16-character lowercase hexadecimal string, or None if there is no active span.
Example:
with langfuse.start_as_current_observation(name="process-user-query") as span: # Get the current observation ID observation_id = langfuse.get_current_observation_id() # Store it for later reference cache.set(f"query_{query_id}_observation", observation_id) # Process the query...
2222 def get_trace_url(self, *, trace_id: Optional[str] = None) -> Optional[str]: 2223 """Get the URL to view a trace in the Langfuse UI. 2224 2225 This method generates a URL that links directly to a trace in the Langfuse UI. 2226 It's useful for providing links in logs, notifications, or debugging tools. 2227 2228 Args: 2229 trace_id: Optional trace ID to generate a URL for. If not provided, 2230 the trace ID of the current active span will be used. 2231 2232 Returns: 2233 A URL string pointing to the trace in the Langfuse UI, 2234 or None if the project ID couldn't be retrieved or no trace ID is available. 2235 2236 Example: 2237 ```python 2238 # Get URL for the current trace 2239 with langfuse.start_as_current_observation(name="process-request") as span: 2240 trace_url = langfuse.get_trace_url() 2241 log.info(f"Processing trace: {trace_url}") 2242 2243 # Get URL for a specific trace 2244 specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef") 2245 send_notification(f"Review needed for trace: {specific_trace_url}") 2246 ``` 2247 """ 2248 final_trace_id = trace_id or self.get_current_trace_id() 2249 if not final_trace_id: 2250 return None 2251 2252 project_id = self._get_project_id() 2253 2254 return ( 2255 f"{self._base_url}/project/{project_id}/traces/{final_trace_id}" 2256 if project_id and final_trace_id 2257 else None 2258 )
Get the URL to view a trace in the Langfuse UI.
This method generates a URL that links directly to a trace in the Langfuse UI. It's useful for providing links in logs, notifications, or debugging tools.
Arguments:
- trace_id: Optional trace ID to generate a URL for. If not provided, the trace ID of the current active span will be used.
Returns:
A URL string pointing to the trace in the Langfuse UI, or None if the project ID couldn't be retrieved or no trace ID is available.
Example:
# Get URL for the current trace with langfuse.start_as_current_observation(name="process-request") as span: trace_url = langfuse.get_trace_url() log.info(f"Processing trace: {trace_url}") # Get URL for a specific trace specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef") send_notification(f"Review needed for trace: {specific_trace_url}")
2260 def get_dataset( 2261 self, 2262 name: str, 2263 *, 2264 fetch_items_page_size: Optional[int] = 50, 2265 version: Optional[datetime] = None, 2266 ) -> "DatasetClient": 2267 """Fetch a dataset by its name. 2268 2269 Args: 2270 name (str): The name of the dataset to fetch. 2271 fetch_items_page_size (Optional[int]): All items of the dataset will be fetched in chunks of this size. Defaults to 50. 2272 version (Optional[datetime]): Retrieve dataset items as they existed at this specific point in time (UTC). 2273 If provided, returns the state of items at the specified UTC timestamp. 2274 If not provided, returns the latest version. Must be a timezone-aware datetime object in UTC. 2275 2276 Returns: 2277 DatasetClient: The dataset with the given name. 2278 """ 2279 try: 2280 langfuse_logger.debug(f"Getting datasets {name}") 2281 dataset = self.api.datasets.get(dataset_name=self._url_encode(name)) 2282 2283 dataset_items = [] 2284 page = 1 2285 2286 while True: 2287 new_items = self.api.dataset_items.list( 2288 dataset_name=self._url_encode(name, is_url_param=True), 2289 page=page, 2290 limit=fetch_items_page_size, 2291 version=version, 2292 ) 2293 dataset_items.extend(new_items.data) 2294 2295 if new_items.meta.total_pages <= page: 2296 break 2297 2298 page += 1 2299 2300 return DatasetClient( 2301 dataset=dataset, 2302 items=dataset_items, 2303 version=version, 2304 langfuse_client=self, 2305 ) 2306 2307 except Error as e: 2308 handle_fern_exception(e) 2309 raise e
Fetch a dataset by its name.
Arguments:
- name (str): The name of the dataset to fetch.
- fetch_items_page_size (Optional[int]): All items of the dataset will be fetched in chunks of this size. Defaults to 50.
- version (Optional[datetime]): Retrieve dataset items as they existed at this specific point in time (UTC). If provided, returns the state of items at the specified UTC timestamp. If not provided, returns the latest version. Must be a timezone-aware datetime object in UTC.
Returns:
DatasetClient: The dataset with the given name.
2311 def get_dataset_run( 2312 self, *, dataset_name: str, run_name: str 2313 ) -> DatasetRunWithItems: 2314 """Fetch a dataset run by dataset name and run name. 2315 2316 Args: 2317 dataset_name (str): The name of the dataset. 2318 run_name (str): The name of the run. 2319 2320 Returns: 2321 DatasetRunWithItems: The dataset run with its items. 2322 """ 2323 try: 2324 return cast( 2325 DatasetRunWithItems, 2326 self.api.datasets.get_run( 2327 dataset_name=self._url_encode(dataset_name), 2328 run_name=self._url_encode(run_name), 2329 request_options=None, 2330 ), 2331 ) 2332 except Error as e: 2333 handle_fern_exception(e) 2334 raise e
Fetch a dataset run by dataset name and run name.
Arguments:
- dataset_name (str): The name of the dataset.
- run_name (str): The name of the run.
Returns:
DatasetRunWithItems: The dataset run with its items.
2336 def get_dataset_runs( 2337 self, 2338 *, 2339 dataset_name: str, 2340 page: Optional[int] = None, 2341 limit: Optional[int] = None, 2342 ) -> PaginatedDatasetRuns: 2343 """Fetch all runs for a dataset. 2344 2345 Args: 2346 dataset_name (str): The name of the dataset. 2347 page (Optional[int]): Page number, starts at 1. 2348 limit (Optional[int]): Limit of items per page. 2349 2350 Returns: 2351 PaginatedDatasetRuns: Paginated list of dataset runs. 2352 """ 2353 try: 2354 return cast( 2355 PaginatedDatasetRuns, 2356 self.api.datasets.get_runs( 2357 dataset_name=self._url_encode(dataset_name), 2358 page=page, 2359 limit=limit, 2360 request_options=None, 2361 ), 2362 ) 2363 except Error as e: 2364 handle_fern_exception(e) 2365 raise e
Fetch all runs for a dataset.
Arguments:
- dataset_name (str): The name of the dataset.
- page (Optional[int]): Page number, starts at 1.
- limit (Optional[int]): Limit of items per page.
Returns:
PaginatedDatasetRuns: Paginated list of dataset runs.
2367 def delete_dataset_run( 2368 self, *, dataset_name: str, run_name: str 2369 ) -> DeleteDatasetRunResponse: 2370 """Delete a dataset run and all its run items. This action is irreversible. 2371 2372 Args: 2373 dataset_name (str): The name of the dataset. 2374 run_name (str): The name of the run. 2375 2376 Returns: 2377 DeleteDatasetRunResponse: Confirmation of deletion. 2378 """ 2379 try: 2380 return cast( 2381 DeleteDatasetRunResponse, 2382 self.api.datasets.delete_run( 2383 dataset_name=self._url_encode(dataset_name), 2384 run_name=self._url_encode(run_name), 2385 request_options=None, 2386 ), 2387 ) 2388 except Error as e: 2389 handle_fern_exception(e) 2390 raise e
Delete a dataset run and all its run items. This action is irreversible.
Arguments:
- dataset_name (str): The name of the dataset.
- run_name (str): The name of the run.
Returns:
DeleteDatasetRunResponse: Confirmation of deletion.
2392 def run_experiment( 2393 self, 2394 *, 2395 name: str, 2396 run_name: Optional[str] = None, 2397 description: Optional[str] = None, 2398 data: ExperimentData, 2399 task: TaskFunction, 2400 evaluators: List[EvaluatorFunction] = [], 2401 composite_evaluator: Optional[CompositeEvaluatorFunction] = None, 2402 run_evaluators: List[RunEvaluatorFunction] = [], 2403 max_concurrency: int = 50, 2404 metadata: Optional[Dict[str, str]] = None, 2405 _dataset_version: Optional[datetime] = None, 2406 ) -> ExperimentResult: 2407 """Run an experiment on a dataset with automatic tracing and evaluation. 2408 2409 This method executes a task function on each item in the provided dataset, 2410 automatically traces all executions with Langfuse for observability, runs 2411 item-level and run-level evaluators on the outputs, and returns comprehensive 2412 results with evaluation metrics. 2413 2414 The experiment system provides: 2415 - Automatic tracing of all task executions 2416 - Concurrent processing with configurable limits 2417 - Comprehensive error handling that isolates failures 2418 - Integration with Langfuse datasets for experiment tracking 2419 - Flexible evaluation framework supporting both sync and async evaluators 2420 2421 Args: 2422 name: Human-readable name for the experiment. Used for identification 2423 in the Langfuse UI. 2424 run_name: Optional exact name for the experiment run. If provided, this will be 2425 used as the exact dataset run name if the `data` contains Langfuse dataset items. 2426 If not provided, this will default to the experiment name appended with an ISO timestamp. 2427 description: Optional description explaining the experiment's purpose, 2428 methodology, or expected outcomes. 2429 data: Array of data items to process. Can be either: 2430 - List of dict-like items with 'input', 'expected_output', 'metadata' keys 2431 - List of Langfuse DatasetItem objects from dataset.items 2432 task: Function that processes each data item and returns output. 2433 Must accept 'item' as keyword argument and can return sync or async results. 2434 The task function signature should be: task(*, item, **kwargs) -> Any 2435 evaluators: List of functions to evaluate each item's output individually. 2436 Each evaluator receives input, output, expected_output, and metadata. 2437 Can return single Evaluation dict or list of Evaluation dicts. 2438 composite_evaluator: Optional function that creates composite scores from item-level evaluations. 2439 Receives the same inputs as item-level evaluators (input, output, expected_output, metadata) 2440 plus the list of evaluations from item-level evaluators. Useful for weighted averages, 2441 pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics. 2442 run_evaluators: List of functions to evaluate the entire experiment run. 2443 Each run evaluator receives all item_results and can compute aggregate metrics. 2444 Useful for calculating averages, distributions, or cross-item comparisons. 2445 max_concurrency: Maximum number of concurrent task executions (default: 50). 2446 Controls the number of items processed simultaneously. Adjust based on 2447 API rate limits and system resources. 2448 metadata: Optional metadata dictionary to attach to all experiment traces. 2449 This metadata will be included in every trace created during the experiment. 2450 If `data` are Langfuse dataset items, the metadata will be attached to the dataset run, too. 2451 2452 Returns: 2453 ExperimentResult containing: 2454 - run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset. 2455 - item_results: List of results for each processed item with outputs and evaluations 2456 - run_evaluations: List of aggregate evaluation results for the entire run 2457 - experiment_id: Stable identifier for the experiment run across all items 2458 - dataset_run_id: ID of the dataset run (if using Langfuse datasets) 2459 - dataset_run_url: Direct URL to view results in Langfuse UI (if applicable) 2460 2461 Raises: 2462 ValueError: If required parameters are missing or invalid 2463 Exception: If experiment setup fails (individual item failures are handled gracefully) 2464 2465 Examples: 2466 Basic experiment with local data: 2467 ```python 2468 def summarize_text(*, item, **kwargs): 2469 return f"Summary: {item['input'][:50]}..." 2470 2471 def length_evaluator(*, input, output, expected_output=None, **kwargs): 2472 return { 2473 "name": "output_length", 2474 "value": len(output), 2475 "comment": f"Output contains {len(output)} characters" 2476 } 2477 2478 result = langfuse.run_experiment( 2479 name="Text Summarization Test", 2480 description="Evaluate summarization quality and length", 2481 data=[ 2482 {"input": "Long article text...", "expected_output": "Expected summary"}, 2483 {"input": "Another article...", "expected_output": "Another summary"} 2484 ], 2485 task=summarize_text, 2486 evaluators=[length_evaluator] 2487 ) 2488 2489 print(f"Processed {len(result.item_results)} items") 2490 for item_result in result.item_results: 2491 print(f"Input: {item_result.item['input']}") 2492 print(f"Output: {item_result.output}") 2493 print(f"Evaluations: {item_result.evaluations}") 2494 ``` 2495 2496 Advanced experiment with async task and multiple evaluators: 2497 ```python 2498 async def llm_task(*, item, **kwargs): 2499 # Simulate async LLM call 2500 response = await openai_client.chat.completions.create( 2501 model="gpt-4", 2502 messages=[{"role": "user", "content": item["input"]}] 2503 ) 2504 return response.choices[0].message.content 2505 2506 def accuracy_evaluator(*, input, output, expected_output=None, **kwargs): 2507 if expected_output and expected_output.lower() in output.lower(): 2508 return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"} 2509 return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"} 2510 2511 def toxicity_evaluator(*, input, output, expected_output=None, **kwargs): 2512 # Simulate toxicity check 2513 toxicity_score = check_toxicity(output) # Your toxicity checker 2514 return { 2515 "name": "toxicity", 2516 "value": toxicity_score, 2517 "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}" 2518 } 2519 2520 def average_accuracy(*, item_results, **kwargs): 2521 accuracies = [ 2522 eval.value for result in item_results 2523 for eval in result.evaluations 2524 if eval.name == "accuracy" 2525 ] 2526 return { 2527 "name": "average_accuracy", 2528 "value": sum(accuracies) / len(accuracies) if accuracies else 0, 2529 "comment": f"Average accuracy across {len(accuracies)} items" 2530 } 2531 2532 result = langfuse.run_experiment( 2533 name="LLM Safety and Accuracy Test", 2534 description="Evaluate model accuracy and safety across diverse prompts", 2535 data=test_dataset, # Your dataset items 2536 task=llm_task, 2537 evaluators=[accuracy_evaluator, toxicity_evaluator], 2538 run_evaluators=[average_accuracy], 2539 max_concurrency=5, # Limit concurrent API calls 2540 metadata={"model": "gpt-4", "temperature": 0.7} 2541 ) 2542 ``` 2543 2544 Using with Langfuse datasets: 2545 ```python 2546 # Get dataset from Langfuse 2547 dataset = langfuse.get_dataset("my-eval-dataset") 2548 2549 result = dataset.run_experiment( 2550 name="Production Model Evaluation", 2551 description="Monthly evaluation of production model performance", 2552 task=my_production_task, 2553 evaluators=[accuracy_evaluator, latency_evaluator] 2554 ) 2555 2556 # Results automatically linked to dataset in Langfuse UI 2557 print(f"View results: {result['dataset_run_url']}") 2558 ``` 2559 2560 Note: 2561 - Task and evaluator functions can be either synchronous or asynchronous 2562 - Individual item failures are logged but don't stop the experiment 2563 - All executions are automatically traced and visible in Langfuse UI 2564 - When using Langfuse datasets, results are automatically linked for easy comparison 2565 - This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.) 2566 - Async execution is handled automatically with smart event loop detection 2567 """ 2568 return cast( 2569 ExperimentResult, 2570 run_async_safely( 2571 self._run_experiment_async( 2572 name=name, 2573 run_name=self._create_experiment_run_name( 2574 name=name, run_name=run_name 2575 ), 2576 description=description, 2577 data=data, 2578 task=task, 2579 evaluators=evaluators or [], 2580 composite_evaluator=composite_evaluator, 2581 run_evaluators=run_evaluators or [], 2582 max_concurrency=max_concurrency, 2583 metadata=metadata, 2584 dataset_version=_dataset_version, 2585 ), 2586 ), 2587 )
Run an experiment on a dataset with automatic tracing and evaluation.
This method executes a task function on each item in the provided dataset, automatically traces all executions with Langfuse for observability, runs item-level and run-level evaluators on the outputs, and returns comprehensive results with evaluation metrics.
The experiment system provides:
- Automatic tracing of all task executions
- Concurrent processing with configurable limits
- Comprehensive error handling that isolates failures
- Integration with Langfuse datasets for experiment tracking
- Flexible evaluation framework supporting both sync and async evaluators
Arguments:
- name: Human-readable name for the experiment. Used for identification in the Langfuse UI.
- run_name: Optional exact name for the experiment run. If provided, this will be
used as the exact dataset run name if the
datacontains Langfuse dataset items. If not provided, this will default to the experiment name appended with an ISO timestamp. - description: Optional description explaining the experiment's purpose, methodology, or expected outcomes.
- data: Array of data items to process. Can be either:
- List of dict-like items with 'input', 'expected_output', 'metadata' keys
- List of Langfuse DatasetItem objects from dataset.items
- task: Function that processes each data item and returns output. Must accept 'item' as keyword argument and can return sync or async results. The task function signature should be: task(*, item, **kwargs) -> Any
- evaluators: List of functions to evaluate each item's output individually. Each evaluator receives input, output, expected_output, and metadata. Can return single Evaluation dict or list of Evaluation dicts.
- composite_evaluator: Optional function that creates composite scores from item-level evaluations. Receives the same inputs as item-level evaluators (input, output, expected_output, metadata) plus the list of evaluations from item-level evaluators. Useful for weighted averages, pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics.
- run_evaluators: List of functions to evaluate the entire experiment run. Each run evaluator receives all item_results and can compute aggregate metrics. Useful for calculating averages, distributions, or cross-item comparisons.
- max_concurrency: Maximum number of concurrent task executions (default: 50). Controls the number of items processed simultaneously. Adjust based on API rate limits and system resources.
- metadata: Optional metadata dictionary to attach to all experiment traces.
This metadata will be included in every trace created during the experiment.
If
dataare Langfuse dataset items, the metadata will be attached to the dataset run, too.
Returns:
ExperimentResult containing:
- run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset.
- item_results: List of results for each processed item with outputs and evaluations
- run_evaluations: List of aggregate evaluation results for the entire run
- experiment_id: Stable identifier for the experiment run across all items
- dataset_run_id: ID of the dataset run (if using Langfuse datasets)
- dataset_run_url: Direct URL to view results in Langfuse UI (if applicable)
Raises:
- ValueError: If required parameters are missing or invalid
- Exception: If experiment setup fails (individual item failures are handled gracefully)
Examples:
Basic experiment with local data:
def summarize_text(*, item, **kwargs): return f"Summary: {item['input'][:50]}..." def length_evaluator(*, input, output, expected_output=None, **kwargs): return { "name": "output_length", "value": len(output), "comment": f"Output contains {len(output)} characters" } result = langfuse.run_experiment( name="Text Summarization Test", description="Evaluate summarization quality and length", data=[ {"input": "Long article text...", "expected_output": "Expected summary"}, {"input": "Another article...", "expected_output": "Another summary"} ], task=summarize_text, evaluators=[length_evaluator] ) print(f"Processed {len(result.item_results)} items") for item_result in result.item_results: print(f"Input: {item_result.item['input']}") print(f"Output: {item_result.output}") print(f"Evaluations: {item_result.evaluations}")Advanced experiment with async task and multiple evaluators:
async def llm_task(*, item, **kwargs): # Simulate async LLM call response = await openai_client.chat.completions.create( model="gpt-4", messages=[{"role": "user", "content": item["input"]}] ) return response.choices[0].message.content def accuracy_evaluator(*, input, output, expected_output=None, **kwargs): if expected_output and expected_output.lower() in output.lower(): return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"} return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"} def toxicity_evaluator(*, input, output, expected_output=None, **kwargs): # Simulate toxicity check toxicity_score = check_toxicity(output) # Your toxicity checker return { "name": "toxicity", "value": toxicity_score, "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}" } def average_accuracy(*, item_results, **kwargs): accuracies = [ eval.value for result in item_results for eval in result.evaluations if eval.name == "accuracy" ] return { "name": "average_accuracy", "value": sum(accuracies) / len(accuracies) if accuracies else 0, "comment": f"Average accuracy across {len(accuracies)} items" } result = langfuse.run_experiment( name="LLM Safety and Accuracy Test", description="Evaluate model accuracy and safety across diverse prompts", data=test_dataset, # Your dataset items task=llm_task, evaluators=[accuracy_evaluator, toxicity_evaluator], run_evaluators=[average_accuracy], max_concurrency=5, # Limit concurrent API calls metadata={"model": "gpt-4", "temperature": 0.7} )Using with Langfuse datasets:
# Get dataset from Langfuse dataset = langfuse.get_dataset("my-eval-dataset") result = dataset.run_experiment( name="Production Model Evaluation", description="Monthly evaluation of production model performance", task=my_production_task, evaluators=[accuracy_evaluator, latency_evaluator] ) # Results automatically linked to dataset in Langfuse UI print(f"View results: {result['dataset_run_url']}")
Note:
- Task and evaluator functions can be either synchronous or asynchronous
- Individual item failures are logged but don't stop the experiment
- All executions are automatically traced and visible in Langfuse UI
- When using Langfuse datasets, results are automatically linked for easy comparison
- This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.)
- Async execution is handled automatically with smart event loop detection
2949 def run_batched_evaluation( 2950 self, 2951 *, 2952 scope: Literal["traces", "observations"], 2953 mapper: MapperFunction, 2954 filter: Optional[str] = None, 2955 fetch_batch_size: int = 50, 2956 fetch_trace_fields: Optional[str] = None, 2957 max_items: Optional[int] = None, 2958 max_retries: int = 3, 2959 evaluators: List[EvaluatorFunction], 2960 composite_evaluator: Optional[CompositeEvaluatorFunction] = None, 2961 max_concurrency: int = 5, 2962 metadata: Optional[Dict[str, Any]] = None, 2963 _add_observation_scores_to_trace: bool = False, 2964 _additional_trace_tags: Optional[List[str]] = None, 2965 resume_from: Optional[BatchEvaluationResumeToken] = None, 2966 verbose: bool = False, 2967 ) -> BatchEvaluationResult: 2968 """Fetch traces or observations and run evaluations on each item. 2969 2970 This method provides a powerful way to evaluate existing data in Langfuse at scale. 2971 It fetches items based on filters, transforms them using a mapper function, runs 2972 evaluators on each item, and creates scores that are linked back to the original 2973 entities. This is ideal for: 2974 2975 - Running evaluations on production traces after deployment 2976 - Backtesting new evaluation metrics on historical data 2977 - Batch scoring of observations for quality monitoring 2978 - Periodic evaluation runs on recent data 2979 2980 The method uses a streaming/pipeline approach to process items in batches, making 2981 it memory-efficient for large datasets. It includes comprehensive error handling, 2982 retry logic, and resume capability for long-running evaluations. 2983 2984 Args: 2985 scope: The type of items to evaluate. Must be one of: 2986 - "traces": Evaluate complete traces with all their observations 2987 - "observations": Evaluate individual observations (spans, generations, events) 2988 mapper: Function that transforms API response objects into evaluator inputs. 2989 Receives a trace/observation object and returns an EvaluatorInputs 2990 instance with input, output, expected_output, and metadata fields. 2991 Can be sync or async. 2992 evaluators: List of evaluation functions to run on each item. Each evaluator 2993 receives the mapped inputs and returns Evaluation object(s). Evaluator 2994 failures are logged but don't stop the batch evaluation. 2995 filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples: 2996 - '{"tags": ["production"]}' 2997 - '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}' 2998 Default: None (fetches all items). 2999 fetch_batch_size: Number of items to fetch per API call and hold in memory. 3000 Larger values may be faster but use more memory. Default: 50. 3001 fetch_trace_fields: Comma-separated list of fields to include when fetching traces. Available field groups: 'core' (always included), 'io' (input, output, metadata), 'scores', 'observations', 'metrics'. If not specified, all fields are returned. Example: 'core,scores,metrics'. Note: Excluded 'observations' or 'scores' fields return empty arrays; excluded 'metrics' returns -1 for 'totalCost' and 'latency'. Only relevant if scope is 'traces'. 3002 max_items: Maximum total number of items to process. If None, processes all 3003 items matching the filter. Useful for testing or limiting evaluation runs. 3004 Default: None (process all). 3005 max_concurrency: Maximum number of items to evaluate concurrently. Controls 3006 parallelism and resource usage. Default: 5. 3007 composite_evaluator: Optional function that creates a composite score from 3008 item-level evaluations. Receives the original item and its evaluations, 3009 returns a single Evaluation. Useful for weighted averages or combined metrics. 3010 Default: None. 3011 metadata: Optional metadata dict to add to all created scores. Useful for 3012 tracking evaluation runs, versions, or other context. Default: None. 3013 max_retries: Maximum number of retry attempts for failed batch fetches. 3014 Uses exponential backoff (1s, 2s, 4s). Default: 3. 3015 verbose: If True, logs progress information to console. Useful for monitoring 3016 long-running evaluations. Default: False. 3017 resume_from: Optional resume token from a previous incomplete run. Allows 3018 continuing evaluation after interruption or failure. Default: None. 3019 3020 3021 Returns: 3022 BatchEvaluationResult containing: 3023 - total_items_fetched: Number of items fetched from API 3024 - total_items_processed: Number of items successfully evaluated 3025 - total_items_failed: Number of items that failed evaluation 3026 - total_scores_created: Scores created by item-level evaluators 3027 - total_composite_scores_created: Scores created by composite evaluator 3028 - total_evaluations_failed: Individual evaluator failures 3029 - evaluator_stats: Per-evaluator statistics (success rate, scores created) 3030 - resume_token: Token for resuming if incomplete (None if completed) 3031 - completed: True if all items processed 3032 - duration_seconds: Total execution time 3033 - failed_item_ids: IDs of items that failed 3034 - error_summary: Error types and counts 3035 - has_more_items: True if max_items reached but more exist 3036 3037 Raises: 3038 ValueError: If invalid scope is provided. 3039 3040 Examples: 3041 Basic trace evaluation: 3042 ```python 3043 from langfuse import Langfuse, EvaluatorInputs, Evaluation 3044 3045 client = Langfuse() 3046 3047 # Define mapper to extract fields from traces 3048 def trace_mapper(trace): 3049 return EvaluatorInputs( 3050 input=trace.input, 3051 output=trace.output, 3052 expected_output=None, 3053 metadata={"trace_id": trace.id} 3054 ) 3055 3056 # Define evaluator 3057 def length_evaluator(*, input, output, expected_output, metadata): 3058 return Evaluation( 3059 name="output_length", 3060 value=len(output) if output else 0 3061 ) 3062 3063 # Run batch evaluation 3064 result = client.run_batched_evaluation( 3065 scope="traces", 3066 mapper=trace_mapper, 3067 evaluators=[length_evaluator], 3068 filter='{"tags": ["production"]}', 3069 max_items=1000, 3070 verbose=True 3071 ) 3072 3073 print(f"Processed {result.total_items_processed} traces") 3074 print(f"Created {result.total_scores_created} scores") 3075 ``` 3076 3077 Evaluation with composite scorer: 3078 ```python 3079 def accuracy_evaluator(*, input, output, expected_output, metadata): 3080 # ... evaluation logic 3081 return Evaluation(name="accuracy", value=0.85) 3082 3083 def relevance_evaluator(*, input, output, expected_output, metadata): 3084 # ... evaluation logic 3085 return Evaluation(name="relevance", value=0.92) 3086 3087 def composite_evaluator(*, item, evaluations): 3088 # Weighted average of evaluations 3089 weights = {"accuracy": 0.6, "relevance": 0.4} 3090 total = sum( 3091 e.value * weights.get(e.name, 0) 3092 for e in evaluations 3093 if isinstance(e.value, (int, float)) 3094 ) 3095 return Evaluation( 3096 name="composite_score", 3097 value=total, 3098 comment=f"Weighted average of {len(evaluations)} metrics" 3099 ) 3100 3101 result = client.run_batched_evaluation( 3102 scope="traces", 3103 mapper=trace_mapper, 3104 evaluators=[accuracy_evaluator, relevance_evaluator], 3105 composite_evaluator=composite_evaluator, 3106 filter='{"user_id": "important_user"}', 3107 verbose=True 3108 ) 3109 ``` 3110 3111 Handling incomplete runs with resume: 3112 ```python 3113 # Initial run that may fail or timeout 3114 result = client.run_batched_evaluation( 3115 scope="observations", 3116 mapper=obs_mapper, 3117 evaluators=[my_evaluator], 3118 max_items=10000, 3119 verbose=True 3120 ) 3121 3122 # Check if incomplete 3123 if not result.completed and result.resume_token: 3124 print(f"Processed {result.resume_token.items_processed} items before interruption") 3125 3126 # Resume from where it left off 3127 result = client.run_batched_evaluation( 3128 scope="observations", 3129 mapper=obs_mapper, 3130 evaluators=[my_evaluator], 3131 resume_from=result.resume_token, 3132 verbose=True 3133 ) 3134 3135 print(f"Total items processed: {result.total_items_processed}") 3136 ``` 3137 3138 Monitoring evaluator performance: 3139 ```python 3140 result = client.run_batched_evaluation(...) 3141 3142 for stats in result.evaluator_stats: 3143 success_rate = stats.successful_runs / stats.total_runs 3144 print(f"{stats.name}:") 3145 print(f" Success rate: {success_rate:.1%}") 3146 print(f" Scores created: {stats.total_scores_created}") 3147 3148 if stats.failed_runs > 0: 3149 print(f" ⚠️ Failed {stats.failed_runs} times") 3150 ``` 3151 3152 Note: 3153 - Evaluator failures are logged but don't stop the batch evaluation 3154 - Individual item failures are tracked but don't stop processing 3155 - Fetch failures are retried with exponential backoff 3156 - All scores are automatically flushed to Langfuse at the end 3157 - The resume mechanism uses timestamp-based filtering to avoid duplicates 3158 """ 3159 runner = BatchEvaluationRunner(self) 3160 3161 return cast( 3162 BatchEvaluationResult, 3163 run_async_safely( 3164 runner.run_async( 3165 scope=scope, 3166 mapper=mapper, 3167 evaluators=evaluators, 3168 filter=filter, 3169 fetch_batch_size=fetch_batch_size, 3170 fetch_trace_fields=fetch_trace_fields, 3171 max_items=max_items, 3172 max_concurrency=max_concurrency, 3173 composite_evaluator=composite_evaluator, 3174 metadata=metadata, 3175 _add_observation_scores_to_trace=_add_observation_scores_to_trace, 3176 _additional_trace_tags=_additional_trace_tags, 3177 max_retries=max_retries, 3178 verbose=verbose, 3179 resume_from=resume_from, 3180 ) 3181 ), 3182 )
Fetch traces or observations and run evaluations on each item.
This method provides a powerful way to evaluate existing data in Langfuse at scale. It fetches items based on filters, transforms them using a mapper function, runs evaluators on each item, and creates scores that are linked back to the original entities. This is ideal for:
- Running evaluations on production traces after deployment
- Backtesting new evaluation metrics on historical data
- Batch scoring of observations for quality monitoring
- Periodic evaluation runs on recent data
The method uses a streaming/pipeline approach to process items in batches, making it memory-efficient for large datasets. It includes comprehensive error handling, retry logic, and resume capability for long-running evaluations.
Arguments:
- scope: The type of items to evaluate. Must be one of:
- "traces": Evaluate complete traces with all their observations
- "observations": Evaluate individual observations (spans, generations, events)
- mapper: Function that transforms API response objects into evaluator inputs. Receives a trace/observation object and returns an EvaluatorInputs instance with input, output, expected_output, and metadata fields. Can be sync or async.
- evaluators: List of evaluation functions to run on each item. Each evaluator receives the mapped inputs and returns Evaluation object(s). Evaluator failures are logged but don't stop the batch evaluation.
- filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples:
- '{"tags": ["production"]}'
- '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}' Default: None (fetches all items).
- fetch_batch_size: Number of items to fetch per API call and hold in memory. Larger values may be faster but use more memory. Default: 50.
- fetch_trace_fields: Comma-separated list of fields to include when fetching traces. Available field groups: 'core' (always included), 'io' (input, output, metadata), 'scores', 'observations', 'metrics'. If not specified, all fields are returned. Example: 'core,scores,metrics'. Note: Excluded 'observations' or 'scores' fields return empty arrays; excluded 'metrics' returns -1 for 'totalCost' and 'latency'. Only relevant if scope is 'traces'.
- max_items: Maximum total number of items to process. If None, processes all items matching the filter. Useful for testing or limiting evaluation runs. Default: None (process all).
- max_concurrency: Maximum number of items to evaluate concurrently. Controls parallelism and resource usage. Default: 5.
- composite_evaluator: Optional function that creates a composite score from item-level evaluations. Receives the original item and its evaluations, returns a single Evaluation. Useful for weighted averages or combined metrics. Default: None.
- metadata: Optional metadata dict to add to all created scores. Useful for tracking evaluation runs, versions, or other context. Default: None.
- max_retries: Maximum number of retry attempts for failed batch fetches. Uses exponential backoff (1s, 2s, 4s). Default: 3.
- verbose: If True, logs progress information to console. Useful for monitoring long-running evaluations. Default: False.
- resume_from: Optional resume token from a previous incomplete run. Allows continuing evaluation after interruption or failure. Default: None.
Returns:
BatchEvaluationResult containing: - total_items_fetched: Number of items fetched from API - total_items_processed: Number of items successfully evaluated - total_items_failed: Number of items that failed evaluation - total_scores_created: Scores created by item-level evaluators - total_composite_scores_created: Scores created by composite evaluator - total_evaluations_failed: Individual evaluator failures - evaluator_stats: Per-evaluator statistics (success rate, scores created) - resume_token: Token for resuming if incomplete (None if completed) - completed: True if all items processed - duration_seconds: Total execution time - failed_item_ids: IDs of items that failed - error_summary: Error types and counts - has_more_items: True if max_items reached but more exist
Raises:
- ValueError: If invalid scope is provided.
Examples:
Basic trace evaluation:
from langfuse import Langfuse, EvaluatorInputs, Evaluation client = Langfuse() # Define mapper to extract fields from traces def trace_mapper(trace): return EvaluatorInputs( input=trace.input, output=trace.output, expected_output=None, metadata={"trace_id": trace.id} ) # Define evaluator def length_evaluator(*, input, output, expected_output, metadata): return Evaluation( name="output_length", value=len(output) if output else 0 ) # Run batch evaluation result = client.run_batched_evaluation( scope="traces", mapper=trace_mapper, evaluators=[length_evaluator], filter='{"tags": ["production"]}', max_items=1000, verbose=True ) print(f"Processed {result.total_items_processed} traces") print(f"Created {result.total_scores_created} scores")Evaluation with composite scorer:
def accuracy_evaluator(*, input, output, expected_output, metadata): # ... evaluation logic return Evaluation(name="accuracy", value=0.85) def relevance_evaluator(*, input, output, expected_output, metadata): # ... evaluation logic return Evaluation(name="relevance", value=0.92) def composite_evaluator(*, item, evaluations): # Weighted average of evaluations weights = {"accuracy": 0.6, "relevance": 0.4} total = sum( e.value * weights.get(e.name, 0) for e in evaluations if isinstance(e.value, (int, float)) ) return Evaluation( name="composite_score", value=total, comment=f"Weighted average of {len(evaluations)} metrics" ) result = client.run_batched_evaluation( scope="traces", mapper=trace_mapper, evaluators=[accuracy_evaluator, relevance_evaluator], composite_evaluator=composite_evaluator, filter='{"user_id": "important_user"}', verbose=True )Handling incomplete runs with resume:
# Initial run that may fail or timeout result = client.run_batched_evaluation( scope="observations", mapper=obs_mapper, evaluators=[my_evaluator], max_items=10000, verbose=True ) # Check if incomplete if not result.completed and result.resume_token: print(f"Processed {result.resume_token.items_processed} items before interruption") # Resume from where it left off result = client.run_batched_evaluation( scope="observations", mapper=obs_mapper, evaluators=[my_evaluator], resume_from=result.resume_token, verbose=True ) print(f"Total items processed: {result.total_items_processed}")Monitoring evaluator performance:
result = client.run_batched_evaluation(...) for stats in result.evaluator_stats: success_rate = stats.successful_runs / stats.total_runs print(f"{stats.name}:") print(f" Success rate: {success_rate:.1%}") print(f" Scores created: {stats.total_scores_created}") if stats.failed_runs > 0: print(f" ⚠️ Failed {stats.failed_runs} times")
Note:
- Evaluator failures are logged but don't stop the batch evaluation
- Individual item failures are tracked but don't stop processing
- Fetch failures are retried with exponential backoff
- All scores are automatically flushed to Langfuse at the end
- The resume mechanism uses timestamp-based filtering to avoid duplicates
3184 def auth_check(self) -> bool: 3185 """Check if the provided credentials (public and secret key) are valid. 3186 3187 Raises: 3188 Exception: If no projects were found for the provided credentials. 3189 3190 Note: 3191 This method is blocking. It is discouraged to use it in production code. 3192 """ 3193 try: 3194 projects = self.api.projects.get() 3195 langfuse_logger.debug( 3196 f"Auth check successful, found {len(projects.data)} projects" 3197 ) 3198 if len(projects.data) == 0: 3199 raise Exception( 3200 "Auth check failed, no project found for the keys provided." 3201 ) 3202 return True 3203 3204 except AttributeError as e: 3205 langfuse_logger.warning( 3206 f"Auth check failed: Client not properly initialized. Error: {e}" 3207 ) 3208 return False 3209 3210 except Error as e: 3211 handle_fern_exception(e) 3212 raise e
Check if the provided credentials (public and secret key) are valid.
Raises:
- Exception: If no projects were found for the provided credentials.
Note:
This method is blocking. It is discouraged to use it in production code.
3214 def create_dataset( 3215 self, 3216 *, 3217 name: str, 3218 description: Optional[str] = None, 3219 metadata: Optional[Any] = None, 3220 input_schema: Optional[Any] = None, 3221 expected_output_schema: Optional[Any] = None, 3222 ) -> Dataset: 3223 """Create a dataset with the given name on Langfuse. 3224 3225 Args: 3226 name: Name of the dataset to create. 3227 description: Description of the dataset. Defaults to None. 3228 metadata: Additional metadata. Defaults to None. 3229 input_schema: JSON Schema for validating dataset item inputs. When set, all new items will be validated against this schema. 3230 expected_output_schema: JSON Schema for validating dataset item expected outputs. When set, all new items will be validated against this schema. 3231 3232 Returns: 3233 Dataset: The created dataset as returned by the Langfuse API. 3234 """ 3235 try: 3236 langfuse_logger.debug(f"Creating datasets {name}") 3237 3238 result = self.api.datasets.create( 3239 name=name, 3240 description=description, 3241 metadata=metadata, 3242 input_schema=input_schema, 3243 expected_output_schema=expected_output_schema, 3244 ) 3245 3246 return cast(Dataset, result) 3247 3248 except Error as e: 3249 handle_fern_exception(e) 3250 raise e
Create a dataset with the given name on Langfuse.
Arguments:
- name: Name of the dataset to create.
- description: Description of the dataset. Defaults to None.
- metadata: Additional metadata. Defaults to None.
- input_schema: JSON Schema for validating dataset item inputs. When set, all new items will be validated against this schema.
- expected_output_schema: JSON Schema for validating dataset item expected outputs. When set, all new items will be validated against this schema.
Returns:
Dataset: The created dataset as returned by the Langfuse API.
3252 def create_dataset_item( 3253 self, 3254 *, 3255 dataset_name: str, 3256 input: Optional[Any] = None, 3257 expected_output: Optional[Any] = None, 3258 metadata: Optional[Any] = None, 3259 source_trace_id: Optional[str] = None, 3260 source_observation_id: Optional[str] = None, 3261 status: Optional[DatasetStatus] = None, 3262 id: Optional[str] = None, 3263 ) -> DatasetItem: 3264 """Create a dataset item. 3265 3266 Upserts if an item with id already exists. 3267 3268 Args: 3269 dataset_name: Name of the dataset in which the dataset item should be created. 3270 input: Input data. Defaults to None. Can contain any dict, list or scalar. 3271 expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar. 3272 metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar. 3273 source_trace_id: Id of the source trace. Defaults to None. 3274 source_observation_id: Id of the source observation. Defaults to None. 3275 status: Status of the dataset item. Defaults to ACTIVE for newly created items. 3276 id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets. 3277 3278 Returns: 3279 DatasetItem: The created dataset item as returned by the Langfuse API. 3280 3281 Example: 3282 ```python 3283 from langfuse import Langfuse 3284 3285 langfuse = Langfuse() 3286 3287 # Uploading items to the Langfuse dataset named "capital_cities" 3288 langfuse.create_dataset_item( 3289 dataset_name="capital_cities", 3290 input={"input": {"country": "Italy"}}, 3291 expected_output={"expected_output": "Rome"}, 3292 metadata={"foo": "bar"} 3293 ) 3294 ``` 3295 """ 3296 try: 3297 langfuse_logger.debug(f"Creating dataset item for dataset {dataset_name}") 3298 3299 result = self.api.dataset_items.create( 3300 dataset_name=dataset_name, 3301 input=input, 3302 expected_output=expected_output, 3303 metadata=metadata, 3304 source_trace_id=source_trace_id, 3305 source_observation_id=source_observation_id, 3306 status=status, 3307 id=id, 3308 ) 3309 3310 return cast(DatasetItem, result) 3311 except Error as e: 3312 handle_fern_exception(e) 3313 raise e
Create a dataset item.
Upserts if an item with id already exists.
Arguments:
- dataset_name: Name of the dataset in which the dataset item should be created.
- input: Input data. Defaults to None. Can contain any dict, list or scalar.
- expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar.
- metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar.
- source_trace_id: Id of the source trace. Defaults to None.
- source_observation_id: Id of the source observation. Defaults to None.
- status: Status of the dataset item. Defaults to ACTIVE for newly created items.
- id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets.
Returns:
DatasetItem: The created dataset item as returned by the Langfuse API.
Example:
from langfuse import Langfuse langfuse = Langfuse() # Uploading items to the Langfuse dataset named "capital_cities" langfuse.create_dataset_item( dataset_name="capital_cities", input={"input": {"country": "Italy"}}, expected_output={"expected_output": "Rome"}, metadata={"foo": "bar"} )
3315 def resolve_media_references( 3316 self, 3317 *, 3318 obj: Any, 3319 resolve_with: Literal["base64_data_uri"], 3320 max_depth: int = 10, 3321 content_fetch_timeout_seconds: int = 5, 3322 ) -> Any: 3323 """Replace media reference strings in an object with base64 data URIs. 3324 3325 This method recursively traverses an object (up to max_depth) looking for media reference strings 3326 in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using 3327 the provided Langfuse client and replaces the reference string with a base64 data URI. 3328 3329 If fetching media content fails for a reference string, a warning is logged and the reference 3330 string is left unchanged. 3331 3332 Args: 3333 obj: The object to process. Can be a primitive value, array, or nested object. 3334 If the object has a __dict__ attribute, a dict will be returned instead of the original object type. 3335 resolve_with: The representation of the media content to replace the media reference string with. 3336 Currently only "base64_data_uri" is supported. 3337 max_depth: int: The maximum depth to traverse the object. Default is 10. 3338 content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5. 3339 3340 Returns: 3341 A deep copy of the input object with all media references replaced with base64 data URIs where possible. 3342 If the input object has a __dict__ attribute, a dict will be returned instead of the original object type. 3343 3344 Example: 3345 obj = { 3346 "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@", 3347 "nested": { 3348 "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@" 3349 } 3350 } 3351 3352 result = await LangfuseMedia.resolve_media_references(obj, langfuse_client) 3353 3354 # Result: 3355 # { 3356 # "image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...", 3357 # "nested": { 3358 # "pdf": "data:application/pdf;base64,JVBERi0xLjcK..." 3359 # } 3360 # } 3361 """ 3362 return LangfuseMedia.resolve_media_references( 3363 langfuse_client=self, 3364 obj=obj, 3365 resolve_with=resolve_with, 3366 max_depth=max_depth, 3367 content_fetch_timeout_seconds=content_fetch_timeout_seconds, 3368 )
Replace media reference strings in an object with base64 data URIs.
This method recursively traverses an object (up to max_depth) looking for media reference strings in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using the provided Langfuse client and replaces the reference string with a base64 data URI.
If fetching media content fails for a reference string, a warning is logged and the reference string is left unchanged.
Arguments:
- obj: The object to process. Can be a primitive value, array, or nested object. If the object has a __dict__ attribute, a dict will be returned instead of the original object type.
- resolve_with: The representation of the media content to replace the media reference string with. Currently only "base64_data_uri" is supported.
- max_depth: int: The maximum depth to traverse the object. Default is 10.
- content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5.
Returns:
A deep copy of the input object with all media references replaced with base64 data URIs where possible. If the input object has a __dict__ attribute, a dict will be returned instead of the original object type.
Example:
obj = { "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@", "nested": { "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@" } }
result = await LangfuseMedia.resolve_media_references(obj, langfuse_client)
Result:
{
"image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...",
"nested": {
"pdf": "data:application/pdf;base64,JVBERi0xLjcK..."
}
}
3398 def get_prompt( 3399 self, 3400 name: str, 3401 *, 3402 version: Optional[int] = None, 3403 label: Optional[str] = None, 3404 type: Literal["chat", "text"] = "text", 3405 cache_ttl_seconds: Optional[int] = None, 3406 fallback: Union[Optional[List[ChatMessageDict]], Optional[str]] = None, 3407 max_retries: Optional[int] = None, 3408 fetch_timeout_seconds: Optional[int] = None, 3409 ) -> PromptClient: 3410 """Get a prompt. 3411 3412 This method attempts to fetch the requested prompt from the local cache. If the prompt is not found 3413 in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again 3414 and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will 3415 return the expired prompt as a fallback. 3416 3417 Args: 3418 name (str): The name of the prompt to retrieve. 3419 3420 Keyword Args: 3421 version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both. 3422 label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both. 3423 cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a 3424 keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0. 3425 type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text". 3426 fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None. 3427 max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds. 3428 fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default. 3429 3430 Returns: 3431 The prompt object retrieved from the cache or directly fetched if not cached or expired of type 3432 - TextPromptClient, if type argument is 'text'. 3433 - ChatPromptClient, if type argument is 'chat'. 3434 3435 Raises: 3436 Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an 3437 expired prompt in the cache, in which case it logs a warning and returns the expired prompt. 3438 """ 3439 if self._resources is None: 3440 raise Error( 3441 "SDK is not correctly initialized. Check the init logs for more details." 3442 ) 3443 if version is not None and label is not None: 3444 raise ValueError("Cannot specify both version and label at the same time.") 3445 3446 if not name: 3447 raise ValueError("Prompt name cannot be empty.") 3448 3449 cache_key = PromptCache.generate_cache_key(name, version=version, label=label) 3450 bounded_max_retries = self._get_bounded_max_retries( 3451 max_retries, default_max_retries=2, max_retries_upper_bound=4 3452 ) 3453 3454 langfuse_logger.debug(f"Getting prompt '{cache_key}'") 3455 cached_prompt = self._resources.prompt_cache.get(cache_key) 3456 3457 if cached_prompt is None or cache_ttl_seconds == 0: 3458 langfuse_logger.debug( 3459 f"Prompt '{cache_key}' not found in cache or caching disabled." 3460 ) 3461 try: 3462 return self._fetch_prompt_and_update_cache( 3463 name, 3464 version=version, 3465 label=label, 3466 ttl_seconds=cache_ttl_seconds, 3467 max_retries=bounded_max_retries, 3468 fetch_timeout_seconds=fetch_timeout_seconds, 3469 ) 3470 except Exception as e: 3471 if fallback: 3472 langfuse_logger.warning( 3473 f"Returning fallback prompt for '{cache_key}' due to fetch error: {e}" 3474 ) 3475 3476 fallback_client_args: Dict[str, Any] = { 3477 "name": name, 3478 "prompt": fallback, 3479 "type": type, 3480 "version": version or 0, 3481 "config": {}, 3482 "labels": [label] if label else [], 3483 "tags": [], 3484 } 3485 3486 if type == "text": 3487 return TextPromptClient( 3488 prompt=Prompt_Text(**fallback_client_args), 3489 is_fallback=True, 3490 ) 3491 3492 if type == "chat": 3493 return ChatPromptClient( 3494 prompt=Prompt_Chat(**fallback_client_args), 3495 is_fallback=True, 3496 ) 3497 3498 raise e 3499 3500 if cached_prompt.is_expired(): 3501 langfuse_logger.debug(f"Stale prompt '{cache_key}' found in cache.") 3502 try: 3503 # refresh prompt in background thread, refresh_prompt deduplicates tasks 3504 langfuse_logger.debug(f"Refreshing prompt '{cache_key}' in background.") 3505 3506 def refresh_task() -> None: 3507 self._fetch_prompt_and_update_cache( 3508 name, 3509 version=version, 3510 label=label, 3511 ttl_seconds=cache_ttl_seconds, 3512 max_retries=bounded_max_retries, 3513 fetch_timeout_seconds=fetch_timeout_seconds, 3514 ) 3515 3516 self._resources.prompt_cache.add_refresh_prompt_task_if_current( 3517 cache_key, 3518 cached_prompt, 3519 refresh_task, 3520 ) 3521 langfuse_logger.debug( 3522 f"Returning stale prompt '{cache_key}' from cache." 3523 ) 3524 # return stale prompt 3525 return cached_prompt.value 3526 3527 except Exception as e: 3528 langfuse_logger.warning( 3529 f"Error when refreshing cached prompt '{cache_key}', returning cached version. Error: {e}" 3530 ) 3531 # creation of refresh prompt task failed, return stale prompt 3532 return cached_prompt.value 3533 3534 return cached_prompt.value
Get a prompt.
This method attempts to fetch the requested prompt from the local cache. If the prompt is not found in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will return the expired prompt as a fallback.
Arguments:
- name (str): The name of the prompt to retrieve.
Keyword Args:
- version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the
productionlabel is returned. Specify either version or label, not both. - label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the
productionlabel is returned. Specify either version or label, not both. - cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a
- keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0.
- type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text".
- fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None.
- max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds.
- fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default.
Returns:
The prompt object retrieved from the cache or directly fetched if not cached or expired of type
- TextPromptClient, if type argument is 'text'.
- ChatPromptClient, if type argument is 'chat'.
Raises:
- Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an
- expired prompt in the cache, in which case it logs a warning and returns the expired prompt.
3636 def create_prompt( 3637 self, 3638 *, 3639 name: str, 3640 prompt: Union[ 3641 str, List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]] 3642 ], 3643 labels: List[str] = [], 3644 tags: Optional[List[str]] = None, 3645 type: Optional[Literal["chat", "text"]] = "text", 3646 config: Optional[Any] = None, 3647 commit_message: Optional[str] = None, 3648 ) -> PromptClient: 3649 """Create a new prompt in Langfuse. 3650 3651 Keyword Args: 3652 name : The name of the prompt to be created. 3653 prompt : The content of the prompt to be created. 3654 is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead. 3655 labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label. 3656 tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt. 3657 config: Additional structured data to be saved with the prompt. Defaults to None. 3658 type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text". 3659 commit_message: Optional string describing the change. 3660 3661 Returns: 3662 TextPromptClient: The prompt if type argument is 'text'. 3663 ChatPromptClient: The prompt if type argument is 'chat'. 3664 """ 3665 try: 3666 langfuse_logger.debug(f"Creating prompt {name=}, {labels=}") 3667 3668 if type == "chat": 3669 if not isinstance(prompt, list): 3670 raise ValueError( 3671 "For 'chat' type, 'prompt' must be a list of chat messages with role and content attributes." 3672 ) 3673 request: Union[CreateChatPromptRequest, CreateTextPromptRequest] = ( 3674 CreateChatPromptRequest( 3675 name=name, 3676 prompt=cast(Any, prompt), 3677 labels=labels, 3678 tags=tags, 3679 config=config or {}, 3680 commit_message=commit_message, 3681 type=CreateChatPromptType.CHAT, 3682 ) 3683 ) 3684 server_prompt = self.api.prompts.create(request=request) 3685 3686 if self._resources is not None: 3687 self._resources.prompt_cache.invalidate(name) 3688 3689 return ChatPromptClient(prompt=cast(Prompt_Chat, server_prompt)) 3690 3691 if not isinstance(prompt, str): 3692 raise ValueError("For 'text' type, 'prompt' must be a string.") 3693 3694 request = CreateTextPromptRequest( 3695 name=name, 3696 prompt=prompt, 3697 labels=labels, 3698 tags=tags, 3699 config=config or {}, 3700 commit_message=commit_message, 3701 ) 3702 3703 server_prompt = self.api.prompts.create(request=request) 3704 3705 if self._resources is not None: 3706 self._resources.prompt_cache.invalidate(name) 3707 3708 return TextPromptClient(prompt=cast(Prompt_Text, server_prompt)) 3709 3710 except Error as e: 3711 handle_fern_exception(e) 3712 raise e
Create a new prompt in Langfuse.
Keyword Args:
- name : The name of the prompt to be created.
- prompt : The content of the prompt to be created.
- is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead.
- labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label.
- tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt.
- config: Additional structured data to be saved with the prompt. Defaults to None.
- type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text".
- commit_message: Optional string describing the change.
Returns:
TextPromptClient: The prompt if type argument is 'text'. ChatPromptClient: The prompt if type argument is 'chat'.
3714 def update_prompt( 3715 self, 3716 *, 3717 name: str, 3718 version: int, 3719 new_labels: List[str] = [], 3720 ) -> Any: 3721 """Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name. 3722 3723 Args: 3724 name (str): The name of the prompt to update. 3725 version (int): The version number of the prompt to update. 3726 new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to []. 3727 3728 Returns: 3729 Prompt: The updated prompt from the Langfuse API. 3730 3731 """ 3732 updated_prompt = self.api.prompt_version.update( 3733 name=self._url_encode(name), 3734 version=version, 3735 new_labels=new_labels, 3736 ) 3737 3738 if self._resources is not None: 3739 self._resources.prompt_cache.invalidate(name) 3740 3741 return updated_prompt
Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name.
Arguments:
- name (str): The name of the prompt to update.
- version (int): The version number of the prompt to update.
- new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to [].
Returns:
Prompt: The updated prompt from the Langfuse API.
3756 def clear_prompt_cache(self) -> None: 3757 """Clear the entire prompt cache, removing all cached prompts. 3758 3759 This method is useful when you want to force a complete refresh of all 3760 cached prompts, for example after major updates or when you need to 3761 ensure the latest versions are fetched from the server. 3762 """ 3763 if self._resources is not None: 3764 self._resources.prompt_cache.clear()
Clear the entire prompt cache, removing all cached prompts.
This method is useful when you want to force a complete refresh of all cached prompts, for example after major updates or when you need to ensure the latest versions are fetched from the server.
63def get_client(*, public_key: Optional[str] = None) -> Langfuse: 64 """Get or create a Langfuse client instance. 65 66 Returns an existing Langfuse client or creates a new one if none exists. In multi-project setups, 67 providing a public_key is required. Multi-project support is experimental - see Langfuse docs. 68 69 Behavior: 70 - Single project: Returns existing client or creates new one 71 - Multi-project: Requires public_key to return specific client 72 - No public_key in multi-project: Returns disabled client to prevent data leakage 73 74 The function uses a singleton pattern per public_key to conserve resources and maintain state. 75 76 Args: 77 public_key (Optional[str]): Project identifier 78 - With key: Returns client for that project 79 - Without key: Returns single client or disabled client if multiple exist 80 81 Returns: 82 Langfuse: Client instance in one of three states: 83 1. Client for specified public_key 84 2. Default client for single-project setup 85 3. Disabled client when multiple projects exist without key 86 87 Security: 88 Disables tracing when multiple projects exist without explicit key to prevent 89 cross-project data leakage. Multi-project setups are experimental. 90 91 Example: 92 ```python 93 # Single project 94 client = get_client() # Default client 95 96 # In multi-project usage: 97 client_a = get_client(public_key="project_a_key") # Returns project A's client 98 client_b = get_client(public_key="project_b_key") # Returns project B's client 99 100 # Without specific key in multi-project setup: 101 client = get_client() # Returns disabled client for safety 102 ``` 103 """ 104 with LangfuseResourceManager._lock: 105 active_instances = LangfuseResourceManager._instances 106 107 # If no explicit public_key provided, check execution context 108 if not public_key: 109 public_key = _current_public_key.get(None) 110 111 if not public_key: 112 if len(active_instances) == 0: 113 # No clients initialized yet, create default instance 114 return Langfuse() 115 116 if len(active_instances) == 1: 117 # Only one client exists, safe to use without specifying key 118 instance = list(active_instances.values())[0] 119 120 # Initialize with the credentials bound to the instance 121 # This is important if the original instance was instantiated 122 # via constructor arguments 123 return _create_client_from_instance(instance) 124 125 else: 126 # Multiple clients exist but no key specified - disable tracing 127 # to prevent cross-project data leakage 128 langfuse_logger.warning( 129 "No 'langfuse_public_key' passed to decorated function, but multiple langfuse clients are instantiated in current process. Skipping tracing for this function to avoid cross-project leakage." 130 ) 131 return Langfuse( 132 tracing_enabled=False, public_key="fake", secret_key="fake" 133 ) 134 135 else: 136 # Specific key provided, look up existing instance 137 target_instance: Optional[LangfuseResourceManager] = active_instances.get( 138 public_key, None 139 ) 140 141 if target_instance is None: 142 # No instance found with this key - client not initialized properly 143 langfuse_logger.warning( 144 f"No Langfuse client with public key {public_key} has been initialized. Skipping tracing for decorated function." 145 ) 146 return Langfuse( 147 tracing_enabled=False, public_key="fake", secret_key="fake" 148 ) 149 150 # target_instance is guaranteed to be not None at this point 151 return _create_client_from_instance(target_instance, public_key)
Get or create a Langfuse client instance.
Returns an existing Langfuse client or creates a new one if none exists. In multi-project setups, providing a public_key is required. Multi-project support is experimental - see Langfuse docs.
Behavior:
- Single project: Returns existing client or creates new one
- Multi-project: Requires public_key to return specific client
- No public_key in multi-project: Returns disabled client to prevent data leakage
The function uses a singleton pattern per public_key to conserve resources and maintain state.
Arguments:
- public_key (Optional[str]): Project identifier
- With key: Returns client for that project
- Without key: Returns single client or disabled client if multiple exist
Returns:
Langfuse: Client instance in one of three states: 1. Client for specified public_key 2. Default client for single-project setup 3. Disabled client when multiple projects exist without key
Security:
Disables tracing when multiple projects exist without explicit key to prevent cross-project data leakage. Multi-project setups are experimental.
Example:
# Single project client = get_client() # Default client # In multi-project usage: client_a = get_client(public_key="project_a_key") # Returns project A's client client_b = get_client(public_key="project_b_key") # Returns project B's client # Without specific key in multi-project setup: client = get_client() # Returns disabled client for safety
88 def observe( 89 self, 90 func: Optional[F] = None, 91 *, 92 name: Optional[str] = None, 93 as_type: Optional[ObservationTypeLiteralNoEvent] = None, 94 capture_input: Optional[bool] = None, 95 capture_output: Optional[bool] = None, 96 transform_to_string: Optional[Callable[[Iterable], str]] = None, 97 ) -> Union[F, Callable[[F], F]]: 98 """Wrap a function to create and manage Langfuse tracing around its execution, supporting both synchronous and asynchronous functions. 99 100 This decorator provides seamless integration of Langfuse observability into your codebase. It automatically creates 101 spans or generations around function execution, capturing timing, inputs/outputs, and error states. The decorator 102 intelligently handles both synchronous and asynchronous functions, preserving function signatures and type hints. 103 104 Using OpenTelemetry's distributed tracing system, it maintains proper trace context propagation throughout your application, 105 enabling you to see hierarchical traces of function calls with detailed performance metrics and function-specific details. 106 107 Args: 108 func (Optional[Callable]): The function to decorate. When used with parentheses @observe(), this will be None. 109 name (Optional[str]): Custom name for the created trace or span. If not provided, the function name is used. 110 as_type (Optional[Literal]): Set the observation type. Supported values: 111 "generation", "span", "agent", "tool", "chain", "retriever", "embedding", "evaluator", "guardrail". 112 Observation types are highlighted in the Langfuse UI for filtering and visualization. 113 The types "generation" and "embedding" create a span on which additional attributes such as model metrics 114 can be set. 115 116 Returns: 117 Callable: A wrapped version of the original function that automatically creates and manages Langfuse spans. 118 119 Example: 120 For general function tracing with automatic naming: 121 ```python 122 @observe() 123 def process_user_request(user_id, query): 124 # Function is automatically traced with name "process_user_request" 125 return get_response(query) 126 ``` 127 128 For language model generation tracking: 129 ```python 130 @observe(name="answer-generation", as_type="generation") 131 async def generate_answer(query): 132 # Creates a generation-type span with extended LLM metrics 133 response = await openai.chat.completions.create( 134 model="gpt-4", 135 messages=[{"role": "user", "content": query}] 136 ) 137 return response.choices[0].message.content 138 ``` 139 140 For trace context propagation between functions: 141 ```python 142 @observe() 143 def main_process(): 144 # Parent span is created 145 return sub_process() # Child span automatically connected to parent 146 147 @observe() 148 def sub_process(): 149 # Automatically becomes a child span of main_process 150 return "result" 151 ``` 152 153 Raises: 154 Exception: Propagates any exceptions from the wrapped function after logging them in the trace. 155 156 Notes: 157 - The decorator preserves the original function's signature, docstring, and return type. 158 - Proper parent-child relationships between spans are automatically maintained. 159 - Special keyword arguments can be passed to control tracing: 160 - langfuse_trace_id: Explicitly set the trace ID for this function call 161 - langfuse_parent_observation_id: Explicitly set the parent span ID 162 - langfuse_public_key: Use a specific Langfuse project (when multiple clients exist) 163 - For async functions, the decorator returns an async function wrapper. 164 - For sync functions, the decorator returns a synchronous wrapper. 165 """ 166 valid_types = set(get_observation_types_list(ObservationTypeLiteralNoEvent)) 167 if as_type is not None and as_type not in valid_types: 168 logger.warning( 169 f"Invalid as_type '{as_type}'. Valid types are: {', '.join(sorted(valid_types))}. Defaulting to 'span'." 170 ) 171 as_type = "span" 172 173 function_io_capture_enabled = os.environ.get( 174 LANGFUSE_OBSERVE_DECORATOR_IO_CAPTURE_ENABLED, "True" 175 ).lower() not in ("false", "0") 176 177 should_capture_input = ( 178 capture_input if capture_input is not None else function_io_capture_enabled 179 ) 180 181 should_capture_output = ( 182 capture_output 183 if capture_output is not None 184 else function_io_capture_enabled 185 ) 186 187 def decorator(func: F) -> F: 188 return ( 189 self._async_observe( 190 func, 191 name=name, 192 as_type=as_type, 193 capture_input=should_capture_input, 194 capture_output=should_capture_output, 195 transform_to_string=transform_to_string, 196 ) 197 if asyncio.iscoroutinefunction(func) 198 else self._sync_observe( 199 func, 200 name=name, 201 as_type=as_type, 202 capture_input=should_capture_input, 203 capture_output=should_capture_output, 204 transform_to_string=transform_to_string, 205 ) 206 ) 207 208 """Handle decorator with or without parentheses. 209 210 This logic enables the decorator to work both with and without parentheses: 211 - @observe - Python passes the function directly to the decorator 212 - @observe() - Python calls the decorator first, which must return a function decorator 213 214 When called without arguments (@observe), the func parameter contains the function to decorate, 215 so we directly apply the decorator to it. When called with parentheses (@observe()), 216 func is None, so we return the decorator function itself for Python to apply in the next step. 217 """ 218 if func is None: 219 return decorator 220 else: 221 return decorator(func)
Wrap a function to create and manage Langfuse tracing around its execution, supporting both synchronous and asynchronous functions.
This decorator provides seamless integration of Langfuse observability into your codebase. It automatically creates spans or generations around function execution, capturing timing, inputs/outputs, and error states. The decorator intelligently handles both synchronous and asynchronous functions, preserving function signatures and type hints.
Using OpenTelemetry's distributed tracing system, it maintains proper trace context propagation throughout your application, enabling you to see hierarchical traces of function calls with detailed performance metrics and function-specific details.
Arguments:
- func (Optional[Callable]): The function to decorate. When used with parentheses @observe(), this will be None.
- name (Optional[str]): Custom name for the created trace or span. If not provided, the function name is used.
- as_type (Optional[Literal]): Set the observation type. Supported values: "generation", "span", "agent", "tool", "chain", "retriever", "embedding", "evaluator", "guardrail". Observation types are highlighted in the Langfuse UI for filtering and visualization. The types "generation" and "embedding" create a span on which additional attributes such as model metrics can be set.
Returns:
Callable: A wrapped version of the original function that automatically creates and manages Langfuse spans.
Example:
For general function tracing with automatic naming:
@observe() def process_user_request(user_id, query): # Function is automatically traced with name "process_user_request" return get_response(query)For language model generation tracking:
@observe(name="answer-generation", as_type="generation") async def generate_answer(query): # Creates a generation-type span with extended LLM metrics response = await openai.chat.completions.create( model="gpt-4", messages=[{"role": "user", "content": query}] ) return response.choices[0].message.contentFor trace context propagation between functions:
@observe() def main_process(): # Parent span is created return sub_process() # Child span automatically connected to parent @observe() def sub_process(): # Automatically becomes a child span of main_process return "result"
Raises:
- Exception: Propagates any exceptions from the wrapped function after logging them in the trace.
Notes:
- The decorator preserves the original function's signature, docstring, and return type.
- Proper parent-child relationships between spans are automatically maintained.
- Special keyword arguments can be passed to control tracing:
- langfuse_trace_id: Explicitly set the trace ID for this function call
- langfuse_parent_observation_id: Explicitly set the parent span ID
- langfuse_public_key: Use a specific Langfuse project (when multiple clients exist)
- For async functions, the decorator returns an async function wrapper.
- For sync functions, the decorator returns a synchronous wrapper.
95def propagate_attributes( 96 *, 97 user_id: Optional[str] = None, 98 session_id: Optional[str] = None, 99 metadata: Optional[Dict[str, str]] = None, 100 version: Optional[str] = None, 101 tags: Optional[List[str]] = None, 102 trace_name: Optional[str] = None, 103 as_baggage: bool = False, 104) -> _AgnosticContextManager[Any]: 105 """Propagate trace-level attributes to all spans created within this context. 106 107 This context manager sets attributes on the currently active span AND automatically 108 propagates them to all new child spans created within the context. This is the 109 recommended way to set trace-level attributes like user_id, session_id, and metadata 110 dimensions that should be consistently applied across all observations in a trace. 111 112 **IMPORTANT**: Call this as early as possible within your trace/workflow. Only the 113 currently active span and spans created after entering this context will have these 114 attributes. Pre-existing spans will NOT be retroactively updated. 115 116 **Why this matters**: Langfuse aggregation queries (e.g., total cost by user_id, 117 filtering by session_id) only include observations that have the attribute set. 118 If you call `propagate_attributes` late in your workflow, earlier spans won't be 119 included in aggregations for that attribute. 120 121 Args: 122 user_id: User identifier to associate with all spans in this context. 123 Must be US-ASCII string, ≤200 characters. Use this to track which user 124 generated each trace and enable e.g. per-user cost/performance analysis. 125 session_id: Session identifier to associate with all spans in this context. 126 Must be US-ASCII string, ≤200 characters. Use this to group related traces 127 within a user session (e.g., a conversation thread, multi-turn interaction). 128 metadata: Additional key-value metadata to propagate to all spans. 129 - Keys and values must be US-ASCII strings 130 - All values must be ≤200 characters 131 - Use for dimensions like internal correlating identifiers 132 - AVOID: large payloads, sensitive data, non-string values (will be dropped with warning) 133 version: Version identfier for parts of your application that are independently versioned, e.g. agents 134 tags: List of tags to categorize the group of observations 135 trace_name: Name to assign to the trace. Must be US-ASCII string, ≤200 characters. 136 Use this to set a consistent trace name for all spans created within this context. 137 as_baggage: If True, propagates attributes using OpenTelemetry baggage for 138 cross-process/service propagation. **Security warning**: When enabled, 139 attribute values are added to HTTP headers on ALL outbound requests. 140 Only enable if values are safe to transmit via HTTP headers and you need 141 cross-service tracing. Default: False. 142 143 Returns: 144 Context manager that propagates attributes to all child spans. 145 146 Example: 147 Basic usage with user and session tracking: 148 149 ```python 150 from langfuse import Langfuse 151 152 langfuse = Langfuse() 153 154 # Set attributes early in the trace 155 with langfuse.start_as_current_observation(name="user_workflow") as span: 156 with langfuse.propagate_attributes( 157 user_id="user_123", 158 session_id="session_abc", 159 metadata={"experiment": "variant_a", "environment": "production"} 160 ): 161 # All spans created here will have user_id, session_id, and metadata 162 with langfuse.start_observation(name="llm_call") as llm_span: 163 # This span inherits: user_id, session_id, experiment, environment 164 ... 165 166 with langfuse.start_generation(name="completion") as gen: 167 # This span also inherits all attributes 168 ... 169 ``` 170 171 Late propagation (anti-pattern): 172 173 ```python 174 with langfuse.start_as_current_observation(name="workflow") as span: 175 # These spans WON'T have user_id 176 early_span = langfuse.start_observation(name="early_work") 177 early_span.end() 178 179 # Set attributes in the middle 180 with langfuse.propagate_attributes(user_id="user_123"): 181 # Only spans created AFTER this point will have user_id 182 late_span = langfuse.start_observation(name="late_work") 183 late_span.end() 184 185 # Result: Aggregations by user_id will miss "early_work" span 186 ``` 187 188 Cross-service propagation with baggage (advanced): 189 190 ```python 191 # Service A - originating service 192 with langfuse.start_as_current_observation(name="api_request"): 193 with langfuse.propagate_attributes( 194 user_id="user_123", 195 session_id="session_abc", 196 as_baggage=True # Propagate via HTTP headers 197 ): 198 # Make HTTP request to Service B 199 response = requests.get("https://service-b.example.com/api") 200 # user_id and session_id are now in HTTP headers 201 202 # Service B - downstream service 203 # OpenTelemetry will automatically extract baggage from HTTP headers 204 # and propagate to spans in Service B 205 ``` 206 207 Note: 208 - **Validation**: All attribute values (user_id, session_id, metadata values) 209 must be strings ≤200 characters. Invalid values will be dropped with a 210 warning logged. Ensure values meet constraints before calling. 211 - **OpenTelemetry**: This uses OpenTelemetry context propagation under the hood, 212 making it compatible with other OTel-instrumented libraries. 213 214 Raises: 215 No exceptions are raised. Invalid values are logged as warnings and dropped. 216 """ 217 return _propagate_attributes( 218 user_id=user_id, 219 session_id=session_id, 220 metadata=metadata, 221 version=version, 222 tags=tags, 223 trace_name=trace_name, 224 as_baggage=as_baggage, 225 )
Propagate trace-level attributes to all spans created within this context.
This context manager sets attributes on the currently active span AND automatically propagates them to all new child spans created within the context. This is the recommended way to set trace-level attributes like user_id, session_id, and metadata dimensions that should be consistently applied across all observations in a trace.
IMPORTANT: Call this as early as possible within your trace/workflow. Only the currently active span and spans created after entering this context will have these attributes. Pre-existing spans will NOT be retroactively updated.
Why this matters: Langfuse aggregation queries (e.g., total cost by user_id,
filtering by session_id) only include observations that have the attribute set.
If you call propagate_attributes late in your workflow, earlier spans won't be
included in aggregations for that attribute.
Arguments:
- user_id: User identifier to associate with all spans in this context. Must be US-ASCII string, ≤200 characters. Use this to track which user generated each trace and enable e.g. per-user cost/performance analysis.
- session_id: Session identifier to associate with all spans in this context. Must be US-ASCII string, ≤200 characters. Use this to group related traces within a user session (e.g., a conversation thread, multi-turn interaction).
- metadata: Additional key-value metadata to propagate to all spans.
- Keys and values must be US-ASCII strings
- All values must be ≤200 characters
- Use for dimensions like internal correlating identifiers
- AVOID: large payloads, sensitive data, non-string values (will be dropped with warning)
- version: Version identfier for parts of your application that are independently versioned, e.g. agents
- tags: List of tags to categorize the group of observations
- trace_name: Name to assign to the trace. Must be US-ASCII string, ≤200 characters. Use this to set a consistent trace name for all spans created within this context.
- as_baggage: If True, propagates attributes using OpenTelemetry baggage for cross-process/service propagation. Security warning: When enabled, attribute values are added to HTTP headers on ALL outbound requests. Only enable if values are safe to transmit via HTTP headers and you need cross-service tracing. Default: False.
Returns:
Context manager that propagates attributes to all child spans.
Example:
Basic usage with user and session tracking:
from langfuse import Langfuse langfuse = Langfuse() # Set attributes early in the trace with langfuse.start_as_current_observation(name="user_workflow") as span: with langfuse.propagate_attributes( user_id="user_123", session_id="session_abc", metadata={"experiment": "variant_a", "environment": "production"} ): # All spans created here will have user_id, session_id, and metadata with langfuse.start_observation(name="llm_call") as llm_span: # This span inherits: user_id, session_id, experiment, environment ... with langfuse.start_generation(name="completion") as gen: # This span also inherits all attributes ...Late propagation (anti-pattern):
with langfuse.start_as_current_observation(name="workflow") as span: # These spans WON'T have user_id early_span = langfuse.start_observation(name="early_work") early_span.end() # Set attributes in the middle with langfuse.propagate_attributes(user_id="user_123"): # Only spans created AFTER this point will have user_id late_span = langfuse.start_observation(name="late_work") late_span.end() # Result: Aggregations by user_id will miss "early_work" spanCross-service propagation with baggage (advanced):
# Service A - originating service with langfuse.start_as_current_observation(name="api_request"): with langfuse.propagate_attributes( user_id="user_123", session_id="session_abc", as_baggage=True # Propagate via HTTP headers ): # Make HTTP request to Service B response = requests.get("https://service-b.example.com/api") # user_id and session_id are now in HTTP headers # Service B - downstream service # OpenTelemetry will automatically extract baggage from HTTP headers # and propagate to spans in Service B
Note:
- Validation: All attribute values (user_id, session_id, metadata values) must be strings ≤200 characters. Invalid values will be dropped with a warning logged. Ensure values meet constraints before calling.
- OpenTelemetry: This uses OpenTelemetry context propagation under the hood, making it compatible with other OTel-instrumented libraries.
Raises:
- No exceptions are raised. Invalid values are logged as warnings and dropped.
1247class LangfuseSpan(LangfuseObservationWrapper): 1248 """Standard span implementation for general operations in Langfuse. 1249 1250 This class represents a general-purpose span that can be used to trace 1251 any operation in your application. It extends the base LangfuseObservationWrapper 1252 with specific methods for creating child spans, generations, and updating 1253 span-specific attributes. If possible, use a more specific type for 1254 better observability and insights. 1255 """ 1256 1257 def __init__( 1258 self, 1259 *, 1260 otel_span: otel_trace_api.Span, 1261 langfuse_client: "Langfuse", 1262 input: Optional[Any] = None, 1263 output: Optional[Any] = None, 1264 metadata: Optional[Any] = None, 1265 environment: Optional[str] = None, 1266 release: Optional[str] = None, 1267 version: Optional[str] = None, 1268 level: Optional[SpanLevel] = None, 1269 status_message: Optional[str] = None, 1270 ): 1271 """Initialize a new LangfuseSpan. 1272 1273 Args: 1274 otel_span: The OpenTelemetry span to wrap 1275 langfuse_client: Reference to the parent Langfuse client 1276 input: Input data for the span (any JSON-serializable object) 1277 output: Output data from the span (any JSON-serializable object) 1278 metadata: Additional metadata to associate with the span 1279 environment: The tracing environment 1280 release: Release identifier for the application 1281 version: Version identifier for the code or component 1282 level: Importance level of the span (info, warning, error) 1283 status_message: Optional status message for the span 1284 """ 1285 super().__init__( 1286 otel_span=otel_span, 1287 as_type="span", 1288 langfuse_client=langfuse_client, 1289 input=input, 1290 output=output, 1291 metadata=metadata, 1292 environment=environment, 1293 release=release, 1294 version=version, 1295 level=level, 1296 status_message=status_message, 1297 )
Standard span implementation for general operations in Langfuse.
This class represents a general-purpose span that can be used to trace any operation in your application. It extends the base LangfuseObservationWrapper with specific methods for creating child spans, generations, and updating span-specific attributes. If possible, use a more specific type for better observability and insights.
1257 def __init__( 1258 self, 1259 *, 1260 otel_span: otel_trace_api.Span, 1261 langfuse_client: "Langfuse", 1262 input: Optional[Any] = None, 1263 output: Optional[Any] = None, 1264 metadata: Optional[Any] = None, 1265 environment: Optional[str] = None, 1266 release: Optional[str] = None, 1267 version: Optional[str] = None, 1268 level: Optional[SpanLevel] = None, 1269 status_message: Optional[str] = None, 1270 ): 1271 """Initialize a new LangfuseSpan. 1272 1273 Args: 1274 otel_span: The OpenTelemetry span to wrap 1275 langfuse_client: Reference to the parent Langfuse client 1276 input: Input data for the span (any JSON-serializable object) 1277 output: Output data from the span (any JSON-serializable object) 1278 metadata: Additional metadata to associate with the span 1279 environment: The tracing environment 1280 release: Release identifier for the application 1281 version: Version identifier for the code or component 1282 level: Importance level of the span (info, warning, error) 1283 status_message: Optional status message for the span 1284 """ 1285 super().__init__( 1286 otel_span=otel_span, 1287 as_type="span", 1288 langfuse_client=langfuse_client, 1289 input=input, 1290 output=output, 1291 metadata=metadata, 1292 environment=environment, 1293 release=release, 1294 version=version, 1295 level=level, 1296 status_message=status_message, 1297 )
Initialize a new LangfuseSpan.
Arguments:
- otel_span: The OpenTelemetry span to wrap
- langfuse_client: Reference to the parent Langfuse client
- input: Input data for the span (any JSON-serializable object)
- output: Output data from the span (any JSON-serializable object)
- metadata: Additional metadata to associate with the span
- environment: The tracing environment
- release: Release identifier for the application
- version: Version identifier for the code or component
- level: Importance level of the span (info, warning, error)
- status_message: Optional status message for the span
1300class LangfuseGeneration(LangfuseObservationWrapper): 1301 """Specialized span implementation for AI model generations in Langfuse. 1302 1303 This class represents a generation span specifically designed for tracking 1304 AI/LLM operations. It extends the base LangfuseObservationWrapper with specialized 1305 attributes for model details, token usage, and costs. 1306 """ 1307 1308 def __init__( 1309 self, 1310 *, 1311 otel_span: otel_trace_api.Span, 1312 langfuse_client: "Langfuse", 1313 input: Optional[Any] = None, 1314 output: Optional[Any] = None, 1315 metadata: Optional[Any] = None, 1316 environment: Optional[str] = None, 1317 release: Optional[str] = None, 1318 version: Optional[str] = None, 1319 level: Optional[SpanLevel] = None, 1320 status_message: Optional[str] = None, 1321 completion_start_time: Optional[datetime] = None, 1322 model: Optional[str] = None, 1323 model_parameters: Optional[Dict[str, MapValue]] = None, 1324 usage_details: Optional[Dict[str, int]] = None, 1325 cost_details: Optional[Dict[str, float]] = None, 1326 prompt: Optional[PromptClient] = None, 1327 ): 1328 """Initialize a new LangfuseGeneration span. 1329 1330 Args: 1331 otel_span: The OpenTelemetry span to wrap 1332 langfuse_client: Reference to the parent Langfuse client 1333 input: Input data for the generation (e.g., prompts) 1334 output: Output from the generation (e.g., completions) 1335 metadata: Additional metadata to associate with the generation 1336 environment: The tracing environment 1337 release: Release identifier for the application 1338 version: Version identifier for the model or component 1339 level: Importance level of the generation (info, warning, error) 1340 status_message: Optional status message for the generation 1341 completion_start_time: When the model started generating the response 1342 model: Name/identifier of the AI model used (e.g., "gpt-4") 1343 model_parameters: Parameters used for the model (e.g., temperature, max_tokens) 1344 usage_details: Token usage information (e.g., prompt_tokens, completion_tokens) 1345 cost_details: Cost information for the model call 1346 prompt: Associated prompt template from Langfuse prompt management 1347 """ 1348 super().__init__( 1349 as_type="generation", 1350 otel_span=otel_span, 1351 langfuse_client=langfuse_client, 1352 input=input, 1353 output=output, 1354 metadata=metadata, 1355 environment=environment, 1356 release=release, 1357 version=version, 1358 level=level, 1359 status_message=status_message, 1360 completion_start_time=completion_start_time, 1361 model=model, 1362 model_parameters=model_parameters, 1363 usage_details=usage_details, 1364 cost_details=cost_details, 1365 prompt=prompt, 1366 )
Specialized span implementation for AI model generations in Langfuse.
This class represents a generation span specifically designed for tracking AI/LLM operations. It extends the base LangfuseObservationWrapper with specialized attributes for model details, token usage, and costs.
1308 def __init__( 1309 self, 1310 *, 1311 otel_span: otel_trace_api.Span, 1312 langfuse_client: "Langfuse", 1313 input: Optional[Any] = None, 1314 output: Optional[Any] = None, 1315 metadata: Optional[Any] = None, 1316 environment: Optional[str] = None, 1317 release: Optional[str] = None, 1318 version: Optional[str] = None, 1319 level: Optional[SpanLevel] = None, 1320 status_message: Optional[str] = None, 1321 completion_start_time: Optional[datetime] = None, 1322 model: Optional[str] = None, 1323 model_parameters: Optional[Dict[str, MapValue]] = None, 1324 usage_details: Optional[Dict[str, int]] = None, 1325 cost_details: Optional[Dict[str, float]] = None, 1326 prompt: Optional[PromptClient] = None, 1327 ): 1328 """Initialize a new LangfuseGeneration span. 1329 1330 Args: 1331 otel_span: The OpenTelemetry span to wrap 1332 langfuse_client: Reference to the parent Langfuse client 1333 input: Input data for the generation (e.g., prompts) 1334 output: Output from the generation (e.g., completions) 1335 metadata: Additional metadata to associate with the generation 1336 environment: The tracing environment 1337 release: Release identifier for the application 1338 version: Version identifier for the model or component 1339 level: Importance level of the generation (info, warning, error) 1340 status_message: Optional status message for the generation 1341 completion_start_time: When the model started generating the response 1342 model: Name/identifier of the AI model used (e.g., "gpt-4") 1343 model_parameters: Parameters used for the model (e.g., temperature, max_tokens) 1344 usage_details: Token usage information (e.g., prompt_tokens, completion_tokens) 1345 cost_details: Cost information for the model call 1346 prompt: Associated prompt template from Langfuse prompt management 1347 """ 1348 super().__init__( 1349 as_type="generation", 1350 otel_span=otel_span, 1351 langfuse_client=langfuse_client, 1352 input=input, 1353 output=output, 1354 metadata=metadata, 1355 environment=environment, 1356 release=release, 1357 version=version, 1358 level=level, 1359 status_message=status_message, 1360 completion_start_time=completion_start_time, 1361 model=model, 1362 model_parameters=model_parameters, 1363 usage_details=usage_details, 1364 cost_details=cost_details, 1365 prompt=prompt, 1366 )
Initialize a new LangfuseGeneration span.
Arguments:
- otel_span: The OpenTelemetry span to wrap
- langfuse_client: Reference to the parent Langfuse client
- input: Input data for the generation (e.g., prompts)
- output: Output from the generation (e.g., completions)
- metadata: Additional metadata to associate with the generation
- environment: The tracing environment
- release: Release identifier for the application
- version: Version identifier for the model or component
- level: Importance level of the generation (info, warning, error)
- status_message: Optional status message for the generation
- completion_start_time: When the model started generating the response
- model: Name/identifier of the AI model used (e.g., "gpt-4")
- model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
- usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
- cost_details: Cost information for the model call
- prompt: Associated prompt template from Langfuse prompt management
1369class LangfuseEvent(LangfuseObservationWrapper): 1370 """Specialized span implementation for Langfuse Events.""" 1371 1372 def __init__( 1373 self, 1374 *, 1375 otel_span: otel_trace_api.Span, 1376 langfuse_client: "Langfuse", 1377 input: Optional[Any] = None, 1378 output: Optional[Any] = None, 1379 metadata: Optional[Any] = None, 1380 environment: Optional[str] = None, 1381 release: Optional[str] = None, 1382 version: Optional[str] = None, 1383 level: Optional[SpanLevel] = None, 1384 status_message: Optional[str] = None, 1385 ): 1386 """Initialize a new LangfuseEvent span. 1387 1388 Args: 1389 otel_span: The OpenTelemetry span to wrap 1390 langfuse_client: Reference to the parent Langfuse client 1391 input: Input data for the event 1392 output: Output from the event 1393 metadata: Additional metadata to associate with the generation 1394 environment: The tracing environment 1395 release: Release identifier for the application 1396 version: Version identifier for the model or component 1397 level: Importance level of the generation (info, warning, error) 1398 status_message: Optional status message for the generation 1399 """ 1400 super().__init__( 1401 otel_span=otel_span, 1402 as_type="event", 1403 langfuse_client=langfuse_client, 1404 input=input, 1405 output=output, 1406 metadata=metadata, 1407 environment=environment, 1408 release=release, 1409 version=version, 1410 level=level, 1411 status_message=status_message, 1412 ) 1413 1414 def update( 1415 self, 1416 *, 1417 name: Optional[str] = None, 1418 input: Optional[Any] = None, 1419 output: Optional[Any] = None, 1420 metadata: Optional[Any] = None, 1421 version: Optional[str] = None, 1422 level: Optional[SpanLevel] = None, 1423 status_message: Optional[str] = None, 1424 completion_start_time: Optional[datetime] = None, 1425 model: Optional[str] = None, 1426 model_parameters: Optional[Dict[str, MapValue]] = None, 1427 usage_details: Optional[Dict[str, int]] = None, 1428 cost_details: Optional[Dict[str, float]] = None, 1429 prompt: Optional[PromptClient] = None, 1430 **kwargs: Any, 1431 ) -> "LangfuseEvent": 1432 """Update is not allowed for LangfuseEvent because events cannot be updated. 1433 1434 This method logs a warning and returns self without making changes. 1435 1436 Returns: 1437 self: Returns the unchanged LangfuseEvent instance 1438 """ 1439 langfuse_logger.warning( 1440 "Attempted to update LangfuseEvent observation. Events cannot be updated after creation." 1441 ) 1442 return self
Specialized span implementation for Langfuse Events.
1372 def __init__( 1373 self, 1374 *, 1375 otel_span: otel_trace_api.Span, 1376 langfuse_client: "Langfuse", 1377 input: Optional[Any] = None, 1378 output: Optional[Any] = None, 1379 metadata: Optional[Any] = None, 1380 environment: Optional[str] = None, 1381 release: Optional[str] = None, 1382 version: Optional[str] = None, 1383 level: Optional[SpanLevel] = None, 1384 status_message: Optional[str] = None, 1385 ): 1386 """Initialize a new LangfuseEvent span. 1387 1388 Args: 1389 otel_span: The OpenTelemetry span to wrap 1390 langfuse_client: Reference to the parent Langfuse client 1391 input: Input data for the event 1392 output: Output from the event 1393 metadata: Additional metadata to associate with the generation 1394 environment: The tracing environment 1395 release: Release identifier for the application 1396 version: Version identifier for the model or component 1397 level: Importance level of the generation (info, warning, error) 1398 status_message: Optional status message for the generation 1399 """ 1400 super().__init__( 1401 otel_span=otel_span, 1402 as_type="event", 1403 langfuse_client=langfuse_client, 1404 input=input, 1405 output=output, 1406 metadata=metadata, 1407 environment=environment, 1408 release=release, 1409 version=version, 1410 level=level, 1411 status_message=status_message, 1412 )
Initialize a new LangfuseEvent span.
Arguments:
- otel_span: The OpenTelemetry span to wrap
- langfuse_client: Reference to the parent Langfuse client
- input: Input data for the event
- output: Output from the event
- metadata: Additional metadata to associate with the generation
- environment: The tracing environment
- release: Release identifier for the application
- version: Version identifier for the model or component
- level: Importance level of the generation (info, warning, error)
- status_message: Optional status message for the generation
1414 def update( 1415 self, 1416 *, 1417 name: Optional[str] = None, 1418 input: Optional[Any] = None, 1419 output: Optional[Any] = None, 1420 metadata: Optional[Any] = None, 1421 version: Optional[str] = None, 1422 level: Optional[SpanLevel] = None, 1423 status_message: Optional[str] = None, 1424 completion_start_time: Optional[datetime] = None, 1425 model: Optional[str] = None, 1426 model_parameters: Optional[Dict[str, MapValue]] = None, 1427 usage_details: Optional[Dict[str, int]] = None, 1428 cost_details: Optional[Dict[str, float]] = None, 1429 prompt: Optional[PromptClient] = None, 1430 **kwargs: Any, 1431 ) -> "LangfuseEvent": 1432 """Update is not allowed for LangfuseEvent because events cannot be updated. 1433 1434 This method logs a warning and returns self without making changes. 1435 1436 Returns: 1437 self: Returns the unchanged LangfuseEvent instance 1438 """ 1439 langfuse_logger.warning( 1440 "Attempted to update LangfuseEvent observation. Events cannot be updated after creation." 1441 ) 1442 return self
Update is not allowed for LangfuseEvent because events cannot be updated.
This method logs a warning and returns self without making changes.
Returns:
self: Returns the unchanged LangfuseEvent instance
28class LangfuseOtelSpanAttributes: 29 # Langfuse-Trace attributes 30 TRACE_NAME = "langfuse.trace.name" 31 TRACE_USER_ID = "user.id" 32 TRACE_SESSION_ID = "session.id" 33 TRACE_TAGS = "langfuse.trace.tags" 34 TRACE_PUBLIC = "langfuse.trace.public" 35 TRACE_METADATA = "langfuse.trace.metadata" 36 TRACE_INPUT = "langfuse.trace.input" 37 TRACE_OUTPUT = "langfuse.trace.output" 38 39 # Langfuse-observation attributes 40 OBSERVATION_TYPE = "langfuse.observation.type" 41 OBSERVATION_METADATA = "langfuse.observation.metadata" 42 OBSERVATION_LEVEL = "langfuse.observation.level" 43 OBSERVATION_STATUS_MESSAGE = "langfuse.observation.status_message" 44 OBSERVATION_INPUT = "langfuse.observation.input" 45 OBSERVATION_OUTPUT = "langfuse.observation.output" 46 47 # Langfuse-observation of type Generation attributes 48 OBSERVATION_COMPLETION_START_TIME = "langfuse.observation.completion_start_time" 49 OBSERVATION_MODEL = "langfuse.observation.model.name" 50 OBSERVATION_MODEL_PARAMETERS = "langfuse.observation.model.parameters" 51 OBSERVATION_USAGE_DETAILS = "langfuse.observation.usage_details" 52 OBSERVATION_COST_DETAILS = "langfuse.observation.cost_details" 53 OBSERVATION_PROMPT_NAME = "langfuse.observation.prompt.name" 54 OBSERVATION_PROMPT_VERSION = "langfuse.observation.prompt.version" 55 56 # General 57 ENVIRONMENT = "langfuse.environment" 58 RELEASE = "langfuse.release" 59 VERSION = "langfuse.version" 60 61 # Internal 62 AS_ROOT = "langfuse.internal.as_root" 63 IS_APP_ROOT = "langfuse.internal.is_app_root" 64 65 # Experiments 66 EXPERIMENT_ID = "langfuse.experiment.id" 67 EXPERIMENT_NAME = "langfuse.experiment.name" 68 EXPERIMENT_DESCRIPTION = "langfuse.experiment.description" 69 EXPERIMENT_METADATA = "langfuse.experiment.metadata" 70 EXPERIMENT_DATASET_ID = "langfuse.experiment.dataset.id" 71 EXPERIMENT_ITEM_ID = "langfuse.experiment.item.id" 72 EXPERIMENT_ITEM_EXPECTED_OUTPUT = "langfuse.experiment.item.expected_output" 73 EXPERIMENT_ITEM_METADATA = "langfuse.experiment.item.metadata" 74 EXPERIMENT_ITEM_ROOT_OBSERVATION_ID = "langfuse.experiment.item.root_observation_id"
1445class LangfuseAgent(LangfuseObservationWrapper): 1446 """Agent observation for reasoning blocks that act on tools using LLM guidance.""" 1447 1448 def __init__(self, **kwargs: Any) -> None: 1449 """Initialize a new LangfuseAgent span.""" 1450 kwargs["as_type"] = "agent" 1451 super().__init__(**kwargs)
Agent observation for reasoning blocks that act on tools using LLM guidance.
1454class LangfuseTool(LangfuseObservationWrapper): 1455 """Tool observation representing external tool calls, e.g., calling a weather API.""" 1456 1457 def __init__(self, **kwargs: Any) -> None: 1458 """Initialize a new LangfuseTool span.""" 1459 kwargs["as_type"] = "tool" 1460 super().__init__(**kwargs)
Tool observation representing external tool calls, e.g., calling a weather API.
1463class LangfuseChain(LangfuseObservationWrapper): 1464 """Chain observation for connecting LLM application steps, e.g. passing context from retriever to LLM.""" 1465 1466 def __init__(self, **kwargs: Any) -> None: 1467 """Initialize a new LangfuseChain span.""" 1468 kwargs["as_type"] = "chain" 1469 super().__init__(**kwargs)
Chain observation for connecting LLM application steps, e.g. passing context from retriever to LLM.
1481class LangfuseEmbedding(LangfuseObservationWrapper): 1482 """Embedding observation for LLM embedding calls, typically used before retrieval.""" 1483 1484 def __init__(self, **kwargs: Any) -> None: 1485 """Initialize a new LangfuseEmbedding span.""" 1486 kwargs["as_type"] = "embedding" 1487 super().__init__(**kwargs)
Embedding observation for LLM embedding calls, typically used before retrieval.
1490class LangfuseEvaluator(LangfuseObservationWrapper): 1491 """Evaluator observation for assessing relevance, correctness, or helpfulness of LLM outputs.""" 1492 1493 def __init__(self, **kwargs: Any) -> None: 1494 """Initialize a new LangfuseEvaluator span.""" 1495 kwargs["as_type"] = "evaluator" 1496 super().__init__(**kwargs)
Evaluator observation for assessing relevance, correctness, or helpfulness of LLM outputs.
1472class LangfuseRetriever(LangfuseObservationWrapper): 1473 """Retriever observation for data retrieval steps, e.g. vector store or database queries.""" 1474 1475 def __init__(self, **kwargs: Any) -> None: 1476 """Initialize a new LangfuseRetriever span.""" 1477 kwargs["as_type"] = "retriever" 1478 super().__init__(**kwargs)
Retriever observation for data retrieval steps, e.g. vector store or database queries.
1499class LangfuseGuardrail(LangfuseObservationWrapper): 1500 """Guardrail observation for protection e.g. against jailbreaks or offensive content.""" 1501 1502 def __init__(self, **kwargs: Any) -> None: 1503 """Initialize a new LangfuseGuardrail span.""" 1504 kwargs["as_type"] = "guardrail" 1505 super().__init__(**kwargs)
Guardrail observation for protection e.g. against jailbreaks or offensive content.
101class Evaluation: 102 """Represents an evaluation result for an experiment item or an entire experiment run. 103 104 This class provides a strongly-typed way to create evaluation results in evaluator functions. 105 Users must use keyword arguments when instantiating this class. 106 107 Attributes: 108 name: Unique identifier for the evaluation metric. Should be descriptive 109 and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity"). 110 Used for aggregation and comparison across experiment runs. 111 value: The evaluation score or result. Can be: 112 - Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42) 113 - String: For categorical results like "positive", "negative", "neutral" 114 - Boolean: For binary assessments like "passes_safety_check" 115 comment: Optional human-readable explanation of the evaluation result. 116 Useful for providing context, explaining scoring rationale, or noting 117 special conditions. Displayed in Langfuse UI for interpretability. 118 metadata: Optional structured metadata about the evaluation process. 119 Can include confidence scores, intermediate calculations, model versions, 120 or any other relevant technical details. 121 data_type: Optional score data type. Required if value is not NUMERIC. 122 One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC. 123 config_id: Optional Langfuse score config ID. 124 125 Examples: 126 Basic accuracy evaluation: 127 ```python 128 from langfuse import Evaluation 129 130 def accuracy_evaluator(*, input, output, expected_output=None, **kwargs): 131 if not expected_output: 132 return Evaluation(name="accuracy", value=0, comment="No expected output") 133 134 is_correct = output.strip().lower() == expected_output.strip().lower() 135 return Evaluation( 136 name="accuracy", 137 value=1.0 if is_correct else 0.0, 138 comment="Correct answer" if is_correct else "Incorrect answer" 139 ) 140 ``` 141 142 Multi-metric evaluator: 143 ```python 144 def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs): 145 return [ 146 Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"), 147 Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"), 148 Evaluation( 149 name="quality", 150 value=0.85, 151 comment="High quality response", 152 metadata={"confidence": 0.92, "model": "gpt-4"} 153 ) 154 ] 155 ``` 156 157 Categorical evaluation: 158 ```python 159 def sentiment_evaluator(*, input, output, **kwargs): 160 sentiment = analyze_sentiment(output) # Returns "positive", "negative", or "neutral" 161 return Evaluation( 162 name="sentiment", 163 value=sentiment, 164 comment=f"Response expresses {sentiment} sentiment", 165 data_type="CATEGORICAL" 166 ) 167 ``` 168 169 Failed evaluation with error handling: 170 ```python 171 def external_api_evaluator(*, input, output, **kwargs): 172 try: 173 score = external_api.evaluate(output) 174 return Evaluation(name="external_score", value=score) 175 except Exception as e: 176 return Evaluation( 177 name="external_score", 178 value=0, 179 comment=f"API unavailable: {e}", 180 metadata={"error": str(e), "retry_count": 3} 181 ) 182 ``` 183 184 Note: 185 All arguments must be passed as keywords. Positional arguments are not allowed 186 to ensure code clarity and prevent errors from argument reordering. 187 """ 188 189 def __init__( 190 self, 191 *, 192 name: str, 193 value: Union[int, float, str, bool], 194 comment: Optional[str] = None, 195 metadata: Optional[Dict[str, Any]] = None, 196 data_type: Optional[ExperimentScoreType] = None, 197 config_id: Optional[str] = None, 198 ): 199 """Initialize an Evaluation with the provided data. 200 201 Args: 202 name: Unique identifier for the evaluation metric. 203 value: The evaluation score or result. 204 comment: Optional human-readable explanation of the result. 205 metadata: Optional structured metadata about the evaluation process. 206 data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN). 207 config_id: Optional Langfuse score config ID. 208 209 Note: 210 All arguments must be provided as keywords. Positional arguments will raise a TypeError. 211 """ 212 self.name = name 213 self.value = value 214 self.comment = comment 215 self.metadata = metadata 216 self.data_type = data_type 217 self.config_id = config_id
Represents an evaluation result for an experiment item or an entire experiment run.
This class provides a strongly-typed way to create evaluation results in evaluator functions. Users must use keyword arguments when instantiating this class.
Attributes:
- name: Unique identifier for the evaluation metric. Should be descriptive and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity"). Used for aggregation and comparison across experiment runs.
- value: The evaluation score or result. Can be:
- Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42)
- String: For categorical results like "positive", "negative", "neutral"
- Boolean: For binary assessments like "passes_safety_check"
- comment: Optional human-readable explanation of the evaluation result. Useful for providing context, explaining scoring rationale, or noting special conditions. Displayed in Langfuse UI for interpretability.
- metadata: Optional structured metadata about the evaluation process. Can include confidence scores, intermediate calculations, model versions, or any other relevant technical details.
- data_type: Optional score data type. Required if value is not NUMERIC. One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC.
- config_id: Optional Langfuse score config ID.
Examples:
Basic accuracy evaluation:
from langfuse import Evaluation def accuracy_evaluator(*, input, output, expected_output=None, **kwargs): if not expected_output: return Evaluation(name="accuracy", value=0, comment="No expected output") is_correct = output.strip().lower() == expected_output.strip().lower() return Evaluation( name="accuracy", value=1.0 if is_correct else 0.0, comment="Correct answer" if is_correct else "Incorrect answer" )Multi-metric evaluator:
def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs): return [ Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"), Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"), Evaluation( name="quality", value=0.85, comment="High quality response", metadata={"confidence": 0.92, "model": "gpt-4"} ) ]Categorical evaluation:
def sentiment_evaluator(*, input, output, **kwargs): sentiment = analyze_sentiment(output) # Returns "positive", "negative", or "neutral" return Evaluation( name="sentiment", value=sentiment, comment=f"Response expresses {sentiment} sentiment", data_type="CATEGORICAL" )Failed evaluation with error handling:
def external_api_evaluator(*, input, output, **kwargs): try: score = external_api.evaluate(output) return Evaluation(name="external_score", value=score) except Exception as e: return Evaluation( name="external_score", value=0, comment=f"API unavailable: {e}", metadata={"error": str(e), "retry_count": 3} )
Note:
All arguments must be passed as keywords. Positional arguments are not allowed to ensure code clarity and prevent errors from argument reordering.
189 def __init__( 190 self, 191 *, 192 name: str, 193 value: Union[int, float, str, bool], 194 comment: Optional[str] = None, 195 metadata: Optional[Dict[str, Any]] = None, 196 data_type: Optional[ExperimentScoreType] = None, 197 config_id: Optional[str] = None, 198 ): 199 """Initialize an Evaluation with the provided data. 200 201 Args: 202 name: Unique identifier for the evaluation metric. 203 value: The evaluation score or result. 204 comment: Optional human-readable explanation of the result. 205 metadata: Optional structured metadata about the evaluation process. 206 data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN). 207 config_id: Optional Langfuse score config ID. 208 209 Note: 210 All arguments must be provided as keywords. Positional arguments will raise a TypeError. 211 """ 212 self.name = name 213 self.value = value 214 self.comment = comment 215 self.metadata = metadata 216 self.data_type = data_type 217 self.config_id = config_id
Initialize an Evaluation with the provided data.
Arguments:
- name: Unique identifier for the evaluation metric.
- value: The evaluation score or result.
- comment: Optional human-readable explanation of the result.
- metadata: Optional structured metadata about the evaluation process.
- data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
- config_id: Optional Langfuse score config ID.
Note:
All arguments must be provided as keywords. Positional arguments will raise a TypeError.
38class EvaluatorInputs: 39 """Input data structure for evaluators, returned by mapper functions. 40 41 This class provides a strongly-typed container for transforming API response 42 objects (traces, observations) into the standardized format expected 43 by evaluator functions. It ensures consistent access to input, output, expected 44 output, and metadata regardless of the source entity type. 45 46 Attributes: 47 input: The input data that was provided to generate the output being evaluated. 48 For traces, this might be the initial prompt or request. For observations, 49 this could be the span's input. The exact meaning depends on your use case. 50 output: The actual output that was produced and needs to be evaluated. 51 For traces, this is typically the final response. For observations, 52 this might be the generation output or span result. 53 expected_output: Optional ground truth or expected result for comparison. 54 Used by evaluators to assess correctness. May be None if no ground truth 55 is available for the entity being evaluated. 56 metadata: Optional structured metadata providing additional context for evaluation. 57 Can include information about the entity, execution context, user attributes, 58 or any other relevant data that evaluators might use. 59 60 Examples: 61 Simple mapper for traces: 62 ```python 63 from langfuse import EvaluatorInputs 64 65 def trace_mapper(trace): 66 return EvaluatorInputs( 67 input=trace.input, 68 output=trace.output, 69 expected_output=None, # No ground truth available 70 metadata={"user_id": trace.user_id, "tags": trace.tags} 71 ) 72 ``` 73 74 Mapper for observations extracting specific fields: 75 ```python 76 def observation_mapper(observation): 77 # Extract input/output from observation's data 78 input_data = observation.input if hasattr(observation, 'input') else None 79 output_data = observation.output if hasattr(observation, 'output') else None 80 81 return EvaluatorInputs( 82 input=input_data, 83 output=output_data, 84 expected_output=None, 85 metadata={ 86 "observation_type": observation.type, 87 "model": observation.model, 88 "latency_ms": observation.end_time - observation.start_time 89 } 90 ) 91 ``` 92 ``` 93 94 Note: 95 All arguments must be passed as keywords when instantiating this class. 96 """ 97 98 def __init__( 99 self, 100 *, 101 input: Any, 102 output: Any, 103 expected_output: Any = None, 104 metadata: Optional[Dict[str, Any]] = None, 105 ): 106 """Initialize EvaluatorInputs with the provided data. 107 108 Args: 109 input: The input data for evaluation. 110 output: The output data to be evaluated. 111 expected_output: Optional ground truth for comparison. 112 metadata: Optional additional context for evaluation. 113 114 Note: 115 All arguments must be provided as keywords. 116 """ 117 self.input = input 118 self.output = output 119 self.expected_output = expected_output 120 self.metadata = metadata
Input data structure for evaluators, returned by mapper functions.
This class provides a strongly-typed container for transforming API response objects (traces, observations) into the standardized format expected by evaluator functions. It ensures consistent access to input, output, expected output, and metadata regardless of the source entity type.
Attributes:
- input: The input data that was provided to generate the output being evaluated. For traces, this might be the initial prompt or request. For observations, this could be the span's input. The exact meaning depends on your use case.
- output: The actual output that was produced and needs to be evaluated. For traces, this is typically the final response. For observations, this might be the generation output or span result.
- expected_output: Optional ground truth or expected result for comparison. Used by evaluators to assess correctness. May be None if no ground truth is available for the entity being evaluated.
- metadata: Optional structured metadata providing additional context for evaluation. Can include information about the entity, execution context, user attributes, or any other relevant data that evaluators might use.
Examples:
Simple mapper for traces:
from langfuse import EvaluatorInputs def trace_mapper(trace): return EvaluatorInputs( input=trace.input, output=trace.output, expected_output=None, # No ground truth available metadata={"user_id": trace.user_id, "tags": trace.tags} )Mapper for observations extracting specific fields:
def observation_mapper(observation): # Extract input/output from observation's data input_data = observation.input if hasattr(observation, 'input') else None output_data = observation.output if hasattr(observation, 'output') else None return EvaluatorInputs( input=input_data, output=output_data, expected_output=None, metadata={ "observation_type": observation.type, "model": observation.model, "latency_ms": observation.end_time - observation.start_time } )```
Note:
All arguments must be passed as keywords when instantiating this class.
98 def __init__( 99 self, 100 *, 101 input: Any, 102 output: Any, 103 expected_output: Any = None, 104 metadata: Optional[Dict[str, Any]] = None, 105 ): 106 """Initialize EvaluatorInputs with the provided data. 107 108 Args: 109 input: The input data for evaluation. 110 output: The output data to be evaluated. 111 expected_output: Optional ground truth for comparison. 112 metadata: Optional additional context for evaluation. 113 114 Note: 115 All arguments must be provided as keywords. 116 """ 117 self.input = input 118 self.output = output 119 self.expected_output = expected_output 120 self.metadata = metadata
Initialize EvaluatorInputs with the provided data.
Arguments:
- input: The input data for evaluation.
- output: The output data to be evaluated.
- expected_output: Optional ground truth for comparison.
- metadata: Optional additional context for evaluation.
Note:
All arguments must be provided as keywords.
123class MapperFunction(Protocol): 124 """Protocol defining the interface for mapper functions in batch evaluation. 125 126 Mapper functions transform API response objects (traces or observations) 127 into the standardized EvaluatorInputs format that evaluators expect. This abstraction 128 allows you to define how to extract and structure evaluation data from different 129 entity types. 130 131 Mapper functions must: 132 - Accept a single item parameter (trace, observation) 133 - Return an EvaluatorInputs instance with input, output, expected_output, metadata 134 - Can be either synchronous or asynchronous 135 - Should handle missing or malformed data gracefully 136 """ 137 138 def __call__( 139 self, 140 *, 141 item: Union["TraceWithFullDetails", "ObservationsView"], 142 **kwargs: Dict[str, Any], 143 ) -> Union[EvaluatorInputs, Awaitable[EvaluatorInputs]]: 144 """Transform an API response object into evaluator inputs. 145 146 This method defines how to extract evaluation-relevant data from the raw 147 API response object. The implementation should map entity-specific fields 148 to the standardized input/output/expected_output/metadata structure. 149 150 Args: 151 item: The API response object to transform. The type depends on the scope: 152 - TraceWithFullDetails: When evaluating traces 153 - ObservationsView: When evaluating observations 154 155 Returns: 156 EvaluatorInputs: A structured container with: 157 - input: The input data that generated the output 158 - output: The output to be evaluated 159 - expected_output: Optional ground truth for comparison 160 - metadata: Optional additional context 161 162 Can return either a direct EvaluatorInputs instance or an awaitable 163 (for async mappers that need to fetch additional data). 164 165 Examples: 166 Basic trace mapper: 167 ```python 168 def map_trace(trace): 169 return EvaluatorInputs( 170 input=trace.input, 171 output=trace.output, 172 expected_output=None, 173 metadata={"trace_id": trace.id, "user": trace.user_id} 174 ) 175 ``` 176 177 Observation mapper with conditional logic: 178 ```python 179 def map_observation(observation): 180 # Extract fields based on observation type 181 if observation.type == "GENERATION": 182 input_data = observation.input 183 output_data = observation.output 184 else: 185 # For other types, use different fields 186 input_data = observation.metadata.get("input") 187 output_data = observation.metadata.get("output") 188 189 return EvaluatorInputs( 190 input=input_data, 191 output=output_data, 192 expected_output=None, 193 metadata={"obs_id": observation.id, "type": observation.type} 194 ) 195 ``` 196 197 Async mapper (if additional processing needed): 198 ```python 199 async def map_trace_async(trace): 200 # Could do async processing here if needed 201 processed_output = await some_async_transformation(trace.output) 202 203 return EvaluatorInputs( 204 input=trace.input, 205 output=processed_output, 206 expected_output=None, 207 metadata={"trace_id": trace.id} 208 ) 209 ``` 210 """ 211 ...
Protocol defining the interface for mapper functions in batch evaluation.
Mapper functions transform API response objects (traces or observations) into the standardized EvaluatorInputs format that evaluators expect. This abstraction allows you to define how to extract and structure evaluation data from different entity types.
Mapper functions must:
- Accept a single item parameter (trace, observation)
- Return an EvaluatorInputs instance with input, output, expected_output, metadata
- Can be either synchronous or asynchronous
- Should handle missing or malformed data gracefully
1927def _no_init_or_replace_init(self, *args, **kwargs): 1928 cls = type(self) 1929 1930 if cls._is_protocol: 1931 raise TypeError('Protocols cannot be instantiated') 1932 1933 # Already using a custom `__init__`. No need to calculate correct 1934 # `__init__` to call. This can lead to RecursionError. See bpo-45121. 1935 if cls.__init__ is not _no_init_or_replace_init: 1936 return 1937 1938 # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`. 1939 # The first instantiation of the subclass will call `_no_init_or_replace_init` which 1940 # searches for a proper new `__init__` in the MRO. The new `__init__` 1941 # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent 1942 # instantiation of the protocol subclass will thus use the new 1943 # `__init__` and no longer call `_no_init_or_replace_init`. 1944 for base in cls.__mro__: 1945 init = base.__dict__.get('__init__', _no_init_or_replace_init) 1946 if init is not _no_init_or_replace_init: 1947 cls.__init__ = init 1948 break 1949 else: 1950 # should not happen 1951 cls.__init__ = object.__init__ 1952 1953 cls.__init__(self, *args, **kwargs)
214class CompositeEvaluatorFunction(Protocol): 215 """Protocol defining the interface for composite evaluator functions. 216 217 Composite evaluators create aggregate scores from multiple item-level evaluations. 218 This is commonly used to compute weighted averages, combined metrics, or other 219 composite assessments based on individual evaluation results. 220 221 Composite evaluators: 222 - Accept the same inputs as item-level evaluators (input, output, expected_output, metadata) 223 plus the list of evaluations 224 - Return either a single Evaluation, a list of Evaluations, or a dict 225 - Can be either synchronous or asynchronous 226 - Have access to both raw item data and evaluation results 227 """ 228 229 def __call__( 230 self, 231 *, 232 input: Optional[Any] = None, 233 output: Optional[Any] = None, 234 expected_output: Optional[Any] = None, 235 metadata: Optional[Dict[str, Any]] = None, 236 evaluations: List[Evaluation], 237 **kwargs: Dict[str, Any], 238 ) -> Union[ 239 Evaluation, 240 List[Evaluation], 241 Dict[str, Any], 242 Awaitable[Evaluation], 243 Awaitable[List[Evaluation]], 244 Awaitable[Dict[str, Any]], 245 ]: 246 r"""Create a composite evaluation from item-level evaluation results. 247 248 This method combines multiple evaluation scores into a single composite metric. 249 Common use cases include weighted averages, pass/fail decisions based on multiple 250 criteria, or custom scoring logic that considers multiple dimensions. 251 252 Args: 253 input: The input data that was provided to the system being evaluated. 254 output: The output generated by the system being evaluated. 255 expected_output: The expected/reference output for comparison (if available). 256 metadata: Additional metadata about the evaluation context. 257 evaluations: List of evaluation results from item-level evaluators. 258 Each evaluation contains name, value, comment, and metadata. 259 260 Returns: 261 Can return any of: 262 - Evaluation: A single composite evaluation result 263 - List[Evaluation]: Multiple composite evaluations 264 - Dict: A dict that will be converted to an Evaluation 265 - name: Identifier for the composite metric (e.g., "composite_score") 266 - value: The computed composite value 267 - comment: Optional explanation of how the score was computed 268 - metadata: Optional details about the composition logic 269 270 Can return either a direct Evaluation instance or an awaitable 271 (for async composite evaluators). 272 273 Examples: 274 Simple weighted average: 275 ```python 276 def weighted_composite(*, input, output, expected_output, metadata, evaluations): 277 weights = { 278 "accuracy": 0.5, 279 "relevance": 0.3, 280 "safety": 0.2 281 } 282 283 total_score = 0.0 284 total_weight = 0.0 285 286 for eval in evaluations: 287 if eval.name in weights and isinstance(eval.value, (int, float)): 288 total_score += eval.value * weights[eval.name] 289 total_weight += weights[eval.name] 290 291 final_score = total_score / total_weight if total_weight > 0 else 0.0 292 293 return Evaluation( 294 name="composite_score", 295 value=final_score, 296 comment=f"Weighted average of {len(evaluations)} metrics" 297 ) 298 ``` 299 300 Pass/fail composite based on thresholds: 301 ```python 302 def pass_fail_composite(*, input, output, expected_output, metadata, evaluations): 303 # Must pass all criteria 304 thresholds = { 305 "accuracy": 0.7, 306 "safety": 0.9, 307 "relevance": 0.6 308 } 309 310 passes = True 311 failing_metrics = [] 312 313 for metric, threshold in thresholds.items(): 314 eval_result = next((e for e in evaluations if e.name == metric), None) 315 if eval_result and isinstance(eval_result.value, (int, float)): 316 if eval_result.value < threshold: 317 passes = False 318 failing_metrics.append(metric) 319 320 return Evaluation( 321 name="passes_all_checks", 322 value=passes, 323 comment=f"Failed: {', '.join(failing_metrics)}" if failing_metrics else "All checks passed", 324 data_type="BOOLEAN" 325 ) 326 ``` 327 328 Async composite with external scoring: 329 ```python 330 async def llm_composite(*, input, output, expected_output, metadata, evaluations): 331 # Use LLM to synthesize multiple evaluation results 332 eval_summary = "\n".join( 333 f"- {e.name}: {e.value}" for e in evaluations 334 ) 335 336 prompt = f"Given these evaluation scores:\n{eval_summary}\n" 337 prompt += f"For the output: {output}\n" 338 prompt += "Provide an overall quality score from 0-1." 339 340 response = await openai.chat.completions.create( 341 model="gpt-4", 342 messages=[{"role": "user", "content": prompt}] 343 ) 344 345 score = float(response.choices[0].message.content.strip()) 346 347 return Evaluation( 348 name="llm_composite_score", 349 value=score, 350 comment="LLM-synthesized composite score" 351 ) 352 ``` 353 354 Context-aware composite: 355 ```python 356 def context_composite(*, input, output, expected_output, metadata, evaluations): 357 # Adjust weighting based on metadata 358 base_weights = {"accuracy": 0.5, "speed": 0.3, "cost": 0.2} 359 360 # If metadata indicates high importance, prioritize accuracy 361 if metadata and metadata.get('importance') == 'high': 362 weights = {"accuracy": 0.7, "speed": 0.2, "cost": 0.1} 363 else: 364 weights = base_weights 365 366 total = sum( 367 e.value * weights.get(e.name, 0) 368 for e in evaluations 369 if isinstance(e.value, (int, float)) 370 ) 371 372 return Evaluation( 373 name="weighted_composite", 374 value=total, 375 comment="Context-aware weighted composite" 376 ) 377 ``` 378 """ 379 ...
Protocol defining the interface for composite evaluator functions.
Composite evaluators create aggregate scores from multiple item-level evaluations. This is commonly used to compute weighted averages, combined metrics, or other composite assessments based on individual evaluation results.
Composite evaluators:
- Accept the same inputs as item-level evaluators (input, output, expected_output, metadata) plus the list of evaluations
- Return either a single Evaluation, a list of Evaluations, or a dict
- Can be either synchronous or asynchronous
- Have access to both raw item data and evaluation results
1927def _no_init_or_replace_init(self, *args, **kwargs): 1928 cls = type(self) 1929 1930 if cls._is_protocol: 1931 raise TypeError('Protocols cannot be instantiated') 1932 1933 # Already using a custom `__init__`. No need to calculate correct 1934 # `__init__` to call. This can lead to RecursionError. See bpo-45121. 1935 if cls.__init__ is not _no_init_or_replace_init: 1936 return 1937 1938 # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`. 1939 # The first instantiation of the subclass will call `_no_init_or_replace_init` which 1940 # searches for a proper new `__init__` in the MRO. The new `__init__` 1941 # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent 1942 # instantiation of the protocol subclass will thus use the new 1943 # `__init__` and no longer call `_no_init_or_replace_init`. 1944 for base in cls.__mro__: 1945 init = base.__dict__.get('__init__', _no_init_or_replace_init) 1946 if init is not _no_init_or_replace_init: 1947 cls.__init__ = init 1948 break 1949 else: 1950 # should not happen 1951 cls.__init__ = object.__init__ 1952 1953 cls.__init__(self, *args, **kwargs)
382class EvaluatorStats: 383 """Statistics for a single evaluator's performance during batch evaluation. 384 385 This class tracks detailed metrics about how a specific evaluator performed 386 across all items in a batch evaluation run. It helps identify evaluator issues, 387 understand reliability, and optimize evaluation pipelines. 388 389 Attributes: 390 name: The name of the evaluator function (extracted from __name__). 391 total_runs: Total number of times the evaluator was invoked. 392 successful_runs: Number of times the evaluator completed successfully. 393 failed_runs: Number of times the evaluator raised an exception or failed. 394 total_scores_created: Total number of evaluation scores created by this evaluator. 395 Can be higher than successful_runs if the evaluator returns multiple scores. 396 397 Examples: 398 Accessing evaluator stats from batch evaluation result: 399 ```python 400 result = client.run_batched_evaluation(...) 401 402 for stats in result.evaluator_stats: 403 print(f"Evaluator: {stats.name}") 404 print(f" Success rate: {stats.successful_runs / stats.total_runs:.1%}") 405 print(f" Scores created: {stats.total_scores_created}") 406 407 if stats.failed_runs > 0: 408 print(f" ⚠️ Failed {stats.failed_runs} times") 409 ``` 410 411 Identifying problematic evaluators: 412 ```python 413 result = client.run_batched_evaluation(...) 414 415 # Find evaluators with high failure rates 416 for stats in result.evaluator_stats: 417 failure_rate = stats.failed_runs / stats.total_runs 418 if failure_rate > 0.1: # More than 10% failures 419 print(f"⚠️ {stats.name} has {failure_rate:.1%} failure rate") 420 print(f" Consider debugging or removing this evaluator") 421 ``` 422 423 Note: 424 All arguments must be passed as keywords when instantiating this class. 425 """ 426 427 def __init__( 428 self, 429 *, 430 name: str, 431 total_runs: int = 0, 432 successful_runs: int = 0, 433 failed_runs: int = 0, 434 total_scores_created: int = 0, 435 ): 436 """Initialize EvaluatorStats with the provided metrics. 437 438 Args: 439 name: The evaluator function name. 440 total_runs: Total number of evaluator invocations. 441 successful_runs: Number of successful completions. 442 failed_runs: Number of failures. 443 total_scores_created: Total scores created by this evaluator. 444 445 Note: 446 All arguments must be provided as keywords. 447 """ 448 self.name = name 449 self.total_runs = total_runs 450 self.successful_runs = successful_runs 451 self.failed_runs = failed_runs 452 self.total_scores_created = total_scores_created
Statistics for a single evaluator's performance during batch evaluation.
This class tracks detailed metrics about how a specific evaluator performed across all items in a batch evaluation run. It helps identify evaluator issues, understand reliability, and optimize evaluation pipelines.
Attributes:
- name: The name of the evaluator function (extracted from __name__).
- total_runs: Total number of times the evaluator was invoked.
- successful_runs: Number of times the evaluator completed successfully.
- failed_runs: Number of times the evaluator raised an exception or failed.
- total_scores_created: Total number of evaluation scores created by this evaluator. Can be higher than successful_runs if the evaluator returns multiple scores.
Examples:
Accessing evaluator stats from batch evaluation result:
result = client.run_batched_evaluation(...) for stats in result.evaluator_stats: print(f"Evaluator: {stats.name}") print(f" Success rate: {stats.successful_runs / stats.total_runs:.1%}") print(f" Scores created: {stats.total_scores_created}") if stats.failed_runs > 0: print(f" ⚠️ Failed {stats.failed_runs} times")Identifying problematic evaluators:
result = client.run_batched_evaluation(...) # Find evaluators with high failure rates for stats in result.evaluator_stats: failure_rate = stats.failed_runs / stats.total_runs if failure_rate > 0.1: # More than 10% failures print(f"⚠️ {stats.name} has {failure_rate:.1%} failure rate") print(f" Consider debugging or removing this evaluator")
Note:
All arguments must be passed as keywords when instantiating this class.
427 def __init__( 428 self, 429 *, 430 name: str, 431 total_runs: int = 0, 432 successful_runs: int = 0, 433 failed_runs: int = 0, 434 total_scores_created: int = 0, 435 ): 436 """Initialize EvaluatorStats with the provided metrics. 437 438 Args: 439 name: The evaluator function name. 440 total_runs: Total number of evaluator invocations. 441 successful_runs: Number of successful completions. 442 failed_runs: Number of failures. 443 total_scores_created: Total scores created by this evaluator. 444 445 Note: 446 All arguments must be provided as keywords. 447 """ 448 self.name = name 449 self.total_runs = total_runs 450 self.successful_runs = successful_runs 451 self.failed_runs = failed_runs 452 self.total_scores_created = total_scores_created
Initialize EvaluatorStats with the provided metrics.
Arguments:
- name: The evaluator function name.
- total_runs: Total number of evaluator invocations.
- successful_runs: Number of successful completions.
- failed_runs: Number of failures.
- total_scores_created: Total scores created by this evaluator.
Note:
All arguments must be provided as keywords.
455class BatchEvaluationResumeToken: 456 """Token for resuming a failed batch evaluation run. 457 458 This class encapsulates all the information needed to resume a batch evaluation 459 that was interrupted or failed partway through. It uses timestamp-based filtering 460 to avoid re-processing items that were already evaluated, even if the underlying 461 dataset changed between runs. 462 463 Attributes: 464 scope: The type of items being evaluated ("traces", "observations"). 465 filter: The original JSON filter string used to query items. 466 last_processed_timestamp: ISO 8601 timestamp of the last successfully processed item. 467 Used to construct a filter that only fetches items after this timestamp. 468 last_processed_id: The ID of the last successfully processed item, for reference. 469 items_processed: Count of items successfully processed before interruption. 470 471 Examples: 472 Resuming a failed batch evaluation: 473 ```python 474 # Initial run that fails partway through 475 try: 476 result = client.run_batched_evaluation( 477 scope="traces", 478 mapper=my_mapper, 479 evaluators=[evaluator1, evaluator2], 480 filter='{"tags": ["production"]}', 481 max_items=10000 482 ) 483 except Exception as e: 484 print(f"Evaluation failed: {e}") 485 486 # Save the resume token 487 if result.resume_token: 488 # Store resume token for later (e.g., in a file or database) 489 import json 490 with open("resume_token.json", "w") as f: 491 json.dump({ 492 "scope": result.resume_token.scope, 493 "filter": result.resume_token.filter, 494 "last_timestamp": result.resume_token.last_processed_timestamp, 495 "last_id": result.resume_token.last_processed_id, 496 "items_done": result.resume_token.items_processed 497 }, f) 498 499 # Later, resume from where it left off 500 with open("resume_token.json") as f: 501 token_data = json.load(f) 502 503 resume_token = BatchEvaluationResumeToken( 504 scope=token_data["scope"], 505 filter=token_data["filter"], 506 last_processed_timestamp=token_data["last_timestamp"], 507 last_processed_id=token_data["last_id"], 508 items_processed=token_data["items_done"] 509 ) 510 511 # Resume the evaluation 512 result = client.run_batched_evaluation( 513 scope="traces", 514 mapper=my_mapper, 515 evaluators=[evaluator1, evaluator2], 516 resume_from=resume_token 517 ) 518 519 print(f"Processed {result.total_items_processed} additional items") 520 ``` 521 522 Handling partial completion: 523 ```python 524 result = client.run_batched_evaluation(...) 525 526 if not result.completed: 527 print(f"Evaluation incomplete. Processed {result.resume_token.items_processed} items") 528 print(f"Last item: {result.resume_token.last_processed_id}") 529 print(f"Resume from: {result.resume_token.last_processed_timestamp}") 530 531 # Optionally retry automatically 532 if result.resume_token: 533 print("Retrying...") 534 result = client.run_batched_evaluation( 535 scope=result.resume_token.scope, 536 mapper=my_mapper, 537 evaluators=my_evaluators, 538 resume_from=result.resume_token 539 ) 540 ``` 541 542 Note: 543 All arguments must be passed as keywords when instantiating this class. 544 The timestamp-based approach means that items created after the initial run 545 but before the timestamp will be skipped. This is intentional to avoid 546 duplicates and ensure consistent evaluation. 547 """ 548 549 def __init__( 550 self, 551 *, 552 scope: str, 553 filter: Optional[str], 554 last_processed_timestamp: str, 555 last_processed_id: str, 556 items_processed: int, 557 ): 558 """Initialize BatchEvaluationResumeToken with the provided state. 559 560 Args: 561 scope: The scope type ("traces", "observations"). 562 filter: The original JSON filter string. 563 last_processed_timestamp: ISO 8601 timestamp of last processed item. 564 last_processed_id: ID of last processed item. 565 items_processed: Count of items processed before interruption. 566 567 Note: 568 All arguments must be provided as keywords. 569 """ 570 self.scope = scope 571 self.filter = filter 572 self.last_processed_timestamp = last_processed_timestamp 573 self.last_processed_id = last_processed_id 574 self.items_processed = items_processed
Token for resuming a failed batch evaluation run.
This class encapsulates all the information needed to resume a batch evaluation that was interrupted or failed partway through. It uses timestamp-based filtering to avoid re-processing items that were already evaluated, even if the underlying dataset changed between runs.
Attributes:
- scope: The type of items being evaluated ("traces", "observations").
- filter: The original JSON filter string used to query items.
- last_processed_timestamp: ISO 8601 timestamp of the last successfully processed item. Used to construct a filter that only fetches items after this timestamp.
- last_processed_id: The ID of the last successfully processed item, for reference.
- items_processed: Count of items successfully processed before interruption.
Examples:
Resuming a failed batch evaluation:
# Initial run that fails partway through try: result = client.run_batched_evaluation( scope="traces", mapper=my_mapper, evaluators=[evaluator1, evaluator2], filter='{"tags": ["production"]}', max_items=10000 ) except Exception as e: print(f"Evaluation failed: {e}") # Save the resume token if result.resume_token: # Store resume token for later (e.g., in a file or database) import json with open("resume_token.json", "w") as f: json.dump({ "scope": result.resume_token.scope, "filter": result.resume_token.filter, "last_timestamp": result.resume_token.last_processed_timestamp, "last_id": result.resume_token.last_processed_id, "items_done": result.resume_token.items_processed }, f) # Later, resume from where it left off with open("resume_token.json") as f: token_data = json.load(f) resume_token = BatchEvaluationResumeToken( scope=token_data["scope"], filter=token_data["filter"], last_processed_timestamp=token_data["last_timestamp"], last_processed_id=token_data["last_id"], items_processed=token_data["items_done"] ) # Resume the evaluation result = client.run_batched_evaluation( scope="traces", mapper=my_mapper, evaluators=[evaluator1, evaluator2], resume_from=resume_token ) print(f"Processed {result.total_items_processed} additional items")Handling partial completion:
result = client.run_batched_evaluation(...) if not result.completed: print(f"Evaluation incomplete. Processed {result.resume_token.items_processed} items") print(f"Last item: {result.resume_token.last_processed_id}") print(f"Resume from: {result.resume_token.last_processed_timestamp}") # Optionally retry automatically if result.resume_token: print("Retrying...") result = client.run_batched_evaluation( scope=result.resume_token.scope, mapper=my_mapper, evaluators=my_evaluators, resume_from=result.resume_token )
Note:
All arguments must be passed as keywords when instantiating this class. The timestamp-based approach means that items created after the initial run but before the timestamp will be skipped. This is intentional to avoid duplicates and ensure consistent evaluation.
549 def __init__( 550 self, 551 *, 552 scope: str, 553 filter: Optional[str], 554 last_processed_timestamp: str, 555 last_processed_id: str, 556 items_processed: int, 557 ): 558 """Initialize BatchEvaluationResumeToken with the provided state. 559 560 Args: 561 scope: The scope type ("traces", "observations"). 562 filter: The original JSON filter string. 563 last_processed_timestamp: ISO 8601 timestamp of last processed item. 564 last_processed_id: ID of last processed item. 565 items_processed: Count of items processed before interruption. 566 567 Note: 568 All arguments must be provided as keywords. 569 """ 570 self.scope = scope 571 self.filter = filter 572 self.last_processed_timestamp = last_processed_timestamp 573 self.last_processed_id = last_processed_id 574 self.items_processed = items_processed
Initialize BatchEvaluationResumeToken with the provided state.
Arguments:
- scope: The scope type ("traces", "observations").
- filter: The original JSON filter string.
- last_processed_timestamp: ISO 8601 timestamp of last processed item.
- last_processed_id: ID of last processed item.
- items_processed: Count of items processed before interruption.
Note:
All arguments must be provided as keywords.
577class BatchEvaluationResult: 578 r"""Complete result structure for batch evaluation execution. 579 580 This class encapsulates comprehensive statistics and metadata about a batch 581 evaluation run, including counts, evaluator-specific metrics, timing information, 582 error details, and resume capability. 583 584 Attributes: 585 total_items_fetched: Total number of items fetched from the API. 586 total_items_processed: Number of items successfully evaluated. 587 total_items_failed: Number of items that failed during evaluation. 588 total_scores_created: Total scores created by all item-level evaluators. 589 total_composite_scores_created: Scores created by the composite evaluator. 590 total_evaluations_failed: Number of individual evaluator failures across all items. 591 evaluator_stats: List of per-evaluator statistics (success/failure rates, scores created). 592 resume_token: Token for resuming if evaluation was interrupted (None if completed). 593 completed: True if all items were processed, False if stopped early or failed. 594 duration_seconds: Total time taken to execute the batch evaluation. 595 failed_item_ids: List of IDs for items that failed evaluation. 596 error_summary: Dictionary mapping error types to occurrence counts. 597 has_more_items: True if max_items limit was reached but more items exist. 598 item_evaluations: Dictionary mapping item IDs to their evaluation results (both regular and composite). 599 600 Examples: 601 Basic result inspection: 602 ```python 603 result = client.run_batched_evaluation(...) 604 605 print(f"Processed: {result.total_items_processed}/{result.total_items_fetched}") 606 print(f"Scores created: {result.total_scores_created}") 607 print(f"Duration: {result.duration_seconds:.2f}s") 608 print(f"Success rate: {result.total_items_processed / result.total_items_fetched:.1%}") 609 ``` 610 611 Detailed analysis with evaluator stats: 612 ```python 613 result = client.run_batched_evaluation(...) 614 615 print(f"\n📊 Batch Evaluation Results") 616 print(f"{'='*50}") 617 print(f"Items processed: {result.total_items_processed}") 618 print(f"Items failed: {result.total_items_failed}") 619 print(f"Scores created: {result.total_scores_created}") 620 621 if result.total_composite_scores_created > 0: 622 print(f"Composite scores: {result.total_composite_scores_created}") 623 624 print(f"\n📈 Evaluator Performance:") 625 for stats in result.evaluator_stats: 626 success_rate = stats.successful_runs / stats.total_runs if stats.total_runs > 0 else 0 627 print(f"\n {stats.name}:") 628 print(f" Success rate: {success_rate:.1%}") 629 print(f" Scores created: {stats.total_scores_created}") 630 if stats.failed_runs > 0: 631 print(f" ⚠️ Failures: {stats.failed_runs}") 632 633 if result.error_summary: 634 print(f"\n⚠️ Errors encountered:") 635 for error_type, count in result.error_summary.items(): 636 print(f" {error_type}: {count}") 637 ``` 638 639 Handling incomplete runs: 640 ```python 641 result = client.run_batched_evaluation(...) 642 643 if not result.completed: 644 print("⚠️ Evaluation incomplete!") 645 646 if result.resume_token: 647 print(f"Processed {result.resume_token.items_processed} items before failure") 648 print(f"Use resume_from parameter to continue from:") 649 print(f" Timestamp: {result.resume_token.last_processed_timestamp}") 650 print(f" Last ID: {result.resume_token.last_processed_id}") 651 652 if result.has_more_items: 653 print(f"ℹ️ More items available beyond max_items limit") 654 ``` 655 656 Performance monitoring: 657 ```python 658 result = client.run_batched_evaluation(...) 659 660 items_per_second = result.total_items_processed / result.duration_seconds 661 avg_scores_per_item = result.total_scores_created / result.total_items_processed 662 663 print(f"Performance metrics:") 664 print(f" Throughput: {items_per_second:.2f} items/second") 665 print(f" Avg scores/item: {avg_scores_per_item:.2f}") 666 print(f" Total duration: {result.duration_seconds:.2f}s") 667 668 if result.total_evaluations_failed > 0: 669 failure_rate = result.total_evaluations_failed / ( 670 result.total_items_processed * len(result.evaluator_stats) 671 ) 672 print(f" Evaluation failure rate: {failure_rate:.1%}") 673 ``` 674 675 Note: 676 All arguments must be passed as keywords when instantiating this class. 677 """ 678 679 def __init__( 680 self, 681 *, 682 total_items_fetched: int, 683 total_items_processed: int, 684 total_items_failed: int, 685 total_scores_created: int, 686 total_composite_scores_created: int, 687 total_evaluations_failed: int, 688 evaluator_stats: List[EvaluatorStats], 689 resume_token: Optional[BatchEvaluationResumeToken], 690 completed: bool, 691 duration_seconds: float, 692 failed_item_ids: List[str], 693 error_summary: Dict[str, int], 694 has_more_items: bool, 695 item_evaluations: Dict[str, List["Evaluation"]], 696 ): 697 """Initialize BatchEvaluationResult with comprehensive statistics. 698 699 Args: 700 total_items_fetched: Total items fetched from API. 701 total_items_processed: Items successfully evaluated. 702 total_items_failed: Items that failed evaluation. 703 total_scores_created: Scores from item-level evaluators. 704 total_composite_scores_created: Scores from composite evaluator. 705 total_evaluations_failed: Individual evaluator failures. 706 evaluator_stats: Per-evaluator statistics. 707 resume_token: Token for resuming (None if completed). 708 completed: Whether all items were processed. 709 duration_seconds: Total execution time. 710 failed_item_ids: IDs of failed items. 711 error_summary: Error types and counts. 712 has_more_items: Whether more items exist beyond max_items. 713 item_evaluations: Dictionary mapping item IDs to their evaluation results. 714 715 Note: 716 All arguments must be provided as keywords. 717 """ 718 self.total_items_fetched = total_items_fetched 719 self.total_items_processed = total_items_processed 720 self.total_items_failed = total_items_failed 721 self.total_scores_created = total_scores_created 722 self.total_composite_scores_created = total_composite_scores_created 723 self.total_evaluations_failed = total_evaluations_failed 724 self.evaluator_stats = evaluator_stats 725 self.resume_token = resume_token 726 self.completed = completed 727 self.duration_seconds = duration_seconds 728 self.failed_item_ids = failed_item_ids 729 self.error_summary = error_summary 730 self.has_more_items = has_more_items 731 self.item_evaluations = item_evaluations 732 733 def __str__(self) -> str: 734 """Return a formatted string representation of the batch evaluation results. 735 736 Returns: 737 A multi-line string with a summary of the evaluation results. 738 """ 739 lines = [] 740 lines.append("=" * 60) 741 lines.append("Batch Evaluation Results") 742 lines.append("=" * 60) 743 744 # Summary statistics 745 lines.append(f"\nStatus: {'Completed' if self.completed else 'Incomplete'}") 746 lines.append(f"Duration: {self.duration_seconds:.2f}s") 747 lines.append(f"\nItems fetched: {self.total_items_fetched}") 748 lines.append(f"Items processed: {self.total_items_processed}") 749 750 if self.total_items_failed > 0: 751 lines.append(f"Items failed: {self.total_items_failed}") 752 753 # Success rate 754 if self.total_items_fetched > 0: 755 success_rate = self.total_items_processed / self.total_items_fetched * 100 756 lines.append(f"Success rate: {success_rate:.1f}%") 757 758 # Scores created 759 lines.append(f"\nScores created: {self.total_scores_created}") 760 if self.total_composite_scores_created > 0: 761 lines.append(f"Composite scores: {self.total_composite_scores_created}") 762 763 total_scores = self.total_scores_created + self.total_composite_scores_created 764 lines.append(f"Total scores: {total_scores}") 765 766 # Evaluator statistics 767 if self.evaluator_stats: 768 lines.append("\nEvaluator Performance:") 769 for stats in self.evaluator_stats: 770 lines.append(f" {stats.name}:") 771 if stats.total_runs > 0: 772 success_rate = ( 773 stats.successful_runs / stats.total_runs * 100 774 if stats.total_runs > 0 775 else 0 776 ) 777 lines.append( 778 f" Runs: {stats.successful_runs}/{stats.total_runs} " 779 f"({success_rate:.1f}% success)" 780 ) 781 lines.append(f" Scores created: {stats.total_scores_created}") 782 if stats.failed_runs > 0: 783 lines.append(f" Failed runs: {stats.failed_runs}") 784 785 # Performance metrics 786 if self.total_items_processed > 0 and self.duration_seconds > 0: 787 items_per_sec = self.total_items_processed / self.duration_seconds 788 lines.append("\nPerformance:") 789 lines.append(f" Throughput: {items_per_sec:.2f} items/second") 790 if self.total_scores_created > 0: 791 avg_scores = self.total_scores_created / self.total_items_processed 792 lines.append(f" Avg scores per item: {avg_scores:.2f}") 793 794 # Errors and warnings 795 if self.error_summary: 796 lines.append("\nErrors encountered:") 797 for error_type, count in self.error_summary.items(): 798 lines.append(f" {error_type}: {count}") 799 800 # Incomplete run information 801 if not self.completed: 802 lines.append("\nWarning: Evaluation incomplete") 803 if self.resume_token: 804 lines.append( 805 f" Last processed: {self.resume_token.last_processed_timestamp}" 806 ) 807 lines.append(f" Items processed: {self.resume_token.items_processed}") 808 lines.append(" Use resume_from parameter to continue") 809 810 if self.has_more_items: 811 lines.append("\nNote: More items available beyond max_items limit") 812 813 lines.append("=" * 60) 814 return "\n".join(lines)
Complete result structure for batch evaluation execution.
This class encapsulates comprehensive statistics and metadata about a batch evaluation run, including counts, evaluator-specific metrics, timing information, error details, and resume capability.
Attributes:
- total_items_fetched: Total number of items fetched from the API.
- total_items_processed: Number of items successfully evaluated.
- total_items_failed: Number of items that failed during evaluation.
- total_scores_created: Total scores created by all item-level evaluators.
- total_composite_scores_created: Scores created by the composite evaluator.
- total_evaluations_failed: Number of individual evaluator failures across all items.
- evaluator_stats: List of per-evaluator statistics (success/failure rates, scores created).
- resume_token: Token for resuming if evaluation was interrupted (None if completed).
- completed: True if all items were processed, False if stopped early or failed.
- duration_seconds: Total time taken to execute the batch evaluation.
- failed_item_ids: List of IDs for items that failed evaluation.
- error_summary: Dictionary mapping error types to occurrence counts.
- has_more_items: True if max_items limit was reached but more items exist.
- item_evaluations: Dictionary mapping item IDs to their evaluation results (both regular and composite).
Examples:
Basic result inspection:
result = client.run_batched_evaluation(...) print(f"Processed: {result.total_items_processed}/{result.total_items_fetched}") print(f"Scores created: {result.total_scores_created}") print(f"Duration: {result.duration_seconds:.2f}s") print(f"Success rate: {result.total_items_processed / result.total_items_fetched:.1%}")Detailed analysis with evaluator stats:
result = client.run_batched_evaluation(...) print(f"\n📊 Batch Evaluation Results") print(f"{'='*50}") print(f"Items processed: {result.total_items_processed}") print(f"Items failed: {result.total_items_failed}") print(f"Scores created: {result.total_scores_created}") if result.total_composite_scores_created > 0: print(f"Composite scores: {result.total_composite_scores_created}") print(f"\n📈 Evaluator Performance:") for stats in result.evaluator_stats: success_rate = stats.successful_runs / stats.total_runs if stats.total_runs > 0 else 0 print(f"\n {stats.name}:") print(f" Success rate: {success_rate:.1%}") print(f" Scores created: {stats.total_scores_created}") if stats.failed_runs > 0: print(f" ⚠️ Failures: {stats.failed_runs}") if result.error_summary: print(f"\n⚠️ Errors encountered:") for error_type, count in result.error_summary.items(): print(f" {error_type}: {count}")Handling incomplete runs:
result = client.run_batched_evaluation(...) if not result.completed: print("⚠️ Evaluation incomplete!") if result.resume_token: print(f"Processed {result.resume_token.items_processed} items before failure") print(f"Use resume_from parameter to continue from:") print(f" Timestamp: {result.resume_token.last_processed_timestamp}") print(f" Last ID: {result.resume_token.last_processed_id}") if result.has_more_items: print(f"ℹ️ More items available beyond max_items limit")Performance monitoring:
result = client.run_batched_evaluation(...) items_per_second = result.total_items_processed / result.duration_seconds avg_scores_per_item = result.total_scores_created / result.total_items_processed print(f"Performance metrics:") print(f" Throughput: {items_per_second:.2f} items/second") print(f" Avg scores/item: {avg_scores_per_item:.2f}") print(f" Total duration: {result.duration_seconds:.2f}s") if result.total_evaluations_failed > 0: failure_rate = result.total_evaluations_failed / ( result.total_items_processed * len(result.evaluator_stats) ) print(f" Evaluation failure rate: {failure_rate:.1%}")
Note:
All arguments must be passed as keywords when instantiating this class.
679 def __init__( 680 self, 681 *, 682 total_items_fetched: int, 683 total_items_processed: int, 684 total_items_failed: int, 685 total_scores_created: int, 686 total_composite_scores_created: int, 687 total_evaluations_failed: int, 688 evaluator_stats: List[EvaluatorStats], 689 resume_token: Optional[BatchEvaluationResumeToken], 690 completed: bool, 691 duration_seconds: float, 692 failed_item_ids: List[str], 693 error_summary: Dict[str, int], 694 has_more_items: bool, 695 item_evaluations: Dict[str, List["Evaluation"]], 696 ): 697 """Initialize BatchEvaluationResult with comprehensive statistics. 698 699 Args: 700 total_items_fetched: Total items fetched from API. 701 total_items_processed: Items successfully evaluated. 702 total_items_failed: Items that failed evaluation. 703 total_scores_created: Scores from item-level evaluators. 704 total_composite_scores_created: Scores from composite evaluator. 705 total_evaluations_failed: Individual evaluator failures. 706 evaluator_stats: Per-evaluator statistics. 707 resume_token: Token for resuming (None if completed). 708 completed: Whether all items were processed. 709 duration_seconds: Total execution time. 710 failed_item_ids: IDs of failed items. 711 error_summary: Error types and counts. 712 has_more_items: Whether more items exist beyond max_items. 713 item_evaluations: Dictionary mapping item IDs to their evaluation results. 714 715 Note: 716 All arguments must be provided as keywords. 717 """ 718 self.total_items_fetched = total_items_fetched 719 self.total_items_processed = total_items_processed 720 self.total_items_failed = total_items_failed 721 self.total_scores_created = total_scores_created 722 self.total_composite_scores_created = total_composite_scores_created 723 self.total_evaluations_failed = total_evaluations_failed 724 self.evaluator_stats = evaluator_stats 725 self.resume_token = resume_token 726 self.completed = completed 727 self.duration_seconds = duration_seconds 728 self.failed_item_ids = failed_item_ids 729 self.error_summary = error_summary 730 self.has_more_items = has_more_items 731 self.item_evaluations = item_evaluations
Initialize BatchEvaluationResult with comprehensive statistics.
Arguments:
- total_items_fetched: Total items fetched from API.
- total_items_processed: Items successfully evaluated.
- total_items_failed: Items that failed evaluation.
- total_scores_created: Scores from item-level evaluators.
- total_composite_scores_created: Scores from composite evaluator.
- total_evaluations_failed: Individual evaluator failures.
- evaluator_stats: Per-evaluator statistics.
- resume_token: Token for resuming (None if completed).
- completed: Whether all items were processed.
- duration_seconds: Total execution time.
- failed_item_ids: IDs of failed items.
- error_summary: Error types and counts.
- has_more_items: Whether more items exist beyond max_items.
- item_evaluations: Dictionary mapping item IDs to their evaluation results.
Note:
All arguments must be provided as keywords.
1062class RunnerContext: 1063 """Wraps :meth:`Langfuse.run_experiment` with CI-injected defaults. 1064 1065 Intended for use with the ``langfuse/experiment-action`` GitHub Action 1066 (https://github.com/langfuse/experiment-action). The action builds a 1067 ``RunnerContext`` before invoking the user's ``experiment(context)`` 1068 function. Defaults set here (dataset, metadata tags) are applied when 1069 the user omits them on the :meth:`run_experiment` call; users can 1070 override any default by passing the corresponding argument explicitly. 1071 """ 1072 1073 def __init__( 1074 self, 1075 *, 1076 client: "Langfuse", 1077 data: Optional[ExperimentData] = None, 1078 dataset_version: Optional[datetime] = None, 1079 metadata: Optional[Dict[str, str]] = None, 1080 ): 1081 """Build a ``RunnerContext`` populated with defaults for ``run_experiment``. 1082 1083 Typically called by the ``langfuse/experiment-action`` GitHub Action, 1084 not by end users directly. Every field except ``client`` is optional: 1085 fields left as ``None`` simply mean the corresponding argument must be 1086 supplied on the :meth:`run_experiment` call. 1087 1088 Args: 1089 client: Initialized Langfuse SDK client used to execute the 1090 experiment. The action creates this from the 1091 ``langfuse_public_key`` / ``langfuse_secret_key`` / 1092 ``langfuse_base_url`` inputs. 1093 data: Default dataset items to run the experiment on. Accepts 1094 either ``List[LocalExperimentItem]`` or ``List[DatasetItem]``. 1095 Injected by the action when ``dataset_name`` is configured. 1096 If ``None``, the user must pass ``data=`` to 1097 :meth:`run_experiment`. 1098 dataset_version: Optional pinned dataset version. Injected by the 1099 action when ``dataset_version`` is configured. 1100 metadata: Default metadata attached to every experiment trace and 1101 the dataset run. The action injects GitHub-sourced tags (SHA, 1102 PR link, workflow run link, branch, GH user, etc.). Merged 1103 with any ``metadata`` passed to :meth:`run_experiment`, with 1104 user-supplied keys winning on collision. 1105 """ 1106 self.client = client 1107 self.data = data 1108 self.dataset_version = dataset_version 1109 self.metadata = metadata 1110 1111 def run_experiment( 1112 self, 1113 *, 1114 name: str, 1115 run_name: Optional[str] = None, 1116 description: Optional[str] = None, 1117 data: Optional[ExperimentData] = None, 1118 task: TaskFunction, 1119 evaluators: List[EvaluatorFunction] = [], 1120 composite_evaluator: Optional["CompositeEvaluatorFunction"] = None, 1121 run_evaluators: List[RunEvaluatorFunction] = [], 1122 max_concurrency: int = 50, 1123 metadata: Optional[Dict[str, str]] = None, 1124 _dataset_version: Optional[datetime] = None, 1125 ) -> ExperimentResult: 1126 resolved_data = data if data is not None else self.data 1127 if resolved_data is None: 1128 raise ValueError( 1129 "`data` must be provided either on the RunnerContext or the run_experiment call" 1130 ) 1131 1132 resolved_dataset_version = ( 1133 _dataset_version if _dataset_version is not None else self.dataset_version 1134 ) 1135 1136 merged_metadata: Optional[Dict[str, str]] 1137 if self.metadata is None and metadata is None: 1138 merged_metadata = None 1139 else: 1140 merged_metadata = {**(self.metadata or {}), **(metadata or {})} 1141 1142 return self.client.run_experiment( 1143 name=name, 1144 run_name=run_name, 1145 description=description, 1146 data=resolved_data, 1147 task=task, 1148 evaluators=evaluators, 1149 composite_evaluator=composite_evaluator, 1150 run_evaluators=run_evaluators, 1151 max_concurrency=max_concurrency, 1152 metadata=merged_metadata, 1153 _dataset_version=resolved_dataset_version, 1154 )
Wraps Langfuse.run_experiment() with CI-injected defaults.
Intended for use with the langfuse/experiment-action GitHub Action
(https://github.com/langfuse/experiment-action). The action builds a
RunnerContext before invoking the user's experiment(context)
function. Defaults set here (dataset, metadata tags) are applied when
the user omits them on the run_experiment() call; users can
override any default by passing the corresponding argument explicitly.
1073 def __init__( 1074 self, 1075 *, 1076 client: "Langfuse", 1077 data: Optional[ExperimentData] = None, 1078 dataset_version: Optional[datetime] = None, 1079 metadata: Optional[Dict[str, str]] = None, 1080 ): 1081 """Build a ``RunnerContext`` populated with defaults for ``run_experiment``. 1082 1083 Typically called by the ``langfuse/experiment-action`` GitHub Action, 1084 not by end users directly. Every field except ``client`` is optional: 1085 fields left as ``None`` simply mean the corresponding argument must be 1086 supplied on the :meth:`run_experiment` call. 1087 1088 Args: 1089 client: Initialized Langfuse SDK client used to execute the 1090 experiment. The action creates this from the 1091 ``langfuse_public_key`` / ``langfuse_secret_key`` / 1092 ``langfuse_base_url`` inputs. 1093 data: Default dataset items to run the experiment on. Accepts 1094 either ``List[LocalExperimentItem]`` or ``List[DatasetItem]``. 1095 Injected by the action when ``dataset_name`` is configured. 1096 If ``None``, the user must pass ``data=`` to 1097 :meth:`run_experiment`. 1098 dataset_version: Optional pinned dataset version. Injected by the 1099 action when ``dataset_version`` is configured. 1100 metadata: Default metadata attached to every experiment trace and 1101 the dataset run. The action injects GitHub-sourced tags (SHA, 1102 PR link, workflow run link, branch, GH user, etc.). Merged 1103 with any ``metadata`` passed to :meth:`run_experiment`, with 1104 user-supplied keys winning on collision. 1105 """ 1106 self.client = client 1107 self.data = data 1108 self.dataset_version = dataset_version 1109 self.metadata = metadata
Build a RunnerContext populated with defaults for run_experiment.
Typically called by the langfuse/experiment-action GitHub Action,
not by end users directly. Every field except client is optional:
fields left as None simply mean the corresponding argument must be
supplied on the run_experiment() call.
Arguments:
- client: Initialized Langfuse SDK client used to execute the
experiment. The action creates this from the
langfuse_public_key/langfuse_secret_key/langfuse_base_urlinputs. - data: Default dataset items to run the experiment on. Accepts
either
List[LocalExperimentItem]orList[DatasetItem]. Injected by the action whendataset_nameis configured. IfNone, the user must passdata=torun_experiment(). - dataset_version: Optional pinned dataset version. Injected by the
action when
dataset_versionis configured. - metadata: Default metadata attached to every experiment trace and
the dataset run. The action injects GitHub-sourced tags (SHA,
PR link, workflow run link, branch, GH user, etc.). Merged
with any
metadatapassed torun_experiment(), with user-supplied keys winning on collision.
1111 def run_experiment( 1112 self, 1113 *, 1114 name: str, 1115 run_name: Optional[str] = None, 1116 description: Optional[str] = None, 1117 data: Optional[ExperimentData] = None, 1118 task: TaskFunction, 1119 evaluators: List[EvaluatorFunction] = [], 1120 composite_evaluator: Optional["CompositeEvaluatorFunction"] = None, 1121 run_evaluators: List[RunEvaluatorFunction] = [], 1122 max_concurrency: int = 50, 1123 metadata: Optional[Dict[str, str]] = None, 1124 _dataset_version: Optional[datetime] = None, 1125 ) -> ExperimentResult: 1126 resolved_data = data if data is not None else self.data 1127 if resolved_data is None: 1128 raise ValueError( 1129 "`data` must be provided either on the RunnerContext or the run_experiment call" 1130 ) 1131 1132 resolved_dataset_version = ( 1133 _dataset_version if _dataset_version is not None else self.dataset_version 1134 ) 1135 1136 merged_metadata: Optional[Dict[str, str]] 1137 if self.metadata is None and metadata is None: 1138 merged_metadata = None 1139 else: 1140 merged_metadata = {**(self.metadata or {}), **(metadata or {})} 1141 1142 return self.client.run_experiment( 1143 name=name, 1144 run_name=run_name, 1145 description=description, 1146 data=resolved_data, 1147 task=task, 1148 evaluators=evaluators, 1149 composite_evaluator=composite_evaluator, 1150 run_evaluators=run_evaluators, 1151 max_concurrency=max_concurrency, 1152 metadata=merged_metadata, 1153 _dataset_version=resolved_dataset_version, 1154 )
1157class RegressionError(Exception): 1158 """Raised by a user's ``experiment`` function to signal a CI gate failure. 1159 1160 Intended for use with the ``langfuse/experiment-action`` GitHub Action 1161 (https://github.com/langfuse/experiment-action). The action catches this 1162 exception and, when ``should_fail_on_error`` is enabled, fails the 1163 workflow run and renders a callout in the PR comment using 1164 ``metric``/``value``/``threshold`` if supplied, otherwise ``str(exc)``. 1165 1166 Callers choose one of three forms: 1167 1168 - ``RegressionError(result=r)`` — minimal, generic message. 1169 - ``RegressionError(result=r, message="...")`` — free-form message. 1170 - ``RegressionError(result=r, metric="acc", value=0.7, threshold=0.9)`` — 1171 structured; ``metric`` and ``value`` must be provided together so the 1172 action can render a targeted callout without ``None`` placeholders. 1173 """ 1174 1175 @overload 1176 def __init__(self, *, result: ExperimentResult) -> None: ... 1177 @overload 1178 def __init__(self, *, result: ExperimentResult, message: str) -> None: ... 1179 @overload 1180 def __init__( 1181 self, 1182 *, 1183 result: ExperimentResult, 1184 metric: str, 1185 value: float, 1186 threshold: Optional[float] = None, 1187 message: Optional[str] = None, 1188 ) -> None: ... 1189 def __init__( 1190 self, 1191 *, 1192 result: ExperimentResult, 1193 metric: Optional[str] = None, 1194 value: Optional[float] = None, 1195 threshold: Optional[float] = None, 1196 message: Optional[str] = None, 1197 ): 1198 self.result = result 1199 self.metric = metric 1200 self.value = value 1201 self.threshold = threshold 1202 if message is not None: 1203 formatted = message 1204 elif metric is not None and value is not None: 1205 formatted = f"Regression on `{metric}`: {value} (threshold {threshold})" 1206 else: 1207 formatted = "Experiment regression detected" 1208 super().__init__(formatted)
Raised by a user's experiment function to signal a CI gate failure.
Intended for use with the langfuse/experiment-action GitHub Action
(https://github.com/langfuse/experiment-action). The action catches this
exception and, when should_fail_on_error is enabled, fails the
workflow run and renders a callout in the PR comment using
metric/value/threshold if supplied, otherwise str(exc).
Callers choose one of three forms:
RegressionError(result=r)— minimal, generic message.RegressionError(result=r, message="...")— free-form message.RegressionError(result=r, metric="acc", value=0.7, threshold=0.9)— structured;metricandvaluemust be provided together so the action can render a targeted callout withoutNoneplaceholders.
1189 def __init__( 1190 self, 1191 *, 1192 result: ExperimentResult, 1193 metric: Optional[str] = None, 1194 value: Optional[float] = None, 1195 threshold: Optional[float] = None, 1196 message: Optional[str] = None, 1197 ): 1198 self.result = result 1199 self.metric = metric 1200 self.value = value 1201 self.threshold = threshold 1202 if message is not None: 1203 formatted = message 1204 elif metric is not None and value is not None: 1205 formatted = f"Regression on `{metric}`: {value} (threshold {threshold})" 1206 else: 1207 formatted = "Experiment regression detected" 1208 super().__init__(formatted)
98def is_default_export_span(span: ReadableSpan) -> bool: 99 """Return whether a span should be exported by default.""" 100 return ( 101 is_langfuse_span(span) or is_genai_span(span) or is_known_llm_instrumentor(span) 102 )
Return whether a span should be exported by default.
61def is_langfuse_span(span: ReadableSpan) -> bool: 62 """Return whether the span was created by the Langfuse SDK tracer.""" 63 return ( 64 span.instrumentation_scope is not None 65 and span.instrumentation_scope.name == LANGFUSE_TRACER_NAME 66 )
Return whether the span was created by the Langfuse SDK tracer.
69def is_genai_span(span: ReadableSpan) -> bool: 70 """Return whether the span has any ``gen_ai.*`` semantic convention attribute.""" 71 if span.attributes is None: 72 return False 73 74 return any( 75 isinstance(key, str) and key.startswith("gen_ai") 76 for key in span.attributes.keys() 77 )
Return whether the span has any gen_ai.* semantic convention attribute.
85def is_known_llm_instrumentor(span: ReadableSpan) -> bool: 86 """Return whether the span comes from a known LLM instrumentation scope.""" 87 if span.instrumentation_scope is None: 88 return False 89 90 scope_name = span.instrumentation_scope.name 91 92 return any( 93 _matches_scope_prefix(scope_name, prefix) 94 for prefix in KNOWN_LLM_INSTRUMENTATION_SCOPE_PREFIXES 95 )
Return whether the span comes from a known LLM instrumentation scope.